diff options
Diffstat (limited to 'llvm/test/CodeGen/NVPTX')
226 files changed, 9184 insertions, 6152 deletions
diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll index 601a352..9eb5048 100644 --- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll +++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX ; RUN: opt -mtriple=nvptx-- < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix IR ; RUN: opt -mtriple=nvptx64-- < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix IR -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 diff --git a/llvm/test/CodeGen/NVPTX/activemask.ll b/llvm/test/CodeGen/NVPTX/activemask.ll index aa3c581..18918c5 100644 --- a/llvm/test/CodeGen/NVPTX/activemask.ll +++ b/llvm/test/CodeGen/NVPTX/activemask.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -O2 -mcpu=sm_52 -mattr=+ptx62 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx62 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx62 | %ptxas-verify %} declare i32 @llvm.nvvm.activemask() diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll index 00b1789..929196f 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -check-prefixes=NOPTRCONV ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | FileCheck %s -check-prefixes=PTRCONV -; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | %ptxas-verify -arch=sm_90 %} ; ALL-LABEL: conv_shared_cluster_to_generic define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) { diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll index 86008a1..e7212ce 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll @@ -1,7 +1,7 @@ ; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32 ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64 ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index abc873e..fab60bd 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -10,19 +10,20 @@ declare {float, float} @bars({float, float} %input) define void @test_v2f32(<2 x float> %input, ptr %output) { ; CHECK-LABEL: test_v2f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; ; CHECK-NEXT: call.uni (retval0), barv, (param0); -; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: ld.param.b64 %rd4, [test_v2f32_param_1]; -; CHECK-NEXT: st.b64 [%rd4], %rd2; +; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_1]; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r3, %r4}; ; CHECK-NEXT: ret; %call = tail call <2 x float> @barv(<2 x float> %input) store <2 x float> %call, ptr %output, align 8 @@ -32,24 +33,28 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { define void @test_v3f32(<3 x float> %input, ptr %output) { ; CHECK-LABEL: test_v3f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: .param .align 16 .b8 retval0[16]; -; CHECK-NEXT: st.param.b32 [param0+8], %r1; -; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: st.param.b32 [param0+8], %r3; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; ; CHECK-NEXT: call.uni (retval0), barv3, (param0); -; CHECK-NEXT: ld.param.b32 %r2, [retval0+8]; -; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: ld.param.b32 %r4, [retval0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: ld.param.b64 %rd4, [test_v3f32_param_1]; -; CHECK-NEXT: st.b32 [%rd4+8], %r2; -; CHECK-NEXT: st.b64 [%rd4], %rd2; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r5; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r6; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; +; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3; +; CHECK-NEXT: ld.param.b64 %rd5, [test_v3f32_param_1]; +; CHECK-NEXT: st.b32 [%rd5+8], %r4; +; CHECK-NEXT: st.b64 [%rd5], %rd4; ; CHECK-NEXT: ret; %call = tail call <3 x float> @barv3(<3 x float> %input) ; Make sure we don't load more values than than we need to. @@ -60,7 +65,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) { define void @test_a2f32([2 x float] %input, ptr %output) { ; CHECK-LABEL: test_a2f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -87,7 +92,7 @@ define void @test_a2f32([2 x float] %input, ptr %output) { define void @test_s2f32({float, float} %input, ptr %output) { ; CHECK-LABEL: test_s2f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: diff --git a/llvm/test/CodeGen/NVPTX/alias.ll b/llvm/test/CodeGen/NVPTX/alias.ll index 01761c2..d5d0c76 100644 --- a/llvm/test/CodeGen/NVPTX/alias.ll +++ b/llvm/test/CodeGen/NVPTX/alias.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | %ptxas-verify %} define i32 @a() { ret i32 0 } @b = internal alias i32 (), ptr @a diff --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll index 5360e89..8972953 100644 --- a/llvm/test/CodeGen/NVPTX/annotations.ll +++ b/llvm/test/CodeGen/NVPTX/annotations.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @texture = internal addrspace(1) global i64 0, align 8 diff --git a/llvm/test/CodeGen/NVPTX/applypriority.ll b/llvm/test/CodeGen/NVPTX/applypriority.ll index 23b1bda..92092a7 100644 --- a/llvm/test/CodeGen/NVPTX/applypriority.ll +++ b/llvm/test/CodeGen/NVPTX/applypriority.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
-; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-sm_80 && ptxas-isa-7.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll index ce71d3a7..500ff4f 100644 --- a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll index 1fbfd0a..5e02a7d 100644 --- a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll +++ b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll index cefb8ede..0d8e230 100644 --- a/llvm/test/CodeGen/NVPTX/async-copy.ll +++ b/llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s -; RUN: %if ptxas-11.0 && ! ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll index b19f6d5..392cd8b 100644 --- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll +++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll @@ -4,12 +4,12 @@ ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg -define void @bitwise_i128(ptr %0, i128 %1) { +define void @bitwise_i256(ptr %0, i256 %1) { entry: - %2 = atomicrmw and ptr %0, i128 %1 monotonic, align 16 - %3 = atomicrmw or ptr %0, i128 %1 monotonic, align 16 - %4 = atomicrmw xor ptr %0, i128 %1 monotonic, align 16 - %5 = atomicrmw xchg ptr %0, i128 %1 monotonic, align 16 + %2 = atomicrmw and ptr %0, i256 %1 monotonic, align 16 + %3 = atomicrmw or ptr %0, i256 %1 monotonic, align 16 + %4 = atomicrmw xor ptr %0, i256 %1 monotonic, align 16 + %5 = atomicrmw xchg ptr %0, i256 %1 monotonic, align 16 ret void } @@ -17,11 +17,11 @@ entry: ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg -define void @minmax_i128(ptr %0, i128 %1) { +define void @minmax_i256(ptr %0, i256 %1) { entry: - %2 = atomicrmw min ptr %0, i128 %1 monotonic, align 16 - %3 = atomicrmw max ptr %0, i128 %1 monotonic, align 16 - %4 = atomicrmw umin ptr %0, i128 %1 monotonic, align 16 - %5 = atomicrmw umax ptr %0, i128 %1 monotonic, align 16 + %2 = atomicrmw min ptr %0, i256 %1 monotonic, align 16 + %3 = atomicrmw max ptr %0, i256 %1 monotonic, align 16 + %4 = atomicrmw umin ptr %0, i256 %1 monotonic, align 16 + %5 = atomicrmw umax ptr %0, i256 %1 monotonic, align 16 ret void } diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll index 94b3f0a..88fae7a 100644 --- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=ALL,SM30 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=ALL,SM60 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} ; CHECK-LABEL: fadd_double define void @fadd_double(ptr %0, double %1) { diff --git a/llvm/test/CodeGen/NVPTX/atomics-b128.ll b/llvm/test/CodeGen/NVPTX/atomics-b128.ll new file mode 100644 index 0000000..b2a3f94 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/atomics-b128.ll @@ -0,0 +1,1033 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: not llc < %s -mcpu=sm_90 -mattr=+ptx82 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: not llc < %s -mcpu=sm_80 -mattr=+ptx84 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx84 | FileCheck %s --check-prefix=CHECK +; RUN: %if ptxas-sm_90 && ptxas-isa-8.4 %{ llc < %s -mcpu=sm_90 -mattr=+ptx84 | %ptxas-verify -arch=sm_90 %} + +;; TODO: Update cmpxchg.py so that it can automatically generate the IR for +;; these test cases. + +target triple = "nvptx64-nvidia-cuda" + +;; Check that the first couple of error messages are correct. +; ERROR: error: unsupported cmpxchg +; ERROR: error: unsupported cmpxchg + +define i128 @test_xchg_generic(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_generic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_generic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_generic_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_global(ptr addrspace(1) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_global( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_global_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_global_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.global.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(1) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_shared(ptr addrspace(3) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_shared( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_shared_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_shared_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.shared.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(3) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_shared_cluster(ptr addrspace(7) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_shared_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_shared_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_shared_cluster_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.shared::cluster.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(7) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_block(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_block( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_block_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_block_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.cta.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("block") release + ret i128 %old +} + +define i128 @test_xchg_cluster(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_cluster_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.cluster.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("cluster") release + ret i128 %old +} + +define i128 @test_xchg_gpu(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_gpu( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_gpu_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_gpu_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.gpu.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("device") release + ret i128 %old +} + +define i128 @test_xchg_sys(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_sys( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_sys_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_sys_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_relaxed(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_relaxed( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_relaxed_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_relaxed_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.relaxed.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt monotonic + ret i128 %old +} + +define i128 @test_xchg_acquire(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_acquire_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.acquire.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt acquire + ret i128 %old +} + +define i128 @test_xchg_release(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_release( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_release_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_release_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_acq_rel(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_acq_rel( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_acq_rel_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_acq_rel_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.acq_rel.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt acq_rel + ret i128 %old +} + +define i128 @test_cmpxchg_generic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_generic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_generic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_generic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_generic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_global(ptr addrspace(1) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_global( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_global_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_global_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_global_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.global.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_shared(ptr addrspace(3) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_shared( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_shared_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_shared_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_shared_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.shared.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_block(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_block( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_block_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_block_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_block_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.cta.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("block") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_cluster(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_cluster_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_cluster_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.cluster.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("cluster") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_gpu(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_gpu( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_gpu_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_gpu_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_gpu_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.gpu.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("device") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_shared_cluster(ptr addrspace(7) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_shared_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_shared_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_shared_cluster_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_shared_cluster_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(7) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic acquire + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire acquire + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_release_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.release.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_release_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release acquire + ret i128 %new +} + +define i128 @test_cmpxchg_release_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel acquire + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_monotonic_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_acquire_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst acquire + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst seq_cst + ret i128 %new +} + +define i128 @test_atomicrmw_and(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_and( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_and_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_and_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB34_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: and.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB34_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw and ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_or(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_or( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_or_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_or_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB35_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: or.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB35_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw or ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_xor(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_xor( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_xor_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_xor_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB36_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xor.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: xor.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB36_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw xor ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_min(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_min( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_min_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_min_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB37_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.lt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB37_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw min ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_max(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_max( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_max_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_max_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB38_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.gt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB38_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw max ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_umin(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_umin( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_umin_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_umin_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB39_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.lt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB39_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw umin ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_umax(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_umax( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_umax_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_umax_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB40_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.gt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB40_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw umax ptr %ptr, i128 %val monotonic + ret i128 %ret +} + + +@si128 = internal addrspace(3) global i128 0, align 16 + +define void @test_atomicrmw_xchg_const() { +; CHECK-LABEL: test_atomicrmw_xchg_const( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: // demoted variable +; CHECK-NEXT: .shared .align 16 .b8 si128[16]; +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %rd1, 0; +; CHECK-NEXT: mov.b64 %rd2, 23; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd1}; +; CHECK-NEXT: atom.relaxed.sys.shared.exch.b128 dst, [si128], amt; +; CHECK-NEXT: mov.b128 {%rd3, %rd4}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: ret; + %res = atomicrmw xchg ptr addrspace(3) @si128, i128 23 monotonic + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll index 2e11323..ae10526 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} ; CHECK-LABEL: .func test( define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, double %d) { diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index f710d7f..e2762ba 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.2 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" @@ -47,90 +47,90 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62: { ; CHECKPTX62-NEXT: .reg .pred %p<5>; ; CHECKPTX62-NEXT: .reg .b16 %rs<11>; -; CHECKPTX62-NEXT: .reg .b32 %r<58>; +; CHECKPTX62-NEXT: .reg .b32 %r<50>; ; CHECKPTX62-EMPTY: ; CHECKPTX62-NEXT: // %bb.0: ; CHECKPTX62-NEXT: ld.param.b16 %rs1, [test_param_3]; -; CHECKPTX62-NEXT: ld.param.b32 %r23, [test_param_2]; -; CHECKPTX62-NEXT: ld.param.b32 %r22, [test_param_1]; -; CHECKPTX62-NEXT: ld.param.b32 %r24, [test_param_0]; -; CHECKPTX62-NEXT: and.b32 %r1, %r24, -4; -; CHECKPTX62-NEXT: and.b32 %r25, %r24, 3; -; CHECKPTX62-NEXT: shl.b32 %r2, %r25, 3; -; CHECKPTX62-NEXT: mov.b32 %r26, 65535; -; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX62-NEXT: not.b32 %r3, %r27; -; CHECKPTX62-NEXT: ld.b32 %r54, [%r1]; +; CHECKPTX62-NEXT: ld.param.b32 %r15, [test_param_2]; +; CHECKPTX62-NEXT: ld.param.b32 %r14, [test_param_1]; +; CHECKPTX62-NEXT: ld.param.b32 %r16, [test_param_0]; +; CHECKPTX62-NEXT: and.b32 %r1, %r16, -4; +; CHECKPTX62-NEXT: and.b32 %r17, %r16, 3; +; CHECKPTX62-NEXT: shl.b32 %r2, %r17, 3; +; CHECKPTX62-NEXT: mov.b32 %r18, 65535; +; CHECKPTX62-NEXT: shl.b32 %r19, %r18, %r2; +; CHECKPTX62-NEXT: not.b32 %r3, %r19; +; CHECKPTX62-NEXT: ld.b32 %r46, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r28; +; CHECKPTX62-NEXT: shr.u32 %r20, %r46, %r2; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r20; ; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs3; -; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2; -; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3; -; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; -; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r6, %r54; -; CHECKPTX62-NEXT: mov.b32 %r54, %r6; +; CHECKPTX62-NEXT: cvt.u32.u16 %r21, %rs3; +; CHECKPTX62-NEXT: shl.b32 %r22, %r21, %r2; +; CHECKPTX62-NEXT: and.b32 %r23, %r46, %r3; +; CHECKPTX62-NEXT: or.b32 %r24, %r23, %r22; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24; +; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r4, %r46; +; CHECKPTX62-NEXT: mov.b32 %r46, %r4; ; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44 -; CHECKPTX62-NEXT: ld.b32 %r55, [%r1]; +; CHECKPTX62-NEXT: ld.b32 %r47, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r33; +; CHECKPTX62-NEXT: shr.u32 %r25, %r47, %r2; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r25; ; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00; ; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5; -; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs6; -; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2; -; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3; -; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; -; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r9, %r55; -; CHECKPTX62-NEXT: mov.b32 %r55, %r9; +; CHECKPTX62-NEXT: cvt.u32.u16 %r26, %rs6; +; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; +; CHECKPTX62-NEXT: and.b32 %r28, %r47, %r3; +; CHECKPTX62-NEXT: or.b32 %r29, %r28, %r27; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29; +; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r5, %r47; +; CHECKPTX62-NEXT: mov.b32 %r47, %r5; ; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26 -; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4; -; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3; -; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24; -; CHECKPTX62-NEXT: mov.b32 %r39, 65535; -; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11; -; CHECKPTX62-NEXT: not.b32 %r12, %r40; -; CHECKPTX62-NEXT: ld.global.b32 %r56, [%r10]; +; CHECKPTX62-NEXT: and.b32 %r6, %r14, -4; +; CHECKPTX62-NEXT: shl.b32 %r30, %r14, 3; +; CHECKPTX62-NEXT: and.b32 %r7, %r30, 24; +; CHECKPTX62-NEXT: mov.b32 %r31, 65535; +; CHECKPTX62-NEXT: shl.b32 %r32, %r31, %r7; +; CHECKPTX62-NEXT: not.b32 %r8, %r32; +; CHECKPTX62-NEXT: ld.global.b32 %r48, [%r6]; ; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r41; +; CHECKPTX62-NEXT: shr.u32 %r33, %r48, %r7; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r33; ; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs8; -; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12; -; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; -; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r15, %r56; -; CHECKPTX62-NEXT: mov.b32 %r56, %r15; +; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs8; +; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r7; +; CHECKPTX62-NEXT: and.b32 %r36, %r48, %r8; +; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; +; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37; +; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r9, %r48; +; CHECKPTX62-NEXT: mov.b32 %r48, %r9; ; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8 -; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4; -; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3; -; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24; -; CHECKPTX62-NEXT: mov.b32 %r47, 65535; -; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17; -; CHECKPTX62-NEXT: not.b32 %r18, %r48; -; CHECKPTX62-NEXT: ld.shared.b32 %r57, [%r16]; +; CHECKPTX62-NEXT: and.b32 %r10, %r15, -4; +; CHECKPTX62-NEXT: shl.b32 %r38, %r15, 3; +; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24; +; CHECKPTX62-NEXT: mov.b32 %r39, 65535; +; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11; +; CHECKPTX62-NEXT: not.b32 %r12, %r40; +; CHECKPTX62-NEXT: ld.shared.b32 %r49, [%r10]; ; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r49; +; CHECKPTX62-NEXT: shr.u32 %r41, %r49, %r11; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r41; ; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs10; -; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17; -; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18; -; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; -; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r21, %r57; -; CHECKPTX62-NEXT: mov.b32 %r57, %r21; +; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs10; +; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; +; CHECKPTX62-NEXT: and.b32 %r44, %r49, %r12; +; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; +; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45; +; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r13, %r49; +; CHECKPTX62-NEXT: mov.b32 %r49, %r13; ; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX62-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index f96fd30..e6c6a73 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_86 && ptxas-isa-7.1 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %} target triple = "nvptx64-nvidia-cuda" @@ -47,93 +47,93 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71: { ; CHECKPTX71-NEXT: .reg .pred %p<5>; ; CHECKPTX71-NEXT: .reg .b16 %rs<14>; -; CHECKPTX71-NEXT: .reg .b32 %r<58>; +; CHECKPTX71-NEXT: .reg .b32 %r<50>; ; CHECKPTX71-EMPTY: ; CHECKPTX71-NEXT: // %bb.0: ; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3]; -; CHECKPTX71-NEXT: ld.param.b32 %r23, [test_param_2]; -; CHECKPTX71-NEXT: ld.param.b32 %r22, [test_param_1]; -; CHECKPTX71-NEXT: ld.param.b32 %r24, [test_param_0]; -; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4; -; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3; -; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3; -; CHECKPTX71-NEXT: mov.b32 %r26, 65535; -; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX71-NEXT: not.b32 %r3, %r27; -; CHECKPTX71-NEXT: ld.b32 %r54, [%r1]; +; CHECKPTX71-NEXT: ld.param.b32 %r15, [test_param_2]; +; CHECKPTX71-NEXT: ld.param.b32 %r14, [test_param_1]; +; CHECKPTX71-NEXT: ld.param.b32 %r16, [test_param_0]; +; CHECKPTX71-NEXT: and.b32 %r1, %r16, -4; +; CHECKPTX71-NEXT: and.b32 %r17, %r16, 3; +; CHECKPTX71-NEXT: shl.b32 %r2, %r17, 3; +; CHECKPTX71-NEXT: mov.b32 %r18, 65535; +; CHECKPTX71-NEXT: shl.b32 %r19, %r18, %r2; +; CHECKPTX71-NEXT: not.b32 %r3, %r19; +; CHECKPTX71-NEXT: ld.b32 %r46, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28; +; CHECKPTX71-NEXT: shr.u32 %r20, %r46, %r2; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r20; ; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1; -; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4; -; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; -; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; -; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; -; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r6, %r54; -; CHECKPTX71-NEXT: mov.b32 %r54, %r6; +; CHECKPTX71-NEXT: cvt.u32.u16 %r21, %rs4; +; CHECKPTX71-NEXT: shl.b32 %r22, %r21, %r2; +; CHECKPTX71-NEXT: and.b32 %r23, %r46, %r3; +; CHECKPTX71-NEXT: or.b32 %r24, %r23, %r22; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24; +; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r4, %r46; +; CHECKPTX71-NEXT: mov.b32 %r46, %r4; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44 -; CHECKPTX71-NEXT: ld.b32 %r55, [%r1]; +; CHECKPTX71-NEXT: ld.b32 %r47, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r33; +; CHECKPTX71-NEXT: shr.u32 %r25, %r47, %r2; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r25; ; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6; -; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs7; -; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; -; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; -; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; -; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r9, %r55; -; CHECKPTX71-NEXT: mov.b32 %r55, %r9; +; CHECKPTX71-NEXT: cvt.u32.u16 %r26, %rs7; +; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; +; CHECKPTX71-NEXT: and.b32 %r28, %r47, %r3; +; CHECKPTX71-NEXT: or.b32 %r29, %r28, %r27; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29; +; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r5, %r47; +; CHECKPTX71-NEXT: mov.b32 %r47, %r5; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26 -; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4; -; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3; -; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24; -; CHECKPTX71-NEXT: mov.b32 %r39, 65535; -; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11; -; CHECKPTX71-NEXT: not.b32 %r12, %r40; -; CHECKPTX71-NEXT: ld.global.b32 %r56, [%r10]; +; CHECKPTX71-NEXT: and.b32 %r6, %r14, -4; +; CHECKPTX71-NEXT: shl.b32 %r30, %r14, 3; +; CHECKPTX71-NEXT: and.b32 %r7, %r30, 24; +; CHECKPTX71-NEXT: mov.b32 %r31, 65535; +; CHECKPTX71-NEXT: shl.b32 %r32, %r31, %r7; +; CHECKPTX71-NEXT: not.b32 %r8, %r32; +; CHECKPTX71-NEXT: ld.global.b32 %r48, [%r6]; ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r41; +; CHECKPTX71-NEXT: shr.u32 %r33, %r48, %r7; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r33; ; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1; -; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs10; -; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; -; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; -; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r15, %r56; -; CHECKPTX71-NEXT: mov.b32 %r56, %r15; +; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs10; +; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r7; +; CHECKPTX71-NEXT: and.b32 %r36, %r48, %r8; +; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; +; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37; +; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r9, %r48; +; CHECKPTX71-NEXT: mov.b32 %r48, %r9; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8 -; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4; -; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3; -; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24; -; CHECKPTX71-NEXT: mov.b32 %r47, 65535; -; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17; -; CHECKPTX71-NEXT: not.b32 %r18, %r48; -; CHECKPTX71-NEXT: ld.shared.b32 %r57, [%r16]; +; CHECKPTX71-NEXT: and.b32 %r10, %r15, -4; +; CHECKPTX71-NEXT: shl.b32 %r38, %r15, 3; +; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24; +; CHECKPTX71-NEXT: mov.b32 %r39, 65535; +; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11; +; CHECKPTX71-NEXT: not.b32 %r12, %r40; +; CHECKPTX71-NEXT: ld.shared.b32 %r49, [%r10]; ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r49; +; CHECKPTX71-NEXT: shr.u32 %r41, %r49, %r11; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r41; ; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1; -; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs13; -; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; -; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; -; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; -; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r21, %r57; -; CHECKPTX71-NEXT: mov.b32 %r57, %r21; +; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs13; +; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; +; CHECKPTX71-NEXT: and.b32 %r44, %r49, %r12; +; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; +; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45; +; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r13, %r49; +; CHECKPTX71-NEXT: mov.b32 %r49, %r13; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX71-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll index e6636d7..d406f9c 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s -check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} ; CHECK-LABEL: .func test_atomics_scope( define void @test_atomics_scope(ptr %fp, float %f, diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index 04a58cf..6ea02f3 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -425,40 +425,40 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1]; ; CHECK-NEXT: ld.param.b64 %rd2, [atomicrmw_add_f16_generic_param_0]; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; -; CHECK-NEXT: cvt.u32.u64 %r6, %rd2; -; CHECK-NEXT: and.b32 %r7, %r6, 3; -; CHECK-NEXT: shl.b32 %r1, %r7, 3; -; CHECK-NEXT: mov.b32 %r8, 65535; -; CHECK-NEXT: shl.b32 %r9, %r8, %r1; -; CHECK-NEXT: not.b32 %r2, %r9; -; CHECK-NEXT: ld.b32 %r19, [%rd1]; -; CHECK-NEXT: cvt.f32.f16 %r12, %rs1; +; CHECK-NEXT: cvt.u32.u64 %r4, %rd2; +; CHECK-NEXT: and.b32 %r5, %r4, 3; +; CHECK-NEXT: shl.b32 %r1, %r5, 3; +; CHECK-NEXT: mov.b32 %r6, 65535; +; CHECK-NEXT: shl.b32 %r7, %r6, %r1; +; CHECK-NEXT: not.b32 %r2, %r7; +; CHECK-NEXT: ld.b32 %r17, [%rd1]; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs1; ; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u32 %r10, %r19, %r1; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r10; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs2; -; CHECK-NEXT: add.rn.f32 %r13, %r11, %r12; -; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r13; -; CHECK-NEXT: cvt.u32.u16 %r14, %rs3; -; CHECK-NEXT: shl.b32 %r15, %r14, %r1; -; CHECK-NEXT: and.b32 %r16, %r19, %r2; -; CHECK-NEXT: or.b32 %r17, %r16, %r15; +; CHECK-NEXT: shr.u32 %r8, %r17, %r1; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r8; +; CHECK-NEXT: cvt.f32.f16 %r9, %rs2; +; CHECK-NEXT: add.rn.f32 %r11, %r9, %r10; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r11; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs3; +; CHECK-NEXT: shl.b32 %r13, %r12, %r1; +; CHECK-NEXT: and.b32 %r14, %r17, %r2; +; CHECK-NEXT: or.b32 %r15, %r14, %r13; ; CHECK-NEXT: membar.sys; -; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r19, %r17; -; CHECK-NEXT: setp.ne.b32 %p1, %r5, %r19; -; CHECK-NEXT: mov.b32 %r19, %r5; +; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r17, %r15; +; CHECK-NEXT: setp.ne.b32 %p1, %r3, %r17; +; CHECK-NEXT: mov.b32 %r17, %r3; ; CHECK-NEXT: @%p1 bra $L__BB24_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: shr.u32 %r18, %r5, %r1; -; CHECK-NEXT: st.param.b16 [func_retval0], %r18; +; CHECK-NEXT: shr.u32 %r16, %r3, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %r16; ; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, half %val seq_cst ret half %ret diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll index b6317df..268a897 100644 --- a/llvm/test/CodeGen/NVPTX/b52037.ll +++ b/llvm/test/CodeGen/NVPTX/b52037.ll @@ -4,7 +4,7 @@ ; https://bugs.llvm.org/show_bug.cgi?id=52037 for the gory details. ; ; RUN: llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | FileCheck %s -; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 %{ llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | %ptxas-verify -arch=sm_70 %} ; CHECK-LABEL: .visible .entry barney( ; CHECK-NOT: .local{{.*}}__local_depot diff --git a/llvm/test/CodeGen/NVPTX/barrier.ll b/llvm/test/CodeGen/NVPTX/barrier.ll index a3b0d21..f2d6f23 100644 --- a/llvm/test/CodeGen/NVPTX/barrier.ll +++ b/llvm/test/CodeGen/NVPTX/barrier.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare void @llvm.nvvm.bar.warp.sync(i32) declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index aee58a0..4d930cd 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" @@ -688,25 +688,25 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; SM70-NEXT: cvt.u32.u16 %r5, %rs2; +; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM70-NEXT: cvt.u32.u16 %r5, %rs8; ; SM70-NEXT: shl.b32 %r6, %r5, 16; -; SM70-NEXT: cvt.u32.u16 %r7, %rs1; +; SM70-NEXT: cvt.u32.u16 %r7, %rs7; ; SM70-NEXT: shl.b32 %r8, %r7, 16; -; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; SM70-NEXT: cvt.u32.u16 %r9, %rs4; +; SM70-NEXT: cvt.u32.u16 %r9, %rs6; ; SM70-NEXT: shl.b32 %r10, %r9, 16; -; SM70-NEXT: cvt.u32.u16 %r11, %rs3; +; SM70-NEXT: cvt.u32.u16 %r11, %rs5; ; SM70-NEXT: shl.b32 %r12, %r11, 16; -; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; SM70-NEXT: cvt.u32.u16 %r13, %rs6; +; SM70-NEXT: cvt.u32.u16 %r13, %rs4; ; SM70-NEXT: shl.b32 %r14, %r13, 16; -; SM70-NEXT: cvt.u32.u16 %r15, %rs5; +; SM70-NEXT: cvt.u32.u16 %r15, %rs3; ; SM70-NEXT: shl.b32 %r16, %r15, 16; -; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; SM70-NEXT: cvt.u32.u16 %r17, %rs8; +; SM70-NEXT: cvt.u32.u16 %r17, %rs2; ; SM70-NEXT: shl.b32 %r18, %r17, 16; -; SM70-NEXT: cvt.u32.u16 %r19, %rs7; +; SM70-NEXT: cvt.u32.u16 %r19, %rs1; ; SM70-NEXT: shl.b32 %r20, %r19, 16; ; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r20, %r18, %r16, %r14}; ; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r10, %r8, %r6}; @@ -721,18 +721,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-NEXT: // %bb.0: ; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; SM80-NEXT: cvt.f32.bf16 %r5, %rs2; -; SM80-NEXT: cvt.f32.bf16 %r6, %rs1; -; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; SM80-NEXT: cvt.f32.bf16 %r7, %rs4; -; SM80-NEXT: cvt.f32.bf16 %r8, %rs3; -; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; SM80-NEXT: cvt.f32.bf16 %r9, %rs6; -; SM80-NEXT: cvt.f32.bf16 %r10, %rs5; -; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; SM80-NEXT: cvt.f32.bf16 %r11, %rs8; -; SM80-NEXT: cvt.f32.bf16 %r12, %rs7; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM80-NEXT: cvt.f32.bf16 %r5, %rs8; +; SM80-NEXT: cvt.f32.bf16 %r6, %rs7; +; SM80-NEXT: cvt.f32.bf16 %r7, %rs6; +; SM80-NEXT: cvt.f32.bf16 %r8, %rs5; +; SM80-NEXT: cvt.f32.bf16 %r9, %rs4; +; SM80-NEXT: cvt.f32.bf16 %r10, %rs3; +; SM80-NEXT: cvt.f32.bf16 %r11, %rs2; +; SM80-NEXT: cvt.f32.bf16 %r12, %rs1; ; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-NEXT: ret; @@ -746,18 +746,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs1; -; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs3; -; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs6; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs5; -; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7; +; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-FTZ-NEXT: ret; @@ -771,18 +771,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; SM90-NEXT: cvt.f32.bf16 %r5, %rs2; -; SM90-NEXT: cvt.f32.bf16 %r6, %rs1; -; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; SM90-NEXT: cvt.f32.bf16 %r7, %rs4; -; SM90-NEXT: cvt.f32.bf16 %r8, %rs3; -; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; SM90-NEXT: cvt.f32.bf16 %r9, %rs6; -; SM90-NEXT: cvt.f32.bf16 %r10, %rs5; -; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; SM90-NEXT: cvt.f32.bf16 %r11, %rs8; -; SM90-NEXT: cvt.f32.bf16 %r12, %rs7; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM90-NEXT: cvt.f32.bf16 %r5, %rs8; +; SM90-NEXT: cvt.f32.bf16 %r6, %rs7; +; SM90-NEXT: cvt.f32.bf16 %r7, %rs6; +; SM90-NEXT: cvt.f32.bf16 %r8, %rs5; +; SM90-NEXT: cvt.f32.bf16 %r9, %rs4; +; SM90-NEXT: cvt.f32.bf16 %r10, %rs3; +; SM90-NEXT: cvt.f32.bf16 %r11, %rs2; +; SM90-NEXT: cvt.f32.bf16 %r12, %rs1; ; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM90-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll index 80627a0..2c4aa6b 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | %ptxas-verify -arch=sm_80 %} +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -22,7 +22,7 @@ define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 { ; CHECK-NEXT: cvt.rn.bf16x2.f32 %r5, %r4, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; - %r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) + %r = call afn <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } @@ -41,7 +41,7 @@ define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 { ; CHECK-NEXT: cvt.rn.bf16x2.f32 %r5, %r4, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; - %r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) + %r = call afn <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index b4641d0..3c6fb4b 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -204,7 +204,7 @@ declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0 define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-LABEL: test_call( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; @@ -711,11 +711,11 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; -; CHECK-NEXT: and.b32 %r3, %r2, -2147450880; -; CHECK-NEXT: and.b32 %r4, %r1, 2147450879; -; CHECK-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_1]; +; CHECK-NEXT: and.b32 %r2, %r1, -2147450880; +; CHECK-NEXT: ld.param.b32 %r3, [test_copysign_param_0]; +; CHECK-NEXT: and.b32 %r4, %r3, 2147450879; +; CHECK-NEXT: or.b32 %r5, %r4, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) diff --git a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll new file mode 100644 index 0000000..a0a99fe --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx90 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim" +; attributes. +define ptx_kernel void @kernel1(ptr %input, ptr %output) #0 #1 #2 { +; CHECK-LABEL: kernel1( +; CHECK: .reqntid 1024, 1, 1 +; CHECK-NEXT: .reqnctapercluster 2, 2, 2 +; CHECK-NEXT: .blocksareclusters +; CHECK-NEXT: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ret; + ret void +} + +; Test "blocksareclusters" attribute with single dimension "reqntid" and +; "cluster_dim" attributes. +define ptx_kernel void @kernel2(ptr %input, ptr %output) #0 #3 #4 { +; CHECK-LABEL: kernel2( +; CHECK: .reqntid 1024 +; CHECK-NEXT: .reqnctapercluster 2 +; CHECK-NEXT: .blocksareclusters // @kernel2 +; CHECK-NEXT: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ret; + ret void +} + +; Test "blocksareclusters" attribute with two dimensions(not z dimension) +; "reqntid" and "cluster_dim" attributes. +define ptx_kernel void @kernel3(ptr %input, ptr %output) #0 #5 #6 { +; CHECK-LABEL: kernel3( +; CHECK: .reqntid 512, 2 +; CHECK-NEXT: .reqnctapercluster 2, 2 +; CHECK-NEXT: .blocksareclusters // @kernel3 +; CHECK-NEXT: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ret; + ret void +} + +attributes #0 = { "nvvm.blocksareclusters" } + +attributes #1 = { "nvvm.reqntid"="1024,1,1" } +attributes #2 = { "nvvm.cluster_dim"="2,2,2" } + +attributes #3 = { "nvvm.reqntid"="1024" } +attributes #4 = { "nvvm.cluster_dim"="2" } + +attributes #5 = { "nvvm.reqntid"="512,2" } +attributes #6 = { "nvvm.cluster_dim"="2,2" } diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll index d5b2786..dee5a76 100644 --- a/llvm/test/CodeGen/NVPTX/bmsk.ll +++ b/llvm/test/CodeGen/NVPTX/bmsk.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-7.6 %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/branch-fold.mir b/llvm/test/CodeGen/NVPTX/branch-fold.mir index ca6f49f..c9abe3f 100644 --- a/llvm/test/CodeGen/NVPTX/branch-fold.mir +++ b/llvm/test/CodeGen/NVPTX/branch-fold.mir @@ -57,7 +57,7 @@ body: | ; CHECK-NEXT: bb.2.bb1: ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[ADDi64ri:%[0-9]+]]:b64 = ADDi64ri [[ADDi64ri]], 1 + ; CHECK-NEXT: [[ADDi64ri:%[0-9]+]]:b64 = ADD64ri [[ADDi64ri]], 1 ; CHECK-NEXT: [[SETP_s64ri:%[0-9]+]]:b1 = SETP_i64ri [[ADDi64ri]], 1, 2 ; CHECK-NEXT: CBranch [[SETP_s64ri]], %bb.2 ; CHECK-NEXT: {{ $}} @@ -76,7 +76,7 @@ body: | bb.2.bb1: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - %5:b64 = ADDi64ri %5, 1 + %5:b64 = ADD64ri %5, 1 %4:b1 = SETP_i64ri %5, 1, 2 CBranch %4, %bb.2 diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index 0d1d6da..e3d1c80 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll index 46172b1..4e11f58 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll @@ -16,7 +16,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p ; CHECK: .maxntid 1, 1, 1 ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-NEXT: .reg .b64 %rd<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %bb ; CHECK-NEXT: ld.param.b64 %rd1, [spam_param_0]; @@ -25,9 +25,10 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p ; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3; ; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1]; ; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16]; -; CHECK-NEXT: ld.global.b64 %rd6, [%rd5]; -; CHECK-NEXT: mad.wide.s32 %rd7, %r1, %r1, %rd6; -; CHECK-NEXT: st.global.b64 [%rd5], %rd7; +; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1; +; CHECK-NEXT: ld.global.b64 %rd7, [%rd5]; +; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7; +; CHECK-NEXT: st.global.b64 [%rd5], %rd8; ; CHECK-NEXT: ret; bb: %tmp5 = add nsw i64 %arg3, 8 diff --git a/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll b/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll index 9988d5b..ed43b42 100644 --- a/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll +++ b/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_70 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 %{ llc < %s -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" @@ -11,7 +11,7 @@ declare %struct.double2 @add(ptr align(16) byval(%struct.double2), ptr align(16) define void @call_byval(ptr %out, ptr %in1, ptr %in2) { ; CHECK-LABEL: call_byval( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<12>; +; CHECK-NEXT: .reg .b64 %rd<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [call_byval_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll index b4934e1a..81e7edf 100644 --- a/llvm/test/CodeGen/NVPTX/byval-const-global.ll +++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_70 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 %{ llc < %s -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/calling-conv.ll b/llvm/test/CodeGen/NVPTX/calling-conv.ll index 74b99ef..0bec7e6 100644 --- a/llvm/test/CodeGen/NVPTX/calling-conv.ll +++ b/llvm/test/CodeGen/NVPTX/calling-conv.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/cluster-dim.ll b/llvm/test/CodeGen/NVPTX/cluster-dim.ll index 196b967..a8101f6 100644 --- a/llvm/test/CodeGen/NVPTX/cluster-dim.ll +++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck -check-prefixes=CHECK80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} define ptx_kernel void @kernel_func_clusterxyz() "nvvm.cluster_dim"="3,5,7" { ; CHECK80-LABEL: kernel_func_clusterxyz( diff --git a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll index c8b79df..d930d18 100644 --- a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll +++ b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} ; RUN: llc -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_101a %} ; RUN: llc -o - -mcpu=sm_120a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %} define void @nvvm_clusterlaunchcontrol_try_cancel_multicast( ; CHECK-PTX-SHARED64-LABEL: nvvm_clusterlaunchcontrol_try_cancel_multicast( diff --git a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll index a8ccfc5..234fb66 100644 --- a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll +++ b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} define void @nvvm_clusterlaunchcontrol_try_cancel( ; CHECK-PTX-SHARED64-LABEL: nvvm_clusterlaunchcontrol_try_cancel( diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 63c389c3..d895c71 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -1,47 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 && ptxas-isa-5.0 %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB0_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB0_1; ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -52,42 +51,41 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -98,43 +96,42 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -145,42 +142,41 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB3_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -191,42 +187,41 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB4_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -237,43 +232,42 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -284,42 +278,41 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB6_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -330,43 +323,42 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB7_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -377,43 +369,42 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB8_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -424,43 +415,42 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB9_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -471,43 +461,42 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB10_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -518,43 +507,42 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB11_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB11_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -565,43 +553,42 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB12_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB12_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -612,43 +599,42 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB13_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB13_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -659,43 +645,42 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB14_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -706,40 +691,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB15_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB15_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new @@ -750,41 +735,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB16_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB16_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new @@ -795,42 +780,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB17_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB17_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new @@ -841,41 +826,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB18_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB18_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new @@ -886,41 +871,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB19_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB19_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new @@ -931,42 +916,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB20_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB20_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new @@ -977,41 +962,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB21_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB21_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB21_1; ; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new @@ -1022,42 +1007,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB22_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB22_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new @@ -1068,42 +1053,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB23_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB23_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new @@ -1114,42 +1099,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB24_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB24_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB24_1; ; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new @@ -1160,42 +1145,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB25_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB25_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new @@ -1206,42 +1191,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB26_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB26_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new @@ -1252,42 +1237,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB27_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB27_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new @@ -1298,42 +1283,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB28_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB28_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new @@ -1344,42 +1329,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB29_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB29_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst ret i16 %new @@ -1899,43 +1884,42 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB60_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1997,43 +1981,42 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB64_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -2044,43 +2027,42 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB65_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 5cb344d..76220ee 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -1,47 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -52,42 +51,41 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -98,43 +96,42 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -145,42 +142,41 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -191,42 +187,41 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -237,43 +232,42 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -284,42 +278,41 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -330,43 +323,42 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -377,43 +369,42 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -424,43 +415,42 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -471,43 +461,42 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB10_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -518,43 +507,42 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB11_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -565,43 +553,42 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB12_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -612,43 +599,42 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB13_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB13_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -659,43 +645,42 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB14_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -706,40 +691,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB15_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB15_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new @@ -750,41 +735,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB16_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB16_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new @@ -795,42 +780,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB17_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB17_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new @@ -841,41 +826,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB18_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB18_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new @@ -886,41 +871,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB19_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB19_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new @@ -931,42 +916,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB20_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB20_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new @@ -977,41 +962,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB21_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB21_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB21_1; ; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new @@ -1022,42 +1007,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB22_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB22_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new @@ -1068,42 +1053,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB23_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB23_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new @@ -1114,42 +1099,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB24_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB24_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB24_1; ; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new @@ -1160,42 +1145,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB25_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB25_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new @@ -1206,42 +1191,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB26_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB26_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new @@ -1252,42 +1237,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB27_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB27_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new @@ -1298,42 +1283,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB28_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB28_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new @@ -1344,42 +1329,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB29_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB29_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst ret i16 %new @@ -1899,43 +1884,42 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB60_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1997,43 +1981,42 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB64_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -2044,43 +2027,42 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB65_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 7cb2590..4cdedb2 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -1,47 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB0_1; ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -52,42 +51,41 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -98,43 +96,42 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -145,42 +142,41 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -191,42 +187,41 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -237,43 +232,42 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB5_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -284,42 +278,41 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -330,43 +323,42 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -377,43 +369,42 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -424,43 +415,42 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -471,43 +461,42 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB10_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -518,43 +507,42 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB11_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -565,43 +553,42 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB12_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -612,43 +599,42 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB13_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -659,43 +645,42 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB14_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -706,40 +691,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB15_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB15_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new @@ -750,41 +735,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB16_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB16_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB16_1; ; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new @@ -795,42 +780,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB17_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB17_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new @@ -841,41 +826,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB18_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB18_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new @@ -886,41 +871,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB19_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB19_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new @@ -931,42 +916,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB20_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB20_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new @@ -977,41 +962,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB21_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB21_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new @@ -1022,42 +1007,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB22_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB22_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new @@ -1068,42 +1053,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB23_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB23_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new @@ -1114,42 +1099,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB24_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB24_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new @@ -1160,42 +1145,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB25_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB25_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new @@ -1206,42 +1191,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB26_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB26_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new @@ -1252,42 +1237,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB27_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB27_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new @@ -1298,42 +1283,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB28_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB28_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new @@ -1344,42 +1329,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB29_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB29_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst ret i16 %new @@ -1899,43 +1884,42 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB60_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -2014,43 +1998,42 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB65_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -2061,43 +2044,42 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB66_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 237e423..ec37025 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_32 | FileCheck %s --check-prefixes=SM30,CHECK ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_32 | %ptxas-verify %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=SM70,CHECK -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} ; TODO: these are system scope, but are compiled to gpu scope.. ; TODO: these are seq_cst, but are compiled to relaxed.. @@ -14,82 +14,80 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: ld.param.b8 %r7, [relaxed_sys_i8_param_1]; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB0_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB0_1; ; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [relaxed_sys_i8_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i8( ; SM90: { @@ -140,84 +138,82 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: ld.param.b8 %r7, [acquire_sys_i8_param_1]; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB1_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB1_1; ; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: acquire_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [acquire_sys_i8_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i8( ; SM90: { @@ -269,84 +265,82 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r7, [release_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB2_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB2_1; ; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: release_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [release_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i8( ; SM90: { @@ -398,86 +392,84 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r7, [acq_rel_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB3_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB3_1; ; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: acq_rel_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i8( ; SM90: { @@ -530,86 +522,84 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r7, [seq_cst_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB4_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB4_1; ; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: seq_cst_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [seq_cst_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i8( ; SM90: { @@ -663,80 +653,80 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: ld.param.b16 %r7, [relaxed_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB5_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB5_1; ; SM30-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [relaxed_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i16( ; SM90: { @@ -786,82 +776,82 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: ld.param.b16 %r7, [acquire_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB6_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB6_1; ; SM30-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: acquire_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [acquire_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i16( ; SM90: { @@ -912,82 +902,82 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r7, [release_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB7_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB7_1; ; SM30-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: release_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [release_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i16( ; SM90: { @@ -1038,84 +1028,84 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r7, [acq_rel_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB8_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB8_1; ; SM30-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: acq_rel_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acq_rel_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i16( ; SM90: { @@ -1168,84 +1158,84 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r7, [seq_cst_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB9_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB9_1; ; SM30-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: seq_cst_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [seq_cst_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i16( ; SM90: { diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll index da303b7..04d1932 100644 --- a/llvm/test/CodeGen/NVPTX/combine-mad.ll +++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O1 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O1 | %ptxas-verify %} define i32 @test1(i32 %n, i32 %m) { @@ -189,7 +189,7 @@ declare i32 @use(i32 %0, i32 %1) define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: test_mad_multi_use( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_mad_multi_use_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/combine-min-max.ll b/llvm/test/CodeGen/NVPTX/combine-min-max.ll index e7140ab..c055008 100644 --- a/llvm/test/CodeGen/NVPTX/combine-min-max.ll +++ b/llvm/test/CodeGen/NVPTX/combine-min-max.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -O3 | FileCheck %s --check-prefixes=CHECK,SM90 ; RUN: llc < %s -mcpu=sm_20 -O3 | FileCheck %s --check-prefixes=CHECK,SM20 -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx80 -O3 | %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_20 -O3 | %ptxas-verify %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx80 -O3 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 -O3 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/combine-wide.ll b/llvm/test/CodeGen/NVPTX/combine-wide.ll index ed4a2b6..b5948d3 100644 --- a/llvm/test/CodeGen/NVPTX/combine-wide.ll +++ b/llvm/test/CodeGen/NVPTX/combine-wide.ll @@ -9,14 +9,15 @@ define i64 @t1(i32 %a, i32 %b, i64 %c) { ; O1-LABEL: t1( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t1_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t1_param_1]; -; O1-NEXT: ld.param.b64 %rd1, [t1_param_2]; -; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NEXT: ld.param.b64 %rd2, [t1_param_2]; +; O1-NEXT: add.s64 %rd3, %rd2, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t1( @@ -44,14 +45,15 @@ define i64 @t2(i32 %a, i32 %b, i64 %c) { ; O1-LABEL: t2( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t2_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t2_param_1]; -; O1-NEXT: ld.param.b64 %rd1, [t2_param_2]; -; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NEXT: ld.param.b64 %rd2, [t2_param_2]; +; O1-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t2( @@ -79,13 +81,14 @@ define i64 @t3(i32 %a, i32 %b) { ; O1-LABEL: t3( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<2>; +; O1-NEXT: .reg .b64 %rd<3>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t3_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t3_param_1]; -; O1-NEXT: mad.wide.s32 %rd1, %r1, %r2, 1; -; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NEXT: add.s64 %rd2, %rd1, 1; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; ; O1-NEXT: ret; ; ; O0-LABEL: t3( @@ -112,13 +115,14 @@ define i64 @t4(i32 %a, i64 %c) { ; O1-LABEL: t4( ; O1: { ; O1-NEXT: .reg .b32 %r<2>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t4_param_0]; ; O1-NEXT: ld.param.b64 %rd1, [t4_param_1]; -; O1-NEXT: mad.wide.s32 %rd2, %r1, 3, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.s32 %rd2, %r1, 3; +; O1-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t4( @@ -145,12 +149,13 @@ define i64 @t4_1(i32 %a, i64 %c) { ; O1-LABEL: t4_1( ; O1: { ; O1-NEXT: .reg .b32 %r<2>; -; O1-NEXT: .reg .b64 %rd<2>; +; O1-NEXT: .reg .b64 %rd<3>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t4_1_param_0]; -; O1-NEXT: mad.wide.s32 %rd1, %r1, 3, 5; -; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: mul.wide.s32 %rd1, %r1, 3; +; O1-NEXT: add.s64 %rd2, %rd1, 5; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; ; O1-NEXT: ret; ; ; O0-LABEL: t4_1( @@ -176,14 +181,15 @@ define i64 @t5(i32 %a, i32 %b, i64 %c) { ; O1-LABEL: t5( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t5_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t5_param_1]; -; O1-NEXT: ld.param.b64 %rd1, [t5_param_2]; -; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; O1-NEXT: ld.param.b64 %rd2, [t5_param_2]; +; O1-NEXT: add.s64 %rd3, %rd2, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t5( @@ -211,14 +217,15 @@ define i64 @t6(i32 %a, i32 %b, i64 %c) { ; O1-LABEL: t6( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t6_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t6_param_1]; -; O1-NEXT: ld.param.b64 %rd1, [t6_param_2]; -; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; O1-NEXT: ld.param.b64 %rd2, [t6_param_2]; +; O1-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t6( @@ -932,14 +939,15 @@ define i32 @t32(i16 %a, i16 %b, i32 %c) { ; O1-LABEL: t32( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t32_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t32_param_1]; -; O1-NEXT: ld.param.b32 %r1, [t32_param_2]; -; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NEXT: ld.param.b32 %r2, [t32_param_2]; +; O1-NEXT: add.s32 %r3, %r2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t32( @@ -967,14 +975,15 @@ define i32 @t33(i16 %a, i16 %b, i32 %c) { ; O1-LABEL: t33( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t33_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t33_param_1]; -; O1-NEXT: ld.param.b32 %r1, [t33_param_2]; -; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NEXT: ld.param.b32 %r2, [t33_param_2]; +; O1-NEXT: add.s32 %r3, %r2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t33( @@ -1002,13 +1011,14 @@ define i32 @t34(i16 %a, i16 %b) { ; O1-LABEL: t34( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b32 %r<3>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t34_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t34_param_1]; -; O1-NEXT: mad.wide.s16 %r1, %rs1, %rs2, 1; -; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NEXT: add.s32 %r2, %r1, 1; +; O1-NEXT: st.param.b32 [func_retval0], %r2; ; O1-NEXT: ret; ; ; O0-LABEL: t34( @@ -1035,13 +1045,14 @@ define i32 @t35(i16 %a, i32 %c) { ; O1-LABEL: t35( ; O1: { ; O1-NEXT: .reg .b16 %rs<2>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t35_param_0]; ; O1-NEXT: ld.param.b32 %r1, [t35_param_1]; -; O1-NEXT: mad.wide.s16 %r2, %rs1, 3, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.s16 %r2, %rs1, 3; +; O1-NEXT: add.s32 %r3, %r1, %r2; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t35( @@ -1068,12 +1079,13 @@ define i32 @t36(i16 %a, i32 %c) { ; O1-LABEL: t36( ; O1: { ; O1-NEXT: .reg .b16 %rs<2>; -; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b32 %r<3>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t36_param_0]; -; O1-NEXT: mad.wide.s16 %r1, %rs1, 3, 5; -; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: mul.wide.s16 %r1, %rs1, 3; +; O1-NEXT: add.s32 %r2, %r1, 5; +; O1-NEXT: st.param.b32 [func_retval0], %r2; ; O1-NEXT: ret; ; ; O0-LABEL: t36( @@ -1099,14 +1111,15 @@ define i32 @t37(i16 %a, i16 %b, i32 %c) { ; O1-LABEL: t37( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t37_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t37_param_1]; -; O1-NEXT: ld.param.b32 %r1, [t37_param_2]; -; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2; +; O1-NEXT: ld.param.b32 %r2, [t37_param_2]; +; O1-NEXT: add.s32 %r3, %r2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t37( @@ -1134,14 +1147,15 @@ define i32 @t38(i16 %a, i16 %b, i32 %c) { ; O1-LABEL: t38( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t38_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t38_param_1]; -; O1-NEXT: ld.param.b32 %r1, [t38_param_2]; -; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2; +; O1-NEXT: ld.param.b32 %r2, [t38_param_2]; +; O1-NEXT: add.s32 %r3, %r1, %r2; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t38( diff --git a/llvm/test/CodeGen/NVPTX/common-linkage.ll b/llvm/test/CodeGen/NVPTX/common-linkage.ll index 2ea5f7f..c5bf25b 100644 --- a/llvm/test/CodeGen/NVPTX/common-linkage.ll +++ b/llvm/test/CodeGen/NVPTX/common-linkage.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -mattr=+ptx43 | FileCheck %s --check-prefixes CHECK,PTX43 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -mattr=+ptx50 | FileCheck %s --check-prefixes CHECK,PTX50 -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} +; RUN: %if ptxas-isa-4.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} +; RUN: %if ptxas-isa-5.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} ; PTX43: .weak .global .align 4 .u32 g ; PTX50: .common .global .align 4 .u32 g diff --git a/llvm/test/CodeGen/NVPTX/compare-int.ll b/llvm/test/CodeGen/NVPTX/compare-int.ll index 9338172d..9c93d18 100644 --- a/llvm/test/CodeGen/NVPTX/compare-int.ll +++ b/llvm/test/CodeGen/NVPTX/compare-int.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll index dd3e4ec..3304f18 100644 --- a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll +++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} %struct.64 = type <{ i64 }> declare i64 @callee(ptr %p); @@ -9,7 +9,7 @@ declare i64 @callee_variadic(ptr %p, ...); define %struct.64 @test_return_type_mismatch(ptr %p) { ; CHECK-LABEL: test_return_type_mismatch( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<40>; +; CHECK-NEXT: .reg .b64 %rd<32>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_return_type_mismatch_param_0]; @@ -29,35 +29,35 @@ define %struct.64 @test_return_type_mismatch(ptr %p) { ; CHECK-NEXT: ld.param.b8 %rd9, [retval0+1]; ; CHECK-NEXT: ld.param.b8 %rd10, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: shl.b64 %rd13, %rd9, 8; -; CHECK-NEXT: or.b64 %rd14, %rd13, %rd10; -; CHECK-NEXT: shl.b64 %rd16, %rd8, 16; -; CHECK-NEXT: shl.b64 %rd18, %rd7, 24; -; CHECK-NEXT: or.b64 %rd19, %rd18, %rd16; -; CHECK-NEXT: or.b64 %rd20, %rd19, %rd14; -; CHECK-NEXT: shl.b64 %rd23, %rd5, 8; -; CHECK-NEXT: or.b64 %rd24, %rd23, %rd6; -; CHECK-NEXT: shl.b64 %rd26, %rd4, 16; -; CHECK-NEXT: shl.b64 %rd28, %rd3, 24; -; CHECK-NEXT: or.b64 %rd29, %rd28, %rd26; -; CHECK-NEXT: or.b64 %rd30, %rd29, %rd24; -; CHECK-NEXT: shl.b64 %rd31, %rd30, 32; -; CHECK-NEXT: or.b64 %rd32, %rd31, %rd20; +; CHECK-NEXT: shl.b64 %rd11, %rd9, 8; +; CHECK-NEXT: or.b64 %rd12, %rd11, %rd10; +; CHECK-NEXT: shl.b64 %rd13, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd14, %rd7, 24; +; CHECK-NEXT: or.b64 %rd15, %rd14, %rd13; +; CHECK-NEXT: or.b64 %rd16, %rd15, %rd12; +; CHECK-NEXT: shl.b64 %rd17, %rd5, 8; +; CHECK-NEXT: or.b64 %rd18, %rd17, %rd6; +; CHECK-NEXT: shl.b64 %rd19, %rd4, 16; +; CHECK-NEXT: shl.b64 %rd20, %rd3, 24; +; CHECK-NEXT: or.b64 %rd21, %rd20, %rd19; +; CHECK-NEXT: or.b64 %rd22, %rd21, %rd18; +; CHECK-NEXT: shl.b64 %rd23, %rd22, 32; +; CHECK-NEXT: or.b64 %rd24, %rd23, %rd16; ; CHECK-NEXT: st.param.b8 [func_retval0], %rd10; -; CHECK-NEXT: shr.u64 %rd33, %rd32, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd33; -; CHECK-NEXT: shr.u64 %rd34, %rd32, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd34; -; CHECK-NEXT: shr.u64 %rd35, %rd32, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd35; -; CHECK-NEXT: shr.u64 %rd36, %rd32, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd36; -; CHECK-NEXT: shr.u64 %rd37, %rd32, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd37; -; CHECK-NEXT: shr.u64 %rd38, %rd32, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd38; -; CHECK-NEXT: shr.u64 %rd39, %rd32, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd39; +; CHECK-NEXT: shr.u64 %rd25, %rd24, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd25; +; CHECK-NEXT: shr.u64 %rd26, %rd24, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd26; +; CHECK-NEXT: shr.u64 %rd27, %rd24, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd27; +; CHECK-NEXT: shr.u64 %rd28, %rd24, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd28; +; CHECK-NEXT: shr.u64 %rd29, %rd24, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd29; +; CHECK-NEXT: shr.u64 %rd30, %rd24, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd30; +; CHECK-NEXT: shr.u64 %rd31, %rd24, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd31; ; CHECK-NEXT: ret; %ret = call %struct.64 @callee(ptr %p) ret %struct.64 %ret @@ -66,7 +66,7 @@ define %struct.64 @test_return_type_mismatch(ptr %p) { define i64 @test_param_type_mismatch(ptr %p) { ; CHECK-LABEL: test_param_type_mismatch( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 1, 0 @@ -87,7 +87,7 @@ define i64 @test_param_type_mismatch(ptr %p) { define i64 @test_param_count_mismatch(ptr %p) { ; CHECK-LABEL: test_param_count_mismatch( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_param_count_mismatch_param_0]; @@ -111,7 +111,7 @@ define i64 @test_param_count_mismatch(ptr %p) { define %struct.64 @test_return_type_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_return_type_mismatch_variadic( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<40>; +; CHECK-NEXT: .reg .b64 %rd<32>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_return_type_mismatch_variadic_param_0]; @@ -131,35 +131,35 @@ define %struct.64 @test_return_type_mismatch_variadic(ptr %p) { ; CHECK-NEXT: ld.param.b8 %rd9, [retval0+1]; ; CHECK-NEXT: ld.param.b8 %rd10, [retval0]; ; CHECK-NEXT: } // callseq 3 -; CHECK-NEXT: shl.b64 %rd13, %rd9, 8; -; CHECK-NEXT: or.b64 %rd14, %rd13, %rd10; -; CHECK-NEXT: shl.b64 %rd16, %rd8, 16; -; CHECK-NEXT: shl.b64 %rd18, %rd7, 24; -; CHECK-NEXT: or.b64 %rd19, %rd18, %rd16; -; CHECK-NEXT: or.b64 %rd20, %rd19, %rd14; -; CHECK-NEXT: shl.b64 %rd23, %rd5, 8; -; CHECK-NEXT: or.b64 %rd24, %rd23, %rd6; -; CHECK-NEXT: shl.b64 %rd26, %rd4, 16; -; CHECK-NEXT: shl.b64 %rd28, %rd3, 24; -; CHECK-NEXT: or.b64 %rd29, %rd28, %rd26; -; CHECK-NEXT: or.b64 %rd30, %rd29, %rd24; -; CHECK-NEXT: shl.b64 %rd31, %rd30, 32; -; CHECK-NEXT: or.b64 %rd32, %rd31, %rd20; +; CHECK-NEXT: shl.b64 %rd11, %rd9, 8; +; CHECK-NEXT: or.b64 %rd12, %rd11, %rd10; +; CHECK-NEXT: shl.b64 %rd13, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd14, %rd7, 24; +; CHECK-NEXT: or.b64 %rd15, %rd14, %rd13; +; CHECK-NEXT: or.b64 %rd16, %rd15, %rd12; +; CHECK-NEXT: shl.b64 %rd17, %rd5, 8; +; CHECK-NEXT: or.b64 %rd18, %rd17, %rd6; +; CHECK-NEXT: shl.b64 %rd19, %rd4, 16; +; CHECK-NEXT: shl.b64 %rd20, %rd3, 24; +; CHECK-NEXT: or.b64 %rd21, %rd20, %rd19; +; CHECK-NEXT: or.b64 %rd22, %rd21, %rd18; +; CHECK-NEXT: shl.b64 %rd23, %rd22, 32; +; CHECK-NEXT: or.b64 %rd24, %rd23, %rd16; ; CHECK-NEXT: st.param.b8 [func_retval0], %rd10; -; CHECK-NEXT: shr.u64 %rd33, %rd32, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd33; -; CHECK-NEXT: shr.u64 %rd34, %rd32, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd34; -; CHECK-NEXT: shr.u64 %rd35, %rd32, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd35; -; CHECK-NEXT: shr.u64 %rd36, %rd32, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd36; -; CHECK-NEXT: shr.u64 %rd37, %rd32, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd37; -; CHECK-NEXT: shr.u64 %rd38, %rd32, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd38; -; CHECK-NEXT: shr.u64 %rd39, %rd32, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd39; +; CHECK-NEXT: shr.u64 %rd25, %rd24, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd25; +; CHECK-NEXT: shr.u64 %rd26, %rd24, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd26; +; CHECK-NEXT: shr.u64 %rd27, %rd24, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd27; +; CHECK-NEXT: shr.u64 %rd28, %rd24, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd28; +; CHECK-NEXT: shr.u64 %rd29, %rd24, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd29; +; CHECK-NEXT: shr.u64 %rd30, %rd24, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd30; +; CHECK-NEXT: shr.u64 %rd31, %rd24, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd31; ; CHECK-NEXT: ret; %ret = call %struct.64 (ptr, ...) @callee_variadic(ptr %p) ret %struct.64 %ret @@ -168,7 +168,7 @@ define %struct.64 @test_return_type_mismatch_variadic(ptr %p) { define i64 @test_param_type_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_param_type_mismatch_variadic( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_param_type_mismatch_variadic_param_0]; @@ -190,7 +190,7 @@ define i64 @test_param_type_mismatch_variadic(ptr %p) { define i64 @test_param_count_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_param_count_mismatch_variadic( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_param_count_mismatch_variadic_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/convert-fp.ll b/llvm/test/CodeGen/NVPTX/convert-fp.ll index debaade..59b33b1b 100644 --- a/llvm/test/CodeGen/NVPTX/convert-fp.ll +++ b/llvm/test/CodeGen/NVPTX/convert-fp.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define i16 @cvt_u16_f32(float %x) { diff --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll index a2fc8da..9e850e7 100644 --- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; Integer conversions happen inplicitly by loading/storing the proper types diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll index 88d0f32..a89b35c 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm100.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} declare i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1) declare i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1) diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll index c8b7014..16bd0da 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) { ; CHECK-LABEL: cvt_rn_sf_e2m3x2_f32( diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll index 9ddeb2b..edf1739 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} define <2 x bfloat> @cvt_rn_bf16x2_f32(float %f1, float %f2) { diff --git a/llvm/test/CodeGen/NVPTX/convert-sm89.ll b/llvm/test/CodeGen/NVPTX/convert-sm89.ll index 30fd76f..616dcfa 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm89.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm89.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_89 -mattr=+ptx81 | FileCheck %s -; RUN: %if ptxas-12.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_89 -mattr=+ptx81 | %ptxas-verify -arch=sm_89 %} +; RUN: %if ptxas-sm_89 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_89 -mattr=+ptx81 | %ptxas-verify -arch=sm_89 %} ; CHECK-LABEL: cvt_rn_e4m3x2_f32 define i16 @cvt_rn_e4m3x2_f32(float %f1, float %f2) { diff --git a/llvm/test/CodeGen/NVPTX/convert-sm90.ll b/llvm/test/CodeGen/NVPTX/convert-sm90.ll index c74ceac..af88ede 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm90.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} declare i32 @llvm.nvvm.f2tf32.rn(float %f1) declare i32 @llvm.nvvm.f2tf32.rn.relu(float %f1) diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll index 1e6b046..a22f2165 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll index 5cfa25d..b5c43fd2 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll index a7e6bec..57342dc 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll index 843446a..a52fab6 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll index 9b485803..1f4c62a 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll index 4325405..3863c19d 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll index ef4a8fb..6296d5a 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll index 112dab1..e5ae387 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll index 54e861e..7d04ada 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll index e0aceaf..b0fe77c 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll index 6bf8f03..ccc3e94 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll index cf166f8..f5478db 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll index 4045b8b..2dac6c4 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll index 2ef44ff..037ecea 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll index 3b5bd16..8684ac3 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll index 46a0263..e800523 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll new file mode 100644 index 0000000..2e68208 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + + +;; Confirm the mov.b64 of global_smem is CSE'd. We need to make things a bit +;; complex with a loop to make this interesting. +define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) { +; CHECK-LABEL: test_mov_sym( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b8 %rs1, [test_mov_sym_param_2]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [test_mov_sym_param_0]; +; CHECK-NEXT: cvt.s64.s32 %rd1, %r1; +; CHECK-NEXT: mov.b64 %rd2, global_smem; +; CHECK-NEXT: add.s64 %rd3, %rd2, %rd1; +; CHECK-NEXT: ld.shared.b32 %r4, [%rd3]; +; CHECK-NEXT: not.pred %p2, %p1; +; CHECK-NEXT: @%p2 bra $L__BB0_4; +; CHECK-NEXT: // %bb.1: // %if1.preheader +; CHECK-NEXT: ld.param.b32 %r2, [test_mov_sym_param_1]; +; CHECK-NEXT: setp.ne.b32 %p3, %r1, %r2; +; CHECK-NEXT: $L__BB0_2: // %if1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: @%p3 bra $L__BB0_2; +; CHECK-NEXT: // %bb.3: // %if2 +; CHECK-NEXT: cvt.s64.s32 %rd4, %r2; +; CHECK-NEXT: add.s64 %rd5, %rd2, %rd4; +; CHECK-NEXT: ld.shared.b32 %r3, [%rd5]; +; CHECK-NEXT: add.s32 %r4, %r4, %r3; +; CHECK-NEXT: $L__BB0_4: // %end +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; +entry: + %gep = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset1 + %val = load i32, ptr addrspace(3) %gep + br i1 %cond, label %if1, label %end +if1: + %cond2 = icmp eq i32 %offset1, %offset2 + br i1 %cond2, label %if2, label %if1 +if2: + %gep2 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset2 + %val2 = load i32, ptr addrspace(3) %gep2 + %add = add i32 %val, %val2 + br label %end +end: + %ret = phi i32 [ %add, %if2 ], [ %val, %entry ] + ret i32 %ret +} diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll index ce72f5f..dca0a0d 100644 --- a/llvm/test/CodeGen/NVPTX/discard.ll +++ b/llvm/test/CodeGen/NVPTX/discard.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
-; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-sm_80 && ptxas-isa-7.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll index 2841e67..01cd70d 100644 --- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -o - -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" @@ -166,23 +166,23 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-LABEL: test_distributed_shared_cluster_cmpxchg( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<11>; -; CHECK-NEXT: .reg .b32 %r<53>; +; CHECK-NEXT: .reg .b32 %r<43>; ; CHECK-NEXT: .reg .b64 %rd<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; -; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r15, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r16, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b32 %r17, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r18, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r19, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r20, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r21, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r22, [%rd2], 1, 0; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; @@ -196,92 +196,92 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; -; CHECK-NEXT: cvt.u32.u64 %r33, %rd2; -; CHECK-NEXT: and.b32 %r34, %r33, 3; -; CHECK-NEXT: shl.b32 %r1, %r34, 3; -; CHECK-NEXT: mov.b32 %r35, 65535; -; CHECK-NEXT: shl.b32 %r36, %r35, %r1; -; CHECK-NEXT: not.b32 %r2, %r36; -; CHECK-NEXT: mov.b32 %r37, 1; -; CHECK-NEXT: shl.b32 %r3, %r37, %r1; -; CHECK-NEXT: ld.shared::cluster.b32 %r38, [%rd1]; -; CHECK-NEXT: and.b32 %r48, %r38, %r2; +; CHECK-NEXT: cvt.u32.u64 %r23, %rd2; +; CHECK-NEXT: and.b32 %r24, %r23, 3; +; CHECK-NEXT: shl.b32 %r1, %r24, 3; +; CHECK-NEXT: mov.b32 %r25, 65535; +; CHECK-NEXT: shl.b32 %r26, %r25, %r1; +; CHECK-NEXT: not.b32 %r2, %r26; +; CHECK-NEXT: mov.b32 %r27, 1; +; CHECK-NEXT: shl.b32 %r3, %r27, %r1; +; CHECK-NEXT: ld.shared::cluster.b32 %r28, [%rd1]; +; CHECK-NEXT: and.b32 %r38, %r28, %r2; ; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r39, %r48, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; -; CHECK-NEXT: setp.eq.b32 %p1, %r6, %r39; +; CHECK-NEXT: or.b32 %r29, %r38, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r4, [%rd1], %r29, %r38; +; CHECK-NEXT: setp.eq.b32 %p1, %r4, %r29; ; CHECK-NEXT: @%p1 bra $L__BB4_3; ; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 ; CHECK-NEXT: // in Loop: Header=BB4_1 Depth=1 -; CHECK-NEXT: and.b32 %r7, %r6, %r2; -; CHECK-NEXT: setp.ne.b32 %p2, %r48, %r7; -; CHECK-NEXT: mov.b32 %r48, %r7; +; CHECK-NEXT: and.b32 %r5, %r4, %r2; +; CHECK-NEXT: setp.ne.b32 %p2, %r38, %r5; +; CHECK-NEXT: mov.b32 %r38, %r5; ; CHECK-NEXT: @%p2 bra $L__BB4_1; ; CHECK-NEXT: $L__BB4_3: // %partword.cmpxchg.end31 -; CHECK-NEXT: ld.shared::cluster.b32 %r40, [%rd1]; -; CHECK-NEXT: and.b32 %r49, %r40, %r2; +; CHECK-NEXT: ld.shared::cluster.b32 %r30, [%rd1]; +; CHECK-NEXT: and.b32 %r39, %r30, %r2; ; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r41, %r49, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; -; CHECK-NEXT: setp.eq.b32 %p3, %r10, %r41; +; CHECK-NEXT: or.b32 %r31, %r39, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r31, %r39; +; CHECK-NEXT: setp.eq.b32 %p3, %r6, %r31; ; CHECK-NEXT: @%p3 bra $L__BB4_6; ; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 ; CHECK-NEXT: // in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: and.b32 %r11, %r10, %r2; -; CHECK-NEXT: setp.ne.b32 %p4, %r49, %r11; -; CHECK-NEXT: mov.b32 %r49, %r11; +; CHECK-NEXT: and.b32 %r7, %r6, %r2; +; CHECK-NEXT: setp.ne.b32 %p4, %r39, %r7; +; CHECK-NEXT: mov.b32 %r39, %r7; ; CHECK-NEXT: @%p4 bra $L__BB4_4; ; CHECK-NEXT: $L__BB4_6: // %partword.cmpxchg.end21 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: fence.acq_rel.sys; -; CHECK-NEXT: ld.shared::cluster.b32 %r42, [%rd1]; -; CHECK-NEXT: and.b32 %r50, %r42, %r2; +; CHECK-NEXT: ld.shared::cluster.b32 %r32, [%rd1]; +; CHECK-NEXT: and.b32 %r40, %r32, %r2; ; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r43, %r50, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; -; CHECK-NEXT: setp.eq.b32 %p5, %r14, %r43; +; CHECK-NEXT: or.b32 %r33, %r40, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r8, [%rd1], %r33, %r40; +; CHECK-NEXT: setp.eq.b32 %p5, %r8, %r33; ; CHECK-NEXT: @%p5 bra $L__BB4_9; ; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 ; CHECK-NEXT: // in Loop: Header=BB4_7 Depth=1 -; CHECK-NEXT: and.b32 %r15, %r14, %r2; -; CHECK-NEXT: setp.ne.b32 %p6, %r50, %r15; -; CHECK-NEXT: mov.b32 %r50, %r15; +; CHECK-NEXT: and.b32 %r9, %r8, %r2; +; CHECK-NEXT: setp.ne.b32 %p6, %r40, %r9; +; CHECK-NEXT: mov.b32 %r40, %r9; ; CHECK-NEXT: @%p6 bra $L__BB4_7; ; CHECK-NEXT: $L__BB4_9: // %partword.cmpxchg.end11 ; CHECK-NEXT: fence.acq_rel.sys; -; CHECK-NEXT: ld.shared::cluster.b32 %r44, [%rd1]; -; CHECK-NEXT: and.b32 %r51, %r44, %r2; +; CHECK-NEXT: ld.shared::cluster.b32 %r34, [%rd1]; +; CHECK-NEXT: and.b32 %r41, %r34, %r2; ; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r45, %r51, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; -; CHECK-NEXT: setp.eq.b32 %p7, %r18, %r45; +; CHECK-NEXT: or.b32 %r35, %r41, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r35, %r41; +; CHECK-NEXT: setp.eq.b32 %p7, %r10, %r35; ; CHECK-NEXT: @%p7 bra $L__BB4_12; ; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 ; CHECK-NEXT: // in Loop: Header=BB4_10 Depth=1 -; CHECK-NEXT: and.b32 %r19, %r18, %r2; -; CHECK-NEXT: setp.ne.b32 %p8, %r51, %r19; -; CHECK-NEXT: mov.b32 %r51, %r19; +; CHECK-NEXT: and.b32 %r11, %r10, %r2; +; CHECK-NEXT: setp.ne.b32 %p8, %r41, %r11; +; CHECK-NEXT: mov.b32 %r41, %r11; ; CHECK-NEXT: @%p8 bra $L__BB4_10; ; CHECK-NEXT: $L__BB4_12: // %partword.cmpxchg.end1 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: ld.shared::cluster.b32 %r46, [%rd1]; -; CHECK-NEXT: and.b32 %r52, %r46, %r2; +; CHECK-NEXT: ld.shared::cluster.b32 %r36, [%rd1]; +; CHECK-NEXT: and.b32 %r42, %r36, %r2; ; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r47, %r52, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; -; CHECK-NEXT: setp.eq.b32 %p9, %r22, %r47; +; CHECK-NEXT: or.b32 %r37, %r42, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r12, [%rd1], %r37, %r42; +; CHECK-NEXT: setp.eq.b32 %p9, %r12, %r37; ; CHECK-NEXT: @%p9 bra $L__BB4_15; ; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure ; CHECK-NEXT: // in Loop: Header=BB4_13 Depth=1 -; CHECK-NEXT: and.b32 %r23, %r22, %r2; -; CHECK-NEXT: setp.ne.b32 %p10, %r52, %r23; -; CHECK-NEXT: mov.b32 %r52, %r23; +; CHECK-NEXT: and.b32 %r13, %r12, %r2; +; CHECK-NEXT: setp.ne.b32 %p10, %r42, %r13; +; CHECK-NEXT: mov.b32 %r42, %r13; ; CHECK-NEXT: @%p10 bra $L__BB4_13; ; CHECK-NEXT: $L__BB4_15: // %partword.cmpxchg.end ; CHECK-NEXT: fence.acq_rel.sys; diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index 06fb8d2..7714127 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -4,15 +4,15 @@ ; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-32 ; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.3 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.3 %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} ; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52. define i32 @test_dynamic_stackalloc(i64 %n) { ; CHECK-32-LABEL: test_dynamic_stackalloc( ; CHECK-32: { -; CHECK-32-NEXT: .reg .b32 %r<8>; +; CHECK-32-NEXT: .reg .b32 %r<7>; ; CHECK-32-EMPTY: ; CHECK-32-NEXT: // %bb.0: ; CHECK-32-NEXT: ld.param.b32 %r1, [test_dynamic_stackalloc_param_0]; @@ -32,7 +32,7 @@ define i32 @test_dynamic_stackalloc(i64 %n) { ; ; CHECK-64-LABEL: test_dynamic_stackalloc( ; CHECK-64: { -; CHECK-64-NEXT: .reg .b32 %r<3>; +; CHECK-64-NEXT: .reg .b32 %r<2>; ; CHECK-64-NEXT: .reg .b64 %rd<6>; ; CHECK-64-EMPTY: ; CHECK-64-NEXT: // %bb.0: diff --git a/llvm/test/CodeGen/NVPTX/elect.ll b/llvm/test/CodeGen/NVPTX/elect.ll index b65fa5a..a61d2da 100644 --- a/llvm/test/CodeGen/NVPTX/elect.ll +++ b/llvm/test/CodeGen/NVPTX/elect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll index d61a63c..6d67ed0 100644 --- a/llvm/test/CodeGen/NVPTX/extractelement.ll +++ b/llvm/test/CodeGen/NVPTX/extractelement.ll @@ -31,16 +31,16 @@ define i1 @test_v2i8_load(ptr %a) { ; CHECK-LABEL: test_v2i8_load( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_v2i8_load_param_0]; ; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: or.b16 %rs5, %rs1, %rs2; -; CHECK-NEXT: and.b16 %rs6, %rs5, 255; -; CHECK-NEXT: setp.eq.b16 %p1, %rs6, 0; +; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: and.b16 %rs4, %rs3, 255; +; CHECK-NEXT: setp.eq.b16 %p1, %rs4, 0; ; CHECK-NEXT: selp.b32 %r1, -1, 0, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/f16-abs.ll b/llvm/test/CodeGen/NVPTX/f16-abs.ll index 4025b38..f5354a3 100644 --- a/llvm/test/CodeGen/NVPTX/f16-abs.ll +++ b/llvm/test/CodeGen/NVPTX/f16-abs.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx60 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefix CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx60 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -14,7 +14,7 @@ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 --nvptx-no-f16-math \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefix CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 --nvptx-no-f16-math \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -24,7 +24,7 @@ ; RUN: llc < %s -mcpu=sm_52 -mattr=+ptx65 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefix CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_52 %{ \ ; RUN: llc < %s -mcpu=sm_52 -mattr=+ptx65 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_52 \ @@ -34,7 +34,7 @@ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefix CHECK-F16-ABS %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll index ae70946..ee79f9d 100644 --- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s -; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} target triple = "nvptx64-nvidia-cuda" declare half @llvm.nvvm.ex2.approx.f16(half) diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index 2b7e418..4e2f7ea 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -3,7 +3,7 @@ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-F16-NOFTZ %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 && ptxas-isa-6.0 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -mattr=+ptx60 \ @@ -14,7 +14,7 @@ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -denormal-fp-math-f32=preserve-sign -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16-FTZ %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 && ptxas-isa-6.0 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -denormal-fp-math-f32=preserve-sign -mattr=+ptx60 \ @@ -25,7 +25,7 @@ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -34,7 +34,7 @@ ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_52 %{ \ ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_52 \ @@ -886,8 +886,8 @@ define half @test_sqrt(half %a) #0 { ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; -define half @test_sin(half %a) #0 #1 { - %r = call half @llvm.sin.f16(half %a) +define half @test_sin(half %a) #0 { + %r = call afn half @llvm.sin.f16(half %a) ret half %r } @@ -900,8 +900,8 @@ define half @test_sin(half %a) #0 #1 { ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; -define half @test_cos(half %a) #0 #1 { - %r = call half @llvm.cos.f16(half %a) +define half @test_cos(half %a) #0 { + %r = call afn half @llvm.cos.f16(half %a) ret half %r } @@ -1183,4 +1183,3 @@ define <2 x half> @test_neg_f16x2(<2 x half> noundef %arg) #0 { } attributes #0 = { nounwind } -attributes #1 = { "unsafe-fp-math" = "true" } diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index d4fcea3..e9143d5 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-F16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -13,7 +13,7 @@ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs \ @@ -23,7 +23,7 @@ ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_52 %{ \ ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_52 \ @@ -455,7 +455,7 @@ declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-LABEL: test_call( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; @@ -478,7 +478,7 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-LABEL: test_call_flipped( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; @@ -501,7 +501,7 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-LABEL: test_tailcall_flipped( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; @@ -596,18 +596,15 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .pred %p<3>; ; CHECK-F16-NEXT: .reg .b32 %r<9>; -; CHECK-F16-NEXT: .reg .b64 %rd<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; -; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; -; CHECK-F16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1]; -; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0]; -; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; -; CHECK-F16-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-F16-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-F16-NEXT: selp.f32 %r7, %r6, %r4, %p2; -; CHECK-F16-NEXT: selp.f32 %r8, %r5, %r3, %p1; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3]; +; CHECK-F16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2]; +; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r5, %r6; +; CHECK-F16-NEXT: selp.f32 %r7, %r2, %r4, %p2; +; CHECK-F16-NEXT: selp.f32 %r8, %r1, %r3, %p1; ; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-F16-NEXT: ret; ; @@ -616,25 +613,22 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-NOF16-NEXT: .reg .pred %p<3>; ; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; -; CHECK-NOF16-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; -; CHECK-NOF16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1]; -; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs1; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; -; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs4; -; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5; -; CHECK-NOF16-NEXT: mov.b64 {%r7, %r8}, %rd2; -; CHECK-NOF16-NEXT: mov.b64 {%r9, %r10}, %rd1; -; CHECK-NOF16-NEXT: selp.f32 %r11, %r10, %r8, %p2; -; CHECK-NOF16-NEXT: selp.f32 %r12, %r9, %r7, %p1; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r8, %r7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs4; +; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r10, %r9; +; CHECK-NOF16-NEXT: selp.f32 %r11, %r2, %r4, %p2; +; CHECK-NOF16-NEXT: selp.f32 %r12, %r1, %r3, %p1; ; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11}; ; CHECK-NOF16-NEXT: ret; <2 x half> %c, <2 x half> %d) #0 { @@ -649,17 +643,14 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, ; CHECK-NEXT: .reg .pred %p<3>; ; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2]; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1]; ; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0]; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3; -; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; @@ -1501,11 +1492,9 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; ; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1; ; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; @@ -1674,7 +1663,7 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 { ; ret <2 x half> %r ;} -define <2 x half> @test_sin(<2 x half> %a) #0 #1 { +define <2 x half> @test_sin(<2 x half> %a) #0 { ; CHECK-LABEL: test_sin( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<5>; @@ -1692,11 +1681,11 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 { ; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs3}; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %r = call <2 x half> @llvm.sin.f16(<2 x half> %a) + %r = call afn <2 x half> @llvm.sin.f16(<2 x half> %a) ret <2 x half> %r } -define <2 x half> @test_cos(<2 x half> %a) #0 #1 { +define <2 x half> @test_cos(<2 x half> %a) #0 { ; CHECK-LABEL: test_cos( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<5>; @@ -1714,7 +1703,7 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 { ; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs3}; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %r = call <2 x half> @llvm.cos.f16(<2 x half> %a) + %r = call afn <2 x half> @llvm.cos.f16(<2 x half> %a) ret <2 x half> %r } @@ -1928,12 +1917,10 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .b16 %rs<3>; ; CHECK-F16-NEXT: .reg .b32 %r<8>; -; CHECK-F16-NEXT: .reg .b64 %rd<2>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; -; CHECK-F16-NEXT: mov.b64 {%r2, %r3}, %rd1; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2; ; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1}; @@ -1947,21 +1934,19 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; ; CHECK-NOF16-NEXT: .reg .b32 %r<6>; -; CHECK-NOF16-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; -; CHECK-NOF16-NEXT: mov.b64 {%r2, %r3}, %rd1; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; } -; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; } +; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs4, %rs3, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs5, %rs4, %rs1; ; CHECK-NOF16-NEXT: and.b32 %r5, %r2, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; } -; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r5; } +; CHECK-NOF16-NEXT: and.b16 %rs7, %rs2, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs8, %rs7, %rs6; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs8, %rs5}; ; CHECK-NOF16-NEXT: ret; %tb = fptrunc <2 x float> %b to <2 x half> @@ -2330,4 +2315,3 @@ define void @test_store_2xhalf(ptr %p1, ptr %p2, <2 x half> %v) { attributes #0 = { nounwind } -attributes #1 = { "unsafe-fp-math" = "true" } diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll index fd92375..796d80d 100644 --- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx-nvidia-cuda" declare float @llvm.nvvm.ex2.approx.f(float) diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll index 29dede0..4f9e370 100644 --- a/llvm/test/CodeGen/NVPTX/f32-lg2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-lg2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify %} +; RUN: %if ptxas-isa-3.2 %{ llc < %s -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" declare float @llvm.nvvm.lg2.approx.f(float) diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index b84a0ec..217bb48 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -2,13 +2,13 @@ ; ## Full FP32x2 support enabled by default. ; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-NOF32X2 %s -; RUN: %if ptxas-12.7 %{ \ +; RUN: %if ptxas-sm_80 %{ \ ; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_80 \ ; RUN: %} ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-F32X2 %s -; RUN: %if ptxas-12.7 %{ \ +; RUN: %if ptxas-sm_100 %{ \ ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ ; RUN: %} @@ -30,12 +30,10 @@ define <2 x float> @test_ret_const() #0 { define float @test_extract_0(<2 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_extract_0( ; CHECK-NOF32X2: { -; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0]; -; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0]; ; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NOF32X2-NEXT: ret; ; @@ -56,13 +54,11 @@ define float @test_extract_0(<2 x float> %a) #0 { define float @test_extract_1(<2 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_extract_1( ; CHECK-NOF32X2: { -; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0]; -; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; } -; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0]; +; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_extract_1( @@ -79,27 +75,57 @@ define float @test_extract_1(<2 x float> %a) #0 { ret float %e } -; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on -; test_extract_i_param_0 where the symbol's address is not taken first (that -; is, moved to a temporary) -; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 { -; %e = extractelement <2 x float> %a, i64 %idx -; ret float %e -; } +define float @test_extract_i(<2 x float> %a, i64 %idx) #0 { +; CHECK-NOF32X2-LABEL: test_extract_i( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .local .align 8 .b8 __local_depot3[8]; +; CHECK-NOF32X2-NEXT: .reg .b64 %SP; +; CHECK-NOF32X2-NEXT: .reg .b64 %SPL; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<6>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: mov.b64 %SPL, __local_depot3; +; CHECK-NOF32X2-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_i_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; +; CHECK-NOF32X2-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: and.b64 %rd2, %rd1, 1; +; CHECK-NOF32X2-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NOF32X2-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NOF32X2-NEXT: or.b64 %rd5, %rd4, %rd3; +; CHECK-NOF32X2-NEXT: ld.b32 %r3, [%rd5]; +; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_extract_i( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<2>; +; CHECK-F32X2-NEXT: .reg .b32 %r<4>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_extract_i_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_0]; +; CHECK-F32X2-NEXT: setp.eq.b64 %p1, %rd2, 0; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: selp.f32 %r3, %r1, %r2, %p1; +; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F32X2-NEXT: ret; + %e = extractelement <2 x float> %a, i64 %idx + ret float %e +} define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_fadd( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -121,11 +147,9 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_0( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0]; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -152,11 +176,9 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_1( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0]; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -183,20 +205,15 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_v4( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<13>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1; -; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r10, %r8; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r9, %r7; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r9, %r4, %r8; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r10, %r3, %r7; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r2, %r6; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r1, %r5; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_v4( @@ -218,17 +235,14 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4( @@ -256,17 +270,14 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4( @@ -294,15 +305,12 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_fsub( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0]; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -321,18 +329,29 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { } define <2 x float> @test_fneg(<2 x float> %a) #0 { -; CHECK-LABEL: test_fneg( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: neg.f32 %r3, %r2; -; CHECK-NEXT: neg.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fneg( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0]; +; CHECK-NOF32X2-NEXT: neg.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: neg.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fneg( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fneg_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: neg.f32 %r3, %r2; +; CHECK-F32X2-NEXT: neg.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = fneg <2 x float> %a ret <2 x float> %r } @@ -341,15 +360,12 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_fmul( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0]; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -368,50 +384,85 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { } define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fdiv( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; -; CHECK-NEXT: div.rn.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fdiv( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0]; +; CHECK-NOF32X2-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: div.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fdiv( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: div.rn.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: div.rn.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = fdiv <2 x float> %a, %b ret <2 x float> %r } define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_frem( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<15>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; -; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; -; CHECK-NEXT: neg.f32 %r7, %r6; -; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4; -; CHECK-NEXT: testp.infinite.f32 %p1, %r2; -; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; -; CHECK-NEXT: div.rn.f32 %r10, %r3, %r1; -; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r10; -; CHECK-NEXT: neg.f32 %r12, %r11; -; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3; -; CHECK-NEXT: testp.infinite.f32 %p2, %r1; -; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_frem( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<15>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0]; +; CHECK-NOF32X2-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-NOF32X2-NEXT: neg.f32 %r7, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r7, %r4, %r2; +; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p1, %r4; +; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r8, %p1; +; CHECK-NOF32X2-NEXT: div.rn.f32 %r10, %r1, %r3; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r11, %r10; +; CHECK-NOF32X2-NEXT: neg.f32 %r12, %r11; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r13, %r12, %r3, %r1; +; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p2, %r3; +; CHECK-NOF32X2-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_frem( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<15>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_frem_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_frem_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: div.rn.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-F32X2-NEXT: neg.f32 %r7, %r6; +; CHECK-F32X2-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4; +; CHECK-F32X2-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-F32X2-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-F32X2-NEXT: div.rn.f32 %r10, %r3, %r1; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r11, %r10; +; CHECK-F32X2-NEXT: neg.f32 %r12, %r11; +; CHECK-F32X2-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3; +; CHECK-F32X2-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-F32X2-NEXT: ret; %r = frem <2 x float> %a, %b ret <2 x float> %r } @@ -420,15 +471,12 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -450,11 +498,9 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_0_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0]; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -481,11 +527,9 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_1_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0]; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -512,20 +556,15 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_v4_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<13>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1; -; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r10, %r8; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r9, %r7; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r9, %r4, %r8; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r10, %r3, %r7; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r2, %r6; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r1, %r5; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_v4_ftz( @@ -547,17 +586,14 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz( @@ -585,17 +621,14 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 { ; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz( @@ -623,15 +656,12 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-LABEL: test_fsub_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0]; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -650,18 +680,29 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { } define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 { -; CHECK-LABEL: test_fneg_ftz( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: neg.ftz.f32 %r3, %r2; -; CHECK-NEXT: neg.ftz.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fneg_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0]; +; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fneg_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: neg.ftz.f32 %r3, %r2; +; CHECK-F32X2-NEXT: neg.ftz.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = fneg <2 x float> %a ret <2 x float> %r } @@ -670,15 +711,12 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-LABEL: test_fmul_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r1, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -700,17 +738,13 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-NOF32X2-LABEL: test_fma_ftz( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r6, %r4, %r2; -; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r5, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0]; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r1, %r3, %r5; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -730,65 +764,112 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) } define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 { -; CHECK-LABEL: test_fdiv_ftz( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fdiv_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0]; +; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fdiv_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = fdiv <2 x float> %a, %b ret <2 x float> %r } define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 { -; CHECK-LABEL: test_frem_ftz( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<15>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; -; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; -; CHECK-NEXT: neg.ftz.f32 %r7, %r6; -; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4; -; CHECK-NEXT: testp.infinite.f32 %p1, %r2; -; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; -; CHECK-NEXT: div.rn.ftz.f32 %r10, %r3, %r1; -; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; -; CHECK-NEXT: neg.ftz.f32 %r12, %r11; -; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3; -; CHECK-NEXT: testp.infinite.f32 %p2, %r1; -; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_frem_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<15>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0]; +; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; +; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r7, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r7, %r4, %r2; +; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p1, %r4; +; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r8, %p1; +; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r10, %r1, %r3; +; CHECK-NOF32X2-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; +; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r12, %r11; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r13, %r12, %r3, %r1; +; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p2, %r3; +; CHECK-NOF32X2-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_frem_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<15>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; +; CHECK-F32X2-NEXT: neg.ftz.f32 %r7, %r6; +; CHECK-F32X2-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4; +; CHECK-F32X2-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-F32X2-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r10, %r3, %r1; +; CHECK-F32X2-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; +; CHECK-F32X2-NEXT: neg.ftz.f32 %r12, %r11; +; CHECK-F32X2-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3; +; CHECK-F32X2-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-F32X2-NEXT: ret; %r = frem <2 x float> %a, %b ret <2 x float> %r } define void @test_ldst_v2f32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: test_ldst_v2f32( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; -; CHECK-NEXT: ld.b64 %rd3, [%rd1]; -; CHECK-NEXT: st.b64 [%rd2], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ldst_v2f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; +; CHECK-NOF32X2-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NOF32X2-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ldst_v2f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; +; CHECK-F32X2-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-F32X2-NEXT: st.b64 [%rd2], %rd3; +; CHECK-F32X2-NEXT: ret; %t1 = load <2 x float>, ptr %a store <2 x float> %t1, ptr %b, align 32 ret void @@ -814,34 +895,60 @@ define void @test_ldst_v3f32(ptr %a, ptr %b) #0 { } define void @test_ldst_v4f32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: test_ldst_v4f32( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; -; CHECK-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; -; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ldst_v4f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; +; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ldst_v4f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; +; CHECK-F32X2-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; +; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; +; CHECK-F32X2-NEXT: ret; %t1 = load <4 x float>, ptr %a store <4 x float> %t1, ptr %b, align 32 ret void } define void @test_ldst_v8f32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: test_ldst_v8f32( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; -; CHECK-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; -; CHECK-NEXT: ld.v2.b64 {%rd5, %rd6}, [%rd1+16]; -; CHECK-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6}; -; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ldst_v8f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; +; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ldst_v8f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; +; CHECK-F32X2-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; +; CHECK-F32X2-NEXT: ld.v2.b64 {%rd5, %rd6}, [%rd1+16]; +; CHECK-F32X2-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6}; +; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; +; CHECK-F32X2-NEXT: ret; %t1 = load <8 x float>, ptr %a store <8 x float> %t1, ptr %b, align 32 ret void @@ -850,571 +957,982 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 { declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0 define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_call( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_call_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_call_param_0]; -; CHECK-NEXT: { // callseq 0, 0 -; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd2; -; CHECK-NEXT: st.param.b64 [param0], %rd1; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_call( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_call_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_call_param_0]; +; CHECK-NOF32X2-NEXT: { // callseq 0, 0 +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r3, %r4}; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; +; CHECK-NOF32X2-NEXT: } // callseq 0 +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_call( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_call_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_call_param_0]; +; CHECK-F32X2-NEXT: { // callseq 0, 0 +; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd1; +; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-NEXT: } // callseq 0 +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_call_flipped( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0]; -; CHECK-NEXT: { // callseq 1, 0 -; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd1; -; CHECK-NEXT: st.param.b64 [param0], %rd2; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_call_flipped( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_call_flipped_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_call_flipped_param_0]; +; CHECK-NOF32X2-NEXT: { // callseq 1, 0 +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r3, %r4}; +; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; +; CHECK-NOF32X2-NEXT: } // callseq 1 +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_call_flipped( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0]; +; CHECK-F32X2-NEXT: { // callseq 1, 0 +; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd1; +; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd2; +; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-NEXT: } // callseq 1 +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) ret <2 x float> %r } define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_tailcall_flipped( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0]; -; CHECK-NEXT: { // callseq 2, 0 -; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: .param .align 8 .b8 retval0[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd1; -; CHECK-NEXT: st.param.b64 [param0], %rd2; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; -; CHECK-NEXT: } // callseq 2 -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_tailcall_flipped( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_tailcall_flipped_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_tailcall_flipped_param_0]; +; CHECK-NOF32X2-NEXT: { // callseq 2, 0 +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r3, %r4}; +; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; +; CHECK-NOF32X2-NEXT: } // callseq 2 +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_tailcall_flipped( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0]; +; CHECK-F32X2-NEXT: { // callseq 2, 0 +; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd1; +; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd2; +; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-F32X2-NEXT: } // callseq 2 +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) ret <2 x float> %r } define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 { -; CHECK-LABEL: test_select( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; -; CHECK-NEXT: and.b16 %rs2, %rs1, 1; -; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b64 %rd2, [test_select_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_select_param_0]; -; CHECK-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_select( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<2>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; CHECK-NOF32X2-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NOF32X2-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_param_0]; +; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r2, %r4, %p1; +; CHECK-NOF32X2-NEXT: selp.f32 %r6, %r1, %r3, %p1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_select( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<2>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; CHECK-F32X2-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-F32X2-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_param_0]; +; CHECK-F32X2-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = select i1 %c, <2 x float> %a, <2 x float> %b ret <2 x float> %r } define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 { -; CHECK-LABEL: test_select_cc( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<11>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3]; -; CHECK-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2]; -; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; -; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; -; CHECK-NEXT: selp.f32 %r9, %r8, %r6, %p2; -; CHECK-NEXT: selp.f32 %r10, %r7, %r5, %p1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_select_cc( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<11>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_3]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0]; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r5, %r7; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r6, %r8; +; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r4, %p2; +; CHECK-NOF32X2-NEXT: selp.f32 %r10, %r1, %r3, %p1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_select_cc( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<11>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r4, %r2; +; CHECK-F32X2-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-F32X2-NEXT: selp.f32 %r9, %r8, %r6, %p2; +; CHECK-F32X2-NEXT: selp.f32 %r10, %r7, %r5, %p1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; +; CHECK-F32X2-NEXT: ret; %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r } define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 { -; CHECK-LABEL: test_select_cc_f64_f32( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<9>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; -; CHECK-NEXT: ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3]; -; CHECK-NEXT: ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd6; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd5; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; -; CHECK-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2; -; CHECK-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_select_cc_f64_f32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r1, %r3; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r2, %r4; +; CHECK-NOF32X2-NEXT: selp.f64 %rd5, %rd2, %rd4, %p2; +; CHECK-NOF32X2-NEXT: selp.f64 %rd6, %rd1, %rd3, %p1; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_select_cc_f64_f32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<9>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd6; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd5; +; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r4, %r2; +; CHECK-F32X2-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2; +; CHECK-F32X2-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; +; CHECK-F32X2-NEXT: ret; %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b ret <2 x double> %r } define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 { -; CHECK-LABEL: test_select_cc_f32_f64( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2]; -; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0]; -; CHECK-NEXT: setp.neu.f64 %p1, %rd3, %rd5; -; CHECK-NEXT: setp.neu.f64 %p2, %rd4, %rd6; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: selp.f32 %r5, %r4, %r2, %p2; -; CHECK-NEXT: selp.f32 %r6, %r3, %r1, %p1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_select_cc_f32_f64( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_3]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f32_f64_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0]; +; CHECK-NOF32X2-NEXT: setp.neu.f64 %p1, %rd1, %rd3; +; CHECK-NOF32X2-NEXT: setp.neu.f64 %p2, %rd2, %rd4; +; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r2, %r4, %p2; +; CHECK-NOF32X2-NEXT: selp.f32 %r6, %r1, %r3, %p1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_select_cc_f32_f64( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3]; +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0]; +; CHECK-F32X2-NEXT: setp.neu.f64 %p1, %rd3, %rd5; +; CHECK-F32X2-NEXT: setp.neu.f64 %p2, %rd4, %rd6; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: selp.f32 %r5, %r4, %r2, %p2; +; CHECK-F32X2-NEXT: selp.f32 %r6, %r3, %r1, %p1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %cc = fcmp une <2 x double> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r } define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_une( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.neu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.neu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_une( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0]; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_une( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp une <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ueq( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.equ.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.equ.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ueq( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0]; +; CHECK-NOF32X2-NEXT: setp.equ.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.equ.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ueq( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.equ.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.equ.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ueq <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ugt( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.gtu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.gtu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ugt( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0]; +; CHECK-NOF32X2-NEXT: setp.gtu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.gtu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ugt( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.gtu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.gtu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ugt <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_uge( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.geu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.geu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_uge( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0]; +; CHECK-NOF32X2-NEXT: setp.geu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.geu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_uge( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.geu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.geu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp uge <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ult( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.ltu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.ltu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ult( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0]; +; CHECK-NOF32X2-NEXT: setp.ltu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.ltu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ult( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.ltu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.ltu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ult <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ule( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.leu.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.leu.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ule( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0]; +; CHECK-NOF32X2-NEXT: setp.leu.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.leu.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ule( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.leu.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.leu.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ule <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_uno( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.nan.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.nan.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_uno( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0]; +; CHECK-NOF32X2-NEXT: setp.nan.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.nan.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_uno( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.nan.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.nan.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp uno <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_one( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.ne.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.ne.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_one( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0]; +; CHECK-NOF32X2-NEXT: setp.ne.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.ne.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_one( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.ne.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.ne.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp one <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_oeq( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.eq.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.eq.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_oeq( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0]; +; CHECK-NOF32X2-NEXT: setp.eq.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.eq.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_oeq( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.eq.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.eq.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp oeq <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ogt( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.gt.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.gt.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ogt( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0]; +; CHECK-NOF32X2-NEXT: setp.gt.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.gt.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ogt( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.gt.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.gt.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ogt <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_oge( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.ge.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.ge.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_oge( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0]; +; CHECK-NOF32X2-NEXT: setp.ge.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.ge.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_oge( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.ge.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.ge.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp oge <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_olt( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.lt.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.lt.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_olt( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0]; +; CHECK-NOF32X2-NEXT: setp.lt.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.lt.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_olt( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.lt.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.lt.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp olt <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ole( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.le.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.le.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ole( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0]; +; CHECK-NOF32X2-NEXT: setp.le.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.le.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ole( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.le.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.le.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ole <2 x float> %a, %b ret <2 x i1> %r } define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_fcmp_ord( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: setp.num.f32 %p1, %r4, %r2; -; CHECK-NEXT: setp.num.f32 %p2, %r3, %r1; -; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; -; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fcmp_ord( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0]; +; CHECK-NOF32X2-NEXT: setp.num.f32 %p1, %r2, %r4; +; CHECK-NOF32X2-NEXT: setp.num.f32 %p2, %r1, %r3; +; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fcmp_ord( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b16 %rs<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: setp.num.f32 %p1, %r4, %r2; +; CHECK-F32X2-NEXT: setp.num.f32 %p2, %r3, %r1; +; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F32X2-NEXT: ret; %r = fcmp ord <2 x float> %a, %b ret <2 x i1> %r } define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 { -; CHECK-LABEL: test_fptosi_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.s32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rzi.s32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptosi_i32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.s32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.s32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptosi_i32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.s32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.s32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %r } define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 { -; CHECK-LABEL: test_fptosi_i64( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %r2; -; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %r1; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptosi_i64( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.s64.f32 %rd1, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.s64.f32 %rd2, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptosi_i64( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.s64.f32 %rd2, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.s64.f32 %rd3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-F32X2-NEXT: ret; %r = fptosi <2 x float> %a to <2 x i64> ret <2 x i64> %r } define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 { -; CHECK-LABEL: test_fptoui_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.u32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rzi.u32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptoui_2xi32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.u32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.u32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptoui_2xi32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.u32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.u32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = fptoui <2 x float> %a to <2 x i32> ret <2 x i32> %r } define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { -; CHECK-LABEL: test_fptoui_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %r2; -; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %r1; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptoui_2xi64( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.u64.f32 %rd1, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.u64.f32 %rd2, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptoui_2xi64( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.u64.f32 %rd2, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.u64.f32 %rd3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-F32X2-NEXT: ret; %r = fptoui <2 x float> %a to <2 x i64> ret <2 x i64> %r } @@ -1485,16 +2003,14 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-LABEL: test_uitofp_2xi32_fadd( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1]; -; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r1; -; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, %r4; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, %r3; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r5, %r1; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r6, %r2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r4, %r6; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r3, %r5; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -1518,48 +2034,81 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { } define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 { -; CHECK-LABEL: test_fptrunc_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; -; CHECK-NEXT: cvt.rn.f32.f64 %r1, %rd2; -; CHECK-NEXT: cvt.rn.f32.f64 %r2, %rd1; -; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fptrunc_2xdouble( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.f64 %r1, %rd2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.f64 %r2, %rd1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fptrunc_2xdouble( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-F32X2-NEXT: cvt.rn.f32.f64 %r1, %rd2; +; CHECK-F32X2-NEXT: cvt.rn.f32.f64 %r2, %rd1; +; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; %r = fptrunc <2 x double> %a to <2 x float> ret <2 x float> %r } define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 { -; CHECK-LABEL: test_fpext_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.f64.f32 %rd2, %r2; -; CHECK-NEXT: cvt.f64.f32 %rd3, %r1; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fpext_2xdouble( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0]; +; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd1, %r2; +; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd2, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fpext_2xdouble( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.f64.f32 %rd2, %r2; +; CHECK-F32X2-NEXT: cvt.f64.f32 %rd3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-F32X2-NEXT: ret; %r = fpext <2 x float> %a to <2 x double> ret <2 x double> %r } define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 { -; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_bitcast_2xfloat_to_2xi32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xfloat_to_2xi32_param_0]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_bitcast_2xfloat_to_2xi32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-NEXT: ret; %r = bitcast <2 x float> %a to <2 x i32> ret <2 x i32> %r } @@ -1591,31 +2140,51 @@ define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 { } define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 { -; CHECK-LABEL: test_bitcast_2xfloat_to_double( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_bitcast_2xfloat_to_double( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xfloat_to_double_param_0]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_bitcast_2xfloat_to_double( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-NEXT: ret; %r = bitcast <2 x float> %a to double ret double %r } define <2 x float> @test_sqrt(<2 x float> %a) #0 { -; CHECK-LABEL: test_sqrt( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_sqrt_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; -; CHECK-NEXT: sqrt.rn.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_sqrt( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0]; +; CHECK-NOF32X2-NEXT: sqrt.rn.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: sqrt.rn.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_sqrt( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sqrt_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: sqrt.rn.f32 %r3, %r2; +; CHECK-F32X2-NEXT: sqrt.rn.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.sqrt(<2 x float> %a) ret <2 x float> %r } @@ -1627,37 +2196,59 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 { ; ret <2 x float> %r ;} -define <2 x float> @test_sin(<2 x float> %a) #0 #1 { -; CHECK-LABEL: test_sin( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_sin_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: sin.approx.f32 %r3, %r2; -; CHECK-NEXT: sin.approx.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; - %r = call <2 x float> @llvm.sin(<2 x float> %a) +define <2 x float> @test_sin(<2 x float> %a) #0 { +; CHECK-NOF32X2-LABEL: test_sin( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0]; +; CHECK-NOF32X2-NEXT: sin.approx.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: sin.approx.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_sin( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sin_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: sin.approx.f32 %r3, %r2; +; CHECK-F32X2-NEXT: sin.approx.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; + %r = call afn <2 x float> @llvm.sin(<2 x float> %a) ret <2 x float> %r } -define <2 x float> @test_cos(<2 x float> %a) #0 #1 { -; CHECK-LABEL: test_cos( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_cos_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cos.approx.f32 %r3, %r2; -; CHECK-NEXT: cos.approx.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; - %r = call <2 x float> @llvm.cos(<2 x float> %a) +define <2 x float> @test_cos(<2 x float> %a) #0 { +; CHECK-NOF32X2-LABEL: test_cos( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0]; +; CHECK-NOF32X2-NEXT: cos.approx.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cos.approx.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_cos( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_cos_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cos.approx.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cos.approx.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; + %r = call afn <2 x float> @llvm.cos(<2 x float> %a) ret <2 x float> %r } @@ -1708,17 +2299,13 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 ; CHECK-NOF32X2-LABEL: test_fma( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_param_2]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0]; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -1738,266 +2325,448 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 } define <2 x float> @test_fabs(<2 x float> %a) #0 { -; CHECK-LABEL: test_fabs( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fabs_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: abs.f32 %r3, %r2; -; CHECK-NEXT: abs.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_fabs( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0]; +; CHECK-NOF32X2-NEXT: abs.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: abs.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fabs( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fabs_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: abs.f32 %r3, %r2; +; CHECK-F32X2-NEXT: abs.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.fabs(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_minnum( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_minnum_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_minnum_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: min.f32 %r5, %r4, %r2; -; CHECK-NEXT: min.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_minnum( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0]; +; CHECK-NOF32X2-NEXT: min.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: min.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_minnum( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_minnum_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_minnum_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: min.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_maxnum( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_maxnum_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_maxnum_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; -; CHECK-NEXT: max.f32 %r5, %r4, %r2; -; CHECK-NEXT: max.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_maxnum( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0]; +; CHECK-NOF32X2-NEXT: max.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: max.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_maxnum( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_maxnum_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_maxnum_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-F32X2-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: max.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_copysign( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: copysign.f32 %r5, %r4, %r2; -; CHECK-NEXT: copysign.f32 %r6, %r3, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_copysign( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0]; +; CHECK-NOF32X2-NEXT: copysign.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: copysign.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_copysign( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_copysign_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-F32X2-NEXT: copysign.f32 %r5, %r4, %r2; +; CHECK-F32X2-NEXT: copysign.f32 %r6, %r3, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) ret <2 x float> %r } define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 { -; CHECK-LABEL: test_copysign_f64( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_f64_param_0]; -; CHECK-NEXT: shr.u64 %rd4, %rd3, 63; -; CHECK-NEXT: and.b64 %rd5, %rd4, 1; -; CHECK-NEXT: setp.ne.b64 %p1, %rd5, 0; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: abs.f32 %r3, %r2; -; CHECK-NEXT: neg.f32 %r4, %r3; -; CHECK-NEXT: selp.f32 %r5, %r4, %r3, %p1; -; CHECK-NEXT: shr.u64 %rd6, %rd2, 63; -; CHECK-NEXT: and.b64 %rd7, %rd6, 1; -; CHECK-NEXT: setp.ne.b64 %p2, %rd7, 0; -; CHECK-NEXT: abs.f32 %r6, %r1; -; CHECK-NEXT: neg.f32 %r7, %r6; -; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p2; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_copysign_f64( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<3>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0]; +; CHECK-NOF32X2-NEXT: abs.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: neg.f32 %r4, %r3; +; CHECK-NOF32X2-NEXT: shr.u64 %rd3, %rd2, 63; +; CHECK-NOF32X2-NEXT: and.b64 %rd4, %rd3, 1; +; CHECK-NOF32X2-NEXT: setp.ne.b64 %p1, %rd4, 0; +; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r4, %r3, %p1; +; CHECK-NOF32X2-NEXT: abs.f32 %r6, %r1; +; CHECK-NOF32X2-NEXT: neg.f32 %r7, %r6; +; CHECK-NOF32X2-NEXT: shr.u64 %rd5, %rd1, 63; +; CHECK-NOF32X2-NEXT: and.b64 %rd6, %rd5, 1; +; CHECK-NOF32X2-NEXT: setp.ne.b64 %p2, %rd6, 0; +; CHECK-NOF32X2-NEXT: selp.f32 %r8, %r7, %r6, %p2; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_copysign_f64( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<3>; +; CHECK-F32X2-NEXT: .reg .b32 %r<9>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<8>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_f64_param_0]; +; CHECK-F32X2-NEXT: shr.u64 %rd4, %rd3, 63; +; CHECK-F32X2-NEXT: and.b64 %rd5, %rd4, 1; +; CHECK-F32X2-NEXT: setp.ne.b64 %p1, %rd5, 0; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: abs.f32 %r3, %r2; +; CHECK-F32X2-NEXT: neg.f32 %r4, %r3; +; CHECK-F32X2-NEXT: selp.f32 %r5, %r4, %r3, %p1; +; CHECK-F32X2-NEXT: shr.u64 %rd6, %rd2, 63; +; CHECK-F32X2-NEXT: and.b64 %rd7, %rd6, 1; +; CHECK-F32X2-NEXT: setp.ne.b64 %p2, %rd7, 0; +; CHECK-F32X2-NEXT: abs.f32 %r6, %r1; +; CHECK-F32X2-NEXT: neg.f32 %r7, %r6; +; CHECK-F32X2-NEXT: selp.f32 %r8, %r7, %r6, %p2; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; +; CHECK-F32X2-NEXT: ret; %tb = fptrunc <2 x double> %b to <2 x float> %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %tb) ret <2 x float> %r } define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 { -; CHECK-LABEL: test_copysign_extended( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_extended_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_extended_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: copysign.f32 %r5, %r3, %r1; -; CHECK-NEXT: copysign.f32 %r6, %r4, %r2; -; CHECK-NEXT: cvt.f64.f32 %rd3, %r6; -; CHECK-NEXT: cvt.f64.f32 %rd4, %r5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_copysign_extended( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0]; +; CHECK-NOF32X2-NEXT: copysign.f32 %r5, %r3, %r1; +; CHECK-NOF32X2-NEXT: copysign.f32 %r6, %r4, %r2; +; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd1, %r6; +; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd2, %r5; +; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_copysign_extended( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<7>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_copysign_extended_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_extended_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-F32X2-NEXT: copysign.f32 %r5, %r3, %r1; +; CHECK-F32X2-NEXT: copysign.f32 %r6, %r4, %r2; +; CHECK-F32X2-NEXT: cvt.f64.f32 %rd3, %r6; +; CHECK-F32X2-NEXT: cvt.f64.f32 %rd4, %r5; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) %xr = fpext <2 x float> %r to <2 x double> ret <2 x double> %xr } define <2 x float> @test_floor(<2 x float> %a) #0 { -; CHECK-LABEL: test_floor( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_floor_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rmi.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rmi.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_floor( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rmi.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rmi.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_floor( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_floor_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rmi.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rmi.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.floor(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_ceil(<2 x float> %a) #0 { -; CHECK-LABEL: test_ceil( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_ceil_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rpi.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rpi.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_ceil( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rpi.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rpi.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_ceil( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ceil_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rpi.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rpi.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.ceil(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_trunc(<2 x float> %a) #0 { -; CHECK-LABEL: test_trunc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_trunc( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_trunc( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.trunc(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_rint(<2 x float> %a) #0 { -; CHECK-LABEL: test_rint( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_rint_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_rint( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_rint( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_rint_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.rint(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_nearbyint(<2 x float> %a) #0 { -; CHECK-LABEL: test_nearbyint( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_nearbyint_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_nearbyint( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_nearbyint( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_nearbyint_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.nearbyint(<2 x float> %a) ret <2 x float> %r } define <2 x float> @test_roundeven(<2 x float> %a) #0 { -; CHECK-LABEL: test_roundeven( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_roundeven_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; -; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_roundeven( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_roundeven( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_roundeven_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.roundeven(<2 x float> %a) ret <2 x float> %r } ; check the use of sign mask and 0.5 to implement round define <2 x float> @test_round(<2 x float> %a) #0 { -; CHECK-LABEL: test_round( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<19>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_round_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; -; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; -; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4; -; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; -; CHECK-NEXT: abs.f32 %r7, %r2; -; CHECK-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; -; CHECK-NEXT: selp.f32 %r8, %r2, %r6, %p1; -; CHECK-NEXT: cvt.rzi.f32.f32 %r9, %r2; -; CHECK-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; -; CHECK-NEXT: selp.f32 %r10, %r9, %r8, %p2; -; CHECK-NEXT: and.b32 %r11, %r1, -2147483648; -; CHECK-NEXT: or.b32 %r12, %r11, 1056964608; -; CHECK-NEXT: add.rn.f32 %r13, %r1, %r12; -; CHECK-NEXT: cvt.rzi.f32.f32 %r14, %r13; -; CHECK-NEXT: abs.f32 %r15, %r1; -; CHECK-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; -; CHECK-NEXT: selp.f32 %r16, %r1, %r14, %p3; -; CHECK-NEXT: cvt.rzi.f32.f32 %r17, %r1; -; CHECK-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; -; CHECK-NEXT: selp.f32 %r18, %r17, %r16, %p4; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_round( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .pred %p<5>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<19>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0]; +; CHECK-NOF32X2-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-NOF32X2-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-NOF32X2-NEXT: abs.f32 %r7, %r2; +; CHECK-NOF32X2-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; +; CHECK-NOF32X2-NEXT: selp.f32 %r8, %r2, %r6, %p1; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r9, %r2; +; CHECK-NOF32X2-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; +; CHECK-NOF32X2-NEXT: selp.f32 %r10, %r9, %r8, %p2; +; CHECK-NOF32X2-NEXT: and.b32 %r11, %r1, -2147483648; +; CHECK-NOF32X2-NEXT: or.b32 %r12, %r11, 1056964608; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r13, %r1, %r12; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r14, %r13; +; CHECK-NOF32X2-NEXT: abs.f32 %r15, %r1; +; CHECK-NOF32X2-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; +; CHECK-NOF32X2-NEXT: selp.f32 %r16, %r1, %r14, %p3; +; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r17, %r1; +; CHECK-NOF32X2-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; +; CHECK-NOF32X2-NEXT: selp.f32 %r18, %r17, %r16, %p4; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_round( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .pred %p<5>; +; CHECK-F32X2-NEXT: .reg .b32 %r<19>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_round_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-F32X2-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-F32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-F32X2-NEXT: abs.f32 %r7, %r2; +; CHECK-F32X2-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; +; CHECK-F32X2-NEXT: selp.f32 %r8, %r2, %r6, %p1; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r9, %r2; +; CHECK-F32X2-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; +; CHECK-F32X2-NEXT: selp.f32 %r10, %r9, %r8, %p2; +; CHECK-F32X2-NEXT: and.b32 %r11, %r1, -2147483648; +; CHECK-F32X2-NEXT: or.b32 %r12, %r11, 1056964608; +; CHECK-F32X2-NEXT: add.rn.f32 %r13, %r1, %r12; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r14, %r13; +; CHECK-F32X2-NEXT: abs.f32 %r15, %r1; +; CHECK-F32X2-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; +; CHECK-F32X2-NEXT: selp.f32 %r16, %r1, %r14, %p3; +; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r17, %r1; +; CHECK-F32X2-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; +; CHECK-F32X2-NEXT: selp.f32 %r18, %r17, %r16, %p4; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; +; CHECK-F32X2-NEXT: ret; %r = call <2 x float> @llvm.round(<2 x float> %a) ret <2 x float> %r } @@ -2006,17 +2775,13 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-NOF32X2-LABEL: test_fmuladd( ; CHECK-NOF32X2: { ; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fmuladd_param_2]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmuladd_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmuladd_param_0]; -; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; -; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0]; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -2036,16 +2801,25 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) } define <2 x float> @test_shufflevector(<2 x float> %a) #0 { -; CHECK-LABEL: test_shufflevector( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_shufflevector_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_shufflevector( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_shufflevector( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_shufflevector_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-F32X2-NEXT: ret; %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> <i32 1, i32 0> ret <2 x float> %s } @@ -2053,14 +2827,12 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 { define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { ; CHECK-NOF32X2-LABEL: test_insertelement( ; CHECK-NOF32X2: { -; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; -; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_insertelement_param_0]; -; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r2, tmp}, %rd1; } -; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_insertelement_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b32 %r3, [test_insertelement_param_1]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_insertelement( @@ -2109,36 +2881,60 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { } define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) { -; CHECK-LABEL: test_trunc_to_v2bf16( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; -; CHECK-NEXT: st.b32 [%rd2], %r3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_trunc_to_v2bf16( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2bf16_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_1]; +; CHECK-NOF32X2-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-NOF32X2-NEXT: st.b32 [%rd1], %r3; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_trunc_to_v2bf16( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<4>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3; +; CHECK-F32X2-NEXT: ret; %trunc = fptrunc <2 x float> %a to <2 x bfloat> store <2 x bfloat> %trunc, ptr %p ret void } define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) { -; CHECK-LABEL: test_trunc_to_v2f16( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; -; CHECK-NEXT: st.b32 [%rd2], %r3; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_trunc_to_v2f16( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2f16_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_1]; +; CHECK-NOF32X2-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; +; CHECK-NOF32X2-NEXT: st.b32 [%rd1], %r3; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_trunc_to_v2f16( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<4>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; +; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3; +; CHECK-F32X2-NEXT: ret; %trunc = fptrunc <2 x float> %a to <2 x half> store <2 x half> %trunc, ptr %p ret void @@ -2146,5 +2942,4 @@ define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) { attributes #0 = { nounwind } -attributes #1 = { "unsafe-fp-math" = "true" } attributes #2 = { "denormal-fp-math"="preserve-sign" } diff --git a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll index 30f9dcc..18b5351 100644 --- a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} target triple = "nvptx-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll index 5eda3a1..8561c60 100644 --- a/llvm/test/CodeGen/NVPTX/fast-math.ll +++ b/llvm/test/CodeGen/NVPTX/fast-math.ll @@ -22,7 +22,7 @@ define float @sqrt_div(float %a, float %b) { ret float %t2 } -define float @sqrt_div_fast(float %a, float %b) #0 { +define float @sqrt_div_fast(float %a, float %b) { ; CHECK-LABEL: sqrt_div_fast( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; @@ -34,29 +34,25 @@ define float @sqrt_div_fast(float %a, float %b) #0 { ; CHECK-NEXT: div.approx.f32 %r4, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; - %t1 = tail call float @llvm.sqrt.f32(float %a) - %t2 = fdiv float %t1, %b + %t1 = tail call afn float @llvm.sqrt.f32(float %a) + %t2 = fdiv afn float %t1, %b ret float %t2 } -define float @sqrt_div_fast_ninf(float %a, float %b) #0 { +define float @sqrt_div_fast_ninf(float %a, float %b) { ; CHECK-LABEL: sqrt_div_fast_ninf( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [sqrt_div_fast_ninf_param_0]; ; CHECK-NEXT: sqrt.approx.f32 %r2, %r1; -; CHECK-NEXT: abs.f32 %r3, %r1; -; CHECK-NEXT: setp.lt.f32 %p1, %r3, 0f00800000; -; CHECK-NEXT: selp.f32 %r4, 0f00000000, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r5, [sqrt_div_fast_ninf_param_1]; -; CHECK-NEXT: div.approx.f32 %r6, %r4, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ld.param.b32 %r3, [sqrt_div_fast_ninf_param_1]; +; CHECK-NEXT: div.approx.f32 %r4, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %t1 = tail call ninf afn float @llvm.sqrt.f32(float %a) - %t2 = fdiv float %t1, %b + %t2 = fdiv afn float %t1, %b ret float %t2 } @@ -77,7 +73,7 @@ define float @sqrt_div_ftz(float %a, float %b) #1 { ret float %t2 } -define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 { +define float @sqrt_div_fast_ftz(float %a, float %b) #1 { ; CHECK-LABEL: sqrt_div_fast_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; @@ -89,35 +85,32 @@ define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 { ; CHECK-NEXT: div.approx.ftz.f32 %r4, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; - %t1 = tail call float @llvm.sqrt.f32(float %a) - %t2 = fdiv float %t1, %b + %t1 = tail call afn float @llvm.sqrt.f32(float %a) + %t2 = fdiv afn float %t1, %b ret float %t2 } -define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #0 #1 { +define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #1 { ; CHECK-LABEL: sqrt_div_fast_ftz_ninf( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [sqrt_div_fast_ftz_ninf_param_0]; -; CHECK-NEXT: setp.eq.ftz.f32 %p1, %r1, 0f00000000; ; CHECK-NEXT: sqrt.approx.ftz.f32 %r2, %r1; -; CHECK-NEXT: selp.f32 %r3, 0f00000000, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [sqrt_div_fast_ftz_ninf_param_1]; -; CHECK-NEXT: div.approx.ftz.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ld.param.b32 %r3, [sqrt_div_fast_ftz_ninf_param_1]; +; CHECK-NEXT: div.approx.ftz.f32 %r4, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %t1 = tail call ninf afn float @llvm.sqrt.f32(float %a) - %t2 = fdiv float %t1, %b + %t2 = fdiv afn float %t1, %b ret float %t2 } ; There are no fast-math or ftz versions of sqrt and div for f64. We use ; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide. -define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 { +define double @sqrt_div_fast_ftz_f64(double %a, double %b) #1 { ; CHECK-LABEL: sqrt_div_fast_ftz_f64( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<5>; @@ -134,22 +127,17 @@ define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 { ret double %t2 } -define double @sqrt_div_fast_ftz_f64_ninf(double %a, double %b) #0 #1 { +define double @sqrt_div_fast_ftz_f64_ninf(double %a, double %b) #1 { ; CHECK-LABEL: sqrt_div_fast_ftz_f64_ninf( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [sqrt_div_fast_ftz_f64_ninf_param_0]; -; CHECK-NEXT: abs.f64 %rd2, %rd1; -; CHECK-NEXT: setp.lt.f64 %p1, %rd2, 0d0010000000000000; -; CHECK-NEXT: rsqrt.approx.f64 %rd3, %rd1; -; CHECK-NEXT: rcp.approx.ftz.f64 %rd4, %rd3; -; CHECK-NEXT: selp.f64 %rd5, 0d0000000000000000, %rd4, %p1; -; CHECK-NEXT: ld.param.b64 %rd6, [sqrt_div_fast_ftz_f64_ninf_param_1]; -; CHECK-NEXT: div.rn.f64 %rd7, %rd5, %rd6; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd7; +; CHECK-NEXT: sqrt.rn.f64 %rd2, %rd1; +; CHECK-NEXT: ld.param.b64 %rd3, [sqrt_div_fast_ftz_f64_ninf_param_1]; +; CHECK-NEXT: div.rn.f64 %rd4, %rd2, %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %t1 = tail call ninf afn double @llvm.sqrt.f64(double %a) %t2 = fdiv double %t1, %b @@ -172,7 +160,7 @@ define float @rsqrt(float %a) { ret float %ret } -define float @rsqrt_fast(float %a) #0 { +define float @rsqrt_fast(float %a) { ; CHECK-LABEL: rsqrt_fast( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -182,12 +170,12 @@ define float @rsqrt_fast(float %a) #0 { ; CHECK-NEXT: rsqrt.approx.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %b = tail call float @llvm.sqrt.f32(float %a) - %ret = fdiv float 1.0, %b + %b = tail call afn float @llvm.sqrt.f32(float %a) + %ret = fdiv afn float 1.0, %b ret float %ret } -define float @rsqrt_fast_ftz(float %a) #0 #1 { +define float @rsqrt_fast_ftz(float %a) #1 { ; CHECK-LABEL: rsqrt_fast_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -197,8 +185,8 @@ define float @rsqrt_fast_ftz(float %a) #0 #1 { ; CHECK-NEXT: rsqrt.approx.ftz.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %b = tail call float @llvm.sqrt.f32(float %a) - %ret = fdiv float 1.0, %b + %b = tail call afn float @llvm.sqrt.f32(float %a) + %ret = fdiv afn float 1.0, %b ret float %ret } @@ -263,35 +251,7 @@ define float @fcos_approx_afn(float %a) { ret float %r } -define float @fsin_approx(float %a) #0 { -; CHECK-LABEL: fsin_approx( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [fsin_approx_param_0]; -; CHECK-NEXT: sin.approx.f32 %r2, %r1; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; -; CHECK-NEXT: ret; - %r = tail call float @llvm.sin.f32(float %a) - ret float %r -} - -define float @fcos_approx(float %a) #0 { -; CHECK-LABEL: fcos_approx( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [fcos_approx_param_0]; -; CHECK-NEXT: cos.approx.f32 %r2, %r1; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; -; CHECK-NEXT: ret; - %r = tail call float @llvm.cos.f32(float %a) - ret float %r -} - -define float @fsin_approx_ftz(float %a) #0 #1 { +define float @fsin_approx_ftz(float %a) #1 { ; CHECK-LABEL: fsin_approx_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -301,11 +261,11 @@ define float @fsin_approx_ftz(float %a) #0 #1 { ; CHECK-NEXT: sin.approx.ftz.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %r = tail call float @llvm.sin.f32(float %a) + %r = tail call afn float @llvm.sin.f32(float %a) ret float %r } -define float @fcos_approx_ftz(float %a) #0 #1 { +define float @fcos_approx_ftz(float %a) #1 { ; CHECK-LABEL: fcos_approx_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -315,7 +275,7 @@ define float @fcos_approx_ftz(float %a) #0 #1 { ; CHECK-NEXT: cos.approx.ftz.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %r = tail call float @llvm.cos.f32(float %a) + %r = tail call afn float @llvm.cos.f32(float %a) ret float %r } @@ -423,7 +383,7 @@ define float @repeated_div_recip_allowed_ftz_sel(i1 %pred, float %a, float %b, f ret float %w } -define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0 { +define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) { ; CHECK-LABEL: repeated_div_fast( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -444,14 +404,14 @@ define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0 ; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; - %x = fdiv float %a, %divisor - %y = fdiv float %b, %divisor - %z = fmul float %x, %y + %x = fdiv afn arcp float %a, %divisor + %y = fdiv afn arcp contract float %b, %divisor + %z = fmul contract float %x, %y %w = select i1 %pred, float %z, float %y ret float %w } -define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor) #0 { +define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor) { ; CHECK-LABEL: repeated_div_fast_sel( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -469,13 +429,13 @@ define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor ; CHECK-NEXT: div.approx.f32 %r5, %r3, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; - %x = fdiv float %a, %divisor - %y = fdiv float %b, %divisor + %x = fdiv afn float %a, %divisor + %y = fdiv afn float %b, %divisor %w = select i1 %pred, float %x, float %y ret float %w } -define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor) #0 #1 { +define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor) #1 { ; CHECK-LABEL: repeated_div_fast_ftz( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -496,14 +456,14 @@ define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor ; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; - %x = fdiv float %a, %divisor - %y = fdiv float %b, %divisor - %z = fmul float %x, %y + %x = fdiv afn arcp float %a, %divisor + %y = fdiv afn arcp contract float %b, %divisor + %z = fmul contract float %x, %y %w = select i1 %pred, float %z, float %y ret float %w } -define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %divisor) #0 #1 { +define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %divisor) #1 { ; CHECK-LABEL: repeated_div_fast_ftz_sel( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -521,13 +481,13 @@ define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %div ; CHECK-NEXT: div.approx.ftz.f32 %r5, %r3, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; - %x = fdiv float %a, %divisor - %y = fdiv float %b, %divisor + %x = fdiv afn float %a, %divisor + %y = fdiv afn float %b, %divisor %w = select i1 %pred, float %x, float %y ret float %w } -define float @frem(float %a, float %b) #0 { +define float @frem(float %a, float %b) { ; CHECK-LABEL: frem( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; @@ -541,11 +501,11 @@ define float @frem(float %a, float %b) #0 { ; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %rem = frem float %a, %b + %rem = frem afn arcp contract ninf float %a, %b ret float %rem } -define float @frem_ftz(float %a, float %b) #0 #1 { +define float @frem_ftz(float %a, float %b) #1 { ; CHECK-LABEL: frem_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; @@ -559,11 +519,11 @@ define float @frem_ftz(float %a, float %b) #0 #1 { ; CHECK-NEXT: fma.rn.ftz.f32 %r6, %r5, %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %rem = frem float %a, %b + %rem = frem afn contract ninf float %a, %b ret float %rem } -define double @frem_f64(double %a, double %b) #0 { +define double @frem_f64(double %a, double %b) { ; CHECK-LABEL: frem_f64( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<7>; @@ -577,9 +537,8 @@ define double @frem_f64(double %a, double %b) #0 { ; CHECK-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; ; CHECK-NEXT: ret; - %rem = frem double %a, %b + %rem = frem ninf double %a, %b ret double %rem } -attributes #0 = { "unsafe-fp-math" = "true" } attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" } diff --git a/llvm/test/CodeGen/NVPTX/fence-cluster.ll b/llvm/test/CodeGen/NVPTX/fence-cluster.ll index 1683ec1..edaf8de 100644 --- a/llvm/test/CodeGen/NVPTX/fence-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/fence-cluster.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} define void @fence_acquire_cluster() { ; SM90-LABEL: fence_acquire_cluster( diff --git a/llvm/test/CodeGen/NVPTX/fence-nocluster.ll b/llvm/test/CodeGen/NVPTX/fence-nocluster.ll index 1c6c174..20f1df4 100644 --- a/llvm/test/CodeGen/NVPTX/fence-nocluster.ll +++ b/llvm/test/CodeGen/NVPTX/fence-nocluster.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -mattr=+ptx50 | FileCheck %s --check-prefix=SM30 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_35 -mattr=+ptx50 | %ptxas-verify -arch=sm_35 %} +; RUN: %if ptxas-sm_35 && ptxas-isa-5.0 %{ llc < %s -march=nvptx64 -mcpu=sm_35 -mattr=+ptx50 | %ptxas-verify -arch=sm_35 %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.0 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} define void @fence_acquire_sys() { ; SM30-LABEL: fence_acquire_sys( diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll index dde983d..636280d 100644 --- a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %} ; CHECK-LABEL: test_fence_proxy_tensormap_generic_release define void @test_fence_proxy_tensormap_generic_release() { diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll index 391aa45..d9e82cc 100644 --- a/llvm/test/CodeGen/NVPTX/fexp2.ll +++ b/llvm/test/CodeGen/NVPTX/fexp2.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s ; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s ; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-BF16 %s -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" ; --- f32 --- diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll index acac5a8..4aafc98 100644 --- a/llvm/test/CodeGen/NVPTX/flog2.ll +++ b/llvm/test/CodeGen/NVPTX/flog2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx64-nvidia-cuda" ; CHECK-LABEL: log2_test diff --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll index 1034c3ee..6693c90 100644 --- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll +++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll @@ -20,10 +20,10 @@ define ptx_device float @t1_f32(float %x, float %y, float %z, ; CHECK-UNSAFE-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-UNSAFE-NEXT: ret; float %u, float %v) { - %a = fmul float %x, %y - %b = fmul float %u, %v - %c = fadd float %a, %b - %d = fadd float %c, %z + %a = fmul reassoc float %x, %y + %b = fmul reassoc float %u, %v + %c = fadd reassoc float %a, %b + %d = fadd reassoc float %c, %z ret float %d } @@ -43,10 +43,10 @@ define ptx_device double @t1_f64(double %x, double %y, double %z, ; CHECK-UNSAFE-NEXT: st.param.b64 [func_retval0], %rd7; ; CHECK-UNSAFE-NEXT: ret; double %u, double %v) { - %a = fmul double %x, %y - %b = fmul double %u, %v - %c = fadd double %a, %b - %d = fadd double %c, %z + %a = fmul reassoc double %x, %y + %b = fmul reassoc double %u, %v + %c = fadd reassoc double %a, %b + %d = fadd reassoc double %c, %z ret double %d } diff --git a/llvm/test/CodeGen/NVPTX/fma-disable.ll b/llvm/test/CodeGen/NVPTX/fma-disable.ll index 0038b4b..e94192b 100644 --- a/llvm/test/CodeGen/NVPTX/fma-disable.ll +++ b/llvm/test/CodeGen/NVPTX/fma-disable.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %} -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll index 2f1d7d6..6d983ba 100644 --- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll +++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll @@ -9,7 +9,7 @@ ; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70 -define half @fma_f16_no_nans(half %a, half %b, half %c) #0 { +define half @fma_f16_no_nans(half %a, half %b, half %c) { ; CHECK-LABEL: fma_f16_no_nans( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<5>; @@ -49,14 +49,14 @@ define half @fma_f16_no_nans(half %a, half %b, half %c) #0 { ; CHECK-SM70-NEXT: selp.b16 %rs6, %rs4, 0x0000, %p1; ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-SM70-NEXT: ret; - %1 = call half @llvm.fma.f16(half %a, half %b, half %c) + %1 = call nnan half @llvm.fma.f16(half %a, half %b, half %c) %2 = fcmp ogt half %1, 0.0 - %3 = select i1 %2, half %1, half 0.0 + %3 = select nsz i1 %2, half %1, half 0.0 ret half %3 } ; FMA relu shouldn't be selected if the FMA operation has multiple uses -define half @fma_f16_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) #0 { +define half @fma_f16_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) { ; CHECK-LABEL: fma_f16_no_nans_multiple_uses_of_fma( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<8>; @@ -103,13 +103,13 @@ define half @fma_f16_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) #0 ; CHECK-SM70-NEXT: ret; %1 = call half @llvm.fma.f16(half %a, half %b, half %c) %2 = fcmp ogt half %1, 0.0 - %3 = select i1 %2, half %1, half 0.0 - %4 = fadd half %1, 7.0 - %5 = fadd half %4, %1 + %3 = select i1 %2, half %1, half 0.0 + %4 = fadd contract half %1, 7.0 + %5 = fadd contract half %4, %1 ret half %5 } -define half @fma_f16_maxnum_no_nans(half %a, half %b, half %c) #0 { +define half @fma_f16_maxnum_no_nans(half %a, half %b, half %c) { ; CHECK-LABEL: fma_f16_maxnum_no_nans( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<5>; @@ -149,12 +149,12 @@ define half @fma_f16_maxnum_no_nans(half %a, half %b, half %c) #0 { ; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %r2; ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5; ; CHECK-SM70-NEXT: ret; - %1 = call half @llvm.fma.f16(half %a, half %b, half %c) - %2 = call half @llvm.maxnum.f16(half %1, half 0.0) + %1 = call nnan half @llvm.fma.f16(half %a, half %b, half %c) + %2 = call nsz half @llvm.maxnum.f16(half %1, half 0.0) ret half %2 } -define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { +define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-LABEL: fma_bf16_no_nans( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<5>; @@ -205,14 +205,14 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2; ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; CHECK-SM70-NEXT: ret; - %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) + %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) %2 = fcmp ogt bfloat %1, 0.0 - %3 = select i1 %2, bfloat %1, bfloat 0.0 + %3 = select nsz i1 %2, bfloat %1, bfloat 0.0 ret bfloat %3 } ; FMA_relu shouldn't be selected if the FMA operation has multiple uses -define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 { +define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; @@ -291,12 +291,12 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) %2 = fcmp ogt bfloat %1, 0.0 %3 = select i1 %2, bfloat %1, bfloat 0.0 - %4 = fadd bfloat %1, 7.0 - %5 = fadd bfloat %4, %1 + %4 = fadd contract bfloat %1, 7.0 + %5 = fadd contract bfloat %4, %1 ret bfloat %5 } -define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { +define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-LABEL: fma_bf16_maxnum_no_nans( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<5>; @@ -351,12 +351,12 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-NEXT: shr.u32 %r20, %r19, 16; ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %r20; ; CHECK-SM70-NEXT: ret; - %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) - %2 = call bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0) + %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) + %2 = call nsz bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0) ret bfloat %2 } -define <2 x half> @fma_f16x2_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { +define <2 x half> @fma_f16x2_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) { ; CHECK-LABEL: fma_f16x2_no_nans( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; @@ -399,14 +399,14 @@ define <2 x half> @fma_f16x2_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c ; CHECK-SM70-NEXT: selp.b16 %rs4, %rs1, 0x0000, %p1; ; CHECK-SM70-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; ; CHECK-SM70-NEXT: ret; - %1 = call <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c) + %1 = call nnan <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c) %2 = fcmp ogt <2 x half> %1, <half 0.0, half 0.0> - %3 = select <2 x i1> %2, <2 x half> %1, <2 x half> <half 0.0, half 0.0> + %3 = select nsz <2 x i1> %2, <2 x half> %1, <2 x half> <half 0.0, half 0.0> ret <2 x half> %3 } ; FMA relu shouldn't be selected if the FMA operation has multiple uses -define <2 x half> @fma_f16x2_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { +define <2 x half> @fma_f16x2_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) { ; CHECK-LABEL: fma_f16x2_no_nans_multiple_uses_of_fma( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<8>; @@ -454,12 +454,12 @@ define <2 x half> @fma_f16x2_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x ha %1 = call <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c) %2 = fcmp ogt <2 x half> %1, <half 0.0, half 0.0> %3 = select <2 x i1> %2, <2 x half> %1, <2 x half> <half 0.0, half 0.0> - %4 = fadd <2 x half> %1, <half 7.0, half 7.0> - %5 = fadd <2 x half> %4, %1 + %4 = fadd contract <2 x half> %1, <half 7.0, half 7.0> + %5 = fadd contract <2 x half> %4, %1 ret <2 x half> %5 } -define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { +define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) { ; CHECK-LABEL: fma_f16x2_maxnum_no_nans( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; @@ -504,12 +504,12 @@ define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x h ; CHECK-SM70-NEXT: mov.b32 %r9, {%rs4, %rs3}; ; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-SM70-NEXT: ret; - %1 = call <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c) - %2 = call <2 x half> @llvm.maxnum.f16x2(<2 x half> %1, <2 x half> <half 0.0, half 0.0>) + %1 = call nnan <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c) + %2 = call nsz <2 x half> @llvm.maxnum.f16x2(<2 x half> %1, <2 x half> <half 0.0, half 0.0>) ret <2 x half> %2 } -define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { +define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; CHECK-LABEL: fma_bf16x2_no_nans( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; @@ -580,14 +580,14 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b ; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3; ; CHECK-SM70-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs9}; ; CHECK-SM70-NEXT: ret; - %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) + %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) %2 = fcmp ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0> - %3 = select <2 x i1> %2, <2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0> + %3 = select nsz <2 x i1> %2, <2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0> ret <2 x bfloat> %3 } ; FMA_relu shouldn't be selected if the FMA operation has multiple uses -define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { +define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<9>; @@ -707,12 +707,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) %2 = fcmp ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0> %3 = select <2 x i1> %2, <2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0> - %4 = fadd <2 x bfloat> %1, <bfloat 7.0, bfloat 7.0> - %5 = fadd <2 x bfloat> %4, %1 + %4 = fadd contract <2 x bfloat> %1, <bfloat 7.0, bfloat 7.0> + %5 = fadd contract <2 x bfloat> %4, %1 ret <2 x bfloat> %5 } -define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { +define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; CHECK-LABEL: fma_bf16x2_maxnum_no_nans( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; @@ -792,10 +792,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-SM70-NEXT: prmt.b32 %r39, %r38, %r31, 0x7632U; ; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r39; ; CHECK-SM70-NEXT: ret; - %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) - %2 = call <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>) + %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) + %2 = call nsz <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>) ret <2 x bfloat> %2 } - -attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" } -attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll index 87274aa..ba4bb76 100644 --- a/llvm/test/CodeGen/NVPTX/fma.ll +++ b/llvm/test/CodeGen/NVPTX/fma.ll @@ -25,7 +25,7 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) { define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) { ; CHECK-LABEL: t2_f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [t2_f32_param_0]; @@ -72,7 +72,7 @@ define ptx_device double @t1_f64(double %x, double %y, double %z) { define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) { ; CHECK-LABEL: t2_f64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-NEXT: .reg .b64 %rd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t2_f64_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll index a182152..96cdb76 100644 --- a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s --check-prefixes=CHECK ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %} -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} ; ---- minimum ---- diff --git a/llvm/test/CodeGen/NVPTX/fns.ll b/llvm/test/CodeGen/NVPTX/fns.ll index b153e29..f003bc1 100644 --- a/llvm/test/CodeGen/NVPTX/fns.ll +++ b/llvm/test/CodeGen/NVPTX/fns.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare i32 @llvm.nvvm.fns(i32, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/fold-movs.ll b/llvm/test/CodeGen/NVPTX/fold-movs.ll index 6ee0fb2..10e31f5 100644 --- a/llvm/test/CodeGen/NVPTX/fold-movs.ll +++ b/llvm/test/CodeGen/NVPTX/fold-movs.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \ ; RUN: -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck %s --check-prefixes=CHECK-F32X2 -; RUN: %if ptxas-12.7 %{ \ +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ \ ; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \ ; RUN: -frame-pointer=all -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ ; RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll index 636e12b..4f1454d 100644 --- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll +++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll @@ -7,7 +7,6 @@ define i32 @test_ld_param_const(ptr byval(i32) %a) { ; CHECK-LABEL: test_ld_param_const( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_const_param_0+4]; @@ -61,7 +60,6 @@ define void @test_ld_param_byval(ptr byval(i32) %a) { ; CHECK-LABEL: test_ld_param_byval( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 1, 0 @@ -98,8 +96,7 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [test_multi_block_param_1]; @@ -108,12 +105,12 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) { ; CHECK-NEXT: not.pred %p2, %p1; ; CHECK-NEXT: @%p2 bra $L__BB5_2; ; CHECK-NEXT: // %bb.1: // %if -; CHECK-NEXT: ld.param.b32 %r4, [test_multi_block_param_0+4]; +; CHECK-NEXT: ld.param.b32 %r1, [test_multi_block_param_0+4]; ; CHECK-NEXT: bra.uni $L__BB5_3; ; CHECK-NEXT: $L__BB5_2: // %else -; CHECK-NEXT: ld.param.b32 %r4, [test_multi_block_param_0+8]; +; CHECK-NEXT: ld.param.b32 %r1, [test_multi_block_param_0+8]; ; CHECK-NEXT: $L__BB5_3: // %end -; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; br i1 %p, label %if, label %else if: diff --git a/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll index dc0ec0f..c4d4dfc 100644 --- a/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll +++ b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefixes=CHECK,DEFAULT -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %} +; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %} +; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %} target triple = "nvptx64-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/frameindex-lifetime.ll b/llvm/test/CodeGen/NVPTX/frameindex-lifetime.ll index 4265553..9c564ff 100644 --- a/llvm/test/CodeGen/NVPTX/frameindex-lifetime.ll +++ b/llvm/test/CodeGen/NVPTX/frameindex-lifetime.ll @@ -44,8 +44,8 @@ declare void @bar(ptr) define void @foo() { %p = alloca i32 - call void @llvm.lifetime.start(i64 4, ptr %p) + call void @llvm.lifetime.start(ptr %p) call void @bar(ptr %p) - call void @llvm.lifetime.end(i64 4, ptr %p) + call void @llvm.lifetime.end(ptr %p) ret void } diff --git a/llvm/test/CodeGen/NVPTX/frem.ll b/llvm/test/CodeGen/NVPTX/frem.ll index 5805aed..d30c72c 100644 --- a/llvm/test/CodeGen/NVPTX/frem.ll +++ b/llvm/test/CodeGen/NVPTX/frem.ll @@ -1,313 +1,316 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s --enable-unsafe-fp-math -mcpu=sm_60 | FileCheck %s --check-prefixes=FAST -; RUN: llc < %s -mcpu=sm_60 | FileCheck %s --check-prefixes=NORMAL +; RUN: llc < %s -mcpu=sm_60 | FileCheck %s target triple = "nvptx64-unknown-cuda" define half @frem_f16(half %a, half %b) { -; FAST-LABEL: frem_f16( -; FAST: { -; FAST-NEXT: .reg .b16 %rs<4>; -; FAST-NEXT: .reg .b32 %r<7>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b16 %rs1, [frem_f16_param_0]; -; FAST-NEXT: ld.param.b16 %rs2, [frem_f16_param_1]; -; FAST-NEXT: cvt.f32.f16 %r1, %rs2; -; FAST-NEXT: cvt.f32.f16 %r2, %rs1; -; FAST-NEXT: div.approx.f32 %r3, %r2, %r1; -; FAST-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; FAST-NEXT: neg.f32 %r5, %r4; -; FAST-NEXT: fma.rn.f32 %r6, %r5, %r1, %r2; -; FAST-NEXT: cvt.rn.f16.f32 %rs3, %r6; -; FAST-NEXT: st.param.b16 [func_retval0], %rs3; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f16( -; NORMAL: { -; NORMAL-NEXT: .reg .pred %p<2>; -; NORMAL-NEXT: .reg .b16 %rs<4>; -; NORMAL-NEXT: .reg .b32 %r<8>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b16 %rs1, [frem_f16_param_0]; -; NORMAL-NEXT: ld.param.b16 %rs2, [frem_f16_param_1]; -; NORMAL-NEXT: cvt.f32.f16 %r1, %rs2; -; NORMAL-NEXT: cvt.f32.f16 %r2, %rs1; -; NORMAL-NEXT: div.rn.f32 %r3, %r2, %r1; -; NORMAL-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; NORMAL-NEXT: neg.f32 %r5, %r4; -; NORMAL-NEXT: fma.rn.f32 %r6, %r5, %r1, %r2; -; NORMAL-NEXT: testp.infinite.f32 %p1, %r1; -; NORMAL-NEXT: selp.f32 %r7, %r2, %r6, %p1; -; NORMAL-NEXT: cvt.rn.f16.f32 %rs3, %r7; -; NORMAL-NEXT: st.param.b16 [func_retval0], %rs3; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [frem_f16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [frem_f16_param_1]; +; CHECK-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NEXT: div.rn.f32 %r3, %r2, %r1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r1, %r2; +; CHECK-NEXT: testp.infinite.f32 %p1, %r1; +; CHECK-NEXT: selp.f32 %r7, %r2, %r6, %p1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r7; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %r = frem half %a, %b ret half %r } +define half @frem_f16_fast(half %a, half %b) { +; CHECK-LABEL: frem_f16_fast( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [frem_f16_fast_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [frem_f16_fast_param_1]; +; CHECK-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NEXT: div.approx.f32 %r3, %r2, %r1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r1, %r2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r6; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; + %r = frem afn ninf half %a, %b + ret half %r +} + define float @frem_f32(float %a, float %b) { -; FAST-LABEL: frem_f32( -; FAST: { -; FAST-NEXT: .reg .b32 %r<7>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b32 %r1, [frem_f32_param_0]; -; FAST-NEXT: ld.param.b32 %r2, [frem_f32_param_1]; -; FAST-NEXT: div.approx.f32 %r3, %r1, %r2; -; FAST-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; FAST-NEXT: neg.f32 %r5, %r4; -; FAST-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; -; FAST-NEXT: st.param.b32 [func_retval0], %r6; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f32( -; NORMAL: { -; NORMAL-NEXT: .reg .pred %p<2>; -; NORMAL-NEXT: .reg .b32 %r<8>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b32 %r1, [frem_f32_param_0]; -; NORMAL-NEXT: ld.param.b32 %r2, [frem_f32_param_1]; -; NORMAL-NEXT: div.rn.f32 %r3, %r1, %r2; -; NORMAL-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; NORMAL-NEXT: neg.f32 %r5, %r4; -; NORMAL-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; -; NORMAL-NEXT: testp.infinite.f32 %p1, %r2; -; NORMAL-NEXT: selp.f32 %r7, %r1, %r6, %p1; -; NORMAL-NEXT: st.param.b32 [func_retval0], %r7; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [frem_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [frem_f32_param_1]; +; CHECK-NEXT: div.rn.f32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; +; CHECK-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-NEXT: selp.f32 %r7, %r1, %r6, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; %r = frem float %a, %b ret float %r } +define float @frem_f32_fast(float %a, float %b) { +; CHECK-LABEL: frem_f32_fast( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [frem_f32_fast_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [frem_f32_fast_param_1]; +; CHECK-NEXT: div.approx.f32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %r = frem afn ninf float %a, %b + ret float %r +} + define double @frem_f64(double %a, double %b) { -; FAST-LABEL: frem_f64( -; FAST: { -; FAST-NEXT: .reg .b64 %rd<7>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b64 %rd1, [frem_f64_param_0]; -; FAST-NEXT: ld.param.b64 %rd2, [frem_f64_param_1]; -; FAST-NEXT: div.rn.f64 %rd3, %rd1, %rd2; -; FAST-NEXT: cvt.rzi.f64.f64 %rd4, %rd3; -; FAST-NEXT: neg.f64 %rd5, %rd4; -; FAST-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; -; FAST-NEXT: st.param.b64 [func_retval0], %rd6; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f64( -; NORMAL: { -; NORMAL-NEXT: .reg .pred %p<2>; -; NORMAL-NEXT: .reg .b64 %rd<8>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b64 %rd1, [frem_f64_param_0]; -; NORMAL-NEXT: ld.param.b64 %rd2, [frem_f64_param_1]; -; NORMAL-NEXT: div.rn.f64 %rd3, %rd1, %rd2; -; NORMAL-NEXT: cvt.rzi.f64.f64 %rd4, %rd3; -; NORMAL-NEXT: neg.f64 %rd5, %rd4; -; NORMAL-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; -; NORMAL-NEXT: testp.infinite.f64 %p1, %rd2; -; NORMAL-NEXT: selp.f64 %rd7, %rd1, %rd6, %p1; -; NORMAL-NEXT: st.param.b64 [func_retval0], %rd7; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [frem_f64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [frem_f64_param_1]; +; CHECK-NEXT: div.rn.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: cvt.rzi.f64.f64 %rd4, %rd3; +; CHECK-NEXT: neg.f64 %rd5, %rd4; +; CHECK-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; +; CHECK-NEXT: testp.infinite.f64 %p1, %rd2; +; CHECK-NEXT: selp.f64 %rd7, %rd1, %rd6, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd7; +; CHECK-NEXT: ret; %r = frem double %a, %b ret double %r } +define double @frem_f64_fast(double %a, double %b) { +; CHECK-LABEL: frem_f64_fast( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [frem_f64_fast_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [frem_f64_fast_param_1]; +; CHECK-NEXT: div.rn.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: cvt.rzi.f64.f64 %rd4, %rd3; +; CHECK-NEXT: neg.f64 %rd5, %rd4; +; CHECK-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; +; CHECK-NEXT: ret; + %r = frem afn ninf double %a, %b + ret double %r +} + define half @frem_f16_ninf(half %a, half %b) { -; FAST-LABEL: frem_f16_ninf( -; FAST: { -; FAST-NEXT: .reg .b16 %rs<4>; -; FAST-NEXT: .reg .b32 %r<7>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b16 %rs1, [frem_f16_ninf_param_0]; -; FAST-NEXT: ld.param.b16 %rs2, [frem_f16_ninf_param_1]; -; FAST-NEXT: cvt.f32.f16 %r1, %rs2; -; FAST-NEXT: cvt.f32.f16 %r2, %rs1; -; FAST-NEXT: div.approx.f32 %r3, %r2, %r1; -; FAST-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; FAST-NEXT: neg.f32 %r5, %r4; -; FAST-NEXT: fma.rn.f32 %r6, %r5, %r1, %r2; -; FAST-NEXT: cvt.rn.f16.f32 %rs3, %r6; -; FAST-NEXT: st.param.b16 [func_retval0], %rs3; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f16_ninf( -; NORMAL: { -; NORMAL-NEXT: .reg .b16 %rs<4>; -; NORMAL-NEXT: .reg .b32 %r<7>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b16 %rs1, [frem_f16_ninf_param_0]; -; NORMAL-NEXT: ld.param.b16 %rs2, [frem_f16_ninf_param_1]; -; NORMAL-NEXT: cvt.f32.f16 %r1, %rs2; -; NORMAL-NEXT: cvt.f32.f16 %r2, %rs1; -; NORMAL-NEXT: div.rn.f32 %r3, %r2, %r1; -; NORMAL-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; NORMAL-NEXT: neg.f32 %r5, %r4; -; NORMAL-NEXT: fma.rn.f32 %r6, %r5, %r1, %r2; -; NORMAL-NEXT: cvt.rn.f16.f32 %rs3, %r6; -; NORMAL-NEXT: st.param.b16 [func_retval0], %rs3; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f16_ninf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [frem_f16_ninf_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [frem_f16_ninf_param_1]; +; CHECK-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NEXT: div.rn.f32 %r3, %r2, %r1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r1, %r2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r6; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %r = frem ninf half %a, %b ret half %r } +define half @frem_f16_ninf_fast(half %a, half %b) { +; CHECK-LABEL: frem_f16_ninf_fast( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [frem_f16_ninf_fast_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [frem_f16_ninf_fast_param_1]; +; CHECK-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NEXT: div.approx.f32 %r3, %r2, %r1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r1, %r2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r6; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; + %r = frem afn ninf half %a, %b + ret half %r +} + define float @frem_f32_ninf(float %a, float %b) { -; FAST-LABEL: frem_f32_ninf( -; FAST: { -; FAST-NEXT: .reg .b32 %r<7>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b32 %r1, [frem_f32_ninf_param_0]; -; FAST-NEXT: ld.param.b32 %r2, [frem_f32_ninf_param_1]; -; FAST-NEXT: div.approx.f32 %r3, %r1, %r2; -; FAST-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; FAST-NEXT: neg.f32 %r5, %r4; -; FAST-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; -; FAST-NEXT: st.param.b32 [func_retval0], %r6; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f32_ninf( -; NORMAL: { -; NORMAL-NEXT: .reg .b32 %r<7>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b32 %r1, [frem_f32_ninf_param_0]; -; NORMAL-NEXT: ld.param.b32 %r2, [frem_f32_ninf_param_1]; -; NORMAL-NEXT: div.rn.f32 %r3, %r1, %r2; -; NORMAL-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; NORMAL-NEXT: neg.f32 %r5, %r4; -; NORMAL-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; -; NORMAL-NEXT: st.param.b32 [func_retval0], %r6; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f32_ninf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [frem_f32_ninf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [frem_f32_ninf_param_1]; +; CHECK-NEXT: div.rn.f32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; %r = frem ninf float %a, %b ret float %r } +define float @frem_f32_ninf_fast(float %a, float %b) { +; CHECK-LABEL: frem_f32_ninf_fast( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [frem_f32_ninf_fast_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [frem_f32_ninf_fast_param_1]; +; CHECK-NEXT: div.approx.f32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %r = frem afn ninf float %a, %b + ret float %r +} + define double @frem_f64_ninf(double %a, double %b) { -; FAST-LABEL: frem_f64_ninf( -; FAST: { -; FAST-NEXT: .reg .b64 %rd<7>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b64 %rd1, [frem_f64_ninf_param_0]; -; FAST-NEXT: ld.param.b64 %rd2, [frem_f64_ninf_param_1]; -; FAST-NEXT: div.rn.f64 %rd3, %rd1, %rd2; -; FAST-NEXT: cvt.rzi.f64.f64 %rd4, %rd3; -; FAST-NEXT: neg.f64 %rd5, %rd4; -; FAST-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; -; FAST-NEXT: st.param.b64 [func_retval0], %rd6; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f64_ninf( -; NORMAL: { -; NORMAL-NEXT: .reg .b64 %rd<7>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b64 %rd1, [frem_f64_ninf_param_0]; -; NORMAL-NEXT: ld.param.b64 %rd2, [frem_f64_ninf_param_1]; -; NORMAL-NEXT: div.rn.f64 %rd3, %rd1, %rd2; -; NORMAL-NEXT: cvt.rzi.f64.f64 %rd4, %rd3; -; NORMAL-NEXT: neg.f64 %rd5, %rd4; -; NORMAL-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; -; NORMAL-NEXT: st.param.b64 [func_retval0], %rd6; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f64_ninf( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [frem_f64_ninf_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [frem_f64_ninf_param_1]; +; CHECK-NEXT: div.rn.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: cvt.rzi.f64.f64 %rd4, %rd3; +; CHECK-NEXT: neg.f64 %rd5, %rd4; +; CHECK-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; +; CHECK-NEXT: ret; %r = frem ninf double %a, %b ret double %r } +define double @frem_f64_ninf_fast(double %a, double %b) { +; CHECK-LABEL: frem_f64_ninf_fast( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [frem_f64_ninf_fast_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [frem_f64_ninf_fast_param_1]; +; CHECK-NEXT: div.rn.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: cvt.rzi.f64.f64 %rd4, %rd3; +; CHECK-NEXT: neg.f64 %rd5, %rd4; +; CHECK-NEXT: fma.rn.f64 %rd6, %rd5, %rd2, %rd1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; +; CHECK-NEXT: ret; + %r = frem afn ninf double %a, %b + ret double %r +} + define float @frem_f32_imm1_fast(float %a) { -; FAST-LABEL: frem_f32_imm1_fast( -; FAST: { -; FAST-NEXT: .reg .b32 %r<5>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b32 %r1, [frem_f32_imm1_fast_param_0]; -; FAST-NEXT: mul.f32 %r2, %r1, 0f3E124925; -; FAST-NEXT: cvt.rzi.f32.f32 %r3, %r2; -; FAST-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; -; FAST-NEXT: st.param.b32 [func_retval0], %r4; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f32_imm1_fast( -; NORMAL: { -; NORMAL-NEXT: .reg .b32 %r<5>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b32 %r1, [frem_f32_imm1_fast_param_0]; -; NORMAL-NEXT: mul.rn.f32 %r2, %r1, 0f3E124925; -; NORMAL-NEXT: cvt.rzi.f32.f32 %r3, %r2; -; NORMAL-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; -; NORMAL-NEXT: st.param.b32 [func_retval0], %r4; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f32_imm1_fast( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [frem_f32_imm1_fast_param_0]; +; CHECK-NEXT: mul.rn.f32 %r2, %r1, 0f3E124925; +; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; %r = frem arcp float %a, 7.0 ret float %r } define float @frem_f32_imm1_normal(float %a) { -; FAST-LABEL: frem_f32_imm1_normal( -; FAST: { -; FAST-NEXT: .reg .b32 %r<5>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b32 %r1, [frem_f32_imm1_normal_param_0]; -; FAST-NEXT: div.approx.f32 %r2, %r1, 0f40E00000; -; FAST-NEXT: cvt.rzi.f32.f32 %r3, %r2; -; FAST-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; -; FAST-NEXT: st.param.b32 [func_retval0], %r4; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f32_imm1_normal( -; NORMAL: { -; NORMAL-NEXT: .reg .b32 %r<5>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b32 %r1, [frem_f32_imm1_normal_param_0]; -; NORMAL-NEXT: div.rn.f32 %r2, %r1, 0f40E00000; -; NORMAL-NEXT: cvt.rzi.f32.f32 %r3, %r2; -; NORMAL-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; -; NORMAL-NEXT: st.param.b32 [func_retval0], %r4; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f32_imm1_normal( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [frem_f32_imm1_normal_param_0]; +; CHECK-NEXT: div.rn.f32 %r2, %r1, 0f40E00000; +; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; %r = frem float %a, 7.0 ret float %r } define float @frem_f32_imm2(float %a) { -; FAST-LABEL: frem_f32_imm2( -; FAST: { -; FAST-NEXT: .reg .b32 %r<7>; -; FAST-EMPTY: -; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b32 %r1, [frem_f32_imm2_param_0]; -; FAST-NEXT: mov.b32 %r2, 0f40E00000; -; FAST-NEXT: div.approx.f32 %r3, %r2, %r1; -; FAST-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; FAST-NEXT: neg.f32 %r5, %r4; -; FAST-NEXT: fma.rn.f32 %r6, %r5, %r1, 0f40E00000; -; FAST-NEXT: st.param.b32 [func_retval0], %r6; -; FAST-NEXT: ret; -; -; NORMAL-LABEL: frem_f32_imm2( -; NORMAL: { -; NORMAL-NEXT: .reg .pred %p<2>; -; NORMAL-NEXT: .reg .b32 %r<8>; -; NORMAL-EMPTY: -; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b32 %r1, [frem_f32_imm2_param_0]; -; NORMAL-NEXT: mov.b32 %r2, 0f40E00000; -; NORMAL-NEXT: div.rn.f32 %r3, %r2, %r1; -; NORMAL-NEXT: cvt.rzi.f32.f32 %r4, %r3; -; NORMAL-NEXT: neg.f32 %r5, %r4; -; NORMAL-NEXT: fma.rn.f32 %r6, %r5, %r1, 0f40E00000; -; NORMAL-NEXT: testp.infinite.f32 %p1, %r1; -; NORMAL-NEXT: selp.f32 %r7, 0f40E00000, %r6, %p1; -; NORMAL-NEXT: st.param.b32 [func_retval0], %r7; -; NORMAL-NEXT: ret; +; CHECK-LABEL: frem_f32_imm2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [frem_f32_imm2_param_0]; +; CHECK-NEXT: mov.b32 %r2, 0f40E00000; +; CHECK-NEXT: div.rn.f32 %r3, %r2, %r1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r1, 0f40E00000; +; CHECK-NEXT: testp.infinite.f32 %p1, %r1; +; CHECK-NEXT: selp.f32 %r7, 0f40E00000, %r6, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; %r = frem float 7.0, %a ret float %r } + +define float @frem_f32_imm2_fast(float %a) { +; CHECK-LABEL: frem_f32_imm2_fast( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [frem_f32_imm2_fast_param_0]; +; CHECK-NEXT: mov.b32 %r2, 0f40E00000; +; CHECK-NEXT: div.approx.f32 %r3, %r2, %r1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r3; +; CHECK-NEXT: neg.f32 %r5, %r4; +; CHECK-NEXT: fma.rn.f32 %r6, %r5, %r1, 0f40E00000; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %r = frem afn ninf float 7.0, %a + ret float %r +} diff --git a/llvm/test/CodeGen/NVPTX/global-addrspace.ll b/llvm/test/CodeGen/NVPTX/global-addrspace.ll index 3f9d321..23f8747 100644 --- a/llvm/test/CodeGen/NVPTX/global-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/global-addrspace.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; PTX32: .visible .global .align 4 .u32 i; diff --git a/llvm/test/CodeGen/NVPTX/global-ordering.ll b/llvm/test/CodeGen/NVPTX/global-ordering.ll index 2815cff..5f59828 100644 --- a/llvm/test/CodeGen/NVPTX/global-ordering.ll +++ b/llvm/test/CodeGen/NVPTX/global-ordering.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Make sure we emit these globals in def-use order diff --git a/llvm/test/CodeGen/NVPTX/griddepcontrol.ll b/llvm/test/CodeGen/NVPTX/griddepcontrol.ll index 0bf9196..5b28d42 100644 --- a/llvm/test/CodeGen/NVPTX/griddepcontrol.ll +++ b/llvm/test/CodeGen/NVPTX/griddepcontrol.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_90 -march=nvptx64 | FileCheck %s -; RUN: %if ptxas-11.8 %{ llc < %s -mcpu=sm_90 -march=nvptx64 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 %{ llc < %s -mcpu=sm_90 -march=nvptx64 | %ptxas-verify -arch=sm_90 %} define void @griddepcontrol() { ; CHECK-LABEL: griddepcontrol( diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index df32e2a..264f380 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -66,22 +66,22 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals ; CHECK-LABEL: test_select_i1_basic( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<4>; -; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_param_0]; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_param_1]; -; CHECK-NEXT: or.b32 %r4, %r1, %r2; +; CHECK-NEXT: or.b32 %r3, %r1, %r2; ; CHECK-NEXT: setp.ne.b32 %p1, %r1, 0; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_2]; -; CHECK-NEXT: setp.eq.b32 %p2, %r5, 0; -; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_param_3]; -; CHECK-NEXT: setp.eq.b32 %p3, %r4, 0; -; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_basic_param_4]; -; CHECK-NEXT: selp.b32 %r9, %r7, %r8, %p2; -; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1; -; CHECK-NEXT: selp.b32 %r11, %r7, %r10, %p3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r11; +; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_param_2]; +; CHECK-NEXT: setp.eq.b32 %p2, %r4, 0; +; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_3]; +; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0; +; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_param_4]; +; CHECK-NEXT: selp.b32 %r7, %r5, %r6, %p2; +; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1; +; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 @@ -94,7 +94,7 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) { ; CHECK-LABEL: test_select_i1_basic_folding( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<13>; +; CHECK-NEXT: .reg .pred %p<11>; ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -106,14 +106,14 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i ; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2]; ; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0; ; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; -; CHECK-NEXT: xor.pred %p6, %p1, %p3; +; CHECK-NEXT: xor.pred %p5, %p1, %p3; ; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; -; CHECK-NEXT: and.pred %p8, %p6, %p4; -; CHECK-NEXT: and.pred %p9, %p2, %p4; -; CHECK-NEXT: and.pred %p10, %p3, %p8; -; CHECK-NEXT: or.pred %p11, %p10, %p9; -; CHECK-NEXT: xor.pred %p12, %p11, %p3; -; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12; +; CHECK-NEXT: and.pred %p6, %p5, %p4; +; CHECK-NEXT: and.pred %p7, %p2, %p4; +; CHECK-NEXT: and.pred %p8, %p3, %p6; +; CHECK-NEXT: or.pred %p9, %p8, %p7; +; CHECK-NEXT: xor.pred %p10, %p9, %p3; +; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p10; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 diff --git a/llvm/test/CodeGen/NVPTX/i128-array.ll b/llvm/test/CodeGen/NVPTX/i128-array.ll index 3bb9c6a..7bd8a00 100644 --- a/llvm/test/CodeGen/NVPTX/i128-array.ll +++ b/llvm/test/CodeGen/NVPTX/i128-array.ll @@ -27,13 +27,13 @@ define [2 x i128] @foo(i64 %a, i32 %b) { define [2 x i128] @foo2(ptr byval([2 x i128]) %a) { ; CHECK-LABEL: foo2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [foo2_param_0+16]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd5, %rd6}; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0+16]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; ; CHECK-NEXT: ret; %ptr0 = getelementptr [2 x i128], ptr %a, i64 0, i32 0 %1 = load i128, i128* %ptr0 diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 44d8558..cdbbabe 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -7,137 +7,137 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<20>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<127>; +; CHECK-NEXT: .reg .b64 %rd<79>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd49, %rd50}, [srem_i128_param_1]; -; CHECK-NEXT: shr.s64 %rd2, %rd46, 63; -; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; -; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; -; CHECK-NEXT: selp.b64 %rd4, %rd52, %rd46, %p1; -; CHECK-NEXT: selp.b64 %rd3, %rd51, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50; -; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; -; CHECK-NEXT: selp.b64 %rd6, %rd54, %rd50, %p2; -; CHECK-NEXT: selp.b64 %rd5, %rd53, %rd49, %p2; -; CHECK-NEXT: or.b64 %rd55, %rd5, %rd6; -; CHECK-NEXT: setp.eq.b64 %p3, %rd55, 0; -; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4; -; CHECK-NEXT: setp.eq.b64 %p4, %rd56, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [srem_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [srem_i128_param_1]; +; CHECK-NEXT: shr.s64 %rd1, %rd9, 63; +; CHECK-NEXT: sub.cc.s64 %rd12, 0, %rd8; +; CHECK-NEXT: subc.cc.s64 %rd13, 0, %rd9; +; CHECK-NEXT: setp.lt.s64 %p1, %rd9, 0; +; CHECK-NEXT: selp.b64 %rd3, %rd13, %rd9, %p1; +; CHECK-NEXT: selp.b64 %rd2, %rd12, %rd8, %p1; +; CHECK-NEXT: sub.cc.s64 %rd14, 0, %rd10; +; CHECK-NEXT: subc.cc.s64 %rd15, 0, %rd11; +; CHECK-NEXT: setp.lt.s64 %p2, %rd11, 0; +; CHECK-NEXT: selp.b64 %rd5, %rd15, %rd11, %p2; +; CHECK-NEXT: selp.b64 %rd4, %rd14, %rd10, %p2; +; CHECK-NEXT: or.b64 %rd16, %rd4, %rd5; +; CHECK-NEXT: setp.eq.b64 %p3, %rd16, 0; +; CHECK-NEXT: or.b64 %rd17, %rd2, %rd3; +; CHECK-NEXT: setp.eq.b64 %p4, %rd17, 0; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: setp.ne.b64 %p6, %rd6, 0; -; CHECK-NEXT: clz.b64 %r1, %rd6; -; CHECK-NEXT: cvt.u64.u32 %rd57, %r1; -; CHECK-NEXT: clz.b64 %r2, %rd5; -; CHECK-NEXT: cvt.u64.u32 %rd58, %r2; -; CHECK-NEXT: add.s64 %rd59, %rd58, 64; -; CHECK-NEXT: selp.b64 %rd60, %rd57, %rd59, %p6; -; CHECK-NEXT: setp.ne.b64 %p7, %rd4, 0; -; CHECK-NEXT: clz.b64 %r3, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd61, %r3; -; CHECK-NEXT: clz.b64 %r4, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd62, %r4; -; CHECK-NEXT: add.s64 %rd63, %rd62, 64; -; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7; -; CHECK-NEXT: mov.b64 %rd117, 0; -; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64; -; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0; -; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127; -; CHECK-NEXT: setp.eq.b64 %p9, %rd67, 0; +; CHECK-NEXT: setp.ne.b64 %p6, %rd5, 0; +; CHECK-NEXT: clz.b64 %r1, %rd5; +; CHECK-NEXT: cvt.u64.u32 %rd18, %r1; +; CHECK-NEXT: clz.b64 %r2, %rd4; +; CHECK-NEXT: cvt.u64.u32 %rd19, %r2; +; CHECK-NEXT: add.s64 %rd20, %rd19, 64; +; CHECK-NEXT: selp.b64 %rd21, %rd18, %rd20, %p6; +; CHECK-NEXT: setp.ne.b64 %p7, %rd3, 0; +; CHECK-NEXT: clz.b64 %r3, %rd3; +; CHECK-NEXT: cvt.u64.u32 %rd22, %r3; +; CHECK-NEXT: clz.b64 %r4, %rd2; +; CHECK-NEXT: cvt.u64.u32 %rd23, %r4; +; CHECK-NEXT: add.s64 %rd24, %rd23, 64; +; CHECK-NEXT: selp.b64 %rd25, %rd22, %rd24, %p7; +; CHECK-NEXT: mov.b64 %rd70, 0; +; CHECK-NEXT: sub.cc.s64 %rd26, %rd21, %rd25; +; CHECK-NEXT: subc.cc.s64 %rd27, %rd70, 0; +; CHECK-NEXT: setp.gt.u64 %p8, %rd26, 127; +; CHECK-NEXT: setp.eq.b64 %p9, %rd27, 0; ; CHECK-NEXT: and.pred %p10, %p9, %p8; -; CHECK-NEXT: setp.ne.b64 %p11, %rd67, 0; +; CHECK-NEXT: setp.ne.b64 %p11, %rd27, 0; ; CHECK-NEXT: or.pred %p12, %p10, %p11; ; CHECK-NEXT: or.pred %p13, %p5, %p12; -; CHECK-NEXT: xor.b64 %rd68, %rd66, 127; -; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67; -; CHECK-NEXT: setp.eq.b64 %p14, %rd69, 0; -; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13; -; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13; +; CHECK-NEXT: xor.b64 %rd28, %rd26, 127; +; CHECK-NEXT: or.b64 %rd29, %rd28, %rd27; +; CHECK-NEXT: setp.eq.b64 %p14, %rd29, 0; +; CHECK-NEXT: selp.b64 %rd78, 0, %rd3, %p13; +; CHECK-NEXT: selp.b64 %rd77, 0, %rd2, %p13; ; CHECK-NEXT: or.pred %p15, %p13, %p14; ; CHECK-NEXT: @%p15 bra $L__BB0_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1; -; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0; -; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120; -; CHECK-NEXT: setp.eq.b64 %p16, %rd72, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd66; +; CHECK-NEXT: add.cc.s64 %rd71, %rd26, 1; +; CHECK-NEXT: addc.cc.s64 %rd72, %rd27, 0; +; CHECK-NEXT: or.b64 %rd30, %rd71, %rd72; +; CHECK-NEXT: setp.eq.b64 %p16, %rd30, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd26; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6; +; CHECK-NEXT: shl.b64 %rd31, %rd3, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7; -; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; +; CHECK-NEXT: shr.u64 %rd32, %rd2, %r7; +; CHECK-NEXT: or.b64 %rd33, %rd31, %rd32; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8; +; CHECK-NEXT: shl.b64 %rd34, %rd2, %r8; ; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; -; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17; -; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6; -; CHECK-NEXT: mov.b64 %rd114, %rd117; +; CHECK-NEXT: selp.b64 %rd76, %rd34, %rd33, %p17; +; CHECK-NEXT: shl.b64 %rd75, %rd2, %r6; +; CHECK-NEXT: mov.b64 %rd69, %rd70; ; CHECK-NEXT: @%p16 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd119; -; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd71; +; CHECK-NEXT: shr.u64 %rd35, %rd2, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10; -; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; +; CHECK-NEXT: shl.b64 %rd36, %rd3, %r10; +; CHECK-NEXT: or.b64 %rd37, %rd35, %rd36; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11; +; CHECK-NEXT: shr.u64 %rd38, %rd3, %r11; ; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; -; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18; -; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9; -; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; -; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; -; CHECK-NEXT: mov.b64 %rd114, 0; -; CHECK-NEXT: mov.b64 %rd117, %rd114; +; CHECK-NEXT: selp.b64 %rd73, %rd38, %rd37, %p18; +; CHECK-NEXT: shr.u64 %rd74, %rd3, %r9; +; CHECK-NEXT: add.cc.s64 %rd6, %rd4, -1; +; CHECK-NEXT: addc.cc.s64 %rd7, %rd5, -1; +; CHECK-NEXT: mov.b64 %rd69, 0; +; CHECK-NEXT: mov.b64 %rd70, %rd69; ; CHECK-NEXT: $L__BB0_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd83, %rd121, 63; -; CHECK-NEXT: shl.b64 %rd84, %rd122, 1; -; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; -; CHECK-NEXT: shl.b64 %rd86, %rd121, 1; -; CHECK-NEXT: shr.u64 %rd87, %rd124, 63; -; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; -; CHECK-NEXT: shr.u64 %rd89, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd124, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd123, 1; -; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92; -; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91; -; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; -; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; -; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; -; CHECK-NEXT: and.b64 %rd117, %rd95, 1; -; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5; -; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6; -; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97; -; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1; -; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1; -; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120; -; CHECK-NEXT: setp.eq.b64 %p19, %rd98, 0; +; CHECK-NEXT: shr.u64 %rd39, %rd73, 63; +; CHECK-NEXT: shl.b64 %rd40, %rd74, 1; +; CHECK-NEXT: or.b64 %rd41, %rd40, %rd39; +; CHECK-NEXT: shl.b64 %rd42, %rd73, 1; +; CHECK-NEXT: shr.u64 %rd43, %rd76, 63; +; CHECK-NEXT: or.b64 %rd44, %rd42, %rd43; +; CHECK-NEXT: shr.u64 %rd45, %rd75, 63; +; CHECK-NEXT: shl.b64 %rd46, %rd76, 1; +; CHECK-NEXT: or.b64 %rd47, %rd46, %rd45; +; CHECK-NEXT: shl.b64 %rd48, %rd75, 1; +; CHECK-NEXT: or.b64 %rd75, %rd70, %rd48; +; CHECK-NEXT: or.b64 %rd76, %rd69, %rd47; +; CHECK-NEXT: sub.cc.s64 %rd49, %rd6, %rd44; +; CHECK-NEXT: subc.cc.s64 %rd50, %rd7, %rd41; +; CHECK-NEXT: shr.s64 %rd51, %rd50, 63; +; CHECK-NEXT: and.b64 %rd70, %rd51, 1; +; CHECK-NEXT: and.b64 %rd52, %rd51, %rd4; +; CHECK-NEXT: and.b64 %rd53, %rd51, %rd5; +; CHECK-NEXT: sub.cc.s64 %rd73, %rd44, %rd52; +; CHECK-NEXT: subc.cc.s64 %rd74, %rd41, %rd53; +; CHECK-NEXT: add.cc.s64 %rd71, %rd71, -1; +; CHECK-NEXT: addc.cc.s64 %rd72, %rd72, -1; +; CHECK-NEXT: or.b64 %rd54, %rd71, %rd72; +; CHECK-NEXT: setp.eq.b64 %p19, %rd54, 0; ; CHECK-NEXT: @%p19 bra $L__BB0_4; ; CHECK-NEXT: bra.uni $L__BB0_2; ; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd99, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd100, %rd124, 1; -; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; -; CHECK-NEXT: shl.b64 %rd102, %rd123, 1; -; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102; -; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101; +; CHECK-NEXT: shr.u64 %rd55, %rd75, 63; +; CHECK-NEXT: shl.b64 %rd56, %rd76, 1; +; CHECK-NEXT: or.b64 %rd57, %rd56, %rd55; +; CHECK-NEXT: shl.b64 %rd58, %rd75, 1; +; CHECK-NEXT: or.b64 %rd77, %rd70, %rd58; +; CHECK-NEXT: or.b64 %rd78, %rd69, %rd57; ; CHECK-NEXT: $L__BB0_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125; -; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103; -; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104; -; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125; -; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106; -; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105; -; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2; -; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2; -; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2; -; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112}; +; CHECK-NEXT: mul.hi.u64 %rd59, %rd4, %rd77; +; CHECK-NEXT: mad.lo.s64 %rd60, %rd4, %rd78, %rd59; +; CHECK-NEXT: mad.lo.s64 %rd61, %rd5, %rd77, %rd60; +; CHECK-NEXT: mul.lo.s64 %rd62, %rd4, %rd77; +; CHECK-NEXT: sub.cc.s64 %rd63, %rd2, %rd62; +; CHECK-NEXT: subc.cc.s64 %rd64, %rd3, %rd61; +; CHECK-NEXT: xor.b64 %rd65, %rd63, %rd1; +; CHECK-NEXT: xor.b64 %rd66, %rd64, %rd1; +; CHECK-NEXT: sub.cc.s64 %rd67, %rd65, %rd1; +; CHECK-NEXT: subc.cc.s64 %rd68, %rd66, %rd1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd67, %rd68}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, %rhs ret i128 %div @@ -148,122 +148,122 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<113>; +; CHECK-NEXT: .reg .b64 %rd<66>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [urem_i128_param_1]; -; CHECK-NEXT: or.b64 %rd45, %rd3, %rd4; -; CHECK-NEXT: setp.eq.b64 %p1, %rd45, 0; -; CHECK-NEXT: or.b64 %rd46, %rd41, %rd42; -; CHECK-NEXT: setp.eq.b64 %p2, %rd46, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [urem_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [urem_i128_param_1]; +; CHECK-NEXT: or.b64 %rd7, %rd1, %rd2; +; CHECK-NEXT: setp.eq.b64 %p1, %rd7, 0; +; CHECK-NEXT: or.b64 %rd8, %rd5, %rd6; +; CHECK-NEXT: setp.eq.b64 %p2, %rd8, 0; ; CHECK-NEXT: or.pred %p3, %p1, %p2; -; CHECK-NEXT: setp.ne.b64 %p4, %rd4, 0; -; CHECK-NEXT: clz.b64 %r1, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd47, %r1; -; CHECK-NEXT: clz.b64 %r2, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd48, %r2; -; CHECK-NEXT: add.s64 %rd49, %rd48, 64; -; CHECK-NEXT: selp.b64 %rd50, %rd47, %rd49, %p4; -; CHECK-NEXT: setp.ne.b64 %p5, %rd42, 0; -; CHECK-NEXT: clz.b64 %r3, %rd42; -; CHECK-NEXT: cvt.u64.u32 %rd51, %r3; -; CHECK-NEXT: clz.b64 %r4, %rd41; -; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; -; CHECK-NEXT: add.s64 %rd53, %rd52, 64; -; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd103, 0; -; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0; +; CHECK-NEXT: setp.ne.b64 %p4, %rd2, 0; +; CHECK-NEXT: clz.b64 %r1, %rd2; +; CHECK-NEXT: cvt.u64.u32 %rd9, %r1; +; CHECK-NEXT: clz.b64 %r2, %rd1; +; CHECK-NEXT: cvt.u64.u32 %rd10, %r2; +; CHECK-NEXT: add.s64 %rd11, %rd10, 64; +; CHECK-NEXT: selp.b64 %rd12, %rd9, %rd11, %p4; +; CHECK-NEXT: setp.ne.b64 %p5, %rd6, 0; +; CHECK-NEXT: clz.b64 %r3, %rd6; +; CHECK-NEXT: cvt.u64.u32 %rd13, %r3; +; CHECK-NEXT: clz.b64 %r4, %rd5; +; CHECK-NEXT: cvt.u64.u32 %rd14, %r4; +; CHECK-NEXT: add.s64 %rd15, %rd14, 64; +; CHECK-NEXT: selp.b64 %rd16, %rd13, %rd15, %p5; +; CHECK-NEXT: mov.b64 %rd57, 0; +; CHECK-NEXT: sub.cc.s64 %rd17, %rd12, %rd16; +; CHECK-NEXT: subc.cc.s64 %rd18, %rd57, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd17, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd18, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd18, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; -; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; -; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0; -; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd19, %rd17, 127; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd18; +; CHECK-NEXT: setp.eq.b64 %p12, %rd20, 0; +; CHECK-NEXT: selp.b64 %rd65, 0, %rd6, %p11; +; CHECK-NEXT: selp.b64 %rd64, 0, %rd5, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB1_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1; -; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0; -; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106; -; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; +; CHECK-NEXT: add.cc.s64 %rd58, %rd17, 1; +; CHECK-NEXT: addc.cc.s64 %rd59, %rd18, 0; +; CHECK-NEXT: or.b64 %rd21, %rd58, %rd59; +; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd17; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd22, %rd6, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; +; CHECK-NEXT: shr.u64 %rd23, %rd5, %r7; +; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd25, %rd5, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15; -; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd100, %rd103; +; CHECK-NEXT: selp.b64 %rd63, %rd25, %rd24, %p15; +; CHECK-NEXT: shl.b64 %rd62, %rd5, %r6; +; CHECK-NEXT: mov.b64 %rd56, %rd57; ; CHECK-NEXT: @%p14 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd105; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd58; +; CHECK-NEXT: shr.u64 %rd26, %rd5, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; +; CHECK-NEXT: shl.b64 %rd27, %rd6, %r10; +; CHECK-NEXT: or.b64 %rd28, %rd26, %rd27; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd29, %rd6, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16; -; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9; -; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; -; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd100, 0; -; CHECK-NEXT: mov.b64 %rd103, %rd100; +; CHECK-NEXT: selp.b64 %rd60, %rd29, %rd28, %p16; +; CHECK-NEXT: shr.u64 %rd61, %rd6, %r9; +; CHECK-NEXT: add.cc.s64 %rd3, %rd1, -1; +; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, -1; +; CHECK-NEXT: mov.b64 %rd56, 0; +; CHECK-NEXT: mov.b64 %rd57, %rd56; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd73, %rd107, 63; -; CHECK-NEXT: shl.b64 %rd74, %rd108, 1; -; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; -; CHECK-NEXT: shl.b64 %rd76, %rd107, 1; -; CHECK-NEXT: shr.u64 %rd77, %rd110, 63; -; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; -; CHECK-NEXT: shr.u64 %rd79, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd80, %rd110, 1; -; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; -; CHECK-NEXT: shl.b64 %rd82, %rd109, 1; -; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82; -; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81; -; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; -; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; -; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; -; CHECK-NEXT: and.b64 %rd103, %rd85, 1; -; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3; -; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86; -; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87; -; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1; -; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1; -; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106; -; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0; +; CHECK-NEXT: shr.u64 %rd30, %rd60, 63; +; CHECK-NEXT: shl.b64 %rd31, %rd61, 1; +; CHECK-NEXT: or.b64 %rd32, %rd31, %rd30; +; CHECK-NEXT: shl.b64 %rd33, %rd60, 1; +; CHECK-NEXT: shr.u64 %rd34, %rd63, 63; +; CHECK-NEXT: or.b64 %rd35, %rd33, %rd34; +; CHECK-NEXT: shr.u64 %rd36, %rd62, 63; +; CHECK-NEXT: shl.b64 %rd37, %rd63, 1; +; CHECK-NEXT: or.b64 %rd38, %rd37, %rd36; +; CHECK-NEXT: shl.b64 %rd39, %rd62, 1; +; CHECK-NEXT: or.b64 %rd62, %rd57, %rd39; +; CHECK-NEXT: or.b64 %rd63, %rd56, %rd38; +; CHECK-NEXT: sub.cc.s64 %rd40, %rd3, %rd35; +; CHECK-NEXT: subc.cc.s64 %rd41, %rd4, %rd32; +; CHECK-NEXT: shr.s64 %rd42, %rd41, 63; +; CHECK-NEXT: and.b64 %rd57, %rd42, 1; +; CHECK-NEXT: and.b64 %rd43, %rd42, %rd1; +; CHECK-NEXT: and.b64 %rd44, %rd42, %rd2; +; CHECK-NEXT: sub.cc.s64 %rd60, %rd35, %rd43; +; CHECK-NEXT: subc.cc.s64 %rd61, %rd32, %rd44; +; CHECK-NEXT: add.cc.s64 %rd58, %rd58, -1; +; CHECK-NEXT: addc.cc.s64 %rd59, %rd59, -1; +; CHECK-NEXT: or.b64 %rd45, %rd58, %rd59; +; CHECK-NEXT: setp.eq.b64 %p17, %rd45, 0; ; CHECK-NEXT: @%p17 bra $L__BB1_4; ; CHECK-NEXT: bra.uni $L__BB1_2; ; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd89, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd110, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd109, 1; -; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92; -; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91; +; CHECK-NEXT: shr.u64 %rd46, %rd62, 63; +; CHECK-NEXT: shl.b64 %rd47, %rd63, 1; +; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46; +; CHECK-NEXT: shl.b64 %rd49, %rd62, 1; +; CHECK-NEXT: or.b64 %rd64, %rd57, %rd49; +; CHECK-NEXT: or.b64 %rd65, %rd56, %rd48; ; CHECK-NEXT: $L__BB1_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111; -; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93; -; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94; -; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111; -; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98}; +; CHECK-NEXT: mul.hi.u64 %rd50, %rd1, %rd64; +; CHECK-NEXT: mad.lo.s64 %rd51, %rd1, %rd65, %rd50; +; CHECK-NEXT: mad.lo.s64 %rd52, %rd2, %rd64, %rd51; +; CHECK-NEXT: mul.lo.s64 %rd53, %rd1, %rd64; +; CHECK-NEXT: sub.cc.s64 %rd54, %rd5, %rd53; +; CHECK-NEXT: subc.cc.s64 %rd55, %rd6, %rd52; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd54, %rd55}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs ret i128 %div @@ -308,132 +308,132 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<20>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<122>; +; CHECK-NEXT: .reg .b64 %rd<74>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd49, %rd50}, [sdiv_i128_param_1]; -; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; -; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; -; CHECK-NEXT: selp.b64 %rd2, %rd52, %rd46, %p1; -; CHECK-NEXT: selp.b64 %rd1, %rd51, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50; -; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; -; CHECK-NEXT: selp.b64 %rd4, %rd54, %rd50, %p2; -; CHECK-NEXT: selp.b64 %rd3, %rd53, %rd49, %p2; -; CHECK-NEXT: xor.b64 %rd55, %rd50, %rd46; -; CHECK-NEXT: shr.s64 %rd5, %rd55, 63; -; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4; -; CHECK-NEXT: setp.eq.b64 %p3, %rd56, 0; -; CHECK-NEXT: or.b64 %rd57, %rd1, %rd2; -; CHECK-NEXT: setp.eq.b64 %p4, %rd57, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [sdiv_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [sdiv_i128_param_1]; +; CHECK-NEXT: sub.cc.s64 %rd12, 0, %rd8; +; CHECK-NEXT: subc.cc.s64 %rd13, 0, %rd9; +; CHECK-NEXT: setp.lt.s64 %p1, %rd9, 0; +; CHECK-NEXT: selp.b64 %rd2, %rd13, %rd9, %p1; +; CHECK-NEXT: selp.b64 %rd1, %rd12, %rd8, %p1; +; CHECK-NEXT: sub.cc.s64 %rd14, 0, %rd10; +; CHECK-NEXT: subc.cc.s64 %rd15, 0, %rd11; +; CHECK-NEXT: setp.lt.s64 %p2, %rd11, 0; +; CHECK-NEXT: selp.b64 %rd4, %rd15, %rd11, %p2; +; CHECK-NEXT: selp.b64 %rd3, %rd14, %rd10, %p2; +; CHECK-NEXT: xor.b64 %rd16, %rd11, %rd9; +; CHECK-NEXT: shr.s64 %rd5, %rd16, 63; +; CHECK-NEXT: or.b64 %rd17, %rd3, %rd4; +; CHECK-NEXT: setp.eq.b64 %p3, %rd17, 0; +; CHECK-NEXT: or.b64 %rd18, %rd1, %rd2; +; CHECK-NEXT: setp.eq.b64 %p4, %rd18, 0; ; CHECK-NEXT: or.pred %p5, %p3, %p4; ; CHECK-NEXT: setp.ne.b64 %p6, %rd4, 0; ; CHECK-NEXT: clz.b64 %r1, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd58, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd19, %r1; ; CHECK-NEXT: clz.b64 %r2, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd59, %r2; -; CHECK-NEXT: add.s64 %rd60, %rd59, 64; -; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6; +; CHECK-NEXT: cvt.u64.u32 %rd20, %r2; +; CHECK-NEXT: add.s64 %rd21, %rd20, 64; +; CHECK-NEXT: selp.b64 %rd22, %rd19, %rd21, %p6; ; CHECK-NEXT: setp.ne.b64 %p7, %rd2, 0; ; CHECK-NEXT: clz.b64 %r3, %rd2; -; CHECK-NEXT: cvt.u64.u32 %rd62, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd23, %r3; ; CHECK-NEXT: clz.b64 %r4, %rd1; -; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; -; CHECK-NEXT: add.s64 %rd64, %rd63, 64; -; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; -; CHECK-NEXT: mov.b64 %rd112, 0; -; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65; -; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0; -; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127; -; CHECK-NEXT: setp.eq.b64 %p9, %rd68, 0; +; CHECK-NEXT: cvt.u64.u32 %rd24, %r4; +; CHECK-NEXT: add.s64 %rd25, %rd24, 64; +; CHECK-NEXT: selp.b64 %rd26, %rd23, %rd25, %p7; +; CHECK-NEXT: mov.b64 %rd65, 0; +; CHECK-NEXT: sub.cc.s64 %rd27, %rd22, %rd26; +; CHECK-NEXT: subc.cc.s64 %rd28, %rd65, 0; +; CHECK-NEXT: setp.gt.u64 %p8, %rd27, 127; +; CHECK-NEXT: setp.eq.b64 %p9, %rd28, 0; ; CHECK-NEXT: and.pred %p10, %p9, %p8; -; CHECK-NEXT: setp.ne.b64 %p11, %rd68, 0; +; CHECK-NEXT: setp.ne.b64 %p11, %rd28, 0; ; CHECK-NEXT: or.pred %p12, %p10, %p11; ; CHECK-NEXT: or.pred %p13, %p5, %p12; -; CHECK-NEXT: xor.b64 %rd69, %rd67, 127; -; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68; -; CHECK-NEXT: setp.eq.b64 %p14, %rd70, 0; -; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13; -; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13; +; CHECK-NEXT: xor.b64 %rd29, %rd27, 127; +; CHECK-NEXT: or.b64 %rd30, %rd29, %rd28; +; CHECK-NEXT: setp.eq.b64 %p14, %rd30, 0; +; CHECK-NEXT: selp.b64 %rd73, 0, %rd2, %p13; +; CHECK-NEXT: selp.b64 %rd72, 0, %rd1, %p13; ; CHECK-NEXT: or.pred %p15, %p13, %p14; ; CHECK-NEXT: @%p15 bra $L__BB4_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1; -; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0; -; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115; -; CHECK-NEXT: setp.eq.b64 %p16, %rd73, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd67; +; CHECK-NEXT: add.cc.s64 %rd66, %rd27, 1; +; CHECK-NEXT: addc.cc.s64 %rd67, %rd28, 0; +; CHECK-NEXT: or.b64 %rd31, %rd66, %rd67; +; CHECK-NEXT: setp.eq.b64 %p16, %rd31, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd27; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6; +; CHECK-NEXT: shl.b64 %rd32, %rd2, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; +; CHECK-NEXT: shr.u64 %rd33, %rd1, %r7; +; CHECK-NEXT: or.b64 %rd34, %rd32, %rd33; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8; +; CHECK-NEXT: shl.b64 %rd35, %rd1, %r8; ; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; -; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17; -; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6; -; CHECK-NEXT: mov.b64 %rd109, %rd112; +; CHECK-NEXT: selp.b64 %rd71, %rd35, %rd34, %p17; +; CHECK-NEXT: shl.b64 %rd70, %rd1, %r6; +; CHECK-NEXT: mov.b64 %rd64, %rd65; ; CHECK-NEXT: @%p16 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd114; -; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd66; +; CHECK-NEXT: shr.u64 %rd36, %rd1, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10; -; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81; +; CHECK-NEXT: shl.b64 %rd37, %rd2, %r10; +; CHECK-NEXT: or.b64 %rd38, %rd36, %rd37; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11; +; CHECK-NEXT: shr.u64 %rd39, %rd2, %r11; ; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; -; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18; -; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9; -; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; -; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd109, 0; -; CHECK-NEXT: mov.b64 %rd112, %rd109; +; CHECK-NEXT: selp.b64 %rd68, %rd39, %rd38, %p18; +; CHECK-NEXT: shr.u64 %rd69, %rd2, %r9; +; CHECK-NEXT: add.cc.s64 %rd6, %rd3, -1; +; CHECK-NEXT: addc.cc.s64 %rd7, %rd4, -1; +; CHECK-NEXT: mov.b64 %rd64, 0; +; CHECK-NEXT: mov.b64 %rd65, %rd64; ; CHECK-NEXT: $L__BB4_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd84, %rd116, 63; -; CHECK-NEXT: shl.b64 %rd85, %rd117, 1; -; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84; -; CHECK-NEXT: shl.b64 %rd87, %rd116, 1; -; CHECK-NEXT: shr.u64 %rd88, %rd119, 63; -; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88; -; CHECK-NEXT: shr.u64 %rd90, %rd118, 63; -; CHECK-NEXT: shl.b64 %rd91, %rd119, 1; -; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90; -; CHECK-NEXT: shl.b64 %rd93, %rd118, 1; -; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93; -; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92; -; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89; -; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86; -; CHECK-NEXT: shr.s64 %rd96, %rd95, 63; -; CHECK-NEXT: and.b64 %rd112, %rd96, 1; -; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3; -; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97; -; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98; -; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1; -; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1; -; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115; -; CHECK-NEXT: setp.eq.b64 %p19, %rd99, 0; +; CHECK-NEXT: shr.u64 %rd40, %rd68, 63; +; CHECK-NEXT: shl.b64 %rd41, %rd69, 1; +; CHECK-NEXT: or.b64 %rd42, %rd41, %rd40; +; CHECK-NEXT: shl.b64 %rd43, %rd68, 1; +; CHECK-NEXT: shr.u64 %rd44, %rd71, 63; +; CHECK-NEXT: or.b64 %rd45, %rd43, %rd44; +; CHECK-NEXT: shr.u64 %rd46, %rd70, 63; +; CHECK-NEXT: shl.b64 %rd47, %rd71, 1; +; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46; +; CHECK-NEXT: shl.b64 %rd49, %rd70, 1; +; CHECK-NEXT: or.b64 %rd70, %rd65, %rd49; +; CHECK-NEXT: or.b64 %rd71, %rd64, %rd48; +; CHECK-NEXT: sub.cc.s64 %rd50, %rd6, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd51, %rd7, %rd42; +; CHECK-NEXT: shr.s64 %rd52, %rd51, 63; +; CHECK-NEXT: and.b64 %rd65, %rd52, 1; +; CHECK-NEXT: and.b64 %rd53, %rd52, %rd3; +; CHECK-NEXT: and.b64 %rd54, %rd52, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd68, %rd45, %rd53; +; CHECK-NEXT: subc.cc.s64 %rd69, %rd42, %rd54; +; CHECK-NEXT: add.cc.s64 %rd66, %rd66, -1; +; CHECK-NEXT: addc.cc.s64 %rd67, %rd67, -1; +; CHECK-NEXT: or.b64 %rd55, %rd66, %rd67; +; CHECK-NEXT: setp.eq.b64 %p19, %rd55, 0; ; CHECK-NEXT: @%p19 bra $L__BB4_4; ; CHECK-NEXT: bra.uni $L__BB4_2; ; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd100, %rd118, 63; -; CHECK-NEXT: shl.b64 %rd101, %rd119, 1; -; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100; -; CHECK-NEXT: shl.b64 %rd103, %rd118, 1; -; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103; -; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102; +; CHECK-NEXT: shr.u64 %rd56, %rd70, 63; +; CHECK-NEXT: shl.b64 %rd57, %rd71, 1; +; CHECK-NEXT: or.b64 %rd58, %rd57, %rd56; +; CHECK-NEXT: shl.b64 %rd59, %rd70, 1; +; CHECK-NEXT: or.b64 %rd72, %rd65, %rd59; +; CHECK-NEXT: or.b64 %rd73, %rd64, %rd58; ; CHECK-NEXT: $L__BB4_5: // %udiv-end -; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5; -; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5; -; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5; -; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107}; +; CHECK-NEXT: xor.b64 %rd60, %rd72, %rd5; +; CHECK-NEXT: xor.b64 %rd61, %rd73, %rd5; +; CHECK-NEXT: sub.cc.s64 %rd62, %rd60, %rd5; +; CHECK-NEXT: subc.cc.s64 %rd63, %rd61, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd62, %rd63}; ; CHECK-NEXT: ret; %div = sdiv i128 %lhs, %rhs ret i128 %div @@ -444,116 +444,116 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<107>; +; CHECK-NEXT: .reg .b64 %rd<60>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd43, %rd44}, [udiv_i128_param_1]; -; CHECK-NEXT: or.b64 %rd45, %rd43, %rd44; -; CHECK-NEXT: setp.eq.b64 %p1, %rd45, 0; -; CHECK-NEXT: or.b64 %rd46, %rd41, %rd42; -; CHECK-NEXT: setp.eq.b64 %p2, %rd46, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [udiv_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [udiv_i128_param_1]; +; CHECK-NEXT: or.b64 %rd7, %rd5, %rd6; +; CHECK-NEXT: setp.eq.b64 %p1, %rd7, 0; +; CHECK-NEXT: or.b64 %rd8, %rd3, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd8, 0; ; CHECK-NEXT: or.pred %p3, %p1, %p2; -; CHECK-NEXT: setp.ne.b64 %p4, %rd44, 0; -; CHECK-NEXT: clz.b64 %r1, %rd44; -; CHECK-NEXT: cvt.u64.u32 %rd47, %r1; -; CHECK-NEXT: clz.b64 %r2, %rd43; -; CHECK-NEXT: cvt.u64.u32 %rd48, %r2; -; CHECK-NEXT: add.s64 %rd49, %rd48, 64; -; CHECK-NEXT: selp.b64 %rd50, %rd47, %rd49, %p4; -; CHECK-NEXT: setp.ne.b64 %p5, %rd42, 0; -; CHECK-NEXT: clz.b64 %r3, %rd42; -; CHECK-NEXT: cvt.u64.u32 %rd51, %r3; -; CHECK-NEXT: clz.b64 %r4, %rd41; -; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; -; CHECK-NEXT: add.s64 %rd53, %rd52, 64; -; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd97, 0; -; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0; +; CHECK-NEXT: setp.ne.b64 %p4, %rd6, 0; +; CHECK-NEXT: clz.b64 %r1, %rd6; +; CHECK-NEXT: cvt.u64.u32 %rd9, %r1; +; CHECK-NEXT: clz.b64 %r2, %rd5; +; CHECK-NEXT: cvt.u64.u32 %rd10, %r2; +; CHECK-NEXT: add.s64 %rd11, %rd10, 64; +; CHECK-NEXT: selp.b64 %rd12, %rd9, %rd11, %p4; +; CHECK-NEXT: setp.ne.b64 %p5, %rd4, 0; +; CHECK-NEXT: clz.b64 %r3, %rd4; +; CHECK-NEXT: cvt.u64.u32 %rd13, %r3; +; CHECK-NEXT: clz.b64 %r4, %rd3; +; CHECK-NEXT: cvt.u64.u32 %rd14, %r4; +; CHECK-NEXT: add.s64 %rd15, %rd14, 64; +; CHECK-NEXT: selp.b64 %rd16, %rd13, %rd15, %p5; +; CHECK-NEXT: mov.b64 %rd51, 0; +; CHECK-NEXT: sub.cc.s64 %rd17, %rd12, %rd16; +; CHECK-NEXT: subc.cc.s64 %rd18, %rd51, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd17, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd18, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd18, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; -; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; -; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0; -; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd19, %rd17, 127; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd18; +; CHECK-NEXT: setp.eq.b64 %p12, %rd20, 0; +; CHECK-NEXT: selp.b64 %rd59, 0, %rd4, %p11; +; CHECK-NEXT: selp.b64 %rd58, 0, %rd3, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB5_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1; -; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0; -; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100; -; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; +; CHECK-NEXT: add.cc.s64 %rd52, %rd17, 1; +; CHECK-NEXT: addc.cc.s64 %rd53, %rd18, 0; +; CHECK-NEXT: or.b64 %rd21, %rd52, %rd53; +; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd17; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd22, %rd4, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; +; CHECK-NEXT: shr.u64 %rd23, %rd3, %r7; +; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd25, %rd3, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15; -; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd94, %rd97; +; CHECK-NEXT: selp.b64 %rd57, %rd25, %rd24, %p15; +; CHECK-NEXT: shl.b64 %rd56, %rd3, %r6; +; CHECK-NEXT: mov.b64 %rd50, %rd51; ; CHECK-NEXT: @%p14 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd99; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd52; +; CHECK-NEXT: shr.u64 %rd26, %rd3, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; +; CHECK-NEXT: shl.b64 %rd27, %rd4, %r10; +; CHECK-NEXT: or.b64 %rd28, %rd26, %rd27; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd29, %rd4, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16; -; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9; -; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; -; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; -; CHECK-NEXT: mov.b64 %rd94, 0; -; CHECK-NEXT: mov.b64 %rd97, %rd94; +; CHECK-NEXT: selp.b64 %rd54, %rd29, %rd28, %p16; +; CHECK-NEXT: shr.u64 %rd55, %rd4, %r9; +; CHECK-NEXT: add.cc.s64 %rd1, %rd5, -1; +; CHECK-NEXT: addc.cc.s64 %rd2, %rd6, -1; +; CHECK-NEXT: mov.b64 %rd50, 0; +; CHECK-NEXT: mov.b64 %rd51, %rd50; ; CHECK-NEXT: $L__BB5_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd73, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd74, %rd102, 1; -; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; -; CHECK-NEXT: shl.b64 %rd76, %rd101, 1; -; CHECK-NEXT: shr.u64 %rd77, %rd104, 63; -; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; -; CHECK-NEXT: shr.u64 %rd79, %rd103, 63; -; CHECK-NEXT: shl.b64 %rd80, %rd104, 1; -; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; -; CHECK-NEXT: shl.b64 %rd82, %rd103, 1; -; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82; -; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81; -; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; -; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; -; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; -; CHECK-NEXT: and.b64 %rd97, %rd85, 1; -; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43; -; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44; -; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86; -; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87; -; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1; -; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1; -; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100; -; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0; +; CHECK-NEXT: shr.u64 %rd30, %rd54, 63; +; CHECK-NEXT: shl.b64 %rd31, %rd55, 1; +; CHECK-NEXT: or.b64 %rd32, %rd31, %rd30; +; CHECK-NEXT: shl.b64 %rd33, %rd54, 1; +; CHECK-NEXT: shr.u64 %rd34, %rd57, 63; +; CHECK-NEXT: or.b64 %rd35, %rd33, %rd34; +; CHECK-NEXT: shr.u64 %rd36, %rd56, 63; +; CHECK-NEXT: shl.b64 %rd37, %rd57, 1; +; CHECK-NEXT: or.b64 %rd38, %rd37, %rd36; +; CHECK-NEXT: shl.b64 %rd39, %rd56, 1; +; CHECK-NEXT: or.b64 %rd56, %rd51, %rd39; +; CHECK-NEXT: or.b64 %rd57, %rd50, %rd38; +; CHECK-NEXT: sub.cc.s64 %rd40, %rd1, %rd35; +; CHECK-NEXT: subc.cc.s64 %rd41, %rd2, %rd32; +; CHECK-NEXT: shr.s64 %rd42, %rd41, 63; +; CHECK-NEXT: and.b64 %rd51, %rd42, 1; +; CHECK-NEXT: and.b64 %rd43, %rd42, %rd5; +; CHECK-NEXT: and.b64 %rd44, %rd42, %rd6; +; CHECK-NEXT: sub.cc.s64 %rd54, %rd35, %rd43; +; CHECK-NEXT: subc.cc.s64 %rd55, %rd32, %rd44; +; CHECK-NEXT: add.cc.s64 %rd52, %rd52, -1; +; CHECK-NEXT: addc.cc.s64 %rd53, %rd53, -1; +; CHECK-NEXT: or.b64 %rd45, %rd52, %rd53; +; CHECK-NEXT: setp.eq.b64 %p17, %rd45, 0; ; CHECK-NEXT: @%p17 bra $L__BB5_4; ; CHECK-NEXT: bra.uni $L__BB5_2; ; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd89, %rd103, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd104, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd103, 1; -; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92; -; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91; +; CHECK-NEXT: shr.u64 %rd46, %rd56, 63; +; CHECK-NEXT: shl.b64 %rd47, %rd57, 1; +; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46; +; CHECK-NEXT: shl.b64 %rd49, %rd56, 1; +; CHECK-NEXT: or.b64 %rd58, %rd51, %rd49; +; CHECK-NEXT: or.b64 %rd59, %rd50, %rd48; ; CHECK-NEXT: $L__BB5_5: // %udiv-end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd58, %rd59}; ; CHECK-NEXT: ret; %div = udiv i128 %lhs, %rhs ret i128 %div diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 74136bb..5d40192 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,I16x2 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_90 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_90 \ @@ -12,7 +12,7 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,NO-I16x2 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -635,7 +635,7 @@ declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0 define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_call( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b32 %r2, [test_call_param_1]; @@ -658,7 +658,7 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_call_flipped( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; @@ -681,7 +681,7 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_tailcall_flipped( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll index 98f94bb..db19495 100644 --- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll @@ -69,7 +69,7 @@ define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) { define <2 x i8> @test_call_2xi8(<2 x i8> %a) { ; O0-LABEL: test_call_2xi8( ; O0: { -; O0-NEXT: .reg .b16 %rs<7>; +; O0-NEXT: .reg .b16 %rs<5>; ; O0-NEXT: .reg .b32 %r<2>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: @@ -87,7 +87,7 @@ define <2 x i8> @test_call_2xi8(<2 x i8> %a) { ; ; O3-LABEL: test_call_2xi8( ; O3: { -; O3-NEXT: .reg .b16 %rs<7>; +; O3-NEXT: .reg .b16 %rs<5>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0]; @@ -103,5 +103,149 @@ define <2 x i8> @test_call_2xi8(<2 x i8> %a) { %res = call <2 x i8> @test_call_2xi8(<2 x i8> %a) ret <2 x i8> %res } + +define <2 x float> @test_uitofp_2xi8(<2 x i8> %a) { +; O0-LABEL: test_uitofp_2xi8( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_uitofp_2xi8_param_0]; +; O0-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; O0-NEXT: cvt.rn.f32.u16 %r2, %rs2; +; O0-NEXT: cvt.rn.f32.u16 %r3, %rs1; +; O0-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; +; O0-NEXT: ret; +; +; O3-LABEL: test_uitofp_2xi8( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b32 %r<3>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_uitofp_2xi8_param_0]; +; O3-NEXT: cvt.rn.f32.u16 %r1, %rs2; +; O3-NEXT: cvt.rn.f32.u16 %r2, %rs1; +; O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; O3-NEXT: ret; + %1 = uitofp <2 x i8> %a to <2 x float> + ret <2 x float> %1 +} + +define void @test_store_i8x2_unaligned(ptr %ptr, <2 x i8> %a) { +; O0-LABEL: test_store_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_param_0]; +; O0-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_store_i8x2_unaligned_param_1]; +; O0-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; O0-NEXT: st.b8 [%rd1+1], %rs2; +; O0-NEXT: st.b8 [%rd1], %rs1; +; O0-NEXT: ret; +; +; O3-LABEL: test_store_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_param_0]; +; O3-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_store_i8x2_unaligned_param_1]; +; O3-NEXT: st.b8 [%rd1+1], %rs2; +; O3-NEXT: st.b8 [%rd1], %rs1; +; O3-NEXT: ret; + store <2 x i8> %a, ptr %ptr, align 1 + ret void +} + +define void @test_store_i8x2_unaligned_immediate(ptr %ptr) { +; O0-LABEL: test_store_i8x2_unaligned_immediate( +; O0: { +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_immediate_param_0]; +; O0-NEXT: st.b8 [%rd1+1], 2; +; O0-NEXT: st.b8 [%rd1], 1; +; O0-NEXT: ret; +; +; O3-LABEL: test_store_i8x2_unaligned_immediate( +; O3: { +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_immediate_param_0]; +; O3-NEXT: st.b8 [%rd1+1], 2; +; O3-NEXT: st.b8 [%rd1], 1; +; O3-NEXT: ret; + store <2 x i8> <i8 1, i8 2>, ptr %ptr, align 1 + ret void +} + +define i32 @test_zext_load_i8x2_unaligned(ptr %ptr) { +; O0-LABEL: test_zext_load_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_zext_load_i8x2_unaligned_param_0]; +; O0-NEXT: ld.b8 %rs1, [%rd1+1]; +; O0-NEXT: ld.b8 %rs2, [%rd1]; +; O0-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O0-NEXT: ret; +; +; O3-LABEL: test_zext_load_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_zext_load_i8x2_unaligned_param_0]; +; O3-NEXT: ld.b8 %rs1, [%rd1+1]; +; O3-NEXT: ld.b8 %rs2, [%rd1]; +; O3-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O3-NEXT: ret; + %a = load <2 x i8>, ptr %ptr, align 1 + %b = zext <2 x i8> %a to <2 x i16> + %c = bitcast <2 x i16> %b to i32 + ret i32 %c +} + +define i32 @test_sext_load_i8x2_unaligned(ptr %ptr) { +; O0-LABEL: test_sext_load_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_sext_load_i8x2_unaligned_param_0]; +; O0-NEXT: ld.s8 %rs1, [%rd1+1]; +; O0-NEXT: ld.s8 %rs2, [%rd1]; +; O0-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O0-NEXT: ret; +; +; O3-LABEL: test_sext_load_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_sext_load_i8x2_unaligned_param_0]; +; O3-NEXT: ld.s8 %rs1, [%rd1+1]; +; O3-NEXT: ld.s8 %rs2, [%rd1]; +; O3-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O3-NEXT: ret; + %a = load <2 x i8>, ptr %ptr, align 1 + %b = sext <2 x i8> %a to <2 x i16> + %c = bitcast <2 x i16> %b to i32 + ret i32 %c +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; COMMON: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 26336b8..40d6a07 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1298,7 +1298,7 @@ declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0 define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_call( ; O0: { -; O0-NEXT: .reg .b32 %r<5>; +; O0-NEXT: .reg .b32 %r<4>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_call_param_1]; @@ -1317,7 +1317,7 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; ; O3-LABEL: test_call( ; O3: { -; O3-NEXT: .reg .b32 %r<5>; +; O3-NEXT: .reg .b32 %r<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_call_param_0]; @@ -1340,7 +1340,7 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_call_flipped( ; O0: { -; O0-NEXT: .reg .b32 %r<5>; +; O0-NEXT: .reg .b32 %r<4>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; @@ -1359,7 +1359,7 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; ; O3-LABEL: test_call_flipped( ; O3: { -; O3-NEXT: .reg .b32 %r<5>; +; O3-NEXT: .reg .b32 %r<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; @@ -1382,7 +1382,7 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_tailcall_flipped( ; O0: { -; O0-NEXT: .reg .b32 %r<5>; +; O0-NEXT: .reg .b32 %r<4>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; @@ -1401,7 +1401,7 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; ; O3-LABEL: test_tailcall_flipped( ; O3: { -; O3-NEXT: .reg .b32 %r<5>; +; O3-NEXT: .reg .b32 %r<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll index a3bf892..87c5ab2 100644 --- a/llvm/test/CodeGen/NVPTX/idioms.ll +++ b/llvm/test/CodeGen/NVPTX/idioms.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} %struct.S16 = type { i16, i16 } diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll index 782e672..e1fecdb 100644 --- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll +++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" @@ -16,8 +16,8 @@ define internal i32 @foo() { ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov.b64 %SPL, __local_depot0; @@ -29,8 +29,8 @@ define internal i32 @foo() { ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: add.u64 %rd2, %SP, 0; ; CHECK-NEXT: st.param.b64 [param1], %rd2; -; CHECK-NEXT: add.u64 %rd4, %SPL, 1; -; CHECK-NEXT: ld.local.b8 %rs1, [%rd4]; +; CHECK-NEXT: add.u64 %rd3, %SPL, 1; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd3]; ; CHECK-NEXT: st.param.b8 [param0], %rs1; ; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _); ; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_0; @@ -53,8 +53,8 @@ define internal i32 @bar() { ; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov.b64 %SPL, __local_depot1; @@ -66,9 +66,9 @@ define internal i32 @bar() { ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: add.u64 %rd2, %SP, 0; ; CHECK-NEXT: st.param.b64 [param1], %rd2; -; CHECK-NEXT: add.u64 %rd4, %SPL, 8; -; CHECK-NEXT: ld.local.b64 %rd5, [%rd4]; -; CHECK-NEXT: st.param.b64 [param0], %rd5; +; CHECK-NEXT: add.u64 %rd3, %SPL, 8; +; CHECK-NEXT: ld.local.b64 %rd4, [%rd3]; +; CHECK-NEXT: st.param.b64 [param0], %rd4; ; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _); ; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_1; ; CHECK-NEXT: ld.param.b32 %r1, [retval0]; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll index 307e2c8..fd8aeff 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll index 52bd51b..e4ca0cb7 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index 037d7df..02a75d5 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" @@ -11,32 +11,32 @@ define void @test_b128_in_loop() { ; CHECK-LABEL: test_b128_in_loop( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b64 %rd<15>; -; CHECK-NEXT: .reg .b128 %rq<3>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b128 %rq<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.global.s32 %rd1, [size]; ; CHECK-NEXT: setp.eq.b64 %p1, %rd1, 0; ; CHECK-NEXT: @%p1 bra $L__BB0_3; ; CHECK-NEXT: // %bb.1: // %BB1 -; CHECK-NEXT: ld.global.v2.b64 {%rd12, %rd13}, [x]; -; CHECK-NEXT: mov.b64 %rd14, 0; +; CHECK-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [x]; +; CHECK-NEXT: mov.b64 %rd4, 0; ; CHECK-NEXT: $L__BB0_2: // %BB2 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov.b128 %rq1, {%rd12, %rd13}; +; CHECK-NEXT: mov.b128 %rq1, {%rd2, %rd3}; ; CHECK-NEXT: // begin inline asm ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b64 lo; ; CHECK-NEXT: .reg .b64 hi; ; CHECK-NEXT: mov.b128 {lo, hi}, %rq1; -; CHECK-NEXT: add.cc.u64 lo, lo, %rd14; +; CHECK-NEXT: add.cc.u64 lo, lo, %rd4; ; CHECK-NEXT: mov.b128 %rq1, {lo, hi}; ; CHECK-NEXT: } ; CHECK-NEXT: // end inline asm -; CHECK-NEXT: mov.b128 {%rd12, %rd13}, %rq1; -; CHECK-NEXT: st.global.v2.b64 [x], {%rd12, %rd13}; -; CHECK-NEXT: add.s64 %rd14, %rd14, 1; -; CHECK-NEXT: setp.ne.b64 %p2, %rd1, %rd14; +; CHECK-NEXT: mov.b128 {%rd2, %rd3}, %rq1; +; CHECK-NEXT: st.global.v2.b64 [x], {%rd2, %rd3}; +; CHECK-NEXT: add.s64 %rd4, %rd4, 1; +; CHECK-NEXT: setp.ne.b64 %p2, %rd1, %rd4; ; CHECK-NEXT: @%p2 bra $L__BB0_2; ; CHECK-NEXT: $L__BB0_3: // %BB3 ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll index f595df83..01cdacb 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_device i32 @test_tid_x() { diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll index a7ab358..e2a01dc 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} ; CHECK-LABEL: test_isspacep define i1 @test_isspacep_shared_cluster(ptr %p) { diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll index 4ed5063..00eb8e2 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify %} +; RUN: %if ptxas-sm_60 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} define float @test_fabsf(float %f) { ; CHECK-LABEL: test_fabsf( @@ -267,6 +267,23 @@ define i64 @test_globaltimer() { ret i64 %ret } +define i32 @test_globaltimer_lo(){ +; CHECK-LABEL: test_globaltimer_lo( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, %globaltimer_lo; +; CHECK-NEXT: mov.u32 %r2, %globaltimer_lo; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %a = tail call i32 @llvm.nvvm.read.ptx.sreg.globaltimer.lo() + %b = tail call i32 @llvm.nvvm.read.ptx.sreg.globaltimer.lo() + %ret = add i32 %a, %b + ret i32 %ret +} + define i64 @test_cyclecounter() { ; CHECK-LABEL: test_cyclecounter( ; CHECK: { diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll index a623835..4620c5e 100644 --- a/llvm/test/CodeGen/NVPTX/jump-table.ll +++ b/llvm/test/CodeGen/NVPTX/jump-table.ll @@ -10,11 +10,11 @@ define void @foo(i32 %i) { ; CHECK-LABEL: foo( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.b32 %r2, [foo_param_0]; -; CHECK-NEXT: setp.gt.u32 %p1, %r2, 3; +; CHECK-NEXT: ld.param.b32 %r1, [foo_param_0]; +; CHECK-NEXT: setp.gt.u32 %p1, %r1, 3; ; CHECK-NEXT: @%p1 bra $L__BB0_6; ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: $L_brx_0: .branchtargets @@ -22,7 +22,7 @@ define void @foo(i32 %i) { ; CHECK-NEXT: $L__BB0_3, ; CHECK-NEXT: $L__BB0_4, ; CHECK-NEXT: $L__BB0_5; -; CHECK-NEXT: brx.idx %r2, $L_brx_0; +; CHECK-NEXT: brx.idx %r1, $L_brx_0; ; CHECK-NEXT: $L__BB0_2: // %case0 ; CHECK-NEXT: st.global.b32 [out], 0; ; CHECK-NEXT: bra.uni $L__BB0_6; diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll index a56b85d..b66b843 100644 --- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %} %struct.Large = type { [16 x double] } diff --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll index 24071b4..c3fd288 100644 --- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll index ee304ca..628fb49 100644 --- a/llvm/test/CodeGen/NVPTX/ld-generic.ll +++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/ld-param-sink.ll b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll index 03523a3..dfb0e80 100644 --- a/llvm/test/CodeGen/NVPTX/ld-param-sink.ll +++ b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll @@ -12,7 +12,7 @@ define ptr @foo(i1 %cond) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b8 %rs1, [foo_param_0]; @@ -21,14 +21,14 @@ define ptr @foo(i1 %cond) { ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .b64 retval0; ; CHECK-NEXT: call.uni (retval0), baz, (); -; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: ld.param.b64 %rd1, [retval0]; ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: @%p1 bra $L__BB0_2; ; CHECK-NEXT: // %bb.1: // %bb ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .b64 param0; ; CHECK-NEXT: .param .b64 retval0; -; CHECK-NEXT: st.param.b64 [param0], %rd2; +; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: call.uni (retval0), bar, (param0); ; CHECK-NEXT: } // callseq 1 ; CHECK-NEXT: $L__BB0_2: // %common.ret diff --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py index 2fa4c89..4b566b2 100644 --- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py +++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py @@ -4,7 +4,7 @@ # RUN: %python %s > %t.ll # RUN: llc < %t.ll -mtriple=nvptx -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P32 %t.ll # RUN: llc < %t.ll -mtriple=nvptx64 -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P64 %t.ll -# RUN: %if ptxas && !ptxas-12.0 %{ llc < %t.ll -mtriple=nvptx -mcpu=sm_30 | %ptxas-verify %} +# RUN: %if ptxas-ptr32 %{ llc < %t.ll -mtriple=nvptx -mcpu=sm_30 | %ptxas-verify %} # RUN: %if ptxas %{ llc < %t.ll -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %} from __future__ import print_function diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll index 6e42e00..d219493 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 -verify-machineinstrs | FileCheck %s -check-prefixes=SM90 -; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 -verify-machineinstrs | FileCheck %s -check-prefixes=SM100 -; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; For 256-bit vectors, check that invariant loads from the ; global addrspace are lowered to ld.global.nc. diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll index efa2666..cb1d126 100644 --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -7,16 +7,17 @@ declare <4 x float> @bar() define void @foo(ptr %ptr) { ; CHECK-LABEL: foo( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 16 .b8 retval0[16]; ; CHECK-NEXT: call.uni (retval0), bar, (); -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [retval0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3}; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %val = tail call <4 x float> @bar() store <4 x float> %val, ptr %ptr diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll index 187ccc9..12e3287 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX -; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; In this test, we check that all the addressing modes are lowered correctly ; for 256-bit invariant loads, which get lowered to ld.global.nc diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll index a17df1e..b7fa1dd 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX -; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; In this test, we check that all the addressing modes are lowered correctly, ; addr can be any of the following: diff --git a/llvm/test/CodeGen/NVPTX/load-store-atomic.err.ll b/llvm/test/CodeGen/NVPTX/load-store-atomic.err.ll new file mode 100644 index 0000000..31889e2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-atomic.err.ll @@ -0,0 +1,10 @@ +; RUN: not llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 2>&1 | FileCheck %s + +; CHECK: error: unsupported atomic store +; CHECK: error: unsupported atomic load + +define void @test_i256_global_atomic(ptr addrspace(1) %a, ptr addrspace(1) %b) { + %a.load = load atomic i256, ptr addrspace(1) %a seq_cst, align 32 + store atomic i256 %a.load, ptr addrspace(1) %b seq_cst, align 32 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll index bac59be5..09c18b6 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 -; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} ; TODO: generate PTX that preserves Concurrent Forward Progress ; for atomic operations to local statespace diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 2ffefd0..7373b50 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} ; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;" ; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;" diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll index ed170e9..5e85e98 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} ; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;" ; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;" diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll index 68c53cd..e8b43ad 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck -check-prefixes=CHECK,SM90 %s -; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100 -; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; This test is based on load-store-vectors.ll, ; and contains testing for lowering 256-bit vector loads/stores @@ -137,18 +137,32 @@ define void @generic_4xi64(ptr %a, ptr %b) { } define void @generic_8xfloat(ptr %a, ptr %b) { -; CHECK-LABEL: generic_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; -; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1]; -; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: generic_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; +; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1]; +; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: generic_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; +; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1]; +; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x float>, ptr %a store <8 x float> %a.load, ptr %b ret void @@ -288,18 +302,32 @@ define void @generic_volatile_4xi64(ptr %a, ptr %b) { } define void @generic_volatile_8xfloat(ptr %a, ptr %b) { -; CHECK-LABEL: generic_volatile_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: generic_volatile_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0]; +; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: generic_volatile_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0]; +; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1]; +; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x float>, ptr %a store volatile <8 x float> %a.load, ptr %b ret void @@ -514,15 +542,16 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-LABEL: global_8xfloat( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0]; -; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; -; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; SM90-NEXT: ld.param.b64 %rd6, [global_8xfloat_param_1]; -; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5}; -; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_8xfloat( @@ -758,15 +787,16 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-LABEL: global_volatile_8xfloat( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0]; -; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1]; -; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; SM90-NEXT: ld.param.b64 %rd6, [global_volatile_8xfloat_param_1]; -; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5}; -; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_volatile_8xfloat( @@ -931,18 +961,32 @@ define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { } define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { -; CHECK-LABEL: shared_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0]; -; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1]; -; CHECK-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: shared_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0]; +; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [shared_8xfloat_param_1]; +; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: shared_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0]; +; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1]; +; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(3) %a store <8 x float> %a.load, ptr addrspace(3) %b ret void @@ -1082,18 +1126,32 @@ define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { } define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { -; CHECK-LABEL: shared_volatile_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: shared_volatile_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0]; +; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: shared_volatile_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0]; +; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1]; +; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(3) %a store volatile <8 x float> %a.load, ptr addrspace(3) %b ret void @@ -1235,18 +1293,32 @@ define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { } define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { -; CHECK-LABEL: local_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0]; -; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1]; -; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: local_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0]; +; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [local_8xfloat_param_1]; +; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: local_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0]; +; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1]; +; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(5) %a store <8 x float> %a.load, ptr addrspace(5) %b ret void @@ -1386,18 +1458,32 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { } define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { -; CHECK-LABEL: local_volatile_8xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; -; CHECK-NEXT: ret; +; SM90-LABEL: local_volatile_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0]; +; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xfloat_param_1]; +; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: local_volatile_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0]; +; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1]; +; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(5) %a store volatile <8 x float> %a.load, ptr addrspace(5) %b ret void @@ -1420,3 +1506,98 @@ define void @local_volatile_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) { store volatile <4 x double> %a.load, ptr addrspace(5) %b ret void } + +define void @test_i256_global(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: test_i256_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [test_i256_global_param_0]; +; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd6, [test_i256_global_param_1]; +; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ret; +; +; SM100-LABEL: test_i256_global( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [test_i256_global_param_0]; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [test_i256_global_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ret; + %a.load = load i256, ptr addrspace(1) %a, align 32 + store i256 %a.load, ptr addrspace(1) %b, align 32 + ret void +} + + +define void @test_i256_global_unaligned(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: test_i256_global_unaligned( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_i256_global_unaligned_param_0]; +; CHECK-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [test_i256_global_unaligned_param_1]; +; CHECK-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load i256, ptr addrspace(1) %a, align 16 + store i256 %a.load, ptr addrspace(1) %b, align 16 + ret void +} + +define void @test_i256_generic(ptr %a, ptr %b) { +; CHECK-LABEL: test_i256_generic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_i256_generic_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [test_i256_generic_param_1]; +; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load i256, ptr %a, align 32 + store i256 %a.load, ptr %b, align 32 + ret void +} + +define void @test_i256_global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: test_i256_global_volatile( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [test_i256_global_volatile_param_0]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd6, [test_i256_global_volatile_param_1]; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ret; +; +; SM100-LABEL: test_i256_global_volatile( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [test_i256_global_volatile_param_0]; +; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [test_i256_global_volatile_param_1]; +; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ret; + %a.load = load volatile i256, ptr addrspace(1) %a, align 32 + store volatile i256 %a.load, ptr addrspace(1) %b, align 32 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll index ec8dd0c..7553c72 100644 --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -333,28 +333,30 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) { define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) { ; SM20-LABEL: foo11( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<6>; +; SM20-NEXT: .reg .b32 %r<3>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.b64 %rd1, [foo11_param_0]; ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM20-NEXT: ld.param.b64 %rd3, [foo11_param_1]; ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM20-NEXT: ld.global.b64 %rd5, [%rd2]; -; SM20-NEXT: st.global.b64 [%rd4], %rd5; +; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2]; +; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; ; SM20-NEXT: ret; ; ; SM35-LABEL: foo11( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<6>; +; SM35-NEXT: .reg .b32 %r<3>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.b64 %rd1, [foo11_param_0]; ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM35-NEXT: ld.param.b64 %rd3, [foo11_param_1]; ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2]; -; SM35-NEXT: st.global.b64 [%rd4], %rd5; +; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2]; +; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; ; SM35-NEXT: ret; %1 = load <2 x float>, ptr %from store <2 x float> %1, ptr %to @@ -494,28 +496,30 @@ define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) { define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) { ; SM20-LABEL: foo16( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<7>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.b64 %rd1, [foo16_param_0]; ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM20-NEXT: ld.param.b64 %rd3, [foo16_param_1]; ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2]; -; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; +; SM20-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; +; SM20-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; ; SM20-NEXT: ret; ; ; SM35-LABEL: foo16( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<7>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.b64 %rd1, [foo16_param_0]; ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM35-NEXT: ld.param.b64 %rd3, [foo16_param_1]; ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2]; -; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; +; SM35-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; +; SM35-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; ; SM35-NEXT: ret; %1 = load <4 x float>, ptr %from store <4 x float> %1, ptr %to @@ -593,51 +597,51 @@ define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) { ; SM20-LABEL: foo19( ; SM20: { ; SM20-NEXT: .reg .pred %p<2>; -; SM20-NEXT: .reg .b32 %r<10>; -; SM20-NEXT: .reg .b64 %rd<8>; +; SM20-NEXT: .reg .b32 %r<4>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: // %entry -; SM20-NEXT: ld.param.b32 %r8, [foo19_param_2]; -; SM20-NEXT: ld.param.b64 %rd5, [foo19_param_0]; -; SM20-NEXT: cvta.to.global.u64 %rd7, %rd5; -; SM20-NEXT: ld.param.b64 %rd6, [foo19_param_1]; -; SM20-NEXT: cvta.to.global.u64 %rd2, %rd6; -; SM20-NEXT: mov.b32 %r9, 0f00000000; +; SM20-NEXT: ld.param.b32 %r2, [foo19_param_2]; +; SM20-NEXT: ld.param.b64 %rd2, [foo19_param_0]; +; SM20-NEXT: cvta.to.global.u64 %rd4, %rd2; +; SM20-NEXT: ld.param.b64 %rd3, [foo19_param_1]; +; SM20-NEXT: cvta.to.global.u64 %rd1, %rd3; +; SM20-NEXT: mov.b32 %r3, 0f00000000; ; SM20-NEXT: $L__BB18_1: // %loop ; SM20-NEXT: // =>This Inner Loop Header: Depth=1 -; SM20-NEXT: ld.global.b32 %r7, [%rd7]; -; SM20-NEXT: add.rn.f32 %r9, %r7, %r9; -; SM20-NEXT: add.s64 %rd7, %rd7, 4; -; SM20-NEXT: add.s32 %r8, %r8, -1; -; SM20-NEXT: setp.ne.b32 %p1, %r8, 0; +; SM20-NEXT: ld.global.b32 %r1, [%rd4]; +; SM20-NEXT: add.rn.f32 %r3, %r1, %r3; +; SM20-NEXT: add.s64 %rd4, %rd4, 4; +; SM20-NEXT: add.s32 %r2, %r2, -1; +; SM20-NEXT: setp.ne.b32 %p1, %r2, 0; ; SM20-NEXT: @%p1 bra $L__BB18_1; ; SM20-NEXT: // %bb.2: // %exit -; SM20-NEXT: st.global.b32 [%rd2], %r9; +; SM20-NEXT: st.global.b32 [%rd1], %r3; ; SM20-NEXT: ret; ; ; SM35-LABEL: foo19( ; SM35: { ; SM35-NEXT: .reg .pred %p<2>; -; SM35-NEXT: .reg .b32 %r<10>; -; SM35-NEXT: .reg .b64 %rd<8>; +; SM35-NEXT: .reg .b32 %r<4>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: // %entry -; SM35-NEXT: ld.param.b32 %r8, [foo19_param_2]; -; SM35-NEXT: ld.param.b64 %rd5, [foo19_param_0]; -; SM35-NEXT: cvta.to.global.u64 %rd7, %rd5; -; SM35-NEXT: ld.param.b64 %rd6, [foo19_param_1]; -; SM35-NEXT: cvta.to.global.u64 %rd2, %rd6; -; SM35-NEXT: mov.b32 %r9, 0f00000000; +; SM35-NEXT: ld.param.b32 %r2, [foo19_param_2]; +; SM35-NEXT: ld.param.b64 %rd2, [foo19_param_0]; +; SM35-NEXT: cvta.to.global.u64 %rd4, %rd2; +; SM35-NEXT: ld.param.b64 %rd3, [foo19_param_1]; +; SM35-NEXT: cvta.to.global.u64 %rd1, %rd3; +; SM35-NEXT: mov.b32 %r3, 0f00000000; ; SM35-NEXT: $L__BB18_1: // %loop ; SM35-NEXT: // =>This Inner Loop Header: Depth=1 -; SM35-NEXT: ld.global.nc.b32 %r7, [%rd7]; -; SM35-NEXT: add.rn.f32 %r9, %r7, %r9; -; SM35-NEXT: add.s64 %rd7, %rd7, 4; -; SM35-NEXT: add.s32 %r8, %r8, -1; -; SM35-NEXT: setp.ne.b32 %p1, %r8, 0; +; SM35-NEXT: ld.global.nc.b32 %r1, [%rd4]; +; SM35-NEXT: add.rn.f32 %r3, %r1, %r3; +; SM35-NEXT: add.s64 %rd4, %rd4, 4; +; SM35-NEXT: add.s32 %r2, %r2, -1; +; SM35-NEXT: setp.ne.b32 %p1, %r2, 0; ; SM35-NEXT: @%p1 bra $L__BB18_1; ; SM35-NEXT: // %bb.2: // %exit -; SM35-NEXT: st.global.b32 [%rd2], %r9; +; SM35-NEXT: st.global.b32 [%rd1], %r3; ; SM35-NEXT: ret; entry: br label %loop diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll index ae069cf..9dac46c 100644 --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; Ensure we access the local stack properly @@ -12,13 +12,13 @@ define void @foo(i32 %a) { ; PTX32-NEXT: .local .align 4 .b8 __local_depot0[4]; ; PTX32-NEXT: .reg .b32 %SP; ; PTX32-NEXT: .reg .b32 %SPL; -; PTX32-NEXT: .reg .b32 %r<4>; +; PTX32-NEXT: .reg .b32 %r<3>; ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot0; ; PTX32-NEXT: ld.param.b32 %r1, [foo_param_0]; -; PTX32-NEXT: add.u32 %r3, %SPL, 0; -; PTX32-NEXT: st.local.b32 [%r3], %r1; +; PTX32-NEXT: add.u32 %r2, %SPL, 0; +; PTX32-NEXT: st.local.b32 [%r2], %r1; ; PTX32-NEXT: ret; ; ; PTX64-LABEL: foo( @@ -27,13 +27,13 @@ define void @foo(i32 %a) { ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; ; PTX64-NEXT: .reg .b32 %r<2>; -; PTX64-NEXT: .reg .b64 %rd<3>; +; PTX64-NEXT: .reg .b64 %rd<2>; ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot0; ; PTX64-NEXT: ld.param.b32 %r1, [foo_param_0]; -; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: st.local.b32 [%rd2], %r1; +; PTX64-NEXT: add.u64 %rd1, %SPL, 0; +; PTX64-NEXT: st.local.b32 [%rd1], %r1; ; PTX64-NEXT: ret; %local = alloca i32, align 4 store volatile i32 %a, ptr %local @@ -97,15 +97,15 @@ define void @foo3(i32 %a) { ; PTX32-NEXT: .local .align 4 .b8 __local_depot2[12]; ; PTX32-NEXT: .reg .b32 %SP; ; PTX32-NEXT: .reg .b32 %SPL; -; PTX32-NEXT: .reg .b32 %r<6>; +; PTX32-NEXT: .reg .b32 %r<5>; ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot2; ; PTX32-NEXT: ld.param.b32 %r1, [foo3_param_0]; -; PTX32-NEXT: add.u32 %r3, %SPL, 0; -; PTX32-NEXT: shl.b32 %r4, %r1, 2; -; PTX32-NEXT: add.s32 %r5, %r3, %r4; -; PTX32-NEXT: st.local.b32 [%r5], %r1; +; PTX32-NEXT: add.u32 %r2, %SPL, 0; +; PTX32-NEXT: shl.b32 %r3, %r1, 2; +; PTX32-NEXT: add.s32 %r4, %r2, %r3; +; PTX32-NEXT: st.local.b32 [%r4], %r1; ; PTX32-NEXT: ret; ; ; PTX64-LABEL: foo3( @@ -119,8 +119,9 @@ define void @foo3(i32 %a) { ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot2; ; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0]; -; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: mad.wide.s32 %rd3, %r1, 4, %rd2; +; PTX64-NEXT: add.u64 %rd1, %SPL, 0; +; PTX64-NEXT: mul.wide.s32 %rd2, %r1, 4; +; PTX64-NEXT: add.s64 %rd3, %rd1, %rd2; ; PTX64-NEXT: st.local.b32 [%rd3], %r1; ; PTX64-NEXT: ret; %local = alloca [3 x i32], align 4 diff --git a/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll b/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll new file mode 100644 index 0000000..2051f63 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=nvptx-lower-args,infer-alignment -S | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +; ------------------------------------------------------------------------------ +; Test that alignment can be inferred through llvm.nvvm.internal.addrspace.wrap.p101.p0 intrinsics +; thanks to the alignment attribute on the intrinsic +; ------------------------------------------------------------------------------ + +%struct.S1 = type { i32, i32, i32, i32 } +define ptx_kernel i32 @test_align8(ptr noundef readonly byval(%struct.S1) align 8 captures(none) %params) { +; CHECK-LABEL: define ptx_kernel i32 @test_align8( +; CHECK-SAME: ptr noundef readonly byval([[STRUCT_S1:%.*]]) align 8 captures(none) [[PARAMS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[PARAMS]]) +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(101) [[TMP0]], align 8 +; CHECK-NEXT: ret i32 [[LOAD]] +; +entry: + %load = load i32, ptr %params, align 4 + ret i32 %load +} + +define ptx_kernel i32 @test_align1(ptr noundef readonly byval(%struct.S1) align 1 captures(none) %params) { +; CHECK-LABEL: define ptx_kernel i32 @test_align1( +; CHECK-SAME: ptr noundef readonly byval([[STRUCT_S1:%.*]]) align 4 captures(none) [[PARAMS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call align 1 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[PARAMS]]) +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(101) [[TMP0]], align 4 +; CHECK-NEXT: ret i32 [[LOAD]] +; +entry: + %load = load i32, ptr %params, align 4 + ret i32 %load +} diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index 045704b..01ab471 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -49,14 +49,14 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly ; PTX-NEXT: st.param.b32 [func_retval0], %r10; ; PTX-NEXT: ret; entry: - %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17 - %idx.ext = sext i32 %c to i64, !dbg !18 - %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext, !dbg !18 - %0 = load i32, ptr %add.ptr, align 1, !dbg !19 - ret i32 %0, !dbg !23 + %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr) + %idx.ext = sext i32 %c to i64 + %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext + %0 = load i32, ptr %add.ptr, align 1 + ret i32 %0 } -define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) { +define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 "nvvm.grid_constant" %input1, i32 %input2, ptr %out, i32 %n) { ; PTX-LABEL: grid_const_int( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; @@ -71,8 +71,8 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu ; PTX-NEXT: st.global.b32 [%rd2], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_int( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT11:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT11:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] ; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4 @@ -85,7 +85,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu %struct.s = type { i32, i32 } -define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ +define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %out){ ; PTX-LABEL: grid_const_struct( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; @@ -100,8 +100,8 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p ; PTX-NEXT: st.global.b32 [%rd2], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_struct( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 @@ -118,7 +118,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p ret void } -define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { +define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input) { ; PTX-LABEL: grid_const_escape( ; PTX: { ; PTX-NEXT: .reg .b64 %rd<4>; @@ -136,8 +136,8 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-NEXT: } // callseq 0 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) ; OPT-NEXT: ret void @@ -145,7 +145,7 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ret void } -define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) { +define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, i32 %a, ptr byval(i32) align 4 "nvvm.grid_constant" %b) { ; PTX-LABEL: multiple_grid_const_escape( ; PTX: { ; PTX-NEXT: .local .align 4 .b8 __local_depot4[4]; @@ -179,10 +179,10 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: } // callseq 1 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 "nvvm.grid_constant" [[B:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]]) ; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr -; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr ; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 ; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 @@ -194,7 +194,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ret void } -define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) { +define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %addr) { ; PTX-LABEL: grid_const_memory_escape( ; PTX: { ; PTX-NEXT: .reg .b64 %rd<5>; @@ -207,8 +207,8 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i ; PTX-NEXT: st.global.b64 [%rd3], %rd4; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8 ; OPT-NEXT: ret void @@ -216,7 +216,7 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i ret void } -define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { +define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %result) { ; PTX-LABEL: grid_const_inlineasm_escape( ; PTX: { ; PTX-NEXT: .reg .b64 %rd<7>; @@ -234,8 +234,8 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: ret; ; PTX-NOT .local ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 @@ -249,7 +249,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 ret void } -define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { +define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) "nvvm.grid_constant" %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escape( ; PTX: { ; PTX-NEXT: .reg .b32 %r<3>; @@ -273,7 +273,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ; PTX-NEXT: } // callseq 2 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape( -; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval(i32) "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT1_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 @@ -288,7 +288,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ret void } -define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { +define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) "nvvm.grid_constant" %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escapemem( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; @@ -314,7 +314,7 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ; PTX-NEXT: st.param.b32 [func_retval0], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 @@ -335,29 +335,29 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ret i32 %add } -define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) { +define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input1, ptr %inout) { ; PTX-LABEL: grid_const_phi( ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: mov.b64 %rd6, grid_const_phi_param_0; -; PTX-NEXT: ld.param.b64 %rd5, [grid_const_phi_param_1]; -; PTX-NEXT: cvta.to.global.u64 %rd1, %rd5; +; PTX-NEXT: mov.b64 %rd3, grid_const_phi_param_0; +; PTX-NEXT: ld.param.b64 %rd2, [grid_const_phi_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd2; ; PTX-NEXT: ld.global.b32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: @%p1 bra $L__BB9_2; ; PTX-NEXT: // %bb.1: // %second -; PTX-NEXT: add.s64 %rd6, %rd6, 4; +; PTX-NEXT: add.s64 %rd3, %rd3, 4; ; PTX-NEXT: $L__BB9_2: // %merge -; PTX-NEXT: ld.param.b32 %r2, [%rd6]; +; PTX-NEXT: ld.param.b32 %r2, [%rd3]; ; PTX-NEXT: st.global.b32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 @@ -391,32 +391,32 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { +define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input1, ptr byval(%struct.s) %input2, ptr %inout) { ; PTX-LABEL: grid_const_phi_ngc( ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<8>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: mov.b64 %rd7, grid_const_phi_ngc_param_0; -; PTX-NEXT: ld.param.b64 %rd6, [grid_const_phi_ngc_param_2]; -; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; +; PTX-NEXT: mov.b64 %rd4, grid_const_phi_ngc_param_0; +; PTX-NEXT: ld.param.b64 %rd3, [grid_const_phi_ngc_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd3; ; PTX-NEXT: ld.global.b32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: @%p1 bra $L__BB10_2; ; PTX-NEXT: // %bb.1: // %second ; PTX-NEXT: mov.b64 %rd2, grid_const_phi_ngc_param_1; -; PTX-NEXT: add.s64 %rd7, %rd2, 4; +; PTX-NEXT: add.s64 %rd4, %rd2, 4; ; PTX-NEXT: $L__BB10_2: // %merge -; PTX-NEXT: ld.param.b32 %r2, [%rd7]; +; PTX-NEXT: ld.param.b32 %r2, [%rd4]; ; PTX-NEXT: st.global.b32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr -; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr ; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 @@ -449,7 +449,7 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { +define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 "nvvm.grid_constant" %input1, ptr byval(i32) %input2, ptr %inout) { ; PTX-LABEL: grid_const_select( ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; @@ -468,10 +468,10 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: st.global.b32 [%rd3], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_select( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr -; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr ; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 @@ -487,7 +487,7 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by ret void } -define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { +define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) "nvvm.grid_constant" %input) { ; PTX-LABEL: grid_const_ptrtoint( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; @@ -502,7 +502,7 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ; PTX-NEXT: st.param.b32 [func_retval0], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4 ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[INPUT2]] to ptr @@ -517,10 +517,10 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { declare void @device_func(ptr byval(i32) align 4) -define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { +define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 "nvvm.grid_constant" %input) { ; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT_PARAM:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT_PARAM:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr ; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]]) ; OPT-NEXT: ret void @@ -545,45 +545,3 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { declare dso_local void @dummy() local_unnamed_addr declare dso_local ptr @escape(ptr) local_unnamed_addr declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr - -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24} - -!0 = !{ptr @grid_const_int, !"grid_constant", !1} -!1 = !{i32 1} - -!2 = !{ptr @grid_const_struct, !"grid_constant", !3} -!3 = !{i32 1} - -!4 = !{ptr @grid_const_escape, !"grid_constant", !5} -!5 = !{i32 1} - -!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7} -!7 = !{i32 1, i32 3} - -!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9} -!9 = !{i32 1} - -!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11} -!11 = !{i32 1} - -!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13} -!13 = !{i32 1} - -!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15} -!15 = !{i32 1} - -!16 = !{ptr @grid_const_phi, !"grid_constant", !17} -!17 = !{i32 1} - -!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19} -!19 = !{i32 1} - -!20 = !{ptr @grid_const_select, !"grid_constant", !21} -!21 = !{i32 1} - -!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23} -!23 = !{i32 1} - -!24 = !{ptr @test_forward_byval_arg, !"grid_constant", !25} -!25 = !{i32 1} - diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 7c029ab..b4a5103 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -200,7 +200,7 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) { ; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr( ; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) { -; IRC-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; IRC-NEXT: [[S3:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8 ; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr ; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) @@ -210,7 +210,7 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st ; ; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr( ; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) { -; IRO-NEXT: [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; IRO-NEXT: [[S1:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8 ; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr ; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4 diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 20a3519..21257e2 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -32,7 +32,7 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only( ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] -; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 ; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; LOWER-ARGS-NEXT: ret void @@ -66,7 +66,7 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep( ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] -; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 ; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 ; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 @@ -128,7 +128,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]] ; COMMON-NEXT: ret void @@ -167,7 +167,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 ; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]] @@ -209,7 +209,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8 ; COMMON-NEXT: ret void @@ -246,7 +246,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 ; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8 @@ -286,7 +286,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64 ; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8 @@ -324,7 +324,7 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param( ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] -; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) ; LOWER-ARGS-NEXT: ret void ; @@ -445,7 +445,7 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly ; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true) ; COMMON-NEXT: ret void @@ -456,63 +456,63 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<48>; +; PTX-NEXT: .reg .b64 %rd<47>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot9; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0]; -; PTX-NEXT: add.u64 %rd3, %SPL, 0; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd3+4], %r1; +; PTX-NEXT: st.local.b32 [%rd2+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1]; -; PTX-NEXT: st.local.b32 [%rd3], %r2; -; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1]; -; PTX-NEXT: ld.volatile.b8 %rd5, [%rd1+1]; -; PTX-NEXT: shl.b64 %rd6, %rd5, 8; -; PTX-NEXT: or.b64 %rd7, %rd6, %rd4; -; PTX-NEXT: ld.volatile.b8 %rd8, [%rd1+2]; -; PTX-NEXT: shl.b64 %rd9, %rd8, 16; -; PTX-NEXT: ld.volatile.b8 %rd10, [%rd1+3]; -; PTX-NEXT: shl.b64 %rd11, %rd10, 24; -; PTX-NEXT: or.b64 %rd12, %rd11, %rd9; -; PTX-NEXT: or.b64 %rd13, %rd12, %rd7; -; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+4]; -; PTX-NEXT: ld.volatile.b8 %rd15, [%rd1+5]; -; PTX-NEXT: shl.b64 %rd16, %rd15, 8; -; PTX-NEXT: or.b64 %rd17, %rd16, %rd14; -; PTX-NEXT: ld.volatile.b8 %rd18, [%rd1+6]; -; PTX-NEXT: shl.b64 %rd19, %rd18, 16; -; PTX-NEXT: ld.volatile.b8 %rd20, [%rd1+7]; -; PTX-NEXT: shl.b64 %rd21, %rd20, 24; -; PTX-NEXT: or.b64 %rd22, %rd21, %rd19; -; PTX-NEXT: or.b64 %rd23, %rd22, %rd17; -; PTX-NEXT: shl.b64 %rd24, %rd23, 32; -; PTX-NEXT: or.b64 %rd25, %rd24, %rd13; -; PTX-NEXT: st.volatile.b64 [%SP], %rd25; -; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+8]; -; PTX-NEXT: ld.volatile.b8 %rd27, [%rd1+9]; -; PTX-NEXT: shl.b64 %rd28, %rd27, 8; -; PTX-NEXT: or.b64 %rd29, %rd28, %rd26; -; PTX-NEXT: ld.volatile.b8 %rd30, [%rd1+10]; -; PTX-NEXT: shl.b64 %rd31, %rd30, 16; -; PTX-NEXT: ld.volatile.b8 %rd32, [%rd1+11]; -; PTX-NEXT: shl.b64 %rd33, %rd32, 24; -; PTX-NEXT: or.b64 %rd34, %rd33, %rd31; -; PTX-NEXT: or.b64 %rd35, %rd34, %rd29; -; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+12]; -; PTX-NEXT: ld.volatile.b8 %rd37, [%rd1+13]; -; PTX-NEXT: shl.b64 %rd38, %rd37, 8; -; PTX-NEXT: or.b64 %rd39, %rd38, %rd36; -; PTX-NEXT: ld.volatile.b8 %rd40, [%rd1+14]; -; PTX-NEXT: shl.b64 %rd41, %rd40, 16; -; PTX-NEXT: ld.volatile.b8 %rd42, [%rd1+15]; -; PTX-NEXT: shl.b64 %rd43, %rd42, 24; -; PTX-NEXT: or.b64 %rd44, %rd43, %rd41; -; PTX-NEXT: or.b64 %rd45, %rd44, %rd39; -; PTX-NEXT: shl.b64 %rd46, %rd45, 32; -; PTX-NEXT: or.b64 %rd47, %rd46, %rd35; -; PTX-NEXT: st.volatile.b64 [%SP+8], %rd47; +; PTX-NEXT: st.local.b32 [%rd2], %r2; +; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1]; +; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1]; +; PTX-NEXT: shl.b64 %rd5, %rd4, 8; +; PTX-NEXT: or.b64 %rd6, %rd5, %rd3; +; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2]; +; PTX-NEXT: shl.b64 %rd8, %rd7, 16; +; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3]; +; PTX-NEXT: shl.b64 %rd10, %rd9, 24; +; PTX-NEXT: or.b64 %rd11, %rd10, %rd8; +; PTX-NEXT: or.b64 %rd12, %rd11, %rd6; +; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4]; +; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5]; +; PTX-NEXT: shl.b64 %rd15, %rd14, 8; +; PTX-NEXT: or.b64 %rd16, %rd15, %rd13; +; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6]; +; PTX-NEXT: shl.b64 %rd18, %rd17, 16; +; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7]; +; PTX-NEXT: shl.b64 %rd20, %rd19, 24; +; PTX-NEXT: or.b64 %rd21, %rd20, %rd18; +; PTX-NEXT: or.b64 %rd22, %rd21, %rd16; +; PTX-NEXT: shl.b64 %rd23, %rd22, 32; +; PTX-NEXT: or.b64 %rd24, %rd23, %rd12; +; PTX-NEXT: st.volatile.b64 [%SP], %rd24; +; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8]; +; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9]; +; PTX-NEXT: shl.b64 %rd27, %rd26, 8; +; PTX-NEXT: or.b64 %rd28, %rd27, %rd25; +; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10]; +; PTX-NEXT: shl.b64 %rd30, %rd29, 16; +; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11]; +; PTX-NEXT: shl.b64 %rd32, %rd31, 24; +; PTX-NEXT: or.b64 %rd33, %rd32, %rd30; +; PTX-NEXT: or.b64 %rd34, %rd33, %rd28; +; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12]; +; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13]; +; PTX-NEXT: shl.b64 %rd37, %rd36, 8; +; PTX-NEXT: or.b64 %rd38, %rd37, %rd35; +; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14]; +; PTX-NEXT: shl.b64 %rd40, %rd39, 16; +; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15]; +; PTX-NEXT: shl.b64 %rd42, %rd41, 24; +; PTX-NEXT: or.b64 %rd43, %rd42, %rd40; +; PTX-NEXT: or.b64 %rd44, %rd43, %rd38; +; PTX-NEXT: shl.b64 %rd45, %rd44, 32; +; PTX-NEXT: or.b64 %rd46, %rd45, %rd34; +; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46; ; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) @@ -525,7 +525,7 @@ define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly % ; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4 ; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4 @@ -551,7 +551,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false) ; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; SM_60-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 @@ -563,7 +563,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; SM_70-NEXT: [[BB:.*:]] ; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr -; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; SM_70-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] ; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 @@ -577,7 +577,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) ; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; COPY-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] ; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 @@ -637,7 +637,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; COMMON-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) ; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; COMMON-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 @@ -651,7 +651,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b16 %rs<3>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %bb ; PTX-NEXT: mov.b64 %SPL, __local_depot12; @@ -663,10 +663,10 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: st.b32 [%SP], %r1; ; PTX-NEXT: ld.param.b32 %r2, [test_select_write_param_0]; ; PTX-NEXT: st.b32 [%SP+4], %r2; -; PTX-NEXT: add.u64 %rd2, %SPL, 4; -; PTX-NEXT: add.u64 %rd4, %SPL, 0; -; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1; -; PTX-NEXT: st.local.b32 [%rd5], 1; +; PTX-NEXT: add.u64 %rd1, %SPL, 4; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; +; PTX-NEXT: st.local.b32 [%rd3], 1; ; PTX-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 @@ -682,7 +682,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) ; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; SM_60-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) ; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; SM_60: [[FIRST]]: @@ -702,7 +702,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_70-NEXT: [[BB:.*:]] ; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr -; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; SM_70-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; SM_70: [[FIRST]]: @@ -724,7 +724,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false) ; COPY-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; COPY-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) ; COPY-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; COPY: [[FIRST]]: @@ -743,7 +743,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_60: { ; PTX_60-NEXT: .reg .pred %p<2>; ; PTX_60-NEXT: .reg .b16 %rs<3>; -; PTX_60-NEXT: .reg .b32 %r<5>; +; PTX_60-NEXT: .reg .b32 %r<2>; ; PTX_60-NEXT: .reg .b64 %rd<3>; ; PTX_60-EMPTY: ; PTX_60-NEXT: // %bb.0: // %bb @@ -752,12 +752,12 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0; ; PTX_60-NEXT: ld.param.b64 %rd2, [test_phi_param_2]; ; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2; -; PTX_60-NEXT: ld.param.b32 %r4, [test_phi_param_0]; +; PTX_60-NEXT: ld.param.b32 %r1, [test_phi_param_0]; ; PTX_60-NEXT: @%p1 bra $L__BB13_2; ; PTX_60-NEXT: // %bb.1: // %second -; PTX_60-NEXT: ld.param.b32 %r4, [test_phi_param_1+4]; +; PTX_60-NEXT: ld.param.b32 %r1, [test_phi_param_1+4]; ; PTX_60-NEXT: $L__BB13_2: // %merge -; PTX_60-NEXT: st.global.b32 [%rd1], %r4; +; PTX_60-NEXT: st.global.b32 [%rd1], %r1; ; PTX_60-NEXT: ret; ; ; PTX_70-LABEL: test_phi( @@ -765,21 +765,21 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_70-NEXT: .reg .pred %p<2>; ; PTX_70-NEXT: .reg .b16 %rs<3>; ; PTX_70-NEXT: .reg .b32 %r<2>; -; PTX_70-NEXT: .reg .b64 %rd<8>; +; PTX_70-NEXT: .reg .b64 %rd<5>; ; PTX_70-EMPTY: ; PTX_70-NEXT: // %bb.0: // %bb ; PTX_70-NEXT: ld.param.b8 %rs1, [test_phi_param_3]; ; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; ; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_70-NEXT: mov.b64 %rd7, test_phi_param_0; -; PTX_70-NEXT: ld.param.b64 %rd6, [test_phi_param_2]; -; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd6; +; PTX_70-NEXT: mov.b64 %rd4, test_phi_param_0; +; PTX_70-NEXT: ld.param.b64 %rd3, [test_phi_param_2]; +; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd3; ; PTX_70-NEXT: @%p1 bra $L__BB13_2; ; PTX_70-NEXT: // %bb.1: // %second ; PTX_70-NEXT: mov.b64 %rd2, test_phi_param_1; -; PTX_70-NEXT: add.s64 %rd7, %rd2, 4; +; PTX_70-NEXT: add.s64 %rd4, %rd2, 4; ; PTX_70-NEXT: $L__BB13_2: // %merge -; PTX_70-NEXT: ld.param.b32 %r1, [%rd7]; +; PTX_70-NEXT: ld.param.b32 %r1, [%rd4]; ; PTX_70-NEXT: st.global.b32 [%rd1], %r1; ; PTX_70-NEXT: ret; bb: @@ -808,7 +808,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; COMMON-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) ; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; COMMON-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) ; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; COMMON: [[FIRST]]: @@ -830,7 +830,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b16 %rs<3>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %bb ; PTX-NEXT: mov.b64 %SPL, __local_depot14; @@ -841,14 +841,14 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; PTX-NEXT: add.u64 %rd1, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [test_phi_write_param_1+4]; ; PTX-NEXT: st.b32 [%SP], %r1; -; PTX-NEXT: add.u64 %rd6, %SPL, 4; +; PTX-NEXT: add.u64 %rd2, %SPL, 4; ; PTX-NEXT: ld.param.b32 %r2, [test_phi_write_param_0]; ; PTX-NEXT: st.b32 [%SP+4], %r2; ; PTX-NEXT: @%p1 bra $L__BB14_2; ; PTX-NEXT: // %bb.1: // %second -; PTX-NEXT: mov.b64 %rd6, %rd1; +; PTX-NEXT: mov.b64 %rd2, %rd1; ; PTX-NEXT: $L__BB14_2: // %merge -; PTX-NEXT: st.local.b32 [%rd6], 1; +; PTX-NEXT: st.local.b32 [%rd2], 1; ; PTX-NEXT: ret; bb: br i1 %cond, label %first, label %second @@ -871,7 +871,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { ; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg( ; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] { ; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4 -; COMMON-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; COMMON-NEXT: [[INPUT2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false) ; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]]) ; COMMON-NEXT: ret void @@ -882,13 +882,13 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %SPL, __local_depot15; -; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: add.u64 %rd1, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0]; -; PTX-NEXT: st.local.b32 [%rd2], %r1; +; PTX-NEXT: st.local.b32 [%rd1], %r1; ; PTX-NEXT: { // callseq 2, 0 ; PTX-NEXT: .param .align 4 .b8 param0[4]; ; PTX-NEXT: st.param.b32 [param0], %r1; @@ -908,7 +908,6 @@ define void @device_func(ptr byval(i32) align 4 %input) { ; PTX-LABEL: device_func( ; PTX: { ; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: { // callseq 3, 0 diff --git a/llvm/test/CodeGen/NVPTX/managed.ll b/llvm/test/CodeGen/NVPTX/managed.ll index 0b94843..931c17d 100644 --- a/llvm/test/CodeGen/NVPTX/managed.ll +++ b/llvm/test/CodeGen/NVPTX/managed.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify %} +; RUN: %if ptxas-isa-4.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify %} ; RUN: not --crash llc < %s -mtriple=nvptx64 -mcpu=sm_20 2>&1 | FileCheck %s --check-prefix ERROR ; ERROR: LLVM ERROR: .attribute(.managed) requires PTX version >= 4.0 and sm_30 diff --git a/llvm/test/CodeGen/NVPTX/match.ll b/llvm/test/CodeGen/NVPTX/match.ll index ae01b0d..0b459a1 100644 --- a/llvm/test/CodeGen/NVPTX/match.ll +++ b/llvm/test/CodeGen/NVPTX/match.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32) declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll index 236bf67..ff0cf3e 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify -arch=sm_53 %} +; RUN: %if ptxas-sm_53 && ptxas-isa-4.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify -arch=sm_53 %} declare half @llvm.nvvm.fma.rn.f16(half, half, half) declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll index c04fd07..7b5bfed 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare bfloat @llvm.nvvm.abs.bf16(bfloat) declare <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat>) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll index 79b7f42..fe2cb16 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare bfloat @llvm.nvvm.abs.bf16(bfloat) declare <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat>) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll index 5d9b8fe3d..0ebbd13 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s -; RUN: %if ptxas-11.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} +; RUN: %if ptxas-sm_86 && ptxas-isa-7.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} ; CHECK-LABEL: fmin_xorsign_abs_f16 define half @fmin_xorsign_abs_f16(half %0, half %1) { diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll index 2ca9d07..0e3ac82 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s -; RUN: %if ptxas-11.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} +; RUN: %if ptxas-sm_86 && ptxas-isa-7.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half) declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index e9635e9..5a55fa9 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -3,8 +3,8 @@ ; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 | FileCheck %s --check-prefixes=CHECK,CHECK-F16 ; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-SM80-NOF16 ; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} -; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %} target triple = "nvptx64-nvidia-cuda" @@ -42,6 +42,14 @@ declare half @llvm.maximum.f16(half, half) #0 declare float @llvm.maximum.f32(float, float) #0 declare double @llvm.maximum.f64(double, double) #0 declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) #0 +declare half @llvm.minimumnum.f16(half, half) #0 +declare float @llvm.minimumnum.f32(float, float) #0 +declare double @llvm.minimumnum.f64(double, double) #0 +declare <2 x half> @llvm.minimumnum.v2f16(<2 x half>, <2 x half>) #0 +declare half @llvm.maximumnum.f16(half, half) #0 +declare float @llvm.maximumnum.f32(float, float) #0 +declare double @llvm.maximumnum.f64(double, double) #0 +declare <2 x half> @llvm.maximumnum.v2f16(<2 x half>, <2 x half>) #0 declare float @llvm.fma.f32(float, float, float) #0 declare double @llvm.fma.f64(double, double, double) #0 @@ -1486,6 +1494,410 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ret <2 x half> %x } +; ---- minimumnum ---- + +define half @minimumnum_half(half %a, half %b) { +; CHECK-NOF16-LABEL: minimumnum_half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: minimumnum_half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-F16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-F16-NEXT: min.f16 %rs3, %rs1, %rs2; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: minimumnum_half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call half @llvm.minimumnum.f16(half %a, half %b) + ret half %x +} + +define float @minimumnum_float(float %a, float %b) { +; CHECK-LABEL: minimumnum_float( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_param_1]; +; CHECK-NEXT: min.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.minimumnum.f32(float %a, float %b) + ret float %x +} + +define float @minimumnum_float_ftz(float %a, float %b) #1 { +; CHECK-LABEL: minimumnum_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_ftz_param_1]; +; CHECK-NEXT: min.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.minimumnum.f32(float %a, float %b) + ret float %x +} + +define double @minimumnum_double(double %a, double %b) { +; CHECK-LABEL: minimumnum_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [minimumnum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [minimumnum_double_param_1]; +; CHECK-NEXT: min.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %x = call double @llvm.minimumnum.f64(double %a, double %b) + ret double %x +} + +; TODO Improve the "Expand" path for minimumnum vectors on targets where +; f16 is not supported. Ideally it should use two f32 minimumnums first instead of +; fully expanding the minimumnum instruction into compare/select instructions. +define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) { +; CHECK-NOF16-LABEL: minimumnum_v2half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: minimumnum_v2half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [minimumnum_v2half_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [minimumnum_v2half_param_1]; +; CHECK-F16-NEXT: min.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: minimumnum_v2half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %x +} + +; ---- maximumnum ---- + +define half @maximumnum_half(half %a, half %b) { +; CHECK-NOF16-LABEL: maximumnum_half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: maximumnum_half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-F16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-F16-NEXT: max.f16 %rs3, %rs1, %rs2; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: maximumnum_half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call half @llvm.maximumnum.f16(half %a, half %b) + ret half %x +} + +define float @maximumnum_float(float %a, float %b) { +; CHECK-LABEL: maximumnum_float( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_param_1]; +; CHECK-NEXT: max.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.maximumnum.f32(float %a, float %b) + ret float %x +} + +define float @maximumnum_float_ftz(float %a, float %b) #1 { +; CHECK-LABEL: maximumnum_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_ftz_param_1]; +; CHECK-NEXT: max.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.maximumnum.f32(float %a, float %b) + ret float %x +} + +define double @maximumnum_double(double %a, double %b) { +; CHECK-LABEL: maximumnum_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [maximumnum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [maximumnum_double_param_1]; +; CHECK-NEXT: max.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %x = call double @llvm.maximumnum.f64(double %a, double %b) + ret double %x +} + +; TODO Improve the "Expand" path for maximumnum vectors on targets where +; f16 is not supported. Ideally it should use two f32 maximumnums first instead of +; fully expanding the maximumnum instruction into compare/select instructions. +define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) { +; CHECK-NOF16-LABEL: maximumnum_v2half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: maximumnum_v2half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [maximumnum_v2half_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [maximumnum_v2half_param_1]; +; CHECK-F16-NEXT: max.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: maximumnum_v2half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %x +} + ; ---- fma ---- define float @fma_float(float %a, float %b, float %c) { diff --git a/llvm/test/CodeGen/NVPTX/mbarrier.ll b/llvm/test/CodeGen/NVPTX/mbarrier.ll index 87a73aa..78edc0a 100644 --- a/llvm/test/CodeGen/NVPTX/mbarrier.ll +++ b/llvm/test/CodeGen/NVPTX/mbarrier.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64 -; RUN: %if ptxas-11.0 && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.mbarrier.init(ptr %a, i32 %b) declare void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) %a, i32 %b) diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll index dfdb338..be6d158 100644 --- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -8,55 +8,52 @@ target triple = "nvptx64-nvidia-cuda" define <4 x float> @t1(ptr %p1) { ; CHECK-LABEL: t1( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<46>; +; CHECK-NEXT: .reg .b32 %r<41>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t1_param_0]; -; CHECK-NEXT: ld.b8 %rd2, [%rd1+8]; -; CHECK-NEXT: ld.b8 %rd3, [%rd1+9]; -; CHECK-NEXT: shl.b64 %rd4, %rd3, 8; -; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2; -; CHECK-NEXT: ld.b8 %rd6, [%rd1+10]; -; CHECK-NEXT: shl.b64 %rd7, %rd6, 16; -; CHECK-NEXT: ld.b8 %rd8, [%rd1+11]; -; CHECK-NEXT: shl.b64 %rd9, %rd8, 24; -; CHECK-NEXT: or.b64 %rd10, %rd9, %rd7; -; CHECK-NEXT: or.b64 %rd11, %rd10, %rd5; -; CHECK-NEXT: ld.b8 %rd12, [%rd1+12]; -; CHECK-NEXT: ld.b8 %rd13, [%rd1+13]; -; CHECK-NEXT: shl.b64 %rd14, %rd13, 8; -; CHECK-NEXT: or.b64 %rd15, %rd14, %rd12; -; CHECK-NEXT: ld.b8 %rd16, [%rd1+14]; -; CHECK-NEXT: shl.b64 %rd17, %rd16, 16; -; CHECK-NEXT: ld.b8 %rd18, [%rd1+15]; -; CHECK-NEXT: shl.b64 %rd19, %rd18, 24; -; CHECK-NEXT: or.b64 %rd20, %rd19, %rd17; -; CHECK-NEXT: or.b64 %rd21, %rd20, %rd15; -; CHECK-NEXT: shl.b64 %rd22, %rd21, 32; -; CHECK-NEXT: or.b64 %rd23, %rd22, %rd11; -; CHECK-NEXT: ld.b8 %rd24, [%rd1]; -; CHECK-NEXT: ld.b8 %rd25, [%rd1+1]; -; CHECK-NEXT: shl.b64 %rd26, %rd25, 8; -; CHECK-NEXT: or.b64 %rd27, %rd26, %rd24; -; CHECK-NEXT: ld.b8 %rd28, [%rd1+2]; -; CHECK-NEXT: shl.b64 %rd29, %rd28, 16; -; CHECK-NEXT: ld.b8 %rd30, [%rd1+3]; -; CHECK-NEXT: shl.b64 %rd31, %rd30, 24; -; CHECK-NEXT: or.b64 %rd32, %rd31, %rd29; -; CHECK-NEXT: or.b64 %rd33, %rd32, %rd27; -; CHECK-NEXT: ld.b8 %rd34, [%rd1+4]; -; CHECK-NEXT: ld.b8 %rd35, [%rd1+5]; -; CHECK-NEXT: shl.b64 %rd36, %rd35, 8; -; CHECK-NEXT: or.b64 %rd37, %rd36, %rd34; -; CHECK-NEXT: ld.b8 %rd38, [%rd1+6]; -; CHECK-NEXT: shl.b64 %rd39, %rd38, 16; -; CHECK-NEXT: ld.b8 %rd40, [%rd1+7]; -; CHECK-NEXT: shl.b64 %rd41, %rd40, 24; -; CHECK-NEXT: or.b64 %rd42, %rd41, %rd39; -; CHECK-NEXT: or.b64 %rd43, %rd42, %rd37; -; CHECK-NEXT: shl.b64 %rd44, %rd43, 32; -; CHECK-NEXT: or.b64 %rd45, %rd44, %rd33; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd45, %rd23}; +; CHECK-NEXT: ld.b8 %r1, [%rd1+12]; +; CHECK-NEXT: ld.b8 %r2, [%rd1+13]; +; CHECK-NEXT: shl.b32 %r3, %r2, 8; +; CHECK-NEXT: or.b32 %r4, %r3, %r1; +; CHECK-NEXT: ld.b8 %r5, [%rd1+14]; +; CHECK-NEXT: shl.b32 %r6, %r5, 16; +; CHECK-NEXT: ld.b8 %r7, [%rd1+15]; +; CHECK-NEXT: shl.b32 %r8, %r7, 24; +; CHECK-NEXT: or.b32 %r9, %r8, %r6; +; CHECK-NEXT: or.b32 %r10, %r9, %r4; +; CHECK-NEXT: ld.b8 %r11, [%rd1+8]; +; CHECK-NEXT: ld.b8 %r12, [%rd1+9]; +; CHECK-NEXT: shl.b32 %r13, %r12, 8; +; CHECK-NEXT: or.b32 %r14, %r13, %r11; +; CHECK-NEXT: ld.b8 %r15, [%rd1+10]; +; CHECK-NEXT: shl.b32 %r16, %r15, 16; +; CHECK-NEXT: ld.b8 %r17, [%rd1+11]; +; CHECK-NEXT: shl.b32 %r18, %r17, 24; +; CHECK-NEXT: or.b32 %r19, %r18, %r16; +; CHECK-NEXT: or.b32 %r20, %r19, %r14; +; CHECK-NEXT: ld.b8 %r21, [%rd1+4]; +; CHECK-NEXT: ld.b8 %r22, [%rd1+5]; +; CHECK-NEXT: shl.b32 %r23, %r22, 8; +; CHECK-NEXT: or.b32 %r24, %r23, %r21; +; CHECK-NEXT: ld.b8 %r25, [%rd1+6]; +; CHECK-NEXT: shl.b32 %r26, %r25, 16; +; CHECK-NEXT: ld.b8 %r27, [%rd1+7]; +; CHECK-NEXT: shl.b32 %r28, %r27, 24; +; CHECK-NEXT: or.b32 %r29, %r28, %r26; +; CHECK-NEXT: or.b32 %r30, %r29, %r24; +; CHECK-NEXT: ld.b8 %r31, [%rd1]; +; CHECK-NEXT: ld.b8 %r32, [%rd1+1]; +; CHECK-NEXT: shl.b32 %r33, %r32, 8; +; CHECK-NEXT: or.b32 %r34, %r33, %r31; +; CHECK-NEXT: ld.b8 %r35, [%rd1+2]; +; CHECK-NEXT: shl.b32 %r36, %r35, 16; +; CHECK-NEXT: ld.b8 %r37, [%rd1+3]; +; CHECK-NEXT: shl.b32 %r38, %r37, 24; +; CHECK-NEXT: or.b32 %r39, %r38, %r36; +; CHECK-NEXT: or.b32 %r40, %r39, %r34; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r30, %r20, %r10}; ; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 1 ret <4 x float> %r @@ -65,19 +62,16 @@ define <4 x float> @t1(ptr %p1) { define <4 x float> @t2(ptr %p1) { ; CHECK-LABEL: t2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<10>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0]; -; CHECK-NEXT: ld.b32 %rd2, [%rd1+8]; -; CHECK-NEXT: ld.b32 %rd3, [%rd1+12]; -; CHECK-NEXT: shl.b64 %rd4, %rd3, 32; -; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2; -; CHECK-NEXT: ld.b32 %rd6, [%rd1]; -; CHECK-NEXT: ld.b32 %rd7, [%rd1+4]; -; CHECK-NEXT: shl.b64 %rd8, %rd7, 32; -; CHECK-NEXT: or.b64 %rd9, %rd8, %rd6; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd9, %rd5}; +; CHECK-NEXT: ld.b32 %r1, [%rd1+12]; +; CHECK-NEXT: ld.b32 %r2, [%rd1+8]; +; CHECK-NEXT: ld.b32 %r3, [%rd1+4]; +; CHECK-NEXT: ld.b32 %r4, [%rd1]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; ; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 4 ret <4 x float> %r @@ -86,13 +80,14 @@ define <4 x float> @t2(ptr %p1) { define <4 x float> @t3(ptr %p1) { ; CHECK-LABEL: t3( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0]; -; CHECK-NEXT: ld.b64 %rd2, [%rd1+8]; -; CHECK-NEXT: ld.b64 %rd3, [%rd1]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1+8]; +; CHECK-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r3, %r4, %r1, %r2}; ; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 8 ret <4 x float> %r @@ -101,12 +96,13 @@ define <4 x float> @t3(ptr %p1) { define <4 x float> @t4(ptr %p1) { ; CHECK-LABEL: t4( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t4_param_0]; -; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd3}; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 16 ret <4 x float> %r @@ -189,41 +185,40 @@ define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) { define void @s1(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s1( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<18>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s1_param_1]; -; CHECK-NEXT: st.b8 [%rd1+8], %rd3; -; CHECK-NEXT: st.b8 [%rd1], %rd2; -; CHECK-NEXT: shr.u64 %rd4, %rd3, 56; -; CHECK-NEXT: st.b8 [%rd1+15], %rd4; -; CHECK-NEXT: shr.u64 %rd5, %rd3, 48; -; CHECK-NEXT: st.b8 [%rd1+14], %rd5; -; CHECK-NEXT: shr.u64 %rd6, %rd3, 40; -; CHECK-NEXT: st.b8 [%rd1+13], %rd6; -; CHECK-NEXT: shr.u64 %rd7, %rd3, 32; -; CHECK-NEXT: st.b8 [%rd1+12], %rd7; -; CHECK-NEXT: shr.u64 %rd8, %rd3, 24; -; CHECK-NEXT: st.b8 [%rd1+11], %rd8; -; CHECK-NEXT: shr.u64 %rd9, %rd3, 16; -; CHECK-NEXT: st.b8 [%rd1+10], %rd9; -; CHECK-NEXT: shr.u64 %rd10, %rd3, 8; -; CHECK-NEXT: st.b8 [%rd1+9], %rd10; -; CHECK-NEXT: shr.u64 %rd11, %rd2, 56; -; CHECK-NEXT: st.b8 [%rd1+7], %rd11; -; CHECK-NEXT: shr.u64 %rd12, %rd2, 48; -; CHECK-NEXT: st.b8 [%rd1+6], %rd12; -; CHECK-NEXT: shr.u64 %rd13, %rd2, 40; -; CHECK-NEXT: st.b8 [%rd1+5], %rd13; -; CHECK-NEXT: shr.u64 %rd14, %rd2, 32; -; CHECK-NEXT: st.b8 [%rd1+4], %rd14; -; CHECK-NEXT: shr.u64 %rd15, %rd2, 24; -; CHECK-NEXT: st.b8 [%rd1+3], %rd15; -; CHECK-NEXT: shr.u64 %rd16, %rd2, 16; -; CHECK-NEXT: st.b8 [%rd1+2], %rd16; -; CHECK-NEXT: shr.u64 %rd17, %rd2, 8; -; CHECK-NEXT: st.b8 [%rd1+1], %rd17; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s1_param_1]; +; CHECK-NEXT: st.b8 [%rd1+12], %r4; +; CHECK-NEXT: st.b8 [%rd1+8], %r3; +; CHECK-NEXT: st.b8 [%rd1+4], %r2; +; CHECK-NEXT: st.b8 [%rd1], %r1; +; CHECK-NEXT: shr.u32 %r5, %r4, 24; +; CHECK-NEXT: st.b8 [%rd1+15], %r5; +; CHECK-NEXT: shr.u32 %r6, %r4, 16; +; CHECK-NEXT: st.b8 [%rd1+14], %r6; +; CHECK-NEXT: shr.u32 %r7, %r4, 8; +; CHECK-NEXT: st.b8 [%rd1+13], %r7; +; CHECK-NEXT: shr.u32 %r8, %r3, 24; +; CHECK-NEXT: st.b8 [%rd1+11], %r8; +; CHECK-NEXT: shr.u32 %r9, %r3, 16; +; CHECK-NEXT: st.b8 [%rd1+10], %r9; +; CHECK-NEXT: shr.u32 %r10, %r3, 8; +; CHECK-NEXT: st.b8 [%rd1+9], %r10; +; CHECK-NEXT: shr.u32 %r11, %r2, 24; +; CHECK-NEXT: st.b8 [%rd1+7], %r11; +; CHECK-NEXT: shr.u32 %r12, %r2, 16; +; CHECK-NEXT: st.b8 [%rd1+6], %r12; +; CHECK-NEXT: shr.u32 %r13, %r2, 8; +; CHECK-NEXT: st.b8 [%rd1+5], %r13; +; CHECK-NEXT: shr.u32 %r14, %r1, 24; +; CHECK-NEXT: st.b8 [%rd1+3], %r14; +; CHECK-NEXT: shr.u32 %r15, %r1, 16; +; CHECK-NEXT: st.b8 [%rd1+2], %r15; +; CHECK-NEXT: shr.u32 %r16, %r1, 8; +; CHECK-NEXT: st.b8 [%rd1+1], %r16; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 1 ret void @@ -232,17 +227,16 @@ define void @s1(ptr %p1, <4 x float> %v) { define void @s2(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s2_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s2_param_1]; -; CHECK-NEXT: st.b32 [%rd1+8], %rd3; -; CHECK-NEXT: st.b32 [%rd1], %rd2; -; CHECK-NEXT: shr.u64 %rd4, %rd3, 32; -; CHECK-NEXT: st.b32 [%rd1+12], %rd4; -; CHECK-NEXT: shr.u64 %rd5, %rd2, 32; -; CHECK-NEXT: st.b32 [%rd1+4], %rd5; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s2_param_1]; +; CHECK-NEXT: st.b32 [%rd1+12], %r4; +; CHECK-NEXT: st.b32 [%rd1+8], %r3; +; CHECK-NEXT: st.b32 [%rd1+4], %r2; +; CHECK-NEXT: st.b32 [%rd1], %r1; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 4 ret void @@ -251,13 +245,14 @@ define void @s2(ptr %p1, <4 x float> %v) { define void @s3(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s3( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s3_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s3_param_1]; -; CHECK-NEXT: st.b64 [%rd1+8], %rd3; -; CHECK-NEXT: st.b64 [%rd1], %rd2; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s3_param_1]; +; CHECK-NEXT: st.v2.b32 [%rd1+8], {%r3, %r4}; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2}; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 8 ret void @@ -266,12 +261,13 @@ define void @s3(ptr %p1, <4 x float> %v) { define void @s4(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s4( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s4_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s4_param_1]; -; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s4_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 16 ret void diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll index b2994c0..62f99e9 100644 --- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -7,14 +7,14 @@ target triple = "nvptx64-nvidia-cuda" define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-LABEL: wombat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<11>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %bb -; CHECK-NEXT: ld.param.b32 %r4, [wombat_param_2]; -; CHECK-NEXT: ld.param.b32 %r3, [wombat_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [wombat_param_0]; -; CHECK-NEXT: mov.b32 %r10, 0; +; CHECK-NEXT: ld.param.b32 %r3, [wombat_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [wombat_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [wombat_param_0]; +; CHECK-NEXT: mov.b32 %r7, 0; ; CHECK-NEXT: $L__BB0_1: // %bb3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: { // callseq 0, 0 @@ -23,15 +23,15 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: st.param.b64 [param0], 0; ; CHECK-NEXT: call.uni (retval0), quux, (param0); ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3; -; CHECK-NEXT: or.b32 %r8, %r4, %r7; -; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8; -; CHECK-NEXT: cvt.rn.f64.s32 %rd1, %r9; -; CHECK-NEXT: cvt.rn.f64.u32 %rd2, %r10; +; CHECK-NEXT: mul.lo.s32 %r4, %r7, %r2; +; CHECK-NEXT: or.b32 %r5, %r3, %r4; +; CHECK-NEXT: mul.lo.s32 %r6, %r1, %r5; +; CHECK-NEXT: cvt.rn.f64.s32 %rd1, %r6; +; CHECK-NEXT: cvt.rn.f64.u32 %rd2, %r7; ; CHECK-NEXT: add.rn.f64 %rd3, %rd2, %rd1; ; CHECK-NEXT: mov.b64 %rd4, 0; ; CHECK-NEXT: st.global.b64 [%rd4], %rd3; -; CHECK-NEXT: mov.b32 %r10, 1; +; CHECK-NEXT: mov.b32 %r7, 1; ; CHECK-NEXT: bra.uni $L__BB0_1; bb: br label %bb3 diff --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll index 666c7a1..bde57fb 100644 --- a/llvm/test/CodeGen/NVPTX/mulwide.ll +++ b/llvm/test/CodeGen/NVPTX/mulwide.ll @@ -118,17 +118,15 @@ define i32 @mulwideu8(i8 %a, i8 %b) { ; NOOPT-LABEL: mulwideu8( ; NOOPT: { ; NOOPT-NEXT: .reg .b16 %rs<3>; -; NOOPT-NEXT: .reg .b32 %r<6>; +; NOOPT-NEXT: .reg .b32 %r<4>; ; NOOPT-EMPTY: ; NOOPT-NEXT: // %bb.0: ; NOOPT-NEXT: ld.param.b8 %rs2, [mulwideu8_param_1]; ; NOOPT-NEXT: ld.param.b8 %rs1, [mulwideu8_param_0]; ; NOOPT-NEXT: cvt.u32.u16 %r1, %rs1; -; NOOPT-NEXT: and.b32 %r2, %r1, 255; -; NOOPT-NEXT: cvt.u32.u16 %r3, %rs2; -; NOOPT-NEXT: and.b32 %r4, %r3, 255; -; NOOPT-NEXT: mul.lo.s32 %r5, %r2, %r4; -; NOOPT-NEXT: st.param.b32 [func_retval0], %r5; +; NOOPT-NEXT: cvt.u32.u16 %r2, %rs2; +; NOOPT-NEXT: mul.lo.s32 %r3, %r1, %r2; +; NOOPT-NEXT: st.param.b32 [func_retval0], %r3; ; NOOPT-NEXT: ret; %val0 = zext i8 %a to i32 %val1 = zext i8 %b to i32 @@ -203,27 +201,35 @@ define i64 @mulwideu32(i32 %a, i32 %b) { define i64 @mulwideu7(i7 %a, i7 %b) { ; OPT-LABEL: mulwideu7( ; OPT: { -; OPT-NEXT: .reg .b32 %r<3>; +; OPT-NEXT: .reg .b32 %r<5>; ; OPT-NEXT: .reg .b64 %rd<2>; ; OPT-EMPTY: ; OPT-NEXT: // %bb.0: -; OPT-NEXT: ld.param.b8 %r1, [mulwideu7_param_0]; -; OPT-NEXT: ld.param.b8 %r2, [mulwideu7_param_1]; -; OPT-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; OPT-NEXT: ld.param.b8 %r1, [mulwideu7_param_1]; +; OPT-NEXT: and.b32 %r2, %r1, 127; +; OPT-NEXT: ld.param.b8 %r3, [mulwideu7_param_0]; +; OPT-NEXT: and.b32 %r4, %r3, 127; +; OPT-NEXT: mul.wide.u32 %rd1, %r4, %r2; ; OPT-NEXT: st.param.b64 [func_retval0], %rd1; ; OPT-NEXT: ret; ; ; NOOPT-LABEL: mulwideu7( ; NOOPT: { -; NOOPT-NEXT: .reg .b16 %rs<3>; +; NOOPT-NEXT: .reg .b16 %rs<9>; ; NOOPT-NEXT: .reg .b64 %rd<6>; ; NOOPT-EMPTY: ; NOOPT-NEXT: // %bb.0: -; NOOPT-NEXT: ld.param.b8 %rs2, [mulwideu7_param_1]; -; NOOPT-NEXT: ld.param.b8 %rs1, [mulwideu7_param_0]; -; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs1; +; NOOPT-NEXT: ld.param.b8 %rs3, [mulwideu7_param_0+1]; +; NOOPT-NEXT: shl.b16 %rs4, %rs3, 8; +; NOOPT-NEXT: ld.param.b8 %rs5, [mulwideu7_param_0]; +; NOOPT-NEXT: or.b16 %rs1, %rs4, %rs5; +; NOOPT-NEXT: ld.param.b8 %rs6, [mulwideu7_param_1+1]; +; NOOPT-NEXT: shl.b16 %rs7, %rs6, 8; +; NOOPT-NEXT: ld.param.b8 %rs8, [mulwideu7_param_1]; +; NOOPT-NEXT: or.b16 %rs2, %rs7, %rs8; +; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs5; ; NOOPT-NEXT: and.b64 %rd2, %rd1, 127; -; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs2; +; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs8; ; NOOPT-NEXT: and.b64 %rd4, %rd3, 127; ; NOOPT-NEXT: mul.lo.s64 %rd5, %rd2, %rd4; ; NOOPT-NEXT: st.param.b64 [func_retval0], %rd5; @@ -242,26 +248,32 @@ define i64 @mulwides7(i7 %a, i7 %b) { ; OPT-EMPTY: ; OPT-NEXT: // %bb.0: ; OPT-NEXT: ld.param.b8 %r1, [mulwides7_param_0]; -; OPT-NEXT: bfe.s32 %r2, %r1, 0, 7; -; OPT-NEXT: ld.param.b8 %r3, [mulwides7_param_1]; -; OPT-NEXT: bfe.s32 %r4, %r3, 0, 7; -; OPT-NEXT: mul.wide.s32 %rd1, %r2, %r4; +; OPT-NEXT: ld.param.b8 %r2, [mulwides7_param_1]; +; OPT-NEXT: bfe.s32 %r3, %r2, 0, 7; +; OPT-NEXT: bfe.s32 %r4, %r1, 0, 7; +; OPT-NEXT: mul.wide.s32 %rd1, %r4, %r3; ; OPT-NEXT: st.param.b64 [func_retval0], %rd1; ; OPT-NEXT: ret; ; ; NOOPT-LABEL: mulwides7( ; NOOPT: { -; NOOPT-NEXT: .reg .b16 %rs<3>; +; NOOPT-NEXT: .reg .b16 %rs<9>; ; NOOPT-NEXT: .reg .b64 %rd<6>; ; NOOPT-EMPTY: ; NOOPT-NEXT: // %bb.0: -; NOOPT-NEXT: ld.param.b8 %rs2, [mulwides7_param_1]; -; NOOPT-NEXT: ld.param.b8 %rs1, [mulwides7_param_0]; -; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs1; -; NOOPT-NEXT: bfe.s64 %rd2, %rd1, 0, 7; -; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs2; -; NOOPT-NEXT: bfe.s64 %rd4, %rd3, 0, 7; -; NOOPT-NEXT: mul.lo.s64 %rd5, %rd2, %rd4; +; NOOPT-NEXT: ld.param.b8 %rs3, [mulwides7_param_0+1]; +; NOOPT-NEXT: shl.b16 %rs4, %rs3, 8; +; NOOPT-NEXT: ld.param.b8 %rs5, [mulwides7_param_0]; +; NOOPT-NEXT: or.b16 %rs1, %rs4, %rs5; +; NOOPT-NEXT: ld.param.b8 %rs6, [mulwides7_param_1]; +; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs6; +; NOOPT-NEXT: cvt.u64.u16 %rd2, %rs5; +; NOOPT-NEXT: ld.param.b8 %rs7, [mulwides7_param_1+1]; +; NOOPT-NEXT: shl.b16 %rs8, %rs7, 8; +; NOOPT-NEXT: or.b16 %rs2, %rs8, %rs6; +; NOOPT-NEXT: bfe.s64 %rd3, %rd2, 0, 7; +; NOOPT-NEXT: bfe.s64 %rd4, %rd1, 0, 7; +; NOOPT-NEXT: mul.lo.s64 %rd5, %rd3, %rd4; ; NOOPT-NEXT: st.param.b64 [func_retval0], %rd5; ; NOOPT-NEXT: ret; %val0 = sext i7 %a to i64 diff --git a/llvm/test/CodeGen/NVPTX/nanosleep.ll b/llvm/test/CodeGen/NVPTX/nanosleep.ll index de08c9f..48bf8bc 100644 --- a/llvm/test/CodeGen/NVPTX/nanosleep.ll +++ b/llvm/test/CodeGen/NVPTX/nanosleep.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -O2 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} declare void @llvm.nvvm.nanosleep(i32) diff --git a/llvm/test/CodeGen/NVPTX/no-f32x2.ll b/llvm/test/CodeGen/NVPTX/no-f32x2.ll new file mode 100644 index 0000000..b2b9091 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/no-f32x2.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_100 | FileCheck %s --check-prefix=F32X2 +; RUN: llc < %s -mcpu=sm_90 | FileCheck %s --check-prefix=NOF32X2 +; RUN: llc < %s -mcpu=sm_100 -nvptx-no-f32x2 | FileCheck %s --check-prefix=NOF32X2 + +target triple = "nvptx64-nvidia-cuda" + +define <2 x float> @test(<2 x float> %a, <2 x float> %b) { +; F32X2-LABEL: test( +; F32X2: { +; F32X2-NEXT: .reg .b64 %rd<4>; +; F32X2-EMPTY: +; F32X2-NEXT: // %bb.0: +; F32X2-NEXT: ld.param.b64 %rd1, [test_param_0]; +; F32X2-NEXT: ld.param.b64 %rd2, [test_param_1]; +; F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; F32X2-NEXT: ret; +; +; NOF32X2-LABEL: test( +; NOF32X2: { +; NOF32X2-NEXT: .reg .b32 %r<7>; +; NOF32X2-EMPTY: +; NOF32X2-NEXT: // %bb.0: +; NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_param_0]; +; NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_param_1]; +; NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3; +; NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; NOF32X2-NEXT: ret; + %c = fadd <2 x float> %a, %b + ret <2 x float> %c +} diff --git a/llvm/test/CodeGen/NVPTX/nofunc.ll b/llvm/test/CodeGen/NVPTX/nofunc.ll index a8ce20e..d07d222 100644 --- a/llvm/test/CodeGen/NVPTX/nofunc.ll +++ b/llvm/test/CodeGen/NVPTX/nofunc.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Test that we don't crash if we're compiling a module with function references, diff --git a/llvm/test/CodeGen/NVPTX/noreturn.ll b/llvm/test/CodeGen/NVPTX/noreturn.ll index 6c11d0a..0062e62 100644 --- a/llvm/test/CodeGen/NVPTX/noreturn.ll +++ b/llvm/test/CodeGen/NVPTX/noreturn.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx64 -mcpu=sm_30 | FileCheck %s -; RUN: %if ptxas %{llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} @function_pointer = addrspace(1) global ptr null diff --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll index 9a78d31..8527d3d 100644 --- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_60 | %ptxas-verify %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} target triple = "nvptx-unknown-nvcl" diff --git a/llvm/test/CodeGen/NVPTX/packed-aggr.ll b/llvm/test/CodeGen/NVPTX/packed-aggr.ll index 602bef2..353f1cb 100644 --- a/llvm/test/CodeGen/NVPTX/packed-aggr.ll +++ b/llvm/test/CodeGen/NVPTX/packed-aggr.ll @@ -5,8 +5,8 @@ ; RUN: FileCheck %s --check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | \ ; RUN: FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas-11.1 && !ptxas-12.0%{ llc < %s -mtriple=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} -; RUN: %if ptxas-11.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.1 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} ;; Test that packed structs with symbol references are represented using the ;; mask() operator. diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll index c5ea9f8..06d7384 100644 --- a/llvm/test/CodeGen/NVPTX/param-add.ll +++ b/llvm/test/CodeGen/NVPTX/param-add.ll @@ -14,7 +14,7 @@ declare i32 @callee(%struct.1float %a) define i32 @test(%struct.1float alignstack(32) %data) { ; CHECK-LABEL: test( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index db3fbbc..90c8b92 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -523,8 +523,7 @@ define <9 x half> @test_v9f16(<9 x half> %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i19( ; CHECK-NEXT: .param .b32 test_i19_param_0 -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i19_param_0]; -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i19_param_0+2]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i19_param_0]; ; CHECK: .param .b32 param0; ; CHECK: .param .b32 retval0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; @@ -540,8 +539,7 @@ define i19 @test_i19(i19 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i23( ; CHECK-NEXT: .param .b32 test_i23_param_0 -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i23_param_0]; -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i23_param_0+2]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i23_param_0]; ; CHECK: .param .b32 param0; ; CHECK: .param .b32 retval0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; @@ -557,8 +555,7 @@ define i23 @test_i23(i23 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i24( ; CHECK-NEXT: .param .b32 test_i24_param_0 -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i24_param_0+2]; -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i24_param_0]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i24_param_0]; ; CHECK: .param .b32 param0; ; CHECK: .param .b32 retval0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; @@ -678,8 +675,7 @@ define float @test_f32(float %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i40( ; CHECK-NEXT: .param .b64 test_i40_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i40_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i40_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i40_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; @@ -695,8 +691,7 @@ define i40 @test_i40(i40 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i47( ; CHECK-NEXT: .param .b64 test_i47_param_0 -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i47_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i47_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i47_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; @@ -712,8 +707,7 @@ define i47 @test_i47(i47 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i48( ; CHECK-NEXT: .param .b64 test_i48_param_0 -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i48_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i48_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i48_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; @@ -729,9 +723,7 @@ define i48 @test_i48(i48 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i51( ; CHECK-NEXT: .param .b64 test_i51_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i51_param_0+6]; -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i51_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i51_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i51_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; @@ -747,9 +739,7 @@ define i51 @test_i51(i51 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i56( ; CHECK-NEXT: .param .b64 test_i56_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i56_param_0+6]; -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i56_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i56_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i56_param_0]; ; CHECK: .param .b64 param0; ; CHECK: .param .b64 retval0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll index 2155fb4..2ee749f 100644 --- a/llvm/test/CodeGen/NVPTX/param-overalign.ll +++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" @@ -21,7 +21,7 @@ target triple = "nvptx64-nvidia-cuda" define float @caller_md(float %a, float %b) { ; CHECK-LABEL: caller_md( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [caller_md_param_0]; @@ -62,7 +62,7 @@ define float @callee_md(%struct.float2 alignstack(8) %a) { define float @caller(float %a, float %b) { ; CHECK-LABEL: caller( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [caller_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll index 95258f7..525da1f 100644 --- a/llvm/test/CodeGen/NVPTX/pr126337.ll +++ b/llvm/test/CodeGen/NVPTX/pr126337.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas -arch=sm_70 -c - %} +; RUN: %if ptxas-sm_70 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} ; This IR should compile without triggering assertions in LICM ; when the CopyToReg from %0 in the first BB gets eliminated @@ -17,17 +17,16 @@ define ptx_kernel void @Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(<2 x float> %0) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %.preheader15 -; CHECK-NEXT: ld.param.b64 %rd1, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0]; -; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0]; ; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000; ; CHECK-NEXT: selp.b16 %rs1, 1, 0, %p1; ; CHECK-NEXT: $L__BB0_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov.b64 %rd2, 0; -; CHECK-NEXT: st.b8 [%rd2], %rs1; +; CHECK-NEXT: mov.b64 %rd1, 0; +; CHECK-NEXT: st.b8 [%rd1], %rs1; ; CHECK-NEXT: bra.uni $L__BB0_1; .preheader15: br label %1 diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll index cd2505c..5120550 100644 --- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_kernel void @t1(ptr %a) { diff --git a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll new file mode 100644 index 0000000..bc67471 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix=INFER
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s --check-prefix=PTX
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+@constant_tensormap = addrspace(4) global [64 x i8] zeroinitializer, align 64
+
+; Inference from const address space
+define void @test_infer_const_from_cast() {
+; INFER-LABEL: @test_infer_const_from_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; BOTH: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; PTX-LABEL: .visible .func test_infer_const_from_cast(
+; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
+; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
+entry:
+ %casted = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
+ call void @llvm.nvvm.prefetch.tensormap.p0(ptr %casted)
+ ret void
+}
+
+; Cast from Const space to Generic
+define void @test_const_to_generic_cast(ptr addrspace(4) %const_ptr) {
+; INFER-LABEL: @test_const_to_generic_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+; PTX-LABEL: .visible .func test_const_to_generic_cast(
+; PTX: prefetch.const.tensormap [%rd{{[0-9]+}}];
+entry:
+ %cast = addrspacecast ptr addrspace(4) %const_ptr to ptr
+ call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
+ ret void
+}
+
+; No inference possible
+define void @test_no_inference_possible(ptr %generic_ptr) {
+; INFER-LABEL: @test_no_inference_possible
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
+; PTX-LABEL: .visible .func test_no_inference_possible(
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
+entry:
+ call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
+ ret void
+}
+
+; Cast from Parameter space to Generic
+define void @test_param_to_generic_cast(ptr addrspace(101) %param_ptr) {
+; INFER-LABEL: @test_param_to_generic_cast
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+; PTX-LABEL: .visible .func test_param_to_generic_cast(
+; PTX: prefetch.param.tensormap [%rd{{[0-9]+}}];
+entry:
+ %cast = addrspacecast ptr addrspace(101) %param_ptr to ptr
+ call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
+ ret void
+}
+
+; Multiple casts in sequence
+define void @test_infer_through_multiple_casts() {
+; INFER-LABEL: @test_infer_through_multiple_casts
+; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
+; PTX-LABEL: .visible .func test_infer_through_multiple_casts(
+; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
+; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
+entry:
+ %cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
+ %cast2 = addrspacecast ptr %cast1 to ptr addrspace(4)
+ %cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr
+ call void @llvm.nvvm.prefetch.tensormap(ptr %cast3)
+ ret void
+}
+
+declare void @llvm.nvvm.prefetch.tensormap.p0(ptr)
+declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4))
+declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101))
+
+
diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll index a64e4fe..a1c5ec8 100644 --- a/llvm/test/CodeGen/NVPTX/prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/prefetch.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
-; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
target triple = "nvptx64-nvidia-cuda"
@@ -12,6 +12,10 @@ declare void @llvm.nvvm.prefetch.local.L2(ptr addrspace(5) %local_ptr) declare void @llvm.nvvm.prefetch.L1(ptr %ptr)
declare void @llvm.nvvm.prefetch.L2(ptr %ptr)
+declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+
declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)
@@ -78,4 +82,43 @@ define void @prefetchu_l1(ptr %ptr) { ; CHECK-PTX64-NEXT: ret;
tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr)
ret void
+}
+
+define void @prefetch_tensormap(ptr %ptr) {
+; CHECK-PTX64-LABEL: prefetch_tensormap(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_tensormap_param_0];
+; CHECK-PTX64-NEXT: prefetch.tensormap [%rd1];
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
+ ret void
+}
+
+define void @prefetch_const_tensormap(ptr addrspace(4) %const_ptr) {
+; CHECK-PTX64-LABEL: prefetch_const_tensormap(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_const_tensormap_param_0];
+; CHECK-PTX64-NEXT: prefetch.const.tensormap [%rd1];
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
+ ret void
+}
+
+define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) {
+; CHECK-PTX64-LABEL: prefetch_param_tensormap(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_param_tensormap_param_0];
+; CHECK-PTX64-NEXT: prefetch.param.tensormap [%rd1];
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
+ ret void
}
\ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir index 4a53152..dfc8417 100644 --- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir +++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir @@ -66,8 +66,8 @@ frameInfo: hasTailCall: false isCalleeSavedInfoValid: false localFrameSize: 0 - savePoint: '' - restorePoint: '' + savePoint: [] + restorePoint: [] fixedStack: [] stack: [] entry_values: [] diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll index c78fcdd..153d677 100644 --- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll +++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll @@ -25,11 +25,11 @@ define float @test_gv_float() { define <2 x float> @test_gv_float2() { ; CHECK-LABEL: test_gv_float2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.global.nc.b64 %rd1, [gv_float2]; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [gv_float2]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; ; CHECK-NEXT: ret; %v = load <2 x float>, ptr @gv_float2 ret <2 x float> %v @@ -38,11 +38,11 @@ define <2 x float> @test_gv_float2() { define <4 x float> @test_gv_float4() { ; CHECK-LABEL: test_gv_float4( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.global.nc.v2.b64 {%rd1, %rd2}, [gv_float4]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [gv_float4]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %v = load <4 x float>, ptr @gv_float4 ret <4 x float> %v diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index 94c2637..f871e403 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -2,13 +2,13 @@ ; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM80 %s -; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_80 %} ; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s -; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" @@ -86,28 +86,46 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) { } define float @reduce_fadd_float(<8 x float> %in) { -; CHECK-LABEL: reduce_fadd_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<17>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; -; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; -; CHECK-NEXT: add.rn.f32 %r9, %r7, 0f00000000; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: add.rn.f32 %r11, %r10, %r5; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r6; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r3; -; CHECK-NEXT: add.rn.f32 %r14, %r13, %r4; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r1; -; CHECK-NEXT: add.rn.f32 %r16, %r15, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r16; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<17>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r1, 0f00000000; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r2; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r3; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r11, %r4; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r5; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, %r8; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<17>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-SM100-NEXT: add.rn.f32 %r9, %r7, 0f00000000; +; CHECK-SM100-NEXT: add.rn.f32 %r10, %r9, %r8; +; CHECK-SM100-NEXT: add.rn.f32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: add.rn.f32 %r12, %r11, %r6; +; CHECK-SM100-NEXT: add.rn.f32 %r13, %r12, %r3; +; CHECK-SM100-NEXT: add.rn.f32 %r14, %r13, %r4; +; CHECK-SM100-NEXT: add.rn.f32 %r15, %r14, %r1; +; CHECK-SM100-NEXT: add.rn.f32 %r16, %r15, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r16; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) ret float %res } @@ -116,20 +134,15 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fadd_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<17>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: add.rn.f32 %r5, %r4, %r2; -; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r7; -; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r5; -; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r1; -; CHECK-SM80-NEXT: add.rn.f32 %r13, %r8, %r6; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r1, %r5; ; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r12; ; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r11; ; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000; @@ -272,27 +285,44 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) { } define float @reduce_fmul_float(<8 x float> %in) { -; CHECK-LABEL: reduce_fmul_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; -; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; -; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r5; -; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r6; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r3; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r4; -; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r1; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r1, %r2; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r3; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r4; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r11, %r5; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r8; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-SM100-NEXT: mul.rn.f32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mul.rn.f32 %r10, %r9, %r5; +; CHECK-SM100-NEXT: mul.rn.f32 %r11, %r10, %r6; +; CHECK-SM100-NEXT: mul.rn.f32 %r12, %r11, %r3; +; CHECK-SM100-NEXT: mul.rn.f32 %r13, %r12, %r4; +; CHECK-SM100-NEXT: mul.rn.f32 %r14, %r13, %r1; +; CHECK-SM100-NEXT: mul.rn.f32 %r15, %r14, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) ret float %res } @@ -301,20 +331,15 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmul_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: mul.rn.f32 %r5, %r4, %r2; -; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r7; -; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r5; -; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r1; -; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r8, %r6; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r1, %r5; ; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r12; ; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r11; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; @@ -495,15 +520,10 @@ define float @reduce_fmax_float(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmax_float( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0]; ; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; @@ -540,15 +560,10 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmax_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0]; ; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; @@ -620,15 +635,10 @@ define float @reduce_fmax_float_nnan(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmax_float_nnan( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_nnan_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_nnan_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_nnan_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_nnan_param_0]; ; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; @@ -809,15 +819,10 @@ define float @reduce_fmin_float(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmin_float( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0]; ; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; @@ -854,15 +859,10 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmin_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0]; ; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; @@ -934,15 +934,10 @@ define float @reduce_fmin_float_nnan(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmin_float_nnan( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_nnan_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_nnan_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_nnan_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_nnan_param_0]; ; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; @@ -1078,15 +1073,10 @@ define float @reduce_fmaximum_float(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmaximum_float( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0]; ; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9; @@ -1123,15 +1113,10 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0]; ; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9; @@ -1267,15 +1252,10 @@ define float @reduce_fminimum_float(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fminimum_float( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0]; ; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9; @@ -1312,15 +1292,10 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0]; ; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8; ; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6; ; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9; diff --git a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll index 7c9487b..38c9234 100644 --- a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll +++ b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} declare float @llvm.nvvm.redux.sync.fmin(float, i32) define float @redux_sync_fmin(float %src, i32 %mask) { diff --git a/llvm/test/CodeGen/NVPTX/redux-sync.ll b/llvm/test/CodeGen/NVPTX/redux-sync.ll index bd1c7f5..90b2308 100644 --- a/llvm/test/CodeGen/NVPTX/redux-sync.ll +++ b/llvm/test/CodeGen/NVPTX/redux-sync.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare i32 @llvm.nvvm.redux.sync.umin(i32, i32) ; CHECK-LABEL: .func{{.*}}redux_sync_min_u32 diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll index ea45bfd..f9b4f6b 100644 --- a/llvm/test/CodeGen/NVPTX/reg-types.ll +++ b/llvm/test/CodeGen/NVPTX/reg-types.ll @@ -3,7 +3,7 @@ ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT -; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: .visible .func func( diff --git a/llvm/test/CodeGen/NVPTX/setmaxnreg-sm100a.ll b/llvm/test/CodeGen/NVPTX/setmaxnreg-sm100a.ll index fecc286..cb62314 100644 --- a/llvm/test/CodeGen/NVPTX/setmaxnreg-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/setmaxnreg-sm100a.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; CHECK-LABEL: test_set_maxn_reg_sm100a define void @test_set_maxn_reg_sm100a() { diff --git a/llvm/test/CodeGen/NVPTX/setmaxnreg.ll b/llvm/test/CodeGen/NVPTX/setmaxnreg.ll index 5b266e8..cca603a 100644 --- a/llvm/test/CodeGen/NVPTX/setmaxnreg.ll +++ b/llvm/test/CodeGen/NVPTX/setmaxnreg.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx80| %ptxas-verify -arch=sm_90a %} +; RUN: %if ptxas-sm_90a && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx80| %ptxas-verify -arch=sm_90a %} declare void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 %reg_count) declare void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 %reg_count) diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll index 97918a6..9c028c2 100644 --- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} define <2 x i16> @sext_setcc_v2i1_to_v2i16(ptr %p) { ; CHECK-LABEL: sext_setcc_v2i1_to_v2i16( diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll index 9cf3a1d..dfc6e96 100644 --- a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32) declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync.ll b/llvm/test/CodeGen/NVPTX/shfl-sync.ll index 0c826d2..139c1e6 100644 --- a/llvm/test/CodeGen/NVPTX/shfl-sync.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-sync.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/shift-opt.ll b/llvm/test/CodeGen/NVPTX/shift-opt.ll index e7866b0..e0d22c6 100644 --- a/llvm/test/CodeGen/NVPTX/shift-opt.ll +++ b/llvm/test/CodeGen/NVPTX/shift-opt.ll @@ -71,18 +71,17 @@ define <2 x i16> @test_vec(<2 x i16> %x, <2 x i8> %y) { ; CHECK-LABEL: test_vec( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<7>; -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_vec_param_0]; ; CHECK-NEXT: ld.param.v2.b8 {%rs3, %rs4}, [test_vec_param_1]; ; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4}; -; CHECK-NEXT: and.b32 %r2, %r1, 16711935; ; CHECK-NEXT: shr.u16 %rs5, %rs2, 5; ; CHECK-NEXT: shr.u16 %rs6, %rs1, 5; -; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; CHECK-NEXT: or.b32 %r4, %r3, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: mov.b32 %r2, {%rs6, %rs5}; +; CHECK-NEXT: or.b32 %r3, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %ext = zext <2 x i8> %y to <2 x i16> %shl = shl <2 x i16> %ext, splat(i16 5) diff --git a/llvm/test/CodeGen/NVPTX/short-ptr.ll b/llvm/test/CodeGen/NVPTX/short-ptr.ll index eb05895..7cf7ff7 100644 --- a/llvm/test/CodeGen/NVPTX/short-ptr.ll +++ b/llvm/test/CodeGen/NVPTX/short-ptr.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix CHECK-DEFAULT-32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-short-ptr | FileCheck %s --check-prefixes CHECK-SHORT-SHARED,CHECK-SHORT-CONST,CHECK-SHORT-LOCAL -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-short-ptr | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll index 991ae04..ddc430e 100644 --- a/llvm/test/CodeGen/NVPTX/simple-call.ll +++ b/llvm/test/CodeGen/NVPTX/simple-call.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; CHECK: .func ({{.*}}) device_func diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll index 3989c8e3..7e4e701 100644 --- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll +++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll @@ -13,7 +13,7 @@ declare double @llvm.sqrt.f64(double) ; -- reciprocal sqrt -- -define float @test_rsqrt32(float %a) #0 { +define float @test_rsqrt32(float %a) { ; CHECK-LABEL: test_rsqrt32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -28,7 +28,7 @@ define float @test_rsqrt32(float %a) #0 { ret float %ret } -define float @test_rsqrt_ftz(float %a) #0 #1 { +define float @test_rsqrt_ftz(float %a) #1 { ; CHECK-LABEL: test_rsqrt_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -76,7 +76,7 @@ define double @test_rsqrt64_ftz(double %a) #1 { ; -- sqrt -- -define float @test_sqrt32(float %a) #0 { +define float @test_sqrt32(float %a) { ; CHECK-LABEL: test_sqrt32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -90,7 +90,7 @@ define float @test_sqrt32(float %a) #0 { ret float %ret } -define float @test_sqrt32_ninf(float %a) #0 { +define float @test_sqrt32_ninf(float %a) { ; CHECK-LABEL: test_sqrt32_ninf( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -108,7 +108,7 @@ define float @test_sqrt32_ninf(float %a) #0 { ret float %ret } -define float @test_sqrt_ftz(float %a) #0 #1 { +define float @test_sqrt_ftz(float %a) #1 { ; CHECK-LABEL: test_sqrt_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -122,7 +122,7 @@ define float @test_sqrt_ftz(float %a) #0 #1 { ret float %ret } -define float @test_sqrt_ftz_ninf(float %a) #0 #1 { +define float @test_sqrt_ftz_ninf(float %a) #1 { ; CHECK-LABEL: test_sqrt_ftz_ninf( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -139,7 +139,7 @@ define float @test_sqrt_ftz_ninf(float %a) #0 #1 { ret float %ret } -define double @test_sqrt64(double %a) #0 { +define double @test_sqrt64(double %a) { ; CHECK-LABEL: test_sqrt64( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<3>; @@ -156,7 +156,7 @@ define double @test_sqrt64(double %a) #0 { ; There's no sqrt.approx.f64 instruction; we emit ; reciprocal(rsqrt.approx.f64(x)). There's no non-ftz approximate reciprocal, ; so we just use the ftz version. -define double @test_sqrt64_ninf(double %a) #0 { +define double @test_sqrt64_ninf(double %a) { ; CHECK-LABEL: test_sqrt64_ninf( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -175,7 +175,7 @@ define double @test_sqrt64_ninf(double %a) #0 { ret double %ret } -define double @test_sqrt64_ftz(double %a) #0 #1 { +define double @test_sqrt64_ftz(double %a) #1 { ; CHECK-LABEL: test_sqrt64_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<3>; @@ -190,7 +190,7 @@ define double @test_sqrt64_ftz(double %a) #0 #1 { } ; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version. -define double @test_sqrt64_ftz_ninf(double %a) #0 #1 { +define double @test_sqrt64_ftz_ninf(double %a) #1 { ; CHECK-LABEL: test_sqrt64_ftz_ninf( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -214,7 +214,7 @@ define double @test_sqrt64_ftz_ninf(double %a) #0 #1 { ; The sqrt and rsqrt refinement algorithms both emit an rsqrt.approx, followed ; by some math. -define float @test_rsqrt32_refined(float %a) #0 #2 { +define float @test_rsqrt32_refined(float %a) #2 { ; CHECK-LABEL: test_rsqrt32_refined( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; @@ -229,11 +229,11 @@ define float @test_rsqrt32_refined(float %a) #0 #2 { ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %val = tail call float @llvm.sqrt.f32(float %a) - %ret = fdiv arcp float 1.0, %val + %ret = fdiv arcp contract float 1.0, %val ret float %ret } -define float @test_sqrt32_refined(float %a) #0 #2 { +define float @test_sqrt32_refined(float %a) #2 { ; CHECK-LABEL: test_sqrt32_refined( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -247,7 +247,7 @@ define float @test_sqrt32_refined(float %a) #0 #2 { ret float %ret } -define float @test_sqrt32_refined_ninf(float %a) #0 #2 { +define float @test_sqrt32_refined_ninf(float %a) #2 { ; CHECK-LABEL: test_sqrt32_refined_ninf( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -265,11 +265,11 @@ define float @test_sqrt32_refined_ninf(float %a) #0 #2 { ; CHECK-NEXT: selp.f32 %r8, 0f00000000, %r6, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; - %ret = tail call ninf afn float @llvm.sqrt.f32(float %a) + %ret = tail call ninf afn contract float @llvm.sqrt.f32(float %a) ret float %ret } -define double @test_rsqrt64_refined(double %a) #0 #2 { +define double @test_rsqrt64_refined(double %a) #2 { ; CHECK-LABEL: test_rsqrt64_refined( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<7>; @@ -284,11 +284,11 @@ define double @test_rsqrt64_refined(double %a) #0 #2 { ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; ; CHECK-NEXT: ret; %val = tail call double @llvm.sqrt.f64(double %a) - %ret = fdiv arcp double 1.0, %val + %ret = fdiv arcp contract double 1.0, %val ret double %ret } -define double @test_sqrt64_refined(double %a) #0 #2 { +define double @test_sqrt64_refined(double %a) #2 { ; CHECK-LABEL: test_sqrt64_refined( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<3>; @@ -302,7 +302,7 @@ define double @test_sqrt64_refined(double %a) #0 #2 { ret double %ret } -define double @test_sqrt64_refined_ninf(double %a) #0 #2 { +define double @test_sqrt64_refined_ninf(double %a) #2 { ; CHECK-LABEL: test_sqrt64_refined_ninf( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -320,13 +320,13 @@ define double @test_sqrt64_refined_ninf(double %a) #0 #2 { ; CHECK-NEXT: selp.f64 %rd8, 0d0000000000000000, %rd6, %p1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd8; ; CHECK-NEXT: ret; - %ret = tail call ninf afn double @llvm.sqrt.f64(double %a) + %ret = tail call ninf afn contract double @llvm.sqrt.f64(double %a) ret double %ret } ; -- refined sqrt and rsqrt with ftz enabled -- -define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 { +define float @test_rsqrt32_refined_ftz(float %a) #1 #2 { ; CHECK-LABEL: test_rsqrt32_refined_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; @@ -341,11 +341,11 @@ define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 { ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %val = tail call float @llvm.sqrt.f32(float %a) - %ret = fdiv arcp float 1.0, %val + %ret = fdiv arcp contract float 1.0, %val ret float %ret } -define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 { +define float @test_sqrt32_refined_ftz(float %a) #1 #2 { ; CHECK-LABEL: test_sqrt32_refined_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -359,7 +359,7 @@ define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 { ret float %ret } -define float @test_sqrt32_refined_ftz_ninf(float %a) #0 #1 #2 { +define float @test_sqrt32_refined_ftz_ninf(float %a) #1 #2 { ; CHECK-LABEL: test_sqrt32_refined_ftz_ninf( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -376,12 +376,12 @@ define float @test_sqrt32_refined_ftz_ninf(float %a) #0 #1 #2 { ; CHECK-NEXT: selp.f32 %r7, 0f00000000, %r6, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; - %ret = tail call ninf afn float @llvm.sqrt.f32(float %a) + %ret = tail call ninf afn contract float @llvm.sqrt.f32(float %a) ret float %ret } ; There's no rsqrt.approx.ftz.f64, so we just use the non-ftz version. -define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 { +define double @test_rsqrt64_refined_ftz(double %a) #1 #2 { ; CHECK-LABEL: test_rsqrt64_refined_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<7>; @@ -396,11 +396,11 @@ define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 { ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; ; CHECK-NEXT: ret; %val = tail call double @llvm.sqrt.f64(double %a) - %ret = fdiv arcp double 1.0, %val + %ret = fdiv arcp contract double 1.0, %val ret double %ret } -define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 { +define double @test_sqrt64_refined_ftz(double %a) #1 #2 { ; CHECK-LABEL: test_sqrt64_refined_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<3>; @@ -414,7 +414,7 @@ define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 { ret double %ret } -define double @test_sqrt64_refined_ftz_ninf(double %a) #0 #1 #2 { +define double @test_sqrt64_refined_ftz_ninf(double %a) #1 #2 { ; CHECK-LABEL: test_sqrt64_refined_ftz_ninf( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; @@ -432,10 +432,9 @@ define double @test_sqrt64_refined_ftz_ninf(double %a) #0 #1 #2 { ; CHECK-NEXT: selp.f64 %rd8, 0d0000000000000000, %rd6, %p1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd8; ; CHECK-NEXT: ret; - %ret = tail call ninf afn double @llvm.sqrt.f64(double %a) + %ret = tail call ninf afn contract double @llvm.sqrt.f64(double %a) ret double %ret } -attributes #0 = { "unsafe-fp-math" = "true" } attributes #1 = { "denormal-fp-math-f32" = "preserve-sign,preserve-sign" } attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" } diff --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll index 1e0e75a..a229389 100644 --- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/st-generic.ll b/llvm/test/CodeGen/NVPTX/st-generic.ll index 950da93..a7aa092 100644 --- a/llvm/test/CodeGen/NVPTX/st-generic.ll +++ b/llvm/test/CodeGen/NVPTX/st-generic.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; i8 diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll index f90435a..a07e1d5 100644 --- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll +++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/st_bulk.ll b/llvm/test/CodeGen/NVPTX/st_bulk.ll index 944f221..5c4b5ba 100644 --- a/llvm/test/CodeGen/NVPTX/st_bulk.ll +++ b/llvm/test/CodeGen/NVPTX/st_bulk.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} declare void @llvm.nvvm.st.bulk(ptr, i64, i64) define void @st_bulk(ptr %dest_addr, i64 %size) { diff --git a/llvm/test/CodeGen/NVPTX/stacksaverestore.ll b/llvm/test/CodeGen/NVPTX/stacksaverestore.ll index 802ae26..a32f88c 100644 --- a/llvm/test/CodeGen/NVPTX/stacksaverestore.ll +++ b/llvm/test/CodeGen/NVPTX/stacksaverestore.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 -mattr=+ptx73 | FileCheck %s --check-prefix=CHECK-32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 -mattr=+ptx73 | FileCheck %s --check-prefix=CHECK-64 ; RUN: llc < %s -mtriple=nvptx64 -nvptx-short-ptr -mcpu=sm_60 -mattr=+ptx73 | FileCheck %s --check-prefix=CHECK-MIXED -; RUN: %if ptxas && ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 -mattr=+ptx73 | %ptxas-verify %} +; RUN: %if ptxas-sm_60 && ptxas-isa-7.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 -mattr=+ptx73 | %ptxas-verify -arch=sm_60 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll index 8056855..d443aeb 100644 --- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll @@ -37,7 +37,7 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) { ; CHECK-LABEL: bar( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [bar_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py index 15b220c..799ef8c 100644 --- a/llvm/test/CodeGen/NVPTX/surf-tex.py +++ b/llvm/test/CodeGen/NVPTX/surf-tex.py @@ -1,6 +1,6 @@ # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll # RUN: llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll -# RUN: %if ptxas %{ llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %} +# RUN: %if ptxas-sm_60 && ptxas-isa-4.3 %{ llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify -arch=sm_60 %} # We only need to run this second time for texture tests, because # there is a difference between unified and non-unified intrinsics. diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll index abc2ea8..c0ced65 100644 --- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll @@ -34,7 +34,6 @@ define ptx_kernel void @bar(i32 %val, i32 %idx) { ; CHECK-LABEL: bar( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [bar_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/symbol-naming.ll b/llvm/test/CodeGen/NVPTX/symbol-naming.ll index 941378f..8053b22 100644 --- a/llvm/test/CodeGen/NVPTX/symbol-naming.ll +++ b/llvm/test/CodeGen/NVPTX/symbol-naming.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} ; Verify that the NVPTX target removes invalid symbol names prior to emitting ; PTX. diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll index 5a4fe4e..a245279 100644 --- a/llvm/test/CodeGen/NVPTX/szext.ll +++ b/llvm/test/CodeGen/NVPTX/szext.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-7.6 %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.ll index 6f4eb22..94ed44c 100644 --- a/llvm/test/CodeGen/NVPTX/tanhf.ll +++ b/llvm/test/CodeGen/NVPTX/tanhf.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll index 9c60af9..308e7e49 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols) declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll index cc3b359..ec73b34 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr) declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll index 780116c..14a7892 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; CHECK-LABEL: test_tcgen05_cp_64x128_v1 define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) { diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll index 07c6267..fe4719c 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} declare void @llvm.nvvm.tcgen05.fence.before.thread.sync() declare void @llvm.nvvm.tcgen05.fence.after.thread.sync() diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll index 7e65338..16710b4 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} ; CHECK-LABEL: nvvm_tcgen05_ld_16x64b define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) { diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll index 590d755..a5b87f3 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll index c323a54..a33ec85 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} ; CHECK-LABEL: nvvm_tcgen05_st_16x64b define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32> %stv2, <4 x i32> %stv4, <8 x i32> %stv8, <16 x i32> %stv16, <32 x i32> %stv32, <64 x i32> %stv64, <128 x i32> %stv128) { diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll index 3138d7c..20f6e2e 100644 --- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -37,7 +37,7 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) { ; CHECK-LABEL: bar( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<6>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [bar_param_0]; @@ -58,7 +58,7 @@ declare float @texfunc(i64) define ptx_kernel void @baz(ptr %red, i32 %idx) { ; CHECK-LABEL: baz( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -74,8 +74,8 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) { ; CHECK-NEXT: call.uni (retval0), texfunc, (param0); ; CHECK-NEXT: ld.param.b32 %r6, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: add.rn.f32 %r8, %r2, %r6; -; CHECK-NEXT: st.global.b32 [%rd2], %r8; +; CHECK-NEXT: add.rn.f32 %r7, %r2, %r6; +; CHECK-NEXT: st.global.b32 [%rd2], %r7; ; CHECK-NEXT: ret; %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0) %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx) diff --git a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll index 4edbec4..c5299046 100644 --- a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll +++ b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll @@ -35,7 +35,6 @@ define i32 @t1() { ; CHECK-LABEL: t1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: txq.width.b32 %r1, [tex0]; @@ -66,7 +65,6 @@ define i32 @t3() { ; CHECK-LABEL: t3( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: txq.height.b32 %r1, [tex0]; @@ -97,7 +95,6 @@ define i32 @s1() { ; CHECK-LABEL: s1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: suq.width.b32 %r1, [surf0]; @@ -128,7 +125,6 @@ define i32 @s3() { ; CHECK-LABEL: s3( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: suq.height.b32 %r1, [surf0]; diff --git a/llvm/test/CodeGen/NVPTX/trunc-setcc.ll b/llvm/test/CodeGen/NVPTX/trunc-setcc.ll index f22e37e..f6a1c6b 100644 --- a/llvm/test/CodeGen/NVPTX/trunc-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/trunc-setcc.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_50 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll index 12502b6..99a1e8a 100644 --- a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll +++ b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_50 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll index 697eb90..5263552 100644 --- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll @@ -24,9 +24,9 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { ; CHECK-LABEL: test_s_i8i16p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b16 %rs<9>; ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8i16p_param_0]; @@ -45,14 +45,14 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { ; CHECK-NEXT: ld.param.b8 %rs4, [retval0+4]; ; CHECK-NEXT: ld.param.b8 %rs5, [retval0+3]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: shl.b16 %rs8, %rs4, 8; -; CHECK-NEXT: or.b16 %rs9, %rs8, %rs5; +; CHECK-NEXT: shl.b16 %rs6, %rs4, 8; +; CHECK-NEXT: or.b16 %rs7, %rs6, %rs5; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs5; ; CHECK-NEXT: st.param.b64 [func_retval0+8], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs2; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-NEXT: shr.u16 %rs12, %rs9, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs12; +; CHECK-NEXT: shr.u16 %rs8, %rs7, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs8; ; CHECK-NEXT: ret; %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) ret %s_i8i16p %r @@ -62,9 +62,9 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-LABEL: test_s_i8i32p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<24>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8i32p_param_0]; @@ -91,22 +91,22 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; ; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: shl.b32 %r12, %r8, 8; -; CHECK-NEXT: or.b32 %r13, %r12, %r9; -; CHECK-NEXT: shl.b32 %r15, %r7, 16; -; CHECK-NEXT: shl.b32 %r17, %r6, 24; -; CHECK-NEXT: or.b32 %r18, %r17, %r15; -; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: shl.b32 %r10, %r8, 8; +; CHECK-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-NEXT: shl.b32 %r12, %r7, 16; +; CHECK-NEXT: shl.b32 %r13, %r6, 24; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: or.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-NEXT: shr.u32 %r21, %r19, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; -; CHECK-NEXT: shr.u32 %r22, %r19, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; -; CHECK-NEXT: shr.u32 %r23, %r19, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; +; CHECK-NEXT: shr.u32 %r16, %r15, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r16; +; CHECK-NEXT: shr.u32 %r17, %r15, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: shr.u32 %r18, %r15, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r18; ; CHECK-NEXT: ret; %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) ret %s_i8i32p %r @@ -116,8 +116,8 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-LABEL: test_s_i8i64p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b64 %rd<46>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b64 %rd<36>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i64p_param_0]; @@ -144,38 +144,38 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-NEXT: ld.param.b8 %rd13, [retval0+10]; ; CHECK-NEXT: ld.param.b8 %rd14, [retval0+9]; ; CHECK-NEXT: } // callseq 2 -; CHECK-NEXT: shl.b64 %rd17, %rd13, 8; -; CHECK-NEXT: or.b64 %rd18, %rd17, %rd14; -; CHECK-NEXT: shl.b64 %rd20, %rd12, 16; -; CHECK-NEXT: shl.b64 %rd22, %rd11, 24; -; CHECK-NEXT: or.b64 %rd23, %rd22, %rd20; -; CHECK-NEXT: or.b64 %rd24, %rd23, %rd18; -; CHECK-NEXT: shl.b64 %rd27, %rd9, 8; -; CHECK-NEXT: or.b64 %rd28, %rd27, %rd10; -; CHECK-NEXT: shl.b64 %rd30, %rd8, 16; -; CHECK-NEXT: shl.b64 %rd32, %rd7, 24; -; CHECK-NEXT: or.b64 %rd33, %rd32, %rd30; -; CHECK-NEXT: or.b64 %rd34, %rd33, %rd28; -; CHECK-NEXT: shl.b64 %rd35, %rd34, 32; -; CHECK-NEXT: or.b64 %rd36, %rd35, %rd24; +; CHECK-NEXT: shl.b64 %rd15, %rd13, 8; +; CHECK-NEXT: or.b64 %rd16, %rd15, %rd14; +; CHECK-NEXT: shl.b64 %rd17, %rd12, 16; +; CHECK-NEXT: shl.b64 %rd18, %rd11, 24; +; CHECK-NEXT: or.b64 %rd19, %rd18, %rd17; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd16; +; CHECK-NEXT: shl.b64 %rd21, %rd9, 8; +; CHECK-NEXT: or.b64 %rd22, %rd21, %rd10; +; CHECK-NEXT: shl.b64 %rd23, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd24, %rd7, 24; +; CHECK-NEXT: or.b64 %rd25, %rd24, %rd23; +; CHECK-NEXT: or.b64 %rd26, %rd25, %rd22; +; CHECK-NEXT: shl.b64 %rd27, %rd26, 32; +; CHECK-NEXT: or.b64 %rd28, %rd27, %rd20; ; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd14; ; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd5; ; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; -; CHECK-NEXT: shr.u64 %rd39, %rd36, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd39; -; CHECK-NEXT: shr.u64 %rd40, %rd36, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd40; -; CHECK-NEXT: shr.u64 %rd41, %rd36, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd41; -; CHECK-NEXT: shr.u64 %rd42, %rd36, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd42; -; CHECK-NEXT: shr.u64 %rd43, %rd36, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43; -; CHECK-NEXT: shr.u64 %rd44, %rd36, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd44; -; CHECK-NEXT: shr.u64 %rd45, %rd36, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd45; +; CHECK-NEXT: shr.u64 %rd29, %rd28, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd29; +; CHECK-NEXT: shr.u64 %rd30, %rd28, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd30; +; CHECK-NEXT: shr.u64 %rd31, %rd28, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd31; +; CHECK-NEXT: shr.u64 %rd32, %rd28, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd32; +; CHECK-NEXT: shr.u64 %rd33, %rd28, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd33; +; CHECK-NEXT: shr.u64 %rd34, %rd28, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd34; +; CHECK-NEXT: shr.u64 %rd35, %rd28, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35; ; CHECK-NEXT: ret; %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) ret %s_i8i64p %r @@ -185,8 +185,8 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { ; CHECK-LABEL: test_s_i8f16p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<15>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b16 %rs<11>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8f16p_param_0]; @@ -207,14 +207,14 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { ; CHECK-NEXT: ld.param.b8 %rs6, [retval0+4]; ; CHECK-NEXT: ld.param.b8 %rs7, [retval0+3]; ; CHECK-NEXT: } // callseq 3 -; CHECK-NEXT: shl.b16 %rs10, %rs6, 8; -; CHECK-NEXT: or.b16 %rs11, %rs10, %rs7; +; CHECK-NEXT: shl.b16 %rs8, %rs6, 8; +; CHECK-NEXT: or.b16 %rs9, %rs8, %rs7; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs7; ; CHECK-NEXT: st.param.b64 [func_retval0+8], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs4; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-NEXT: shr.u16 %rs14, %rs11, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs14; +; CHECK-NEXT: shr.u16 %rs10, %rs9, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs10; ; CHECK-NEXT: ret; %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) ret %s_i8f16p %r @@ -224,9 +224,9 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-LABEL: test_s_i8f16x2p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<24>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8f16x2p_param_0]; @@ -253,22 +253,22 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; ; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 4 -; CHECK-NEXT: shl.b32 %r12, %r8, 8; -; CHECK-NEXT: or.b32 %r13, %r12, %r9; -; CHECK-NEXT: shl.b32 %r15, %r7, 16; -; CHECK-NEXT: shl.b32 %r17, %r6, 24; -; CHECK-NEXT: or.b32 %r18, %r17, %r15; -; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: shl.b32 %r10, %r8, 8; +; CHECK-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-NEXT: shl.b32 %r12, %r7, 16; +; CHECK-NEXT: shl.b32 %r13, %r6, 24; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: or.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-NEXT: shr.u32 %r21, %r19, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; -; CHECK-NEXT: shr.u32 %r22, %r19, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; -; CHECK-NEXT: shr.u32 %r23, %r19, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; +; CHECK-NEXT: shr.u32 %r16, %r15, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r16; +; CHECK-NEXT: shr.u32 %r17, %r15, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: shr.u32 %r18, %r15, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r18; ; CHECK-NEXT: ret; %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) ret %s_i8f16x2p %r @@ -278,9 +278,9 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-LABEL: test_s_i8f32p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<24>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8f32p_param_0]; @@ -307,22 +307,22 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; ; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 5 -; CHECK-NEXT: shl.b32 %r12, %r8, 8; -; CHECK-NEXT: or.b32 %r13, %r12, %r9; -; CHECK-NEXT: shl.b32 %r15, %r7, 16; -; CHECK-NEXT: shl.b32 %r17, %r6, 24; -; CHECK-NEXT: or.b32 %r18, %r17, %r15; -; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: shl.b32 %r10, %r8, 8; +; CHECK-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-NEXT: shl.b32 %r12, %r7, 16; +; CHECK-NEXT: shl.b32 %r13, %r6, 24; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: or.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-NEXT: shr.u32 %r21, %r19, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; -; CHECK-NEXT: shr.u32 %r22, %r19, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; -; CHECK-NEXT: shr.u32 %r23, %r19, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; +; CHECK-NEXT: shr.u32 %r16, %r15, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r16; +; CHECK-NEXT: shr.u32 %r17, %r15, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: shr.u32 %r18, %r15, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r18; ; CHECK-NEXT: ret; %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) ret %s_i8f32p %r @@ -332,8 +332,8 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) { ; CHECK-LABEL: test_s_i8f64p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b64 %rd<46>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b64 %rd<36>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f64p_param_0]; @@ -360,38 +360,38 @@ define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) { ; CHECK-NEXT: ld.param.b8 %rd13, [retval0+10]; ; CHECK-NEXT: ld.param.b8 %rd14, [retval0+9]; ; CHECK-NEXT: } // callseq 6 -; CHECK-NEXT: shl.b64 %rd17, %rd13, 8; -; CHECK-NEXT: or.b64 %rd18, %rd17, %rd14; -; CHECK-NEXT: shl.b64 %rd20, %rd12, 16; -; CHECK-NEXT: shl.b64 %rd22, %rd11, 24; -; CHECK-NEXT: or.b64 %rd23, %rd22, %rd20; -; CHECK-NEXT: or.b64 %rd24, %rd23, %rd18; -; CHECK-NEXT: shl.b64 %rd27, %rd9, 8; -; CHECK-NEXT: or.b64 %rd28, %rd27, %rd10; -; CHECK-NEXT: shl.b64 %rd30, %rd8, 16; -; CHECK-NEXT: shl.b64 %rd32, %rd7, 24; -; CHECK-NEXT: or.b64 %rd33, %rd32, %rd30; -; CHECK-NEXT: or.b64 %rd34, %rd33, %rd28; -; CHECK-NEXT: shl.b64 %rd35, %rd34, 32; -; CHECK-NEXT: or.b64 %rd36, %rd35, %rd24; +; CHECK-NEXT: shl.b64 %rd15, %rd13, 8; +; CHECK-NEXT: or.b64 %rd16, %rd15, %rd14; +; CHECK-NEXT: shl.b64 %rd17, %rd12, 16; +; CHECK-NEXT: shl.b64 %rd18, %rd11, 24; +; CHECK-NEXT: or.b64 %rd19, %rd18, %rd17; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd16; +; CHECK-NEXT: shl.b64 %rd21, %rd9, 8; +; CHECK-NEXT: or.b64 %rd22, %rd21, %rd10; +; CHECK-NEXT: shl.b64 %rd23, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd24, %rd7, 24; +; CHECK-NEXT: or.b64 %rd25, %rd24, %rd23; +; CHECK-NEXT: or.b64 %rd26, %rd25, %rd22; +; CHECK-NEXT: shl.b64 %rd27, %rd26, 32; +; CHECK-NEXT: or.b64 %rd28, %rd27, %rd20; ; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd14; ; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd5; ; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; -; CHECK-NEXT: shr.u64 %rd39, %rd36, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd39; -; CHECK-NEXT: shr.u64 %rd40, %rd36, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd40; -; CHECK-NEXT: shr.u64 %rd41, %rd36, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd41; -; CHECK-NEXT: shr.u64 %rd42, %rd36, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd42; -; CHECK-NEXT: shr.u64 %rd43, %rd36, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43; -; CHECK-NEXT: shr.u64 %rd44, %rd36, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd44; -; CHECK-NEXT: shr.u64 %rd45, %rd36, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd45; +; CHECK-NEXT: shr.u64 %rd29, %rd28, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd29; +; CHECK-NEXT: shr.u64 %rd30, %rd28, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd30; +; CHECK-NEXT: shr.u64 %rd31, %rd28, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd31; +; CHECK-NEXT: shr.u64 %rd32, %rd28, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd32; +; CHECK-NEXT: shr.u64 %rd33, %rd28, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd33; +; CHECK-NEXT: shr.u64 %rd34, %rd28, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd34; +; CHECK-NEXT: shr.u64 %rd35, %rd28, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35; ; CHECK-NEXT: ret; %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) ret %s_i8f64p %r diff --git a/llvm/test/CodeGen/NVPTX/unreachable.ll b/llvm/test/CodeGen/NVPTX/unreachable.ll index 618c7ed..0b65ef8 100644 --- a/llvm/test/CodeGen/NVPTX/unreachable.ll +++ b/llvm/test/CodeGen/NVPTX/unreachable.ll @@ -13,7 +13,7 @@ ; RUN: | FileCheck %s --check-prefixes=CHECK,TRAP ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -mattr=+ptx83 \ ; RUN: | FileCheck %s --check-prefixes=BUG-FIXED -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll index 84c7a12..80fd47f8 100644 --- a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll +++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll @@ -96,7 +96,15 @@ define void @test_cluster_dim() { ret void } -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12} +define void @test_grid_constant(ptr byval(i32) %input1, i32 %input2, ptr byval(i32) %input3) { +; CHECK-LABEL: define void @test_grid_constant( +; CHECK-SAME: ptr byval(i32) "nvvm.grid_constant" [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr byval(i32) "nvvm.grid_constant" [[INPUT3:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13} !0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010} !1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008} @@ -111,7 +119,8 @@ define void @test_cluster_dim() { !10 = !{ptr @test_maxntid_4, !"maxntidz", i32 100} !11 = !{ptr @test_reqntid, !"reqntidx", i32 31, !"reqntidy", i32 32, !"reqntidz", i32 33} !12 = !{ptr @test_cluster_dim, !"cluster_dim_x", i32 101, !"cluster_dim_y", i32 102, !"cluster_dim_z", i32 103} - +!13 = !{ptr @test_grid_constant, !"grid_constant", !14} +!14 = !{i32 1, i32 3} ;. ; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="2" } ; CHECK: attributes #[[ATTR1]] = { "nvvm.maxclusterrank"="3" } diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll index 9e312a2..a6b1bdd 100644 --- a/llvm/test/CodeGen/NVPTX/vaargs.ll +++ b/llvm/test/CodeGen/NVPTX/vaargs.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -O0 -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 && ptxas-ptr32 %{ llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -O0 -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} ; CHECK: .address_size [[BITS:32|64]] diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index a9b3675..61ff806 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 < %s | FileCheck %s --check-prefix=CHECK-PTX -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.4 %{ llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} %struct.S1 = type { i32, i8, i64 } %struct.S2 = type { i64, i64 } @@ -104,7 +104,7 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot1[40]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b32 %r<2>; ; CHECK-PTX-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry @@ -143,29 +143,29 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b16 %rs<4>; ; CHECK-PTX-NEXT: .reg .b32 %r<6>; -; CHECK-PTX-NEXT: .reg .b64 %rd<9>; +; CHECK-PTX-NEXT: .reg .b64 %rd<8>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot2; ; CHECK-PTX-NEXT: ld.param.b32 %r1, [variadics2_param_0]; ; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics2_param_1]; -; CHECK-PTX-NEXT: add.u64 %rd3, %SPL, 0; -; CHECK-PTX-NEXT: add.s64 %rd4, %rd1, 7; -; CHECK-PTX-NEXT: and.b64 %rd5, %rd4, -8; -; CHECK-PTX-NEXT: ld.b32 %r2, [%rd5]; -; CHECK-PTX-NEXT: ld.s8 %r3, [%rd5+4]; -; CHECK-PTX-NEXT: ld.b8 %rs1, [%rd5+7]; -; CHECK-PTX-NEXT: st.local.b8 [%rd3+2], %rs1; -; CHECK-PTX-NEXT: ld.b8 %rs2, [%rd5+6]; -; CHECK-PTX-NEXT: st.local.b8 [%rd3+1], %rs2; -; CHECK-PTX-NEXT: ld.b8 %rs3, [%rd5+5]; -; CHECK-PTX-NEXT: st.local.b8 [%rd3], %rs3; -; CHECK-PTX-NEXT: ld.b64 %rd6, [%rd5+8]; +; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; +; CHECK-PTX-NEXT: add.s64 %rd3, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8; +; CHECK-PTX-NEXT: ld.b32 %r2, [%rd4]; +; CHECK-PTX-NEXT: ld.s8 %r3, [%rd4+4]; +; CHECK-PTX-NEXT: ld.b8 %rs1, [%rd4+7]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1; +; CHECK-PTX-NEXT: ld.b8 %rs2, [%rd4+6]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2; +; CHECK-PTX-NEXT: ld.b8 %rs3, [%rd4+5]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3; +; CHECK-PTX-NEXT: ld.b64 %rd5, [%rd4+8]; ; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2; ; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3; -; CHECK-PTX-NEXT: cvt.u64.u32 %rd7, %r5; -; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd6; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd8; +; CHECK-PTX-NEXT: cvt.u64.u32 %rd6, %r5; +; CHECK-PTX-NEXT: add.s64 %rd7, %rd6, %rd5; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd7; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -202,19 +202,19 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b16 %rs<4>; -; CHECK-PTX-NEXT: .reg .b32 %r<3>; -; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-NEXT: .reg .b32 %r<2>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot3; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; +; CHECK-PTX-NEXT: add.u64 %rd1, %SPL, 0; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1; +; CHECK-PTX-NEXT: st.local.b8 [%rd1+2], %rs1; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2; +; CHECK-PTX-NEXT: st.local.b8 [%rd1+1], %rs2; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3; +; CHECK-PTX-NEXT: st.local.b8 [%rd1], %rs3; ; CHECK-PTX-NEXT: st.b32 [%SP+8], 1; ; CHECK-PTX-NEXT: st.b8 [%SP+12], 1; ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; @@ -222,8 +222,8 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: .param .b32 param0; ; CHECK-PTX-NEXT: .param .b64 param1; ; CHECK-PTX-NEXT: .param .b32 retval0; -; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd3; +; CHECK-PTX-NEXT: add.u64 %rd2, %SP, 8; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd2; ; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: call.uni (retval0), variadics2, (param0, param1); ; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0]; @@ -282,7 +282,7 @@ define dso_local i32 @baz() { ; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot5[16]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b32 %r<2>; ; CHECK-PTX-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry @@ -309,18 +309,18 @@ entry: define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) { ; CHECK-PTX-LABEL: variadics4( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .reg .b64 %rd<10>; +; CHECK-PTX-NEXT: .reg .b64 %rd<9>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry -; CHECK-PTX-NEXT: ld.param.b64 %rd2, [variadics4_param_1]; -; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 7; -; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8; -; CHECK-PTX-NEXT: ld.b64 %rd5, [%rd4]; -; CHECK-PTX-NEXT: ld.param.b64 %rd6, [variadics4_param_0]; -; CHECK-PTX-NEXT: ld.param.b64 %rd7, [variadics4_param_0+8]; -; CHECK-PTX-NEXT: add.s64 %rd8, %rd6, %rd7; -; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, %rd5; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd9; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics4_param_1]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.b64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: ld.param.b64 %rd5, [variadics4_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd6, [variadics4_param_0+8]; +; CHECK-PTX-NEXT: add.s64 %rd7, %rd5, %rd6; +; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd4; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd8; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -348,27 +348,27 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b64 %rd<8>; +; CHECK-PTX-NEXT: .reg .b64 %rd<7>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot7; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; -; CHECK-PTX-NEXT: ld.global.nc.b64 %rd3, [__const_$_qux_$_s+8]; -; CHECK-PTX-NEXT: st.local.b64 [%rd2+8], %rd3; -; CHECK-PTX-NEXT: ld.global.nc.b64 %rd4, [__const_$_qux_$_s]; -; CHECK-PTX-NEXT: st.local.b64 [%rd2], %rd4; +; CHECK-PTX-NEXT: add.u64 %rd1, %SPL, 0; +; CHECK-PTX-NEXT: ld.global.nc.b64 %rd2, [__const_$_qux_$_s+8]; +; CHECK-PTX-NEXT: st.local.b64 [%rd1+8], %rd2; +; CHECK-PTX-NEXT: ld.global.nc.b64 %rd3, [__const_$_qux_$_s]; +; CHECK-PTX-NEXT: st.local.b64 [%rd1], %rd3; ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; ; CHECK-PTX-NEXT: { // callseq 3, 0 ; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16]; ; CHECK-PTX-NEXT: .param .b64 param1; ; CHECK-PTX-NEXT: .param .b32 retval0; -; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 16; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5; -; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2+8]; -; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd6; -; CHECK-PTX-NEXT: ld.local.b64 %rd7, [%rd2]; -; CHECK-PTX-NEXT: st.param.b64 [param0], %rd7; +; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 16; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd4; +; CHECK-PTX-NEXT: ld.local.b64 %rd5, [%rd1+8]; +; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd5; +; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd1]; +; CHECK-PTX-NEXT: st.param.b64 [param0], %rd6; ; CHECK-PTX-NEXT: call.uni (retval0), variadics4, (param0, param1); ; CHECK-PTX-NEXT: } // callseq 3 ; CHECK-PTX-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/variadics-lowering.ll b/llvm/test/CodeGen/NVPTX/variadics-lowering.ll index 5502980..1d69f8d 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-lowering.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-lowering.ll @@ -119,7 +119,7 @@ define dso_local i32 @foo() { ; CHECK-NEXT: [[CONV:%.*]] = sext i8 1 to i32 ; CHECK-NEXT: [[CONV1:%.*]] = sext i16 1 to i32 ; CHECK-NEXT: [[CONV2:%.*]] = fpext float 1.000000e+00 to double -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[VARARG_BUFFER]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 ; CHECK-NEXT: store i32 [[CONV]], ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 @@ -133,7 +133,7 @@ define dso_local i32 @foo() { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 6 ; CHECK-NEXT: store double 1.000000e+00, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics1(i32 noundef 1, ptr [[VARARG_BUFFER]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[VARARG_BUFFER]]) ; CHECK-NEXT: ret i32 [[CALL]] ; entry: @@ -208,7 +208,7 @@ define dso_local i32 @bar() { ; CHECK-NEXT: [[S1_SROA_2_0_COPYLOAD:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1_SROA_3]], ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false) ; CHECK-NEXT: [[S1_SROA_31_0_COPYLOAD:%.*]] = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[VARARG_BUFFER]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 ; CHECK-NEXT: store i32 [[S1_SROA_0_0_COPYLOAD]], ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 @@ -216,7 +216,7 @@ define dso_local i32 @bar() { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 3 ; CHECK-NEXT: store i64 [[S1_SROA_31_0_COPYLOAD]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics2(i32 noundef 1, ptr [[VARARG_BUFFER]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[VARARG_BUFFER]]) ; CHECK-NEXT: ret i32 [[CALL]] ; entry: @@ -274,11 +274,11 @@ define dso_local i32 @baz() { ; CHECK-LABEL: define dso_local i32 @baz() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[BAZ_VARARG:%.*]], align 16 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[VARARG_BUFFER]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[BAZ_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 ; CHECK-NEXT: store <4 x i32> splat (i32 1), ptr [[TMP0]], align 16 ; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics3(i32 noundef 1, ptr [[VARARG_BUFFER]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[VARARG_BUFFER]]) ; CHECK-NEXT: ret i32 [[CALL]] ; entry: @@ -333,11 +333,11 @@ define dso_local void @qux() { ; CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_S2:%.*]], align 8 ; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[QUX_VARARG:%.*]], align 8 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[S]], ptr align 8 @__const.qux.s, i64 16, i1 false) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[VARARG_BUFFER]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[QUX_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 ; CHECK-NEXT: store i64 1, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics4(ptr noundef byval([[STRUCT_S2]]) align 8 [[S]], ptr [[VARARG_BUFFER]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[VARARG_BUFFER]]) ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll index 29939e3..3c424c9 100644 --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -7,17 +7,17 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define <16 x float> @test_v16f32(<16 x float> %a) { ; CHECK-LABEL: test_v16f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-NEXT: .reg .b32 %r<17>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v16f32_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v16f32_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v16f32_param_0+32]; -; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v16f32_param_0+48]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd7, %rd8}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd5, %rd6}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v16f32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v16f32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [test_v16f32_param_0+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [test_v16f32_param_0+48]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+48], {%r13, %r14, %r15, %r16}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+32], {%r9, %r10, %r11, %r12}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; ret <16 x float> %a } @@ -25,13 +25,13 @@ define <16 x float> @test_v16f32(<16 x float> %a) { define <8 x float> @test_v8f32(<8 x float> %a) { ; CHECK-LABEL: test_v8f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8f32_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8f32_param_0+16]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v8f32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v8f32_param_0+16]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; ret <8 x float> %a } @@ -39,11 +39,11 @@ define <8 x float> @test_v8f32(<8 x float> %a) { define <4 x float> @test_v4f32(<4 x float> %a) { ; CHECK-LABEL: test_v4f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v4f32_param_0]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v4f32_param_0]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; ret <4 x float> %a } @@ -51,11 +51,11 @@ define <4 x float> @test_v4f32(<4 x float> %a) { define <2 x float> @test_v2f32(<2 x float> %a) { ; CHECK-LABEL: test_v2f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0]; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; ; CHECK-NEXT: ret; ret <2 x float> %a } @@ -64,14 +64,13 @@ define <2 x float> @test_v2f32(<2 x float> %a) { define <3 x float> @test_v3f32(<3 x float> %a) { ; CHECK-LABEL: test_v3f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8]; -; CHECK-NEXT: st.param.b32 [func_retval0+8], %r1; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8]; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; ; CHECK-NEXT: ret; ret <3 x float> %a } diff --git a/llvm/test/CodeGen/NVPTX/vector-compare.ll b/llvm/test/CodeGen/NVPTX/vector-compare.ll index 0e63ee9..d5569b5 100644 --- a/llvm/test/CodeGen/NVPTX/vector-compare.ll +++ b/llvm/test/CodeGen/NVPTX/vector-compare.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify -m32 %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify -m32 %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; This test makes sure that the result of vector compares are properly diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll index 6f0dff7..1ae6f6b 100644 --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -154,7 +154,7 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177 ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_complex_param_0]; @@ -166,11 +166,12 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177 ; CHECK-NEXT: shl.b32 %r6, %r1, 1; ; CHECK-NEXT: or.b32 %r7, %r5, %r6; ; CHECK-NEXT: cvt.u64.u32 %rd2, %r7; -; CHECK-NEXT: mad.wide.u32 %rd3, %r3, 131072, %rd1; -; CHECK-NEXT: add.s64 %rd4, %rd3, %rd2; -; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd4+128]; +; CHECK-NEXT: mul.wide.u32 %rd3, %r3, 131072; +; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd2; +; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd5+128]; ; CHECK-NEXT: max.u16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.b8 [%rd4+129], %rs3; +; CHECK-NEXT: st.b8 [%rd5+129], %rs3; ; CHECK-NEXT: ret; %t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1 %t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() @@ -206,18 +207,18 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst ; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_global_a16_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_global_a16_param_1]; ; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; CHECK-NEXT: cvt.f32.f16 %r9, %rs6; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs5; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs8; -; CHECK-NEXT: cvt.f32.f16 %r12, %rs7; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r5, %rs8; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs7; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs6; +; CHECK-NEXT: cvt.f32.f16 %r8, %rs5; +; CHECK-NEXT: cvt.f32.f16 %r9, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r12, %rs1; ; CHECK-NEXT: st.global.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9}; ; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; @@ -270,18 +271,18 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia ; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_generic_a16_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_generic_a16_param_1]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4; -; CHECK-NEXT: cvt.f32.f16 %r9, %rs6; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs5; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs8; -; CHECK-NEXT: cvt.f32.f16 %r12, %rs7; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r5, %rs8; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs7; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs6; +; CHECK-NEXT: cvt.f32.f16 %r8, %rs5; +; CHECK-NEXT: cvt.f32.f16 %r9, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r12, %rs1; ; CHECK-NEXT: st.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9}; ; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/vector-select.ll b/llvm/test/CodeGen/NVPTX/vector-select.ll index 569da5e..96b2a0c 100644 --- a/llvm/test/CodeGen/NVPTX/vector-select.ll +++ b/llvm/test/CodeGen/NVPTX/vector-select.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; This test makes sure that vector selects are scalarized by the type legalizer. diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll index d07c740..b9bb417 100644 --- a/llvm/test/CodeGen/NVPTX/vector-stores.ll +++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll @@ -5,12 +5,13 @@ define void @foo1(<2 x float> %val, ptr %ptr) { ; CHECK-LABEL: foo1( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_0]; -; CHECK-NEXT: ld.param.b64 %rd2, [foo1_param_1]; -; CHECK-NEXT: st.b64 [%rd2], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_1]; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2}; ; CHECK-NEXT: ret; store <2 x float> %val, ptr %ptr ret void @@ -19,12 +20,13 @@ define void @foo1(<2 x float> %val, ptr %ptr) { define void @foo2(<4 x float> %val, ptr %ptr) { ; CHECK-LABEL: foo2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0]; -; CHECK-NEXT: ld.param.b64 %rd3, [foo2_param_1]; -; CHECK-NEXT: st.v2.b64 [%rd3], {%rd1, %rd2}; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [foo2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo2_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; store <4 x float> %val, ptr %ptr ret void diff --git a/llvm/test/CodeGen/NVPTX/vote.ll b/llvm/test/CodeGen/NVPTX/vote.ll index 6e760ce..d8aa0b1 100644 --- a/llvm/test/CodeGen/NVPTX/vote.ll +++ b/llvm/test/CodeGen/NVPTX/vote.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare i1 @llvm.nvvm.vote.all(i1) ; CHECK-LABEL: .func{{.*}}vote_all diff --git a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll index 43fc9b0e..06c2cd8 100644 --- a/llvm/test/CodeGen/NVPTX/weak-global.ll +++ b/llvm/test/CodeGen/NVPTX/weak-global.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | FileCheck %s --check-prefix PTX43 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | FileCheck %s --check-prefix PTX50 -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} +; RUN: %if ptxas-isa-4.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} +; RUN: %if ptxas-isa-5.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} ; PTX43: .weak .global .align 4 .u32 g ; PTX50: .common .global .align 4 .u32 g diff --git a/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll b/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll index 59fe57b..531a204 100644 --- a/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll +++ b/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | FileCheck %s -; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | %ptxas-verify -arch=sm_90a %} +; RUN: %if ptxas-sm_90a && ptxas-isa-8.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | %ptxas-verify -arch=sm_90a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx60-sm70.py b/llvm/test/CodeGen/NVPTX/wmma-ptx60-sm70.py index bc441bfa..ca6f788 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx60-sm70.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx60-sm70.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx60-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \ # RUN: | FileCheck %t-ptx60-sm_70.ll -# RUN: %if ptxas %{ \ +# RUN: %if ptxas-sm_70 && ptxas-isa-6.0 %{ \ # RUN: llc < %t-ptx60-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \ # RUN: | %ptxas-verify -arch=sm_70 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx61-sm70.py b/llvm/test/CodeGen/NVPTX/wmma-ptx61-sm70.py index 7cfee46..25b2421 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx61-sm70.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx61-sm70.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx61-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \ # RUN: | FileCheck %t-ptx61-sm_70.ll -# RUN: %if ptxas-9.1 %{ \ +# RUN: %if ptxas-sm_70 && ptxas-isa-6.1 %{ \ # RUN: llc < %t-ptx61-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \ # RUN: | %ptxas-verify -arch=sm_70 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm72.py b/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm72.py index 6168df2..4c0fd48 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm72.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm72.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx63-sm_72.ll -mtriple=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_72.ll -# RUN: %if ptxas-10.0 %{ \ +# RUN: %if ptxas-sm_72 && ptxas-isa-6.3 %{ \ # RUN: llc < %t-ptx63-sm_72.ll -mtriple=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \ # RUN: | %ptxas-verify -arch=sm_72 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm75.py b/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm75.py index 507760e..944d284 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm75.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm75.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx63-sm_75.ll -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_75.ll -# RUN: %if ptxas-10.0 %{ \ +# RUN: %if ptxas-sm_75 && ptxas-isa-6.3 %{ \ # RUN: llc < %t-ptx63-sm_75.ll -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \ # RUN: | %ptxas-verify -arch=sm_75 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx64-sm70.py b/llvm/test/CodeGen/NVPTX/wmma-ptx64-sm70.py index 0f0d1c9..a7960454 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx64-sm70.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx64-sm70.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx64-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \ # RUN: | FileCheck %t-ptx64-sm_70.ll -# RUN: %if ptxas-10.1 %{ \ +# RUN: %if ptxas-sm_70 && ptxas-isa-6.4 %{ \ # RUN: llc < %t-ptx64-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \ # RUN: | %ptxas-verify -arch=sm_70 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx65-sm75.py b/llvm/test/CodeGen/NVPTX/wmma-ptx65-sm75.py index 2b919db..ea9d0ba 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx65-sm75.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx65-sm75.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS # RUN: llc < %t-ptx65-sm_75.ll -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \ # RUN: | FileCheck %t-ptx65-sm_75.ll -# RUN: %if ptxas-10.2 %{ \ +# RUN: %if ptxas-sm_75 && ptxas-isa-6.5 %{ \ # RUN: llc < %t-ptx65-sm_75.ll -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \ # RUN: | %ptxas-verify -arch=sm_75 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx71-sm80.py b/llvm/test/CodeGen/NVPTX/wmma-ptx71-sm80.py index 2985c1b..03d46b8 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx71-sm80.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx71-sm80.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS # RUN: llc < %t-ptx71-sm_80.ll -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \ # RUN: | FileCheck %t-ptx71-sm_80.ll -# RUN: %if ptxas-11.1 %{ \ +# RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ \ # RUN: llc < %t-ptx71-sm_80.ll -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \ # RUN: | %ptxas-verify -arch=sm_80 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py index 8f50206..8a5ae22 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py @@ -4,7 +4,7 @@ # RUN: --check-prefixes=PTX78STMATRIX-DAG # RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ # RUN: | FileCheck %t-ptx78-sm_90.ll -# RUN: %if ptxas-12.7 %{ \ +# RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ \ # RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ # RUN: | %ptxas-verify -arch=sm_90 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py index 5c14a54..12b1980 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py @@ -4,7 +4,7 @@ # RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_100a.ll -# RUN: %if ptxas-12.7 %{ \ +# RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ \ # RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \ # RUN: | %ptxas-verify -arch=sm_100a \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py index a77f9ad..f0e9723 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py @@ -4,7 +4,7 @@ # RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_101a.ll -# RUN: %if ptxas-12.7 %{ \ +# RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ \ # RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \ # RUN: | %ptxas-verify -arch=sm_101a \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py index 8126e64..570372c 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py @@ -4,7 +4,7 @@ # RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_120a.ll -# RUN: %if ptxas-12.7 %{ \ +# RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ \ # RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ # RUN: | %ptxas-verify -arch=sm_120a \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx87-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx87-sm120a.py new file mode 100644 index 0000000..ae781df --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx87-sm120a.py @@ -0,0 +1,12 @@ +# Check all variants of instructions supported by PTX87 on SM120a +# RUN: %python %s --ptx=87 --gpu-arch=120 --aa > %t-ptx87-sm_120a.ll +# RUN: llc < %t-ptx87-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 \ +# RUN: | FileCheck %t-ptx87-sm_120a.ll +# RUN: %if ptxas-12.7 %{ \ +# RUN: llc < %t-ptx87-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 \ +# RUN: | %ptxas-verify -arch=sm_120a \ +# RUN: %} + +import wmma + +wmma.main() diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py index 2eb3c3d..6d73bce 100644 --- a/llvm/test/CodeGen/NVPTX/wmma.py +++ b/llvm/test/CodeGen/NVPTX/wmma.py @@ -15,6 +15,11 @@ class MMAType: def __init__(self, ptx_type): self.ptx_type = ptx_type self.llvm_type = { + "e4m3": "i32", + "e5m2": "i32", + "e3m2": "i32", + "e2m3": "i32", + "e2m1": "i32", "f16": "<2 x half>", "f32": "float", "f64": "double", @@ -43,7 +48,7 @@ class MMAType: class MMAFrag: - def __init__(self, geom, frag, ptx_elt_type): + def __init__(self, geom, frag, ptx_elt_type, is_mma_sparse=False): self.geom = geom self.frag = frag self.mma_type = MMAType(ptx_elt_type) @@ -79,12 +84,53 @@ class MMAFrag: "m16n8k16:b:s8": 1, "m16n8k16:c:s32": 4, "m16n8k16:d:s32": 4, - "m16n8k32:a:u8": 4, - "m16n8k32:a:s8": 4, + "m16n8k32:a:u8": 2 if is_mma_sparse else 4, + "m16n8k32:a:s8": 2 if is_mma_sparse else 4, "m16n8k32:b:u8": 2, "m16n8k32:b:s8": 2, "m16n8k32:c:s32": 4, "m16n8k32:d:s32": 4, + # mma sp + "m16n8k32:a:bf16": 4, + "m16n8k32:a:f16": 4, + "m16n8k32:b:bf16": 4, + "m16n8k32:b:f16": 4, + "m16n8k32:c:f16": 2, + "m16n8k32:c:f32": 4, + "m16n8k32:d:f16": 2, + "m16n8k32:d:f32": 4, + "m16n8k16:a:tf32": 4, + "m16n8k16:b:tf32": 4, + "m16n8k16:c:tf32": 4, + "m16n8k16:d:tf32": 4, + "m16n8k64:a:u8": 4, + "m16n8k64:a:s8": 4, + "m16n8k64:a:e4m3": 4, + "m16n8k64:a:e5m2": 4, + "m16n8k64:a:e3m2": 4, + "m16n8k64:a:e2m3": 4, + "m16n8k64:a:e2m1": 4, + "m16n8k64:b:u8": 4, + "m16n8k64:b:s8": 4, + "m16n8k64:b:e4m3": 4, + "m16n8k64:b:e5m2": 4, + "m16n8k64:b:e3m2": 4, + "m16n8k64:b:e2m3": 4, + "m16n8k64:b:e2m1": 4, + "m16n8k64:c:f16": 2, + "m16n8k64:c:f32": 4, + "m16n8k64:d:f16": 2, + "m16n8k64:d:f32": 4, + "m16n8k128:a:u4": 4, + "m16n8k128:a:s4": 4, + "m16n8k128:a:e2m1": 4, + "m16n8k128:b:u4": 4, + "m16n8k128:b:s4": 4, + "m16n8k128:b:e2m1": 4, + "m16n8k128:c:s32": 4, + "m16n8k128:c:f32": 4, + "m16n8k128:d:s32": 4, + "m16n8k128:d:f32": 4, # u4/s4 -> s32 @ m8n8k32 (u4/s4) "m8n8k32:a:u4": 1, "m8n8k32:a:s4": 1, @@ -98,8 +144,8 @@ class MMAFrag: "m16n8k32:b:s4": 1, "m16n8k32:c:s32": 4, "m16n8k32:d:s32": 4, - "m16n8k64:a:u4": 4, - "m16n8k64:a:s4": 4, + "m16n8k64:a:u4": 2 if is_mma_sparse else 4, + "m16n8k64:a:s4": 2 if is_mma_sparse else 4, "m16n8k64:b:u4": 2, "m16n8k64:b:s4": 2, "m16n8k64:c:s32": 4, @@ -124,7 +170,7 @@ class MMAFrag: "m8n32k16:b:bf16": 8, "m32n8k16:a:bf16": 8, "m32n8k16:b:bf16": 2, - "m16n8k16:a:bf16": 4, + "m16n8k16:a:bf16": 2 if is_mma_sparse else 4, "m16n8k16:b:bf16": 2, "m16n8k16:c:f32": 4, "m16n8k16:d:f32": 4, @@ -143,7 +189,7 @@ class MMAFrag: "m16n8k4:b:tf32": 1, "m16n8k4:c:f32": 4, "m16n8k4:d:f32": 4, - "m16n8k8:a:tf32": 4, + "m16n8k8:a:tf32": 2 if is_mma_sparse else 4, "m16n8k8:b:tf32": 2, "m16n8k8:c:f32": 4, "m16n8k8:d:f32": 4, @@ -155,7 +201,7 @@ class MMAFrag: "m16n8k8:d:f16": 2, "m16n8k8:c:f32": 4, "m16n8k8:d:f32": 4, - "m16n8k16:a:f16": 4, + "m16n8k16:a:f16": 2 if is_mma_sparse else 4, "m16n8k16:b:f16": 2, "m16n8k16:c:f16": 2, "m16n8k16:d:f16": 2, @@ -218,7 +264,7 @@ class MMAOp: return "{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d) -def make_mma_ops(geoms, types_a, types_b, types_c, types_d): +def make_mma_ops(geoms, types_a, types_b, types_c, types_d, is_mma_sparse=False): ops = [] for geom, type_a, type_c in product(geoms, types_a, types_c): for type_b, type_d in product( @@ -226,10 +272,10 @@ def make_mma_ops(geoms, types_a, types_b, types_c, types_d): ): ops.append( MMAOp( - MMAFrag(geom, "a", type_a), - MMAFrag(geom, "b", type_b), - MMAFrag(geom, "c", type_c), - MMAFrag(geom, "d", type_d), + MMAFrag(geom, "a", type_a, is_mma_sparse), + MMAFrag(geom, "b", type_b, is_mma_sparse), + MMAFrag(geom, "c", type_c, is_mma_sparse), + MMAFrag(geom, "d", type_d, is_mma_sparse), ) ) return ops @@ -416,6 +462,10 @@ def is_type_supported(ptx_type): return ptx_version >= 65 and gpu_arch >= 75 if ptx_type in ["bf16", "tf32", "f64"]: return ptx_version >= 70 + if ptx_type in ["e4m3", "e5m2"]: + return ptx_version >= 84 and gpu_arch >= 89 + if ptx_type in ["e3m2", "e2m3", "e2m1"]: + return ptx_version >= 87 and gpu_arch >= 120 and aa return ptx_version >= 60 and gpu_arch >= 70 @@ -448,7 +498,7 @@ def is_mma_variant_supported(op, layout_a, layout_b, satf): ): return False - if satf and not op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4"]: + if satf and op.a.mma_type.ptx_type not in ["s8", "u8", "s4", "u4"]: return False # If the type of C is f32 then so must the type of D @@ -825,7 +875,15 @@ define void @test_${function}_o(i8 ${as}* %dst, ${args}) { return generated_items def mma_signature(op): - if op.a.mma_type.ptx_type == "f16": + if op.a.mma_type.ptx_type in ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"]: + # FP8/F8F6F4 ops identified by inputs, accumulator & result types. + return "%s.%s.%s.%s" % ( + op.d.mma_type.ptx_type, + op.a.mma_type.ptx_type, + op.b.mma_type.ptx_type, + op.c.mma_type.ptx_type, + ) + elif op.a.mma_type.ptx_type == "f16": # FP16 ops identified by accumulator & result type. return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type) elif op.a.mma_type.ptx_type != op.b.mma_type.ptx_type: @@ -980,6 +1038,230 @@ def gen_mma_tests(): return generated_items +def get_mma_sp_ops(): + return ( + make_mma_ops(["m16n8k16", "m16n8k32"], ["bf16"], [], ["f32"], [], True) + + make_mma_ops(["m16n8k8", "m16n8k16"], ["tf32"], [], ["f32"], [], True) + + make_mma_ops( + ["m16n8k16", "m16n8k32"], + ["f16"], + [], + ["f16", "f32"], + ["f16", "f32"], + True, + ) + + make_mma_ops( + ["m16n8k64", "m16n8k128"], ["s4", "u4"], ["s4", "u4"], ["s32"], [], True + ) + + make_mma_ops( + ["m16n8k32", "m16n8k64"], ["s8", "u8"], ["s8", "u8"], ["s32"], [], True + ) + + make_mma_ops( + ["m16n8k64"], + ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"], + ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"], + ["f16", "f32"], + ["f16", "f32"], + True, + ) + ) + + +def is_mma_sp_geom_supported(geom): + # geometries for FP and ints. + if geom in [ + "m16n8k16", + "m16n8k32", + "m16n8k8", + "m16n8k64", + "m16n8k128", + ]: + return ptx_version >= 71 + raise ValueError(f"Unexpected sparse MMA geometry: {geom}") + + +def is_mma_sp_variant_supported(op, metadata, kind, satf): + if metadata != "sp" and (ptx_version < 85 or gpu_arch < 80): + return False + + if kind != "" and (ptx_version < 87 or gpu_arch < 120 or not aa): + return False + + if not ( + is_type_supported(op.a.mma_type.ptx_type) + and is_mma_sp_geom_supported(op.a.geom) + ): + return False + + is_int = op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4"] + + if satf and not is_int: + return False + + # A and B type must be the same + if ( + op.a.mma_type.ptx_type in ["f16", "bf16", "tf32"] + and op.a.mma_type.ptx_type != op.b.mma_type.ptx_type + ): + return False + + # C and D type must be the same for m16n8k16/m16n8k32 + if ( + op.a.geom in ["m16n8k16", "m16n8k32"] + and op.c.mma_type.ptx_type != op.d.mma_type.ptx_type + ): + return False + + if kind == "" and ( + op.a.mma_type.ptx_type in ["e3m2", "e2m3", "e2m1"] + or op.b.mma_type.ptx_type in ["e3m2", "e2m3", "e2m1"] + ): + return False + + if ( + kind == "" + and op.a.geom == "m16n8k64" + and (op.c.mma_type.ptx_type == "f16" or op.d.mma_type.ptx_type == "f16") + ): + return False + + if kind != "" and (metadata == "sp" or op.a.geom != "m16n8k64" or is_int): + return False + + return True + + +def sp_selector_gen(op): + # (geom, type) -> allowed selector range + range_01 = { + ("m16n8k32", "bf16"), + ("m16n8k32", "f16"), + ("m16n8k16", "tf32"), + ("m16n8k32", "u8"), + ("m16n8k32", "s8"), + ("m16n8k64", "u4"), + ("m16n8k64", "s4"), + } + + if (op.a.geom, op.a.mma_type.ptx_type) in range_01: + return range(2) + if op.a.geom == "m16n8k64" and op.a.mma_type.ptx_type in [ + "u8", + "s8", + "e4m3", + "e5m2", + "e3m2", + "e2m3", + "e2m1", + ]: + return range(1) + if op.a.geom == "m16n8k128" and op.a.mma_type.ptx_type in [ + "u4", + "s4", + ]: + return range(1) + return range(4) + + +def common_mma_sp_test_gen(params, op, intrinsic_template, instruction_template): + mma_sp_decl_template = """ +declare ${ret_ty} @${intrinsic}( + ${args}); +""" + + mma_sp_test_template = """ +; CHECK-LABEL: .func {{.*}}test_${function}_${selector}( +define ${ret_ty} @test_${function}_${selector}( + ${args}) { +; CHECK: ${instruction} +; CHECK-NEXT: ${check_d} +; CHECK-NEXT: ${check_a} +; CHECK-NEXT: ${check_b} +; CHECK-NEXT: ${check_c} +; CHECK-NEXT: ${check_metadata} +; CHECK-NEXT: ${check_selector} + %r = call ${ret_ty} @${intrinsic}( + ${call_args}); + ret ${ret_ty} %r; +} +""" + + test_params = params + test_params["intrinsic"] = ( + Template(intrinsic_template) + .substitute(params) + .replace("::", ".") + .replace("_", ".") + ) + test_params["function"] = test_params["intrinsic"].replace(".", "_") + test_params["instruction"] = Template(instruction_template).substitute(params) + test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d) + test_params["check_a"] = check_pattern(op.a) + test_params["check_b"] = check_pattern(op.b) + test_params["check_c"] = check_pattern(op.c) + test_params["check_d"] = check_pattern(op.d) + test_params["check_metadata"] = "{{%r[0-9]+}}" + args = ",\n ".join( + list(make_wmma_slice_args(frag) for frag in (op.a, op.b, op.c)) + + ["i32 %metadata", "i32 %selector"] + ) + test_params["args"] = args + + print(Template(mma_sp_decl_template).substitute(test_params)) + + for selector in [str(r) for r in sp_selector_gen(op)]: + test_params["selector"] = selector + test_params["check_selector"] = "{{" + test_params["selector"] + "}}" + test_params["call_args"] = test_params["args"].replace( + "%selector", test_params["selector"] + ) + + print(Template(mma_sp_test_template).substitute(test_params)) + + return (test_params["intrinsic"], test_params["instruction"]) + + +def gen_mma_sp_tests(): + if ptx_version < 71 or gpu_arch < 80: + return [] + + mma_sp_intrinsic_template = ( + "llvm.nvvm.mma.${metadata}.${geom}.row.col${kind}${satf}.${intrinsic_signature}" + ) + mma_sp_instruction_template = ( + "mma.${metadata}.sync.aligned.${geom}.row.col${kind}${satf}.${ptx_signature}" + ) + + generated_items = [] + + for op, metadata, kind, satf in product( + get_mma_sp_ops(), + ["sp::ordered_metadata", "sp"], + ["", ".kind::f8f6f4"], + [".satfinite", ""], + ): + if not is_mma_sp_variant_supported(op, metadata, kind, satf): + continue + + params = { + "intrinsic_signature": mma_signature(op), + "ptx_signature": mma_ptx_signature(op), + "satf": satf, + "geom": op.a.geom, + "metadata": metadata, + "kind": kind, + } + + intrinsic_template = mma_sp_intrinsic_template + instruction_template = mma_sp_instruction_template + + generated_items.append( + common_mma_sp_test_gen(params, op, intrinsic_template, instruction_template) + ) + + return generated_items + + # Append complete list of intrinsics and instructions we've generated tests for. # Generate set of checks to verify that that we did generate sensible set of # tests for the given combination of PTX and SM variants. @@ -1170,6 +1452,7 @@ def gen_tests(): items += gen_stmatrix_tests() items += gen_wmma_mma_tests() items += gen_mma_tests() + items += gen_mma_sp_tests() gen_check_unsupported_ops(items) |