; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx78 | FileCheck %s ; RUN: %if ptxas-sm_75 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx78 | %ptxas-verify -arch=sm_75 %} declare i32 @llvm.nvvm.movmatrix.sync.aligned.m8n8.trans.b16(i32) ; CHECK-LABEL: test_movmatrix define i32 @test_movmatrix(i32 %a) { ; CHECK-LABEL: test_movmatrix( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_movmatrix_param_0]; ; CHECK-NEXT: movmatrix.sync.aligned.m8n8.trans.b16 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %d = call i32 @llvm.nvvm.movmatrix.sync.aligned.m8n8.trans.b16(i32 %a) ret i32 %d } ; Test that LLVM does not CSE two movmatrix calls with the same input, ; as the result depends on values from other threads in the warp. define i32 @test_movmatrix_cse(i32 %a) { ; CHECK-LABEL: test_movmatrix_cse( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_movmatrix_cse_param_0]; ; CHECK-NEXT: movmatrix.sync.aligned.m8n8.trans.b16 %r2, %r1; ; CHECK-NEXT: movmatrix.sync.aligned.m8n8.trans.b16 %r3, %r1; ; CHECK-NEXT: add.s32 %r4, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %d1 = call i32 @llvm.nvvm.movmatrix.sync.aligned.m8n8.trans.b16(i32 %a) %d2 = call i32 @llvm.nvvm.movmatrix.sync.aligned.m8n8.trans.b16(i32 %a) %sum = add i32 %d1, %d2 ret i32 %sum }