diff options
Diffstat (limited to 'llvm/test/CodeGen/NVPTX')
25 files changed, 3996 insertions, 1148 deletions
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index 23832a9..dd9a472 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -181,32 +181,32 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; ENABLED-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; ; ENABLED-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; ; ENABLED-NEXT: prmt.b32 %r7, %r4, 0, 0x7771U; -; ENABLED-NEXT: prmt.b32 %r8, %r4, 0, 0x7770U; -; ENABLED-NEXT: prmt.b32 %r9, %r3, 0, 0x7773U; -; ENABLED-NEXT: prmt.b32 %r10, %r3, 0, 0x7772U; -; ENABLED-NEXT: prmt.b32 %r11, %r3, 0, 0x7771U; -; ENABLED-NEXT: prmt.b32 %r12, %r3, 0, 0x7770U; -; ENABLED-NEXT: prmt.b32 %r13, %r2, 0, 0x7773U; -; ENABLED-NEXT: prmt.b32 %r14, %r2, 0, 0x7772U; -; ENABLED-NEXT: prmt.b32 %r15, %r2, 0, 0x7771U; -; ENABLED-NEXT: prmt.b32 %r16, %r2, 0, 0x7770U; -; ENABLED-NEXT: prmt.b32 %r17, %r1, 0, 0x7773U; -; ENABLED-NEXT: prmt.b32 %r18, %r1, 0, 0x7772U; -; ENABLED-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U; -; ENABLED-NEXT: prmt.b32 %r20, %r1, 0, 0x7770U; +; ENABLED-NEXT: prmt.b32 %r8, %r3, 0, 0x7773U; +; ENABLED-NEXT: prmt.b32 %r9, %r3, 0, 0x7772U; +; ENABLED-NEXT: prmt.b32 %r10, %r3, 0, 0x7771U; +; ENABLED-NEXT: prmt.b32 %r11, %r2, 0, 0x7773U; +; ENABLED-NEXT: prmt.b32 %r12, %r2, 0, 0x7772U; +; ENABLED-NEXT: prmt.b32 %r13, %r2, 0, 0x7771U; +; ENABLED-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U; +; ENABLED-NEXT: prmt.b32 %r15, %r1, 0, 0x7772U; +; ENABLED-NEXT: prmt.b32 %r16, %r1, 0, 0x7771U; ; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1]; -; ENABLED-NEXT: add.s32 %r21, %r20, %r19; -; ENABLED-NEXT: add.s32 %r22, %r21, %r18; -; ENABLED-NEXT: add.s32 %r23, %r22, %r17; -; ENABLED-NEXT: add.s32 %r24, %r23, %r16; -; ENABLED-NEXT: add.s32 %r25, %r24, %r15; -; ENABLED-NEXT: add.s32 %r26, %r25, %r14; -; ENABLED-NEXT: add.s32 %r27, %r26, %r13; -; ENABLED-NEXT: add.s32 %r28, %r27, %r12; -; ENABLED-NEXT: add.s32 %r29, %r28, %r11; -; ENABLED-NEXT: add.s32 %r30, %r29, %r10; -; ENABLED-NEXT: add.s32 %r31, %r30, %r9; -; ENABLED-NEXT: add.s32 %r32, %r31, %r8; +; ENABLED-NEXT: and.b32 %r17, %r1, 255; +; ENABLED-NEXT: and.b32 %r18, %r2, 255; +; ENABLED-NEXT: and.b32 %r19, %r3, 255; +; ENABLED-NEXT: and.b32 %r20, %r4, 255; +; ENABLED-NEXT: add.s32 %r21, %r17, %r16; +; ENABLED-NEXT: add.s32 %r22, %r21, %r15; +; ENABLED-NEXT: add.s32 %r23, %r22, %r14; +; ENABLED-NEXT: add.s32 %r24, %r23, %r18; +; ENABLED-NEXT: add.s32 %r25, %r24, %r13; +; ENABLED-NEXT: add.s32 %r26, %r25, %r12; +; ENABLED-NEXT: add.s32 %r27, %r26, %r11; +; ENABLED-NEXT: add.s32 %r28, %r27, %r19; +; ENABLED-NEXT: add.s32 %r29, %r28, %r10; +; ENABLED-NEXT: add.s32 %r30, %r29, %r9; +; ENABLED-NEXT: add.s32 %r31, %r30, %r8; +; ENABLED-NEXT: add.s32 %r32, %r31, %r20; ; ENABLED-NEXT: add.s32 %r33, %r32, %r7; ; ENABLED-NEXT: add.s32 %r34, %r33, %r6; ; ENABLED-NEXT: add.s32 %r35, %r34, %r5; @@ -332,36 +332,36 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig ; ENABLED-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; ; ENABLED-NEXT: prmt.b32 %r4, %r2, 0, 0x7772U; ; ENABLED-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; ENABLED-NEXT: prmt.b32 %r6, %r2, 0, 0x7770U; -; ENABLED-NEXT: prmt.b32 %r7, %r1, 0, 0x7773U; -; ENABLED-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; -; ENABLED-NEXT: prmt.b32 %r9, %r1, 0, 0x7771U; -; ENABLED-NEXT: prmt.b32 %r10, %r1, 0, 0x7770U; +; ENABLED-NEXT: prmt.b32 %r6, %r1, 0, 0x7773U; +; ENABLED-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; +; ENABLED-NEXT: prmt.b32 %r8, %r1, 0, 0x7771U; ; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1]; -; ENABLED-NEXT: ld.v2.b32 {%r11, %r12}, [%rd1+8]; -; ENABLED-NEXT: prmt.b32 %r13, %r12, 0, 0x7773U; -; ENABLED-NEXT: prmt.b32 %r14, %r12, 0, 0x7772U; -; ENABLED-NEXT: prmt.b32 %r15, %r12, 0, 0x7771U; -; ENABLED-NEXT: prmt.b32 %r16, %r12, 0, 0x7770U; -; ENABLED-NEXT: prmt.b32 %r17, %r11, 0, 0x7773U; -; ENABLED-NEXT: prmt.b32 %r18, %r11, 0, 0x7772U; -; ENABLED-NEXT: prmt.b32 %r19, %r11, 0, 0x7771U; -; ENABLED-NEXT: prmt.b32 %r20, %r11, 0, 0x7770U; -; ENABLED-NEXT: add.s32 %r21, %r10, %r9; -; ENABLED-NEXT: add.s32 %r22, %r21, %r8; -; ENABLED-NEXT: add.s32 %r23, %r22, %r7; -; ENABLED-NEXT: add.s32 %r24, %r23, %r6; +; ENABLED-NEXT: ld.v2.b32 {%r9, %r10}, [%rd1+8]; +; ENABLED-NEXT: prmt.b32 %r11, %r10, 0, 0x7773U; +; ENABLED-NEXT: prmt.b32 %r12, %r10, 0, 0x7772U; +; ENABLED-NEXT: prmt.b32 %r13, %r10, 0, 0x7771U; +; ENABLED-NEXT: prmt.b32 %r14, %r9, 0, 0x7773U; +; ENABLED-NEXT: prmt.b32 %r15, %r9, 0, 0x7772U; +; ENABLED-NEXT: prmt.b32 %r16, %r9, 0, 0x7771U; +; ENABLED-NEXT: and.b32 %r17, %r1, 255; +; ENABLED-NEXT: and.b32 %r18, %r2, 255; +; ENABLED-NEXT: and.b32 %r19, %r9, 255; +; ENABLED-NEXT: and.b32 %r20, %r10, 255; +; ENABLED-NEXT: add.s32 %r21, %r17, %r8; +; ENABLED-NEXT: add.s32 %r22, %r21, %r7; +; ENABLED-NEXT: add.s32 %r23, %r22, %r6; +; ENABLED-NEXT: add.s32 %r24, %r23, %r18; ; ENABLED-NEXT: add.s32 %r25, %r24, %r5; ; ENABLED-NEXT: add.s32 %r26, %r25, %r4; ; ENABLED-NEXT: add.s32 %r27, %r26, %r3; -; ENABLED-NEXT: add.s32 %r28, %r27, %r20; -; ENABLED-NEXT: add.s32 %r29, %r28, %r19; -; ENABLED-NEXT: add.s32 %r30, %r29, %r18; -; ENABLED-NEXT: add.s32 %r31, %r30, %r17; -; ENABLED-NEXT: add.s32 %r32, %r31, %r16; -; ENABLED-NEXT: add.s32 %r33, %r32, %r15; -; ENABLED-NEXT: add.s32 %r34, %r33, %r14; -; ENABLED-NEXT: add.s32 %r35, %r34, %r13; +; ENABLED-NEXT: add.s32 %r28, %r27, %r19; +; ENABLED-NEXT: add.s32 %r29, %r28, %r16; +; ENABLED-NEXT: add.s32 %r30, %r29, %r15; +; ENABLED-NEXT: add.s32 %r31, %r30, %r14; +; ENABLED-NEXT: add.s32 %r32, %r31, %r20; +; ENABLED-NEXT: add.s32 %r33, %r32, %r13; +; ENABLED-NEXT: add.s32 %r34, %r33, %r12; +; ENABLED-NEXT: add.s32 %r35, %r34, %r11; ; ENABLED-NEXT: st.b32 [%rd2], %r35; ; ENABLED-NEXT: ret; ; diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index e2a914d..ba5813c 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -359,11 +359,12 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-LABEL: test_fptrunc_2xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = fptrunc <2 x float> %a to <2 x bfloat> ret <2 x bfloat> %r diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll new file mode 100644 index 0000000..843446a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll new file mode 100644 index 0000000..9b485803 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll @@ -0,0 +1,152 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll new file mode 100644 index 0000000..4325405 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll @@ -0,0 +1,353 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1); + +define void @cp_async_bulk_tensor_g2s_cta_tile_1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + ret void +} + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll new file mode 100644 index 0000000..ef4a8fb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1 +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2 +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll new file mode 100644 index 0000000..112dab1 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -0,0 +1,524 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d +define void @cp_async_bulk_tensor_g2s_im2colw_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d +define void @cp_async_bulk_tensor_g2s_im2colw_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d +define void @cp_async_bulk_tensor_g2s_im2colw_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll new file mode 100644 index 0000000..54e861e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -0,0 +1,524 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll new file mode 100644 index 0000000..6bf8f03 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_tile_gather4_2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll new file mode 100644 index 0000000..2ef44ff --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag); + +; CHECK-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d +define void @cp_async_bulk_tensor_s2g_tile_scatter4_2d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll index 80980ef..d61a63c 100644 --- a/llvm/test/CodeGen/NVPTX/extractelement.ll +++ b/llvm/test/CodeGen/NVPTX/extractelement.ll @@ -56,23 +56,22 @@ define i16 @test_v4i8(i32 %a) { ; CHECK-LABEL: test_v4i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<8>; -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_param_0]; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x8880U; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r3; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U; -; CHECK-NEXT: cvt.u16.u32 %rs4, %r5; +; CHECK-NEXT: cvt.s8.s32 %rs1, %r1; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x9991U; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r2; +; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0xaaa2U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r3; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xbbb3U; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r4; ; CHECK-NEXT: add.s16 %rs5, %rs1, %rs2; ; CHECK-NEXT: add.s16 %rs6, %rs3, %rs4; ; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs7; -; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %v = bitcast i32 %a to <4 x i8> %r0 = extractelement <4 x i8> %v, i64 0 @@ -96,7 +95,7 @@ define i32 @test_v4i8_s32(i32 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_s32_param_0]; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x8880U; +; CHECK-NEXT: cvt.s32.s8 %r2, %r1; ; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U; ; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U; ; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U; @@ -127,12 +126,12 @@ define i32 @test_v4i8_u32(i32 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_u32_param_0]; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7773U; -; CHECK-NEXT: add.s32 %r6, %r2, %r3; -; CHECK-NEXT: add.s32 %r7, %r4, %r5; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7771U; +; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7772U; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; +; CHECK-NEXT: and.b32 %r5, %r1, 255; +; CHECK-NEXT: add.s32 %r6, %r5, %r2; +; CHECK-NEXT: add.s32 %r7, %r3, %r4; ; CHECK-NEXT: add.s32 %r8, %r6, %r7; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; @@ -157,26 +156,24 @@ define i16 @test_v8i8(i64 %a) { ; CHECK-LABEL: test_v8i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<16>; -; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x8880U; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; -; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xaaa2U; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; -; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0xbbb3U; -; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x8880U; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x9991U; -; CHECK-NEXT: cvt.u16.u32 %rs6, %r8; -; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0xaaa2U; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0xbbb3U; -; CHECK-NEXT: cvt.u16.u32 %rs8, %r10; +; CHECK-NEXT: cvt.s8.s32 %rs1, %r1; +; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r3; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r5; +; CHECK-NEXT: cvt.s8.s32 %rs5, %r2; +; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x9991U; +; CHECK-NEXT: cvt.u16.u32 %rs6, %r6; +; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r7; +; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0xbbb3U; +; CHECK-NEXT: cvt.u16.u32 %rs8, %r8; ; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2; ; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4; ; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6; @@ -184,8 +181,8 @@ define i16 @test_v8i8(i64 %a) { ; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10; ; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12; ; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs15; -; CHECK-NEXT: st.param.b32 [func_retval0], %r11; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs15; +; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; %v = bitcast i64 %a to <8 x i8> %r0 = extractelement <8 x i8> %v, i64 0 diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index d0e2c18..8918fbd 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -45,11 +45,12 @@ define <2 x half> @test_ret_const() #0 { define half @test_extract_0(<2 x half> %a) #0 { ; CHECK-LABEL: test_extract_0( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i32 0 @@ -59,12 +60,13 @@ define half @test_extract_0(<2 x half> %a) #0 { define half @test_extract_1(<2 x half> %a) #0 { ; CHECK-LABEL: test_extract_1( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0]; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i32 1 ret half %e @@ -80,8 +82,9 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; ; CHECK-NEXT: setp.eq.b64 %p1, %rd1, 0; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-NEXT: ret; @@ -107,14 +110,16 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fadd_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fadd_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: add.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NOF16-NEXT: add.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -143,7 +148,8 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_0_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -175,7 +181,8 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_1_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_imm_1_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -207,14 +214,16 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsub_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsub_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fsub_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fsub_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: sub.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NOF16-NEXT: sub.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -242,7 +251,8 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fneg_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fneg_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: mov.b32 %r3, 0f00000000; ; CHECK-NOF16-NEXT: sub.rn.f32 %r4, %r3, %r2; @@ -275,14 +285,16 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmul_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmul_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmul_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmul_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: mul.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NOF16-NEXT: mul.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -299,14 +311,16 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NEXT: div.rn.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NEXT: div.rn.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -331,10 +345,12 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_frem_param_0]; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_frem_param_1]; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NEXT: ld.param.b32 %r2, [test_frem_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_frem_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NEXT: div.rn.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; ; CHECK-NEXT: neg.f32 %r7, %r6; @@ -342,8 +358,8 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: testp.infinite.f32 %p1, %r3; ; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r9; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs3; ; CHECK-NEXT: div.rn.f32 %r12, %r11, %r10; ; CHECK-NEXT: cvt.rzi.f32.f32 %r13, %r12; ; CHECK-NEXT: neg.f32 %r14, %r13; @@ -535,11 +551,13 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; ; CHECK-F16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; -; CHECK-F16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; ; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r3, %r4; -; CHECK-F16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_1]; -; CHECK-F16-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; -; CHECK-F16-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; +; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-F16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-F16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; CHECK-F16-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; ; CHECK-F16-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; CHECK-F16-NEXT: ret; ; @@ -550,18 +568,22 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; CHECK-NOF16-NEXT: .reg .b32 %r<9>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_3]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_2]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs5; +; CHECK-NOF16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r6, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs4; ; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r8, %r7; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1]; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs2, %rs8, %p2; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs1, %rs7, %p1; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; CHECK-NOF16-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs9}; ; CHECK-NOF16-NEXT: ret; %cc = fcmp une <2 x half> %c, %d @@ -579,11 +601,13 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; -; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0]; +; CHECK-F16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1]; +; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0]; ; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; -; CHECK-F16-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_1]; -; CHECK-F16-NEXT: selp.f32 %r7, %r4, %r6, %p2; -; CHECK-F16-NEXT: selp.f32 %r8, %r3, %r5, %p1; +; CHECK-F16-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-F16-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-F16-NEXT: selp.f32 %r7, %r6, %r4, %p2; +; CHECK-F16-NEXT: selp.f32 %r8, %r5, %r3, %p1; ; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-F16-NEXT: ret; ; @@ -595,18 +619,22 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-NOF16-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f32_f16_param_3]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f32_f16_param_2]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r6, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs4; -; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r8, %r7; -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_f16_param_1]; -; CHECK-NOF16-NEXT: selp.f32 %r11, %r4, %r10, %p2; -; CHECK-NOF16-NEXT: selp.f32 %r12, %r3, %r9, %p1; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; +; CHECK-NOF16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1]; +; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs1; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs4; +; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5; +; CHECK-NOF16-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-NOF16-NEXT: mov.b64 {%r9, %r10}, %rd1; +; CHECK-NOF16-NEXT: selp.f32 %r11, %r10, %r8, %p2; +; CHECK-NOF16-NEXT: selp.f32 %r12, %r9, %r7, %p1; ; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11}; ; CHECK-NOF16-NEXT: ret; <2 x half> %c, <2 x half> %d) #0 { @@ -624,14 +652,18 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f16_f32_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3]; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f16_f32_param_1]; -; CHECK-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; -; CHECK-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; +; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0]; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3; +; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; CHECK-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; CHECK-NEXT: ret; <2 x float> %c, <2 x float> %d) #0 { @@ -664,13 +696,15 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_une_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_une_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_une_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_une_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -705,13 +739,15 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ueq_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ueq_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ueq_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ueq_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.equ.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.equ.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -746,13 +782,15 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ugt_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ugt_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ugt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ugt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.gtu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.gtu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -787,13 +825,15 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uge_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uge_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_uge_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_uge_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.geu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.geu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -828,13 +868,15 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ult_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ult_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ult_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ult_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.ltu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.ltu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -869,13 +911,15 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ule_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ule_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ule_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ule_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.leu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.leu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -911,13 +955,15 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uno_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uno_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_uno_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_uno_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -952,13 +998,15 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_one_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_one_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_one_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_one_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.ne.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.ne.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -993,13 +1041,15 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oeq_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oeq_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_oeq_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_oeq_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.eq.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1034,13 +1084,15 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ogt_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ogt_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ogt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ogt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.gt.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.gt.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1075,13 +1127,15 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oge_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oge_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_oge_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_oge_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.ge.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.ge.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1116,13 +1170,15 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_olt_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_olt_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_olt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_olt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.lt.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1157,13 +1213,15 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ole_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ole_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ole_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ole_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.le.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.le.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1198,13 +1256,15 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ord_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ord_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ord_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ord_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.num.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.num.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1222,7 +1282,8 @@ define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fptosi_i32_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.s32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.rzi.s32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1239,7 +1300,8 @@ define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i64_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fptosi_i64_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.s64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.rzi.s64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1255,7 +1317,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.u32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.rzi.u32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1272,7 +1335,8 @@ define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.u64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.rzi.u64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1369,16 +1433,17 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs1, %r1; ; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs2, %r2; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_uitofp_2xi32_fadd_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r4, %r5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; -; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r7, %r8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r8, %r7; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r9; ; CHECK-NOF16-NEXT: mov.b32 %r10, {%rs6, %rs5}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r10; @@ -1411,16 +1476,17 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1]; ; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs1, %r1; ; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs2, %r2; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_sitofp_2xi32_fadd_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r4, %r5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; -; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r7, %r8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r8, %r7; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r9; ; CHECK-NOF16-NEXT: mov.b32 %r10, {%rs6, %rs5}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r10; @@ -1433,11 +1499,17 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-LABEL: test_fptrunc_2xfloat( ; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %rd1; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1; +; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = fptrunc <2 x float> %a to <2 x half> ret <2 x half> %r @@ -1468,7 +1540,8 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xfloat_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.f32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1485,7 +1558,8 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.f64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1578,7 +1652,8 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sqrt_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sqrt_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1606,7 +1681,8 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sin_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sin_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: sin.approx.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1627,7 +1703,8 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_cos_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_cos_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: cos.approx.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1703,17 +1780,20 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fma_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fma_param_2]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_fma_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fma_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fma_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fma_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6; ; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5; ; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11; ; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7}; @@ -1740,7 +1820,8 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fabs_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: abs.f32 %r3, %r2; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1761,14 +1842,16 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_minnum_param_0]; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_minnum_param_1]; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NEXT: ld.param.b32 %r2, [test_minnum_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_minnum_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NEXT: min.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NEXT: min.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -1785,14 +1868,16 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_maxnum_param_0]; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_maxnum_param_1]; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NEXT: ld.param.b32 %r2, [test_maxnum_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_maxnum_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NEXT: max.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NEXT: max.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -1822,13 +1907,15 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_param_1]; -; CHECK-NOF16-NEXT: and.b16 %rs5, %rs4, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs2, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs5; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs3, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs1, 32767; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, -32768; +; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs5, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs1, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs4, 32767; ; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs7}; ; CHECK-NOF16-NEXT: ret; @@ -1844,8 +1931,9 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16-NEXT: .reg .b64 %rd<2>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; -; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; +; CHECK-F16-NEXT: mov.b64 {%r2, %r3}, %rd1; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2; ; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1}; @@ -1862,8 +1950,10 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f32_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; +; CHECK-NOF16-NEXT: mov.b64 {%r2, %r3}, %rd1; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648; ; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; } @@ -1906,7 +1996,8 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f64_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: and.b64 %rd3, %rd2, -9223372036854775808; ; CHECK-NOF16-NEXT: shr.u64 %rd4, %rd3, 48; @@ -1948,13 +2039,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_extended_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_extended_param_1]; -; CHECK-NOF16-NEXT: and.b16 %rs5, %rs3, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs5; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs4, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs2, 32767; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs1, -32768; +; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs4, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs2, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs5, 32767; ; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs10; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs7; @@ -1972,7 +2065,8 @@ define <2 x half> @test_floor(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_floor_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_floor_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rmi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rmi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -1988,7 +2082,8 @@ define <2 x half> @test_ceil(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_ceil_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_ceil_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rpi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rpi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2004,7 +2099,8 @@ define <2 x half> @test_trunc(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_trunc_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_trunc_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2020,7 +2116,8 @@ define <2 x half> @test_rint(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_rint_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_rint_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2036,7 +2133,8 @@ define <2 x half> @test_nearbyint(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_nearbyint_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_nearbyint_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2052,7 +2150,8 @@ define <2 x half> @test_roundeven(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_roundeven_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_roundeven_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2070,7 +2169,8 @@ define <2 x half> @test_round(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<21>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_round_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_round_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; ; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; @@ -2121,17 +2221,20 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmuladd_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmuladd_param_2]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_fmuladd_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fmuladd_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmuladd_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6; ; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5; ; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11; ; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7}; @@ -2148,7 +2251,8 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; CHECK-NEXT: ret; %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0> @@ -2158,12 +2262,13 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 { define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 { ; CHECK-LABEL: test_insertelement( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; -; CHECK-NEXT: ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; CHECK-NEXT: ret; %i = insertelement <2 x half> %a, half %x, i64 1 @@ -2177,7 +2282,8 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rn.f16.s16 %rs3, %rs2; ; CHECK-NEXT: cvt.rn.f16.s16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2193,7 +2299,8 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_uitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rn.f16.u16 %rs3, %rs2; ; CHECK-NEXT: cvt.rn.f16.u16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index af3cb63..30afd69 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -28,29 +28,53 @@ define <2 x float> @test_ret_const() #0 { } define float @test_extract_0(<2 x float> %a) #0 { -; CHECK-LABEL: test_extract_0( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %r1; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_extract_0( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0]; +; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_extract_0( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, _}, %rd1; +; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-F32X2-NEXT: ret; %e = extractelement <2 x float> %a, i32 0 ret float %e } define float @test_extract_1(<2 x float> %a) #0 { -; CHECK-LABEL: test_extract_1( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_extract_1( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0]; +; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; } +; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_extract_1( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {_, %r1}, %rd1; +; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-F32X2-NEXT: ret; %e = extractelement <2 x float> %a, i32 1 ret float %e } @@ -70,10 +94,12 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1]; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -98,7 +124,8 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -128,7 +155,8 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -158,13 +186,17 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1]; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r9, %r4, %r8; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r10, %r3, %r7; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r2, %r6; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r1, %r5; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r10, %r8; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r9, %r7; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_v4( @@ -189,12 +221,14 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0]; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4( @@ -225,12 +259,14 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0]; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4( @@ -261,10 +297,12 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1]; -; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -289,7 +327,8 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: neg.f32 %r3, %r2; ; CHECK-NEXT: neg.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -305,10 +344,12 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1]; -; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -333,10 +374,12 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1]; -; CHECK-NEXT: div.rn.f32 %r5, %r2, %r4; -; CHECK-NEXT: div.rn.f32 %r6, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; +; CHECK-NEXT: div.rn.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %r = fdiv <2 x float> %a, %b @@ -351,20 +394,22 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1]; -; CHECK-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; ; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; ; CHECK-NEXT: neg.f32 %r7, %r6; -; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r4, %r2; -; CHECK-NEXT: testp.infinite.f32 %p1, %r4; -; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p1; -; CHECK-NEXT: div.rn.f32 %r10, %r1, %r3; +; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4; +; CHECK-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-NEXT: div.rn.f32 %r10, %r3, %r1; ; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r10; ; CHECK-NEXT: neg.f32 %r12, %r11; -; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r3, %r1; -; CHECK-NEXT: testp.infinite.f32 %p2, %r3; -; CHECK-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3; +; CHECK-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; ; CHECK-NEXT: ret; %r = frem <2 x float> %a, %b @@ -378,10 +423,12 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1]; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -406,7 +453,8 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -436,7 +484,8 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -466,13 +515,17 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1]; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r9, %r4, %r8; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r10, %r3, %r7; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r2, %r6; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r1, %r5; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r10, %r8; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r9, %r7; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_v4_ftz( @@ -497,12 +550,14 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz( @@ -533,12 +588,14 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz( @@ -569,10 +626,12 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1]; -; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -597,7 +656,8 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: neg.ftz.f32 %r3, %r2; ; CHECK-NEXT: neg.ftz.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -613,10 +673,12 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1]; -; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -641,11 +703,14 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2]; -; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r2, %r4, %r6; -; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r6, %r4, %r2; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r5, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -671,10 +736,12 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1]; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-NEXT: div.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %r = fdiv <2 x float> %a, %b @@ -689,20 +756,22 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1]; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; ; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; ; CHECK-NEXT: neg.ftz.f32 %r7, %r6; -; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r4, %r2; -; CHECK-NEXT: testp.infinite.f32 %p1, %r4; -; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p1; -; CHECK-NEXT: div.rn.ftz.f32 %r10, %r1, %r3; +; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4; +; CHECK-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-NEXT: div.rn.ftz.f32 %r10, %r3, %r1; ; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; ; CHECK-NEXT: neg.ftz.f32 %r12, %r11; -; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r3, %r1; -; CHECK-NEXT: testp.infinite.f32 %p2, %r3; -; CHECK-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3; +; CHECK-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; ; CHECK-NEXT: ret; %r = frem <2 x float> %a, %b @@ -877,14 +946,18 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> % ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_3]; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; -; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_1]; -; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p2; -; CHECK-NEXT: selp.f32 %r10, %r1, %r7, %p1; +; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-NEXT: selp.f32 %r9, %r8, %r6, %p2; +; CHECK-NEXT: selp.f32 %r10, %r7, %r5, %p1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; ; CHECK-NEXT: ret; %cc = fcmp une <2 x float> %c, %d @@ -902,10 +975,12 @@ define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; ; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3]; -; CHECK-NEXT: setp.neu.f32 %p1, %r1, %r3; -; CHECK-NEXT: setp.neu.f32 %p2, %r2, %r4; +; CHECK-NEXT: ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3]; +; CHECK-NEXT: ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd6; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd5; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; ; CHECK-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2; ; CHECK-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; @@ -925,12 +1000,14 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3]; ; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0]; ; CHECK-NEXT: setp.neu.f64 %p1, %rd3, %rd5; ; CHECK-NEXT: setp.neu.f64 %p2, %rd4, %rd6; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1]; -; CHECK-NEXT: selp.f32 %r5, %r2, %r4, %p2; -; CHECK-NEXT: selp.f32 %r6, %r1, %r3, %p1; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: selp.f32 %r5, %r4, %r2, %p2; +; CHECK-NEXT: selp.f32 %r6, %r3, %r1, %p1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %cc = fcmp une <2 x double> %c, %d @@ -947,10 +1024,12 @@ define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1]; -; CHECK-NEXT: setp.neu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.neu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.neu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.neu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -969,10 +1048,12 @@ define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1]; -; CHECK-NEXT: setp.equ.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.equ.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.equ.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.equ.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -991,10 +1072,12 @@ define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1]; -; CHECK-NEXT: setp.gtu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.gtu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.gtu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.gtu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1013,10 +1096,12 @@ define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1]; -; CHECK-NEXT: setp.geu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.geu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.geu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.geu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1035,10 +1120,12 @@ define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1]; -; CHECK-NEXT: setp.ltu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.ltu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.ltu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ltu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1057,10 +1144,12 @@ define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1]; -; CHECK-NEXT: setp.leu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.leu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.leu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.leu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1079,10 +1168,12 @@ define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1]; -; CHECK-NEXT: setp.nan.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.nan.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.nan.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.nan.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1101,10 +1192,12 @@ define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1]; -; CHECK-NEXT: setp.ne.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.ne.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.ne.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ne.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1123,10 +1216,12 @@ define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1]; -; CHECK-NEXT: setp.eq.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.eq.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.eq.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.eq.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1145,10 +1240,12 @@ define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1]; -; CHECK-NEXT: setp.gt.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.gt.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.gt.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.gt.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1167,10 +1264,12 @@ define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1]; -; CHECK-NEXT: setp.ge.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.ge.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.ge.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ge.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1189,10 +1288,12 @@ define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1]; -; CHECK-NEXT: setp.lt.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.lt.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.lt.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.lt.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1211,10 +1312,12 @@ define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1]; -; CHECK-NEXT: setp.le.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.le.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.le.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.le.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1233,10 +1336,12 @@ define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1]; -; CHECK-NEXT: setp.num.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.num.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.num.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.num.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1253,7 +1358,8 @@ define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.s32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rzi.s32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1269,7 +1375,8 @@ define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %r2; ; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %r1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; @@ -1285,7 +1392,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.u32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rzi.u32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1301,7 +1409,8 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %r2; ; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %r1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; @@ -1380,9 +1489,10 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: ; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r1; ; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r2; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, %r4; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; @@ -1431,7 +1541,8 @@ define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.f64.f32 %rd2, %r2; ; CHECK-NEXT: cvt.f64.f32 %rd3, %r1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; @@ -1499,7 +1610,8 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_sqrt_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; ; CHECK-NEXT: sqrt.rn.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1522,7 +1634,8 @@ define <2 x float> @test_sin(<2 x float> %a) #0 #1 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_sin_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: sin.approx.f32 %r3, %r2; ; CHECK-NEXT: sin.approx.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1538,7 +1651,8 @@ define <2 x float> @test_cos(<2 x float> %a) #0 #1 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_cos_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cos.approx.f32 %r3, %r2; ; CHECK-NEXT: cos.approx.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1597,11 +1711,14 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2]; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -1627,7 +1744,8 @@ define <2 x float> @test_fabs(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fabs_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: abs.f32 %r3, %r2; ; CHECK-NEXT: abs.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1643,10 +1761,12 @@ define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1]; -; CHECK-NEXT: min.f32 %r5, %r2, %r4; -; CHECK-NEXT: min.f32 %r6, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_minnum_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_minnum_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: min.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b) @@ -1660,10 +1780,12 @@ define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1]; -; CHECK-NEXT: max.f32 %r5, %r2, %r4; -; CHECK-NEXT: max.f32 %r6, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_maxnum_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_maxnum_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: max.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b) @@ -1677,8 +1799,10 @@ define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; ; CHECK-NEXT: copysign.f32 %r5, %r4, %r2; ; CHECK-NEXT: copysign.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; @@ -1696,18 +1820,19 @@ define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1]; -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0]; -; CHECK-NEXT: abs.f32 %r3, %r2; -; CHECK-NEXT: neg.f32 %r4, %r3; +; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_f64_param_0]; ; CHECK-NEXT: shr.u64 %rd4, %rd3, 63; ; CHECK-NEXT: and.b64 %rd5, %rd4, 1; ; CHECK-NEXT: setp.ne.b64 %p1, %rd5, 0; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: abs.f32 %r3, %r2; +; CHECK-NEXT: neg.f32 %r4, %r3; ; CHECK-NEXT: selp.f32 %r5, %r4, %r3, %p1; -; CHECK-NEXT: abs.f32 %r6, %r1; -; CHECK-NEXT: neg.f32 %r7, %r6; ; CHECK-NEXT: shr.u64 %rd6, %rd2, 63; ; CHECK-NEXT: and.b64 %rd7, %rd6, 1; ; CHECK-NEXT: setp.ne.b64 %p2, %rd7, 0; +; CHECK-NEXT: abs.f32 %r6, %r1; +; CHECK-NEXT: neg.f32 %r7, %r6; ; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p2; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; ; CHECK-NEXT: ret; @@ -1723,8 +1848,10 @@ define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_extended_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_extended_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; ; CHECK-NEXT: copysign.f32 %r5, %r3, %r1; ; CHECK-NEXT: copysign.f32 %r6, %r4, %r2; ; CHECK-NEXT: cvt.f64.f32 %rd3, %r6; @@ -1743,7 +1870,8 @@ define <2 x float> @test_floor(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_floor_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rmi.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rmi.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1759,7 +1887,8 @@ define <2 x float> @test_ceil(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ceil_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rpi.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rpi.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1775,7 +1904,8 @@ define <2 x float> @test_trunc(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1791,7 +1921,8 @@ define <2 x float> @test_rint(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_rint_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1807,7 +1938,8 @@ define <2 x float> @test_nearbyint(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_nearbyint_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1823,7 +1955,8 @@ define <2 x float> @test_roundeven(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_roundeven_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1841,7 +1974,8 @@ define <2 x float> @test_round(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_round_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; ; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; ; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4; @@ -1875,11 +2009,14 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2]; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fmuladd_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmuladd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmuladd_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -1905,7 +2042,8 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_shufflevector_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; ; CHECK-NEXT: ret; %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> <i32 1, i32 0> @@ -1913,16 +2051,29 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 { } define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { -; CHECK-LABEL: test_insertelement( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; -; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_insertelement_param_0]; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_insertelement( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_insertelement_param_0]; +; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r2, tmp}, %rd1; } +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_insertelement( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_insertelement_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r2, _}, %rd1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-F32X2-NEXT: ret; %i = insertelement <2 x float> %a, float %x, i64 1 ret <2 x float> %i } @@ -1957,6 +2108,43 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { ret <2 x float> %r } +define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) { +; CHECK-LABEL: test_trunc_to_v2bf16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-NEXT: st.b32 [%rd2], %r3; +; CHECK-NEXT: ret; + %trunc = fptrunc <2 x float> %a to <2 x bfloat> + store <2 x bfloat> %trunc, ptr %p + ret void +} + +define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) { +; CHECK-LABEL: test_trunc_to_v2f16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; +; CHECK-NEXT: st.b32 [%rd2], %r3; +; CHECK-NEXT: ret; + %trunc = fptrunc <2 x float> %a to <2 x half> + store <2 x half> %trunc, ptr %p + ret void +} + + attributes #0 = { nounwind } attributes #1 = { "unsafe-fp-math" = "true" } attributes #2 = { "denormal-fp-math"="preserve-sign" } diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index f1adc34..9a051b3 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) { ; CHECK-LABEL: test_select_i1_basic_folding( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<12>; -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .pred %p<13>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0]; ; CHECK-NEXT: setp.eq.b32 %p1, %r1, 0; -; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_1]; -; CHECK-NEXT: setp.ne.b32 %p2, %r3, 0; -; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_2]; -; CHECK-NEXT: setp.eq.b32 %p4, %r5, 0; -; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_folding_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1]; +; CHECK-NEXT: setp.ne.b32 %p2, %r2, 0; +; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0; +; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2]; +; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0; +; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; ; CHECK-NEXT: xor.pred %p6, %p1, %p3; -; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_folding_param_4]; +; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; ; CHECK-NEXT: and.pred %p7, %p6, %p4; -; CHECK-NEXT: and.pred %p8, %p2, %p4; -; CHECK-NEXT: and.pred %p9, %p3, %p7; -; CHECK-NEXT: or.pred %p10, %p9, %p8; -; CHECK-NEXT: xor.pred %p11, %p10, %p3; -; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: and.pred %p9, %p2, %p4; +; CHECK-NEXT: and.pred %p10, %p3, %p7; +; CHECK-NEXT: or.pred %p11, %p10, %p9; +; CHECK-NEXT: xor.pred %p12, %p11, %p3; +; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index f2211eb..44d8558 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -5,9 +5,9 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: srem_i128( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<22>; +; CHECK-NEXT: .reg .pred %p<20>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<126>; +; CHECK-NEXT: .reg .b64 %rd<127>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0]; @@ -42,103 +42,102 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd62, %r4; ; CHECK-NEXT: add.s64 %rd63, %rd62, 64; ; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7; -; CHECK-NEXT: mov.b64 %rd116, 0; +; CHECK-NEXT: mov.b64 %rd117, 0; ; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64; -; CHECK-NEXT: subc.cc.s64 %rd8, %rd116, 0; -; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0; -; CHECK-NEXT: and.pred %p10, %p8, %p8; -; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0; -; CHECK-NEXT: setp.gt.u64 %p12, %rd66, 127; -; CHECK-NEXT: and.pred %p13, %p11, %p12; -; CHECK-NEXT: or.pred %p14, %p13, %p10; -; CHECK-NEXT: or.pred %p15, %p5, %p14; -; CHECK-NEXT: xor.b64 %rd67, %rd66, 127; -; CHECK-NEXT: or.b64 %rd68, %rd67, %rd8; -; CHECK-NEXT: setp.eq.b64 %p16, %rd68, 0; -; CHECK-NEXT: selp.b64 %rd125, 0, %rd4, %p15; -; CHECK-NEXT: selp.b64 %rd124, 0, %rd3, %p15; -; CHECK-NEXT: or.pred %p17, %p15, %p16; -; CHECK-NEXT: @%p17 bra $L__BB0_5; +; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0; +; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127; +; CHECK-NEXT: setp.eq.b64 %p9, %rd67, 0; +; CHECK-NEXT: and.pred %p10, %p9, %p8; +; CHECK-NEXT: setp.ne.b64 %p11, %rd67, 0; +; CHECK-NEXT: or.pred %p12, %p10, %p11; +; CHECK-NEXT: or.pred %p13, %p5, %p12; +; CHECK-NEXT: xor.b64 %rd68, %rd66, 127; +; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67; +; CHECK-NEXT: setp.eq.b64 %p14, %rd69, 0; +; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13; +; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13; +; CHECK-NEXT: or.pred %p15, %p13, %p14; +; CHECK-NEXT: @%p15 bra $L__BB0_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd118, %rd66, 1; -; CHECK-NEXT: addc.cc.s64 %rd119, %rd8, 0; -; CHECK-NEXT: or.b64 %rd71, %rd118, %rd119; -; CHECK-NEXT: setp.eq.b64 %p18, %rd71, 0; +; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1; +; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0; +; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120; +; CHECK-NEXT: setp.eq.b64 %p16, %rd72, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd66; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd72, %rd4, %r6; +; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd73, %rd3, %r7; -; CHECK-NEXT: or.b64 %rd74, %rd72, %rd73; +; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7; +; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd75, %rd3, %r8; -; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63; -; CHECK-NEXT: selp.b64 %rd123, %rd75, %rd74, %p19; -; CHECK-NEXT: shl.b64 %rd122, %rd3, %r6; -; CHECK-NEXT: mov.b64 %rd113, %rd116; -; CHECK-NEXT: @%p18 bra $L__BB0_4; +; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8; +; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; +; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17; +; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6; +; CHECK-NEXT: mov.b64 %rd114, %rd117; +; CHECK-NEXT: @%p16 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd118; -; CHECK-NEXT: shr.u64 %rd78, %rd3, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd119; +; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd79, %rd4, %r10; -; CHECK-NEXT: or.b64 %rd80, %rd78, %rd79; +; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10; +; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd81, %rd4, %r11; -; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63; -; CHECK-NEXT: selp.b64 %rd120, %rd81, %rd80, %p20; -; CHECK-NEXT: shr.u64 %rd121, %rd4, %r9; +; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11; +; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; +; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18; +; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9; ; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; -; CHECK-NEXT: mov.b64 %rd113, 0; -; CHECK-NEXT: mov.b64 %rd116, %rd113; +; CHECK-NEXT: mov.b64 %rd114, 0; +; CHECK-NEXT: mov.b64 %rd117, %rd114; ; CHECK-NEXT: $L__BB0_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd82, %rd120, 63; -; CHECK-NEXT: shl.b64 %rd83, %rd121, 1; -; CHECK-NEXT: or.b64 %rd84, %rd83, %rd82; -; CHECK-NEXT: shl.b64 %rd85, %rd120, 1; -; CHECK-NEXT: shr.u64 %rd86, %rd123, 63; -; CHECK-NEXT: or.b64 %rd87, %rd85, %rd86; -; CHECK-NEXT: shr.u64 %rd88, %rd122, 63; -; CHECK-NEXT: shl.b64 %rd89, %rd123, 1; -; CHECK-NEXT: or.b64 %rd90, %rd89, %rd88; -; CHECK-NEXT: shl.b64 %rd91, %rd122, 1; -; CHECK-NEXT: or.b64 %rd122, %rd116, %rd91; -; CHECK-NEXT: or.b64 %rd123, %rd113, %rd90; -; CHECK-NEXT: sub.cc.s64 %rd92, %rd35, %rd87; -; CHECK-NEXT: subc.cc.s64 %rd93, %rd36, %rd84; -; CHECK-NEXT: shr.s64 %rd94, %rd93, 63; -; CHECK-NEXT: and.b64 %rd116, %rd94, 1; -; CHECK-NEXT: and.b64 %rd95, %rd94, %rd5; -; CHECK-NEXT: and.b64 %rd96, %rd94, %rd6; -; CHECK-NEXT: sub.cc.s64 %rd120, %rd87, %rd95; -; CHECK-NEXT: subc.cc.s64 %rd121, %rd84, %rd96; -; CHECK-NEXT: add.cc.s64 %rd118, %rd118, -1; -; CHECK-NEXT: addc.cc.s64 %rd119, %rd119, -1; -; CHECK-NEXT: or.b64 %rd97, %rd118, %rd119; -; CHECK-NEXT: setp.eq.b64 %p21, %rd97, 0; -; CHECK-NEXT: @%p21 bra $L__BB0_4; +; CHECK-NEXT: shr.u64 %rd83, %rd121, 63; +; CHECK-NEXT: shl.b64 %rd84, %rd122, 1; +; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; +; CHECK-NEXT: shl.b64 %rd86, %rd121, 1; +; CHECK-NEXT: shr.u64 %rd87, %rd124, 63; +; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; +; CHECK-NEXT: shr.u64 %rd89, %rd123, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd124, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd123, 1; +; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92; +; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91; +; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; +; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; +; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; +; CHECK-NEXT: and.b64 %rd117, %rd95, 1; +; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5; +; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6; +; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96; +; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97; +; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1; +; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1; +; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120; +; CHECK-NEXT: setp.eq.b64 %p19, %rd98, 0; +; CHECK-NEXT: @%p19 bra $L__BB0_4; ; CHECK-NEXT: bra.uni $L__BB0_2; ; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd98, %rd122, 63; -; CHECK-NEXT: shl.b64 %rd99, %rd123, 1; -; CHECK-NEXT: or.b64 %rd100, %rd99, %rd98; -; CHECK-NEXT: shl.b64 %rd101, %rd122, 1; -; CHECK-NEXT: or.b64 %rd124, %rd116, %rd101; -; CHECK-NEXT: or.b64 %rd125, %rd113, %rd100; +; CHECK-NEXT: shr.u64 %rd99, %rd123, 63; +; CHECK-NEXT: shl.b64 %rd100, %rd124, 1; +; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; +; CHECK-NEXT: shl.b64 %rd102, %rd123, 1; +; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102; +; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101; ; CHECK-NEXT: $L__BB0_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd102, %rd5, %rd124; -; CHECK-NEXT: mad.lo.s64 %rd103, %rd5, %rd125, %rd102; -; CHECK-NEXT: mad.lo.s64 %rd104, %rd6, %rd124, %rd103; -; CHECK-NEXT: mul.lo.s64 %rd105, %rd5, %rd124; -; CHECK-NEXT: sub.cc.s64 %rd106, %rd3, %rd105; -; CHECK-NEXT: subc.cc.s64 %rd107, %rd4, %rd104; -; CHECK-NEXT: xor.b64 %rd108, %rd106, %rd2; +; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125; +; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103; +; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104; +; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125; +; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106; +; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105; ; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2; -; CHECK-NEXT: sub.cc.s64 %rd110, %rd108, %rd2; -; CHECK-NEXT: subc.cc.s64 %rd111, %rd109, %rd2; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd110, %rd111}; +; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2; +; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2; +; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, %rhs ret i128 %div @@ -149,7 +148,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<111>; +; CHECK-NEXT: .reg .b64 %rd<113>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0]; @@ -173,98 +172,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd101, 0; -; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd6, %rd101, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0; +; CHECK-NEXT: mov.b64 %rd103, 0; +; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; +; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd56, %rd5, 127; -; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6; -; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0; -; CHECK-NEXT: selp.b64 %rd110, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd109, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; +; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; +; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0; +; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11; +; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB1_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd103, %rd5, 1; -; CHECK-NEXT: addc.cc.s64 %rd104, %rd6, 0; -; CHECK-NEXT: or.b64 %rd60, %rd103, %rd104; -; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd5; +; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1; +; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0; +; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106; +; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; +; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd108, %rd64, %rd63, %p15; -; CHECK-NEXT: shl.b64 %rd107, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd98, %rd101; +; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15; +; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6; +; CHECK-NEXT: mov.b64 %rd100, %rd103; ; CHECK-NEXT: @%p14 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd103; -; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd105; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; +; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd105, %rd70, %rd69, %p16; -; CHECK-NEXT: shr.u64 %rd106, %rd42, %r9; +; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16; +; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9; ; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd98, 0; -; CHECK-NEXT: mov.b64 %rd101, %rd98; +; CHECK-NEXT: mov.b64 %rd100, 0; +; CHECK-NEXT: mov.b64 %rd103, %rd100; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd71, %rd105, 63; -; CHECK-NEXT: shl.b64 %rd72, %rd106, 1; -; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71; -; CHECK-NEXT: shl.b64 %rd74, %rd105, 1; -; CHECK-NEXT: shr.u64 %rd75, %rd108, 63; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; -; CHECK-NEXT: shr.u64 %rd77, %rd107, 63; -; CHECK-NEXT: shl.b64 %rd78, %rd108, 1; -; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77; -; CHECK-NEXT: shl.b64 %rd80, %rd107, 1; -; CHECK-NEXT: or.b64 %rd107, %rd101, %rd80; -; CHECK-NEXT: or.b64 %rd108, %rd98, %rd79; -; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76; -; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73; -; CHECK-NEXT: shr.s64 %rd83, %rd82, 63; -; CHECK-NEXT: and.b64 %rd101, %rd83, 1; -; CHECK-NEXT: and.b64 %rd84, %rd83, %rd3; -; CHECK-NEXT: and.b64 %rd85, %rd83, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd105, %rd76, %rd84; -; CHECK-NEXT: subc.cc.s64 %rd106, %rd73, %rd85; -; CHECK-NEXT: add.cc.s64 %rd103, %rd103, -1; -; CHECK-NEXT: addc.cc.s64 %rd104, %rd104, -1; -; CHECK-NEXT: or.b64 %rd86, %rd103, %rd104; -; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0; +; CHECK-NEXT: shr.u64 %rd73, %rd107, 63; +; CHECK-NEXT: shl.b64 %rd74, %rd108, 1; +; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; +; CHECK-NEXT: shl.b64 %rd76, %rd107, 1; +; CHECK-NEXT: shr.u64 %rd77, %rd110, 63; +; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; +; CHECK-NEXT: shr.u64 %rd79, %rd109, 63; +; CHECK-NEXT: shl.b64 %rd80, %rd110, 1; +; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; +; CHECK-NEXT: shl.b64 %rd82, %rd109, 1; +; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82; +; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81; +; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; +; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; +; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; +; CHECK-NEXT: and.b64 %rd103, %rd85, 1; +; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3; +; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86; +; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87; +; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1; +; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1; +; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106; +; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0; ; CHECK-NEXT: @%p17 bra $L__BB1_4; ; CHECK-NEXT: bra.uni $L__BB1_2; ; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd87, %rd107, 63; -; CHECK-NEXT: shl.b64 %rd88, %rd108, 1; -; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; -; CHECK-NEXT: shl.b64 %rd90, %rd107, 1; -; CHECK-NEXT: or.b64 %rd109, %rd101, %rd90; -; CHECK-NEXT: or.b64 %rd110, %rd98, %rd89; +; CHECK-NEXT: shr.u64 %rd89, %rd109, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd110, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd109, 1; +; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92; +; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91; ; CHECK-NEXT: $L__BB1_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd91, %rd3, %rd109; -; CHECK-NEXT: mad.lo.s64 %rd92, %rd3, %rd110, %rd91; -; CHECK-NEXT: mad.lo.s64 %rd93, %rd4, %rd109, %rd92; -; CHECK-NEXT: mul.lo.s64 %rd94, %rd3, %rd109; -; CHECK-NEXT: sub.cc.s64 %rd95, %rd41, %rd94; -; CHECK-NEXT: subc.cc.s64 %rd96, %rd42, %rd93; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd95, %rd96}; +; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111; +; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93; +; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94; +; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111; +; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96; +; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs ret i128 %div @@ -307,9 +306,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) { define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: sdiv_i128( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<22>; +; CHECK-NEXT: .reg .pred %p<20>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<121>; +; CHECK-NEXT: .reg .b64 %rd<122>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0]; @@ -345,97 +344,96 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; ; CHECK-NEXT: add.s64 %rd64, %rd63, 64; ; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; -; CHECK-NEXT: mov.b64 %rd111, 0; +; CHECK-NEXT: mov.b64 %rd112, 0; ; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65; -; CHECK-NEXT: subc.cc.s64 %rd8, %rd111, 0; -; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0; -; CHECK-NEXT: and.pred %p10, %p8, %p8; -; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0; -; CHECK-NEXT: setp.gt.u64 %p12, %rd67, 127; -; CHECK-NEXT: and.pred %p13, %p11, %p12; -; CHECK-NEXT: or.pred %p14, %p13, %p10; -; CHECK-NEXT: or.pred %p15, %p5, %p14; -; CHECK-NEXT: xor.b64 %rd68, %rd67, 127; -; CHECK-NEXT: or.b64 %rd69, %rd68, %rd8; -; CHECK-NEXT: setp.eq.b64 %p16, %rd69, 0; -; CHECK-NEXT: selp.b64 %rd120, 0, %rd2, %p15; -; CHECK-NEXT: selp.b64 %rd119, 0, %rd1, %p15; -; CHECK-NEXT: or.pred %p17, %p15, %p16; -; CHECK-NEXT: @%p17 bra $L__BB4_5; +; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0; +; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127; +; CHECK-NEXT: setp.eq.b64 %p9, %rd68, 0; +; CHECK-NEXT: and.pred %p10, %p9, %p8; +; CHECK-NEXT: setp.ne.b64 %p11, %rd68, 0; +; CHECK-NEXT: or.pred %p12, %p10, %p11; +; CHECK-NEXT: or.pred %p13, %p5, %p12; +; CHECK-NEXT: xor.b64 %rd69, %rd67, 127; +; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68; +; CHECK-NEXT: setp.eq.b64 %p14, %rd70, 0; +; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13; +; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13; +; CHECK-NEXT: or.pred %p15, %p13, %p14; +; CHECK-NEXT: @%p15 bra $L__BB4_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd113, %rd67, 1; -; CHECK-NEXT: addc.cc.s64 %rd114, %rd8, 0; -; CHECK-NEXT: or.b64 %rd72, %rd113, %rd114; -; CHECK-NEXT: setp.eq.b64 %p18, %rd72, 0; +; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1; +; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0; +; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115; +; CHECK-NEXT: setp.eq.b64 %p16, %rd73, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd67; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd73, %rd2, %r6; +; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd74, %rd1, %r7; -; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; +; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7; +; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd76, %rd1, %r8; -; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63; -; CHECK-NEXT: selp.b64 %rd118, %rd76, %rd75, %p19; -; CHECK-NEXT: shl.b64 %rd117, %rd1, %r6; -; CHECK-NEXT: mov.b64 %rd108, %rd111; -; CHECK-NEXT: @%p18 bra $L__BB4_4; +; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8; +; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; +; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17; +; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6; +; CHECK-NEXT: mov.b64 %rd109, %rd112; +; CHECK-NEXT: @%p16 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd113; -; CHECK-NEXT: shr.u64 %rd79, %rd1, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd114; +; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd80, %rd2, %r10; -; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; +; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10; +; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd82, %rd2, %r11; -; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63; -; CHECK-NEXT: selp.b64 %rd115, %rd82, %rd81, %p20; -; CHECK-NEXT: shr.u64 %rd116, %rd2, %r9; +; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11; +; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; +; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18; +; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9; ; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd108, 0; -; CHECK-NEXT: mov.b64 %rd111, %rd108; +; CHECK-NEXT: mov.b64 %rd109, 0; +; CHECK-NEXT: mov.b64 %rd112, %rd109; ; CHECK-NEXT: $L__BB4_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd83, %rd115, 63; -; CHECK-NEXT: shl.b64 %rd84, %rd116, 1; -; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; -; CHECK-NEXT: shl.b64 %rd86, %rd115, 1; -; CHECK-NEXT: shr.u64 %rd87, %rd118, 63; -; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; -; CHECK-NEXT: shr.u64 %rd89, %rd117, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd118, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd117, 1; -; CHECK-NEXT: or.b64 %rd117, %rd111, %rd92; -; CHECK-NEXT: or.b64 %rd118, %rd108, %rd91; -; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; -; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; -; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; -; CHECK-NEXT: and.b64 %rd111, %rd95, 1; -; CHECK-NEXT: and.b64 %rd96, %rd95, %rd3; -; CHECK-NEXT: and.b64 %rd97, %rd95, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd115, %rd88, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd116, %rd85, %rd97; -; CHECK-NEXT: add.cc.s64 %rd113, %rd113, -1; -; CHECK-NEXT: addc.cc.s64 %rd114, %rd114, -1; -; CHECK-NEXT: or.b64 %rd98, %rd113, %rd114; -; CHECK-NEXT: setp.eq.b64 %p21, %rd98, 0; -; CHECK-NEXT: @%p21 bra $L__BB4_4; +; CHECK-NEXT: shr.u64 %rd84, %rd116, 63; +; CHECK-NEXT: shl.b64 %rd85, %rd117, 1; +; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84; +; CHECK-NEXT: shl.b64 %rd87, %rd116, 1; +; CHECK-NEXT: shr.u64 %rd88, %rd119, 63; +; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88; +; CHECK-NEXT: shr.u64 %rd90, %rd118, 63; +; CHECK-NEXT: shl.b64 %rd91, %rd119, 1; +; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90; +; CHECK-NEXT: shl.b64 %rd93, %rd118, 1; +; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93; +; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92; +; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89; +; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86; +; CHECK-NEXT: shr.s64 %rd96, %rd95, 63; +; CHECK-NEXT: and.b64 %rd112, %rd96, 1; +; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3; +; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97; +; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98; +; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1; +; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1; +; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115; +; CHECK-NEXT: setp.eq.b64 %p19, %rd99, 0; +; CHECK-NEXT: @%p19 bra $L__BB4_4; ; CHECK-NEXT: bra.uni $L__BB4_2; ; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd99, %rd117, 63; -; CHECK-NEXT: shl.b64 %rd100, %rd118, 1; -; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; -; CHECK-NEXT: shl.b64 %rd102, %rd117, 1; -; CHECK-NEXT: or.b64 %rd119, %rd111, %rd102; -; CHECK-NEXT: or.b64 %rd120, %rd108, %rd101; +; CHECK-NEXT: shr.u64 %rd100, %rd118, 63; +; CHECK-NEXT: shl.b64 %rd101, %rd119, 1; +; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100; +; CHECK-NEXT: shl.b64 %rd103, %rd118, 1; +; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103; +; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102; ; CHECK-NEXT: $L__BB4_5: // %udiv-end -; CHECK-NEXT: xor.b64 %rd103, %rd119, %rd5; ; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5; -; CHECK-NEXT: sub.cc.s64 %rd105, %rd103, %rd5; -; CHECK-NEXT: subc.cc.s64 %rd106, %rd104, %rd5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; +; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5; +; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5; +; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107}; ; CHECK-NEXT: ret; %div = sdiv i128 %lhs, %rhs ret i128 %div @@ -446,7 +444,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<105>; +; CHECK-NEXT: .reg .b64 %rd<107>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0]; @@ -470,92 +468,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd95, 0; -; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd6, %rd95, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0; +; CHECK-NEXT: mov.b64 %rd97, 0; +; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; +; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd56, %rd5, 127; -; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6; -; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0; -; CHECK-NEXT: selp.b64 %rd104, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd103, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; +; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; +; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0; +; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11; +; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB5_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd97, %rd5, 1; -; CHECK-NEXT: addc.cc.s64 %rd98, %rd6, 0; -; CHECK-NEXT: or.b64 %rd60, %rd97, %rd98; -; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd5; +; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1; +; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0; +; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100; +; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; +; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd102, %rd64, %rd63, %p15; -; CHECK-NEXT: shl.b64 %rd101, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd92, %rd95; +; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15; +; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6; +; CHECK-NEXT: mov.b64 %rd94, %rd97; ; CHECK-NEXT: @%p14 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd97; -; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd99; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; +; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd99, %rd70, %rd69, %p16; -; CHECK-NEXT: shr.u64 %rd100, %rd42, %r9; +; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16; +; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9; ; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; -; CHECK-NEXT: mov.b64 %rd92, 0; -; CHECK-NEXT: mov.b64 %rd95, %rd92; +; CHECK-NEXT: mov.b64 %rd94, 0; +; CHECK-NEXT: mov.b64 %rd97, %rd94; ; CHECK-NEXT: $L__BB5_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd71, %rd99, 63; -; CHECK-NEXT: shl.b64 %rd72, %rd100, 1; -; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71; -; CHECK-NEXT: shl.b64 %rd74, %rd99, 1; -; CHECK-NEXT: shr.u64 %rd75, %rd102, 63; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; -; CHECK-NEXT: shr.u64 %rd77, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd78, %rd102, 1; -; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77; -; CHECK-NEXT: shl.b64 %rd80, %rd101, 1; -; CHECK-NEXT: or.b64 %rd101, %rd95, %rd80; -; CHECK-NEXT: or.b64 %rd102, %rd92, %rd79; -; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76; -; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73; -; CHECK-NEXT: shr.s64 %rd83, %rd82, 63; -; CHECK-NEXT: and.b64 %rd95, %rd83, 1; -; CHECK-NEXT: and.b64 %rd84, %rd83, %rd43; -; CHECK-NEXT: and.b64 %rd85, %rd83, %rd44; -; CHECK-NEXT: sub.cc.s64 %rd99, %rd76, %rd84; -; CHECK-NEXT: subc.cc.s64 %rd100, %rd73, %rd85; -; CHECK-NEXT: add.cc.s64 %rd97, %rd97, -1; -; CHECK-NEXT: addc.cc.s64 %rd98, %rd98, -1; -; CHECK-NEXT: or.b64 %rd86, %rd97, %rd98; -; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0; +; CHECK-NEXT: shr.u64 %rd73, %rd101, 63; +; CHECK-NEXT: shl.b64 %rd74, %rd102, 1; +; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; +; CHECK-NEXT: shl.b64 %rd76, %rd101, 1; +; CHECK-NEXT: shr.u64 %rd77, %rd104, 63; +; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; +; CHECK-NEXT: shr.u64 %rd79, %rd103, 63; +; CHECK-NEXT: shl.b64 %rd80, %rd104, 1; +; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; +; CHECK-NEXT: shl.b64 %rd82, %rd103, 1; +; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82; +; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81; +; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; +; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; +; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; +; CHECK-NEXT: and.b64 %rd97, %rd85, 1; +; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43; +; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44; +; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86; +; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87; +; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1; +; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1; +; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100; +; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0; ; CHECK-NEXT: @%p17 bra $L__BB5_4; ; CHECK-NEXT: bra.uni $L__BB5_2; ; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd87, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd88, %rd102, 1; -; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; -; CHECK-NEXT: shl.b64 %rd90, %rd101, 1; -; CHECK-NEXT: or.b64 %rd103, %rd95, %rd90; -; CHECK-NEXT: or.b64 %rd104, %rd92, %rd89; +; CHECK-NEXT: shr.u64 %rd89, %rd103, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd104, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd103, 1; +; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92; +; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91; ; CHECK-NEXT: $L__BB5_5: // %udiv-end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd103, %rd104}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; ; CHECK-NEXT: ret; %div = udiv i128 %lhs, %rhs ret i128 %div diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 1a61498..2b7a06c 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -32,31 +32,57 @@ define <2 x i16> @test_ret_const() #0 { } define i16 @test_extract_0(<2 x i16> %a) #0 { -; COMMON-LABEL: test_extract_0( -; COMMON: { -; COMMON-NEXT: .reg .b16 %rs<3>; -; COMMON-NEXT: .reg .b32 %r<3>; -; COMMON-EMPTY: -; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0]; -; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; -; COMMON-NEXT: st.param.b32 [func_retval0], %r2; -; COMMON-NEXT: ret; +; I16x2-LABEL: test_extract_0( +; I16x2: { +; I16x2-NEXT: .reg .b16 %rs<2>; +; I16x2-NEXT: .reg .b32 %r<3>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; +; I16x2-NEXT: mov.b32 {%rs1, _}, %r1; +; I16x2-NEXT: cvt.u32.u16 %r2, %rs1; +; I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_extract_0( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<2>; +; NO-I16x2-NEXT: .reg .b32 %r<3>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; +; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } +; NO-I16x2-NEXT: cvt.u32.u16 %r2, %rs1; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; NO-I16x2-NEXT: ret; %e = extractelement <2 x i16> %a, i32 0 ret i16 %e } define i16 @test_extract_1(<2 x i16> %a) #0 { -; COMMON-LABEL: test_extract_1( -; COMMON: { -; COMMON-NEXT: .reg .b16 %rs<3>; -; COMMON-NEXT: .reg .b32 %r<3>; -; COMMON-EMPTY: -; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0]; -; COMMON-NEXT: cvt.u32.u16 %r2, %rs2; -; COMMON-NEXT: st.param.b32 [func_retval0], %r2; -; COMMON-NEXT: ret; +; I16x2-LABEL: test_extract_1( +; I16x2: { +; I16x2-NEXT: .reg .b16 %rs<2>; +; I16x2-NEXT: .reg .b32 %r<3>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; +; I16x2-NEXT: mov.b32 {_, %rs1}, %r1; +; I16x2-NEXT: cvt.u32.u16 %r2, %rs1; +; I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_extract_1( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<2>; +; NO-I16x2-NEXT: .reg .b32 %r<3>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; +; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } +; NO-I16x2-NEXT: cvt.u32.u16 %r2, %rs1; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; NO-I16x2-NEXT: ret; %e = extractelement <2 x i16> %a, i32 1 ret i16 %e } @@ -71,8 +97,9 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; ; COMMON-NEXT: setp.eq.b64 %p1, %rd1, 0; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; COMMON-NEXT: cvt.u32.u16 %r2, %rs3; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; @@ -99,10 +126,12 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_add_param_1]; -; NO-I16x2-NEXT: add.s16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: add.s16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_add_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: add.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: add.s16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %r = add <2 x i16> %a, %b @@ -128,7 +157,8 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<2>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_0_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; ; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -155,7 +185,8 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<2>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_1_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; ; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -171,10 +202,12 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sub_param_0]; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_sub_param_1]; -; COMMON-NEXT: sub.s16 %rs5, %rs2, %rs4; -; COMMON-NEXT: sub.s16 %rs6, %rs1, %rs3; +; COMMON-NEXT: ld.param.b32 %r2, [test_sub_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_sub_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: sub.s16 %rs5, %rs4, %rs2; +; COMMON-NEXT: sub.s16 %rs6, %rs3, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; %r = sub <2 x i16> %a, %b @@ -199,10 +232,12 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_smax_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_smax_param_1]; -; NO-I16x2-NEXT: max.s16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: max.s16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smax_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smax_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: max.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: max.s16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp sgt <2 x i16> %a, %b @@ -228,10 +263,12 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_umax_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_umax_param_1]; -; NO-I16x2-NEXT: max.u16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: max.u16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umax_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umax_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: max.u16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: max.u16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp ugt <2 x i16> %a, %b @@ -257,10 +294,12 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_smin_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_smin_param_1]; -; NO-I16x2-NEXT: min.s16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: min.s16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smin_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smin_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: min.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: min.s16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp sle <2 x i16> %a, %b @@ -286,10 +325,12 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_umin_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_umin_param_1]; -; NO-I16x2-NEXT: min.u16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: min.u16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umin_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umin_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: min.u16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: min.u16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp ule <2 x i16> %a, %b @@ -304,10 +345,12 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_mul_param_0]; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_mul_param_1]; -; COMMON-NEXT: mul.lo.s16 %rs5, %rs2, %rs4; -; COMMON-NEXT: mul.lo.s16 %rs6, %rs1, %rs3; +; COMMON-NEXT: ld.param.b32 %r2, [test_mul_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_mul_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: mul.lo.s16 %rs5, %rs4, %rs2; +; COMMON-NEXT: mul.lo.s16 %rs6, %rs3, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; %r = mul <2 x i16> %a, %b @@ -686,14 +729,18 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x ; COMMON-NEXT: .reg .b32 %r<5>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_2]; -; COMMON-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_3]; -; COMMON-NEXT: setp.ne.b16 %p1, %rs3, %rs5; -; COMMON-NEXT: setp.ne.b16 %p2, %rs4, %rs6; -; COMMON-NEXT: ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1]; -; COMMON-NEXT: selp.b16 %rs9, %rs2, %rs8, %p2; -; COMMON-NEXT: selp.b16 %rs10, %rs1, %rs7, %p1; +; COMMON-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; COMMON-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; COMMON-NEXT: setp.ne.b16 %p1, %rs3, %rs1; +; COMMON-NEXT: setp.ne.b16 %p2, %rs4, %rs2; +; COMMON-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; COMMON-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; COMMON-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; +; COMMON-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs9}; ; COMMON-NEXT: ret; %cc = icmp ne <2 x i16> %c, %d @@ -711,10 +758,12 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1]; ; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0]; -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i32_i16_param_2]; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i32_i16_param_3]; -; COMMON-NEXT: setp.ne.b16 %p1, %rs1, %rs3; -; COMMON-NEXT: setp.ne.b16 %p2, %rs2, %rs4; +; COMMON-NEXT: ld.param.b32 %r6, [test_select_cc_i32_i16_param_3]; +; COMMON-NEXT: ld.param.b32 %r5, [test_select_cc_i32_i16_param_2]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r6; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r5; +; COMMON-NEXT: setp.ne.b16 %p1, %rs3, %rs1; +; COMMON-NEXT: setp.ne.b16 %p2, %rs4, %rs2; ; COMMON-NEXT: selp.b32 %r7, %r2, %r4, %p2; ; COMMON-NEXT: selp.b32 %r8, %r1, %r3, %p1; ; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; @@ -735,12 +784,14 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3]; ; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2]; -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i16_i32_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_i16_i32_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_i16_i32_param_0]; ; COMMON-NEXT: setp.ne.b32 %p1, %r3, %r5; ; COMMON-NEXT: setp.ne.b32 %p2, %r4, %r6; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i16_i32_param_1]; -; COMMON-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; -; COMMON-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; COMMON-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; <2 x i32> %c, <2 x i32> %d) #0 { @@ -851,7 +902,8 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi32_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.u32.u16 %r2, %rs2; ; COMMON-NEXT: cvt.u32.u16 %r3, %rs1; ; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -868,7 +920,8 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b64 %rd<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi64_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.u64.u16 %rd1, %rs2; ; COMMON-NEXT: cvt.u64.u16 %rd2, %rs1; ; COMMON-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -926,7 +979,8 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; COMMON-NEXT: ret; %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0> @@ -934,16 +988,29 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { } define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { -; COMMON-LABEL: test_insertelement( -; COMMON: { -; COMMON-NEXT: .reg .b16 %rs<4>; -; COMMON-NEXT: .reg .b32 %r<2>; -; COMMON-EMPTY: -; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; -; COMMON-NEXT: ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0]; -; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; -; COMMON-NEXT: ret; +; I16x2-LABEL: test_insertelement( +; I16x2: { +; I16x2-NEXT: .reg .b16 %rs<3>; +; I16x2-NEXT: .reg .b32 %r<2>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; +; I16x2-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; I16x2-NEXT: mov.b32 {%rs2, _}, %r1; +; I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_insertelement( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<3>; +; NO-I16x2-NEXT: .reg .b32 %r<2>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } +; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; NO-I16x2-NEXT: ret; %i = insertelement <2 x i16> %a, i16 %x, i64 1 ret <2 x i16> %i } @@ -955,7 +1022,8 @@ define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; COMMON-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -971,7 +1039,8 @@ define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; COMMON-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index cbc9f70..da99cec 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1935,16 +1935,18 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; O0-NEXT: .reg .b32 %r<12>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: -; O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; O0-NEXT: cvt.rzi.s16.f16 %rs5, %rs4; -; O0-NEXT: cvt.rzi.s16.f16 %rs6, %rs3; -; O0-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; O0-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; O0-NEXT: cvt.u32.u16 %r4, %rs8; -; O0-NEXT: cvt.u32.u16 %r5, %rs7; +; O0-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0]; +; O0-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; O0-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; +; O0-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; +; O0-NEXT: mov.b32 %r3, {%rs4, %rs3}; +; O0-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; O0-NEXT: cvt.u32.u16 %r4, %rs6; +; O0-NEXT: cvt.u32.u16 %r5, %rs5; ; O0-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; O0-NEXT: cvt.rzi.s16.f16 %rs9, %rs2; -; O0-NEXT: cvt.rzi.s16.f16 %rs10, %rs1; +; O0-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; O0-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; +; O0-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; ; O0-NEXT: mov.b32 %r7, {%rs10, %rs9}; ; O0-NEXT: mov.b32 {%rs11, %rs12}, %r7; ; O0-NEXT: cvt.u32.u16 %r8, %rs12; @@ -1989,16 +1991,18 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; O0-NEXT: .reg .b32 %r<12>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: -; O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; O0-NEXT: cvt.rzi.u16.f16 %rs5, %rs4; -; O0-NEXT: cvt.rzi.u16.f16 %rs6, %rs3; -; O0-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; O0-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; O0-NEXT: cvt.u32.u16 %r4, %rs8; -; O0-NEXT: cvt.u32.u16 %r5, %rs7; +; O0-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0]; +; O0-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; O0-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; +; O0-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; +; O0-NEXT: mov.b32 %r3, {%rs4, %rs3}; +; O0-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; O0-NEXT: cvt.u32.u16 %r4, %rs6; +; O0-NEXT: cvt.u32.u16 %r5, %rs5; ; O0-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; O0-NEXT: cvt.rzi.u16.f16 %rs9, %rs2; -; O0-NEXT: cvt.rzi.u16.f16 %rs10, %rs1; +; O0-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; O0-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; +; O0-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; ; O0-NEXT: mov.b32 %r7, {%rs10, %rs9}; ; O0-NEXT: mov.b32 {%rs11, %rs12}, %r7; ; O0-NEXT: cvt.u32.u16 %r8, %rs12; @@ -2040,7 +2044,7 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; O0-LABEL: test_srem_v4i8( ; O0: { ; O0-NEXT: .reg .b16 %rs<13>; -; O0-NEXT: .reg .b32 %r<18>; +; O0-NEXT: .reg .b32 %r<16>; ; O0-NEXT: .reg .b64 %rd<4>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: // %entry @@ -2062,27 +2066,25 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; O0-NEXT: rem.s16 %rs6, %rs5, %rs4; ; O0-NEXT: cvt.u32.u16 %r8, %rs6; ; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; O0-NEXT: prmt.b32 %r10, %r2, 0, 0x9991U; -; O0-NEXT: cvt.u16.u32 %rs7, %r10; -; O0-NEXT: prmt.b32 %r11, %r1, 0, 0x9991U; -; O0-NEXT: cvt.u16.u32 %rs8, %r11; +; O0-NEXT: cvt.s8.s32 %rs7, %r2; +; O0-NEXT: cvt.s8.s32 %rs8, %r1; ; O0-NEXT: rem.s16 %rs9, %rs8, %rs7; -; O0-NEXT: cvt.u32.u16 %r12, %rs9; -; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x8880U; -; O0-NEXT: cvt.u16.u32 %rs10, %r13; -; O0-NEXT: prmt.b32 %r14, %r1, 0, 0x8880U; -; O0-NEXT: cvt.u16.u32 %rs11, %r14; +; O0-NEXT: cvt.u32.u16 %r10, %rs9; +; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x9991U; +; O0-NEXT: cvt.u16.u32 %rs10, %r11; +; O0-NEXT: prmt.b32 %r12, %r1, 0, 0x9991U; +; O0-NEXT: cvt.u16.u32 %rs11, %r12; ; O0-NEXT: rem.s16 %rs12, %rs11, %rs10; -; O0-NEXT: cvt.u32.u16 %r15, %rs12; -; O0-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; -; O0-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; -; O0-NEXT: st.b32 [%rd3], %r17; +; O0-NEXT: cvt.u32.u16 %r13, %rs12; +; O0-NEXT: prmt.b32 %r14, %r10, %r13, 0x3340U; +; O0-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; O0-NEXT: st.b32 [%rd3], %r15; ; O0-NEXT: ret; ; ; O3-LABEL: test_srem_v4i8( ; O3: { ; O3-NEXT: .reg .b16 %rs<13>; -; O3-NEXT: .reg .b32 %r<18>; +; O3-NEXT: .reg .b32 %r<16>; ; O3-NEXT: .reg .b64 %rd<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: // %entry @@ -2104,21 +2106,19 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; O3-NEXT: rem.s16 %rs6, %rs5, %rs4; ; O3-NEXT: cvt.u32.u16 %r8, %rs6; ; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; O3-NEXT: prmt.b32 %r10, %r2, 0, 0x9991U; -; O3-NEXT: cvt.u16.u32 %rs7, %r10; -; O3-NEXT: prmt.b32 %r11, %r1, 0, 0x9991U; -; O3-NEXT: cvt.u16.u32 %rs8, %r11; +; O3-NEXT: cvt.s8.s32 %rs7, %r2; +; O3-NEXT: cvt.s8.s32 %rs8, %r1; ; O3-NEXT: rem.s16 %rs9, %rs8, %rs7; -; O3-NEXT: cvt.u32.u16 %r12, %rs9; -; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x8880U; -; O3-NEXT: cvt.u16.u32 %rs10, %r13; -; O3-NEXT: prmt.b32 %r14, %r1, 0, 0x8880U; -; O3-NEXT: cvt.u16.u32 %rs11, %r14; +; O3-NEXT: cvt.u32.u16 %r10, %rs9; +; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x9991U; +; O3-NEXT: cvt.u16.u32 %rs10, %r11; +; O3-NEXT: prmt.b32 %r12, %r1, 0, 0x9991U; +; O3-NEXT: cvt.u16.u32 %rs11, %r12; ; O3-NEXT: rem.s16 %rs12, %rs11, %rs10; -; O3-NEXT: cvt.u32.u16 %r15, %rs12; -; O3-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; -; O3-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; -; O3-NEXT: st.b32 [%rd3], %r17; +; O3-NEXT: cvt.u32.u16 %r13, %rs12; +; O3-NEXT: prmt.b32 %r14, %r10, %r13, 0x3340U; +; O3-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; O3-NEXT: st.b32 [%rd3], %r15; ; O3-NEXT: ret; entry: %t57 = load <4 x i8>, ptr %a, align 4 @@ -2138,7 +2138,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; O0-LABEL: test_srem_v3i8( ; O0: { ; O0-NEXT: .reg .b16 %rs<20>; -; O0-NEXT: .reg .b32 %r<14>; +; O0-NEXT: .reg .b32 %r<8>; ; O0-NEXT: .reg .b64 %rd<4>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: // %entry @@ -2157,25 +2157,19 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; O0-NEXT: or.b16 %rs9, %rs8, %rs6; ; O0-NEXT: cvt.u32.u16 %r2, %rs9; ; O0-NEXT: ld.s8 %rs10, [%rd2+2]; -; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x9991U; -; O0-NEXT: cvt.u16.u32 %rs11, %r3; -; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U; -; O0-NEXT: cvt.u16.u32 %rs12, %r4; +; O0-NEXT: cvt.s16.s8 %rs11, %rs9; +; O0-NEXT: cvt.s16.s8 %rs12, %rs4; ; O0-NEXT: rem.s16 %rs13, %rs12, %rs11; -; O0-NEXT: cvt.u32.u16 %r5, %rs13; -; O0-NEXT: prmt.b32 %r6, %r2, 0, 0x8880U; -; O0-NEXT: cvt.u16.u32 %rs14, %r6; -; O0-NEXT: prmt.b32 %r7, %r1, 0, 0x8880U; -; O0-NEXT: cvt.u16.u32 %rs15, %r7; +; O0-NEXT: cvt.u32.u16 %r3, %rs13; +; O0-NEXT: prmt.b32 %r4, %r2, 0, 0x9991U; +; O0-NEXT: cvt.u16.u32 %rs14, %r4; +; O0-NEXT: prmt.b32 %r5, %r1, 0, 0x9991U; +; O0-NEXT: cvt.u16.u32 %rs15, %r5; ; O0-NEXT: rem.s16 %rs16, %rs15, %rs14; -; O0-NEXT: cvt.u32.u16 %r8, %rs16; -; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; O0-NEXT: // implicit-def: %r11 -; O0-NEXT: // implicit-def: %r12 -; O0-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U; -; O0-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U; +; O0-NEXT: cvt.u32.u16 %r6, %rs16; +; O0-NEXT: prmt.b32 %r7, %r3, %r6, 0x3340U; ; O0-NEXT: rem.s16 %rs17, %rs5, %rs10; -; O0-NEXT: cvt.u16.u32 %rs18, %r13; +; O0-NEXT: cvt.u16.u32 %rs18, %r7; ; O0-NEXT: st.b8 [%rd3], %rs18; ; O0-NEXT: shr.u16 %rs19, %rs18, 8; ; O0-NEXT: st.b8 [%rd3+1], %rs19; @@ -2185,7 +2179,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; O3-LABEL: test_srem_v3i8( ; O3: { ; O3-NEXT: .reg .b16 %rs<20>; -; O3-NEXT: .reg .b32 %r<14>; +; O3-NEXT: .reg .b32 %r<8>; ; O3-NEXT: .reg .b64 %rd<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: // %entry @@ -2204,24 +2198,20 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; O3-NEXT: cvt.u32.u16 %r2, %rs9; ; O3-NEXT: ld.s8 %rs10, [%rd2+2]; ; O3-NEXT: ld.param.b64 %rd3, [test_srem_v3i8_param_2]; -; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x9991U; -; O3-NEXT: cvt.u16.u32 %rs11, %r3; -; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U; -; O3-NEXT: cvt.u16.u32 %rs12, %r4; +; O3-NEXT: cvt.s16.s8 %rs11, %rs9; +; O3-NEXT: cvt.s16.s8 %rs12, %rs4; ; O3-NEXT: rem.s16 %rs13, %rs12, %rs11; -; O3-NEXT: cvt.u32.u16 %r5, %rs13; -; O3-NEXT: prmt.b32 %r6, %r2, 0, 0x8880U; -; O3-NEXT: cvt.u16.u32 %rs14, %r6; -; O3-NEXT: prmt.b32 %r7, %r1, 0, 0x8880U; -; O3-NEXT: cvt.u16.u32 %rs15, %r7; +; O3-NEXT: cvt.u32.u16 %r3, %rs13; +; O3-NEXT: prmt.b32 %r4, %r2, 0, 0x9991U; +; O3-NEXT: cvt.u16.u32 %rs14, %r4; +; O3-NEXT: prmt.b32 %r5, %r1, 0, 0x9991U; +; O3-NEXT: cvt.u16.u32 %rs15, %r5; ; O3-NEXT: rem.s16 %rs16, %rs15, %rs14; -; O3-NEXT: cvt.u32.u16 %r8, %rs16; -; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; O3-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U; -; O3-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U; +; O3-NEXT: cvt.u32.u16 %r6, %rs16; +; O3-NEXT: prmt.b32 %r7, %r3, %r6, 0x3340U; ; O3-NEXT: rem.s16 %rs17, %rs5, %rs10; ; O3-NEXT: st.b8 [%rd3+2], %rs17; -; O3-NEXT: cvt.u16.u32 %rs18, %r13; +; O3-NEXT: cvt.u16.u32 %rs18, %r7; ; O3-NEXT: st.b8 [%rd3], %rs18; ; O3-NEXT: shr.u16 %rs19, %rs18, 8; ; O3-NEXT: st.b8 [%rd3+1], %rs19; @@ -2311,4 +2301,50 @@ entry: ret void } +define <4 x float> @test_uitofp_v4i8(<4 x i8> %a) { +; CHECK-LABEL: test_uitofp_v4i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_v4i8_param_0]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U; +; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U; +; CHECK-NEXT: cvt.rn.f32.u32 %r5, %r4; +; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; CHECK-NEXT: cvt.rn.f32.u32 %r7, %r6; +; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7770U; +; CHECK-NEXT: cvt.rn.f32.u32 %r9, %r8; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3}; +; CHECK-NEXT: ret; + %r = uitofp <4 x i8> %a to <4 x float> + ret <4 x float> %r +} + +define <4 x float> @test_sitofp_v4i8(<4 x i8> %a) { +; CHECK-LABEL: test_sitofp_v4i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_v4i8_param_0]; +; CHECK-NEXT: cvt.s8.s32 %rs1, %r1; +; CHECK-NEXT: cvt.rn.f32.s16 %r2, %rs1; +; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0xbbb3U; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r3; +; CHECK-NEXT: cvt.rn.f32.s16 %r4, %rs2; +; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xaaa2U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: cvt.rn.f32.s16 %r6, %rs3; +; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x9991U; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r7; +; CHECK-NEXT: cvt.rn.f32.s16 %r8, %rs4; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r2, %r8, %r6, %r4}; +; CHECK-NEXT: ret; + %r = sitofp <4 x i8> %a to <4 x float> + ret <4 x float> %r +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll new file mode 100644 index 0000000..95258f7 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/pr126337.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas -arch=sm_70 -c - %} + +; This IR should compile without triggering assertions in LICM +; when the CopyToReg from %0 in the first BB gets eliminated +; but we still use its result in the second BB. +; Technically the problem happens in MIR, but there are multiple +; passes involved, so testing with the IR reproducer is more convenient. +; https://github.com/llvm/llvm-project/pull/126337#issuecomment-3081431594 + +target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define ptx_kernel void @Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(<2 x float> %0) { +; CHECK-LABEL: Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %.preheader15 +; CHECK-NEXT: ld.param.b64 %rd1, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0]; +; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000; +; CHECK-NEXT: selp.b16 %rs1, 1, 0, %p1; +; CHECK-NEXT: $L__BB0_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov.b64 %rd2, 0; +; CHECK-NEXT: st.b8 [%rd2], %rs1; +; CHECK-NEXT: bra.uni $L__BB0_1; +.preheader15: + br label %1 + +1: ; preds = %1, %.preheader15 + %2 = fcmp oeq <2 x float> %0, zeroinitializer + %3 = extractelement <2 x i1> %2, i64 0 + store i1 %3, ptr null, align 4 + br label %1 +} + diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index 87f965c..92cb51b 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -117,16 +117,20 @@ define float @reduce_fadd_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0]; -; CHECK-NEXT: add.rn.f32 %r9, %r5, 0f00000000; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r6; -; CHECK-NEXT: add.rn.f32 %r11, %r10, %r7; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r8; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r1; -; CHECK-NEXT: add.rn.f32 %r14, %r13, %r2; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r3; -; CHECK-NEXT: add.rn.f32 %r16, %r15, %r4; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-NEXT: add.rn.f32 %r9, %r7, 0f00000000; +; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8; +; CHECK-NEXT: add.rn.f32 %r11, %r10, %r5; +; CHECK-NEXT: add.rn.f32 %r12, %r11, %r6; +; CHECK-NEXT: add.rn.f32 %r13, %r12, %r3; +; CHECK-NEXT: add.rn.f32 %r14, %r13, %r4; +; CHECK-NEXT: add.rn.f32 %r15, %r14, %r1; +; CHECK-NEXT: add.rn.f32 %r16, %r15, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r16; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) @@ -140,14 +144,18 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0]; -; CHECK-SM80-NEXT: add.rn.f32 %r9, %r7, %r3; -; CHECK-SM80-NEXT: add.rn.f32 %r10, %r5, %r1; -; CHECK-SM80-NEXT: add.rn.f32 %r11, %r8, %r4; -; CHECK-SM80-NEXT: add.rn.f32 %r12, %r6, %r2; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM80-NEXT: add.rn.f32 %r5, %r3, %r1; +; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r8, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r4, %r2; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r9, %r7; ; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r11; -; CHECK-SM80-NEXT: add.rn.f32 %r14, %r10, %r9; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r10, %r5; ; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r13; ; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16; @@ -321,15 +329,19 @@ define float @reduce_fmul_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0]; -; CHECK-NEXT: mul.rn.f32 %r9, %r5, %r6; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r7; -; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r8; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r1; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r2; -; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r3; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r4; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8; +; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r5; +; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r6; +; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r3; +; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r4; +; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r1; +; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) @@ -343,14 +355,18 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r7, %r3; -; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r5, %r1; -; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r8, %r4; -; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r6, %r2; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM80-NEXT: mul.rn.f32 %r5, %r3, %r1; +; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r8, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r4, %r2; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r9, %r7; ; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r11; -; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r10, %r9; +; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r10, %r5; ; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r13; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-SM80-NEXT: ret; @@ -494,13 +510,17 @@ define float @reduce_fmax_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0]; -; CHECK-NEXT: max.f32 %r9, %r8, %r4; -; CHECK-NEXT: max.f32 %r10, %r6, %r2; -; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r7, %r3; -; CHECK-NEXT: max.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -517,13 +537,17 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0]; -; CHECK-NEXT: max.f32 %r9, %r8, %r4; -; CHECK-NEXT: max.f32 %r10, %r6, %r2; -; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r7, %r3; -; CHECK-NEXT: max.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -628,13 +652,17 @@ define float @reduce_fmin_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0]; -; CHECK-NEXT: min.f32 %r9, %r8, %r4; -; CHECK-NEXT: min.f32 %r10, %r6, %r2; -; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r7, %r3; -; CHECK-NEXT: min.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -651,13 +679,17 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0]; -; CHECK-NEXT: min.f32 %r9, %r8, %r4; -; CHECK-NEXT: min.f32 %r10, %r6, %r2; -; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r7, %r3; -; CHECK-NEXT: min.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -762,13 +794,17 @@ define float @reduce_fmaximum_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r8, %r4; -; CHECK-NEXT: max.NaN.f32 %r10, %r6, %r2; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r7, %r3; -; CHECK-NEXT: max.NaN.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -785,13 +821,17 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r8, %r4; -; CHECK-NEXT: max.NaN.f32 %r10, %r6, %r2; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r7, %r3; -; CHECK-NEXT: max.NaN.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -896,13 +936,17 @@ define float @reduce_fminimum_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r8, %r4; -; CHECK-NEXT: min.NaN.f32 %r10, %r6, %r2; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r7, %r3; -; CHECK-NEXT: min.NaN.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -919,13 +963,17 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r8, %r4; -; CHECK-NEXT: min.NaN.f32 %r10, %r6, %r2; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r7, %r3; -; CHECK-NEXT: min.NaN.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; diff --git a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll new file mode 100644 index 0000000..12502b6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_50 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} + +target triple = "nvptx64-nvidia-cuda" + +define float @uitofp_trunc_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: uitofp_trunc_nuw( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [uitofp_trunc_nuw_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [uitofp_trunc_nuw_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %v = add i32 %x, %y + %t = trunc nuw i32 %v to i16 + %f = uitofp i16 %t to float + ret float %f +} + +define float @sitofp_trunc_nsw(i32 %x, i32 %y) { +; CHECK-LABEL: sitofp_trunc_nsw( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [sitofp_trunc_nsw_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [sitofp_trunc_nsw_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %v = add i32 %x, %y + %t = trunc nsw i32 %v to i16 + %f = sitofp i16 %t to float + ret float %f +} + +define float @uitofp_trunc_nsw(i32 %x, i32 %y) { +; CHECK-LABEL: uitofp_trunc_nsw( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [uitofp_trunc_nsw_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [uitofp_trunc_nsw_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: cvt.rn.f32.u16 %r4, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %v = add i32 %x, %y + %t = trunc nsw i32 %v to i16 + %f = uitofp i16 %t to float + ret float %f +} + +define float @sitofp_trunc_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: sitofp_trunc_nuw( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [sitofp_trunc_nuw_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [sitofp_trunc_nuw_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: cvt.rn.f32.s16 %r4, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %v = add i32 %x, %y + %t = trunc nuw i32 %v to i16 + %f = sitofp i16 %t to float + ret float %f +} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py new file mode 100644 index 0000000..8f50206 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py @@ -0,0 +1,14 @@ +# Check all variants of instructions supported by PTX78 on SM90 +# RUN: %python %s --ptx=78 --gpu-arch=90 --aa > %t-ptx78-sm_90.ll +# RUN: FileCheck %t-ptx78-sm_90.ll < %t-ptx78-sm_90.ll \ +# RUN: --check-prefixes=PTX78STMATRIX-DAG +# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ +# RUN: | FileCheck %t-ptx78-sm_90.ll +# RUN: %if ptxas-12.7 %{ \ +# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ +# RUN: | %ptxas-verify -arch=sm_90 \ +# RUN: %} + +import wmma + +wmma.main() diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py index 6ad0a2a..5c14a54 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM100a # RUN: %python %s --ptx=86 --gpu-arch=100 --aa > %t-ptx86-sm_100a.ll # RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_100a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py index 7d99534..a77f9ad 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM101a # RUN: %python %s --ptx=86 --gpu-arch=101 --aa > %t-ptx86-sm_101a.ll # RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_101a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py index 7bddf0b..8126e64 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM120a # RUN: %python %s --ptx=86 --gpu-arch=120 --aa > %t-ptx86-sm_120a.ll # RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_120a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py index 2ee4896..2eb3c3d 100644 --- a/llvm/test/CodeGen/NVPTX/wmma.py +++ b/llvm/test/CodeGen/NVPTX/wmma.py @@ -10,6 +10,7 @@ import argparse from itertools import product from string import Template + class MMAType: def __init__(self, ptx_type): self.ptx_type = ptx_type @@ -176,6 +177,13 @@ class MMAFrag: "m8n16:x1:b8x16.b4x16_p64": 1, "m8n16:x2:b8x16.b4x16_p64": 2, "m8n16:x4:b8x16.b4x16_p64": 4, + # stmatrix + "m8n8:x1:b16": 1, + "m8n8:x2:b16": 2, + "m8n8:x4:b16": 4, + "m16n8:x1:b8": 1, + "m16n8:x2:b8": 2, + "m16n8:x4:b8": 4, }.get( "%s:%s:%s" % (geom, frag, ptx_elt_type), { @@ -241,6 +249,13 @@ def make_ldmatrix_ops(geoms, frags, types): ] +def make_stmatrix_ops(geoms, frags, types): + return [ + MMAFrag(geom, frag, ptx_type) + for (geom, frag, ptx_type) in product(geoms, frags, types) + ] + + def get_wmma_ops(): return ( make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], []) @@ -315,6 +330,12 @@ def get_ldmatrix_ops(): ) +def get_stmatrix_ops(): + return make_stmatrix_ops(["m8n8"], ["x1", "x2", "x4"], ["b16"]) + make_stmatrix_ops( + ["m16n8"], ["x1", "x2", "x4"], ["b8"] + ) + + def is_wmma_geom_supported(geom): # geometries for FP and ints. if geom in ["m8n32k16", "m32n8k16"]: @@ -360,6 +381,14 @@ def is_ldmatrix_geom_supported(geom): assert False # Unexpected geometry. +def is_stmatrix_geom_supported(geom): + if geom in ["m8n8"]: + return ptx_version >= 78 and gpu_arch >= 90 + elif geom in ["m16n8"]: + return ptx_version >= 86 and gpu_arch >= 100 and aa + assert False # Unexpected geometry. + + def is_ldmatrix_trans_supported(geom, trans): if geom in ["m8n8"]: return True @@ -369,6 +398,15 @@ def is_ldmatrix_trans_supported(geom, trans): return trans == "" assert False # Unexpected geometry. + +def is_stmatrix_trans_supported(geom, trans): + if geom in ["m8n8"]: + return True + elif geom in ["m16n8"]: + return trans == ".trans" + assert False # Unexpected geometry. + + def is_type_supported(ptx_type): if ptx_type in ["s8", "u8", "s32"]: return ptx_version >= 63 and gpu_arch >= 72 @@ -463,6 +501,16 @@ def is_ldmatrix_variant_supported(frag, trans): return frag.frag in ["x1", "x2", "x4"] +def is_stmatrix_variant_supported(frag, trans): + if not ( + is_type_supported(frag.mma_type.ptx_type) + and is_stmatrix_geom_supported(frag.geom) + and is_stmatrix_trans_supported(frag.geom, trans) + ): + return False + return frag.frag in ["x1", "x2", "x4"] + + def make_wmma_slice_ty(frag): return [frag.mma_type.llvm_type] * frag.nregs @@ -717,6 +765,65 @@ define ${ret_ty} @test_${function}_o(i8 ${as}* %src) { return generated_items +def gen_stmatrix_tests(): + stmatrix_template = """ +declare void @${intrinsic}(i8 ${as}* %dst, ${args}); + +; CHECK-LABEL: .func {{.*}}test_${function}( +define void @test_${function}(i8 ${as}* %dst, ${args}) { +; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}] +; CHECK: {${check_args}} + call void @${intrinsic}(i8${as}* %dst, ${args}); + ret void +} + +; CHECK-LABEL: .func{{.*}}test_${function}_o( +define void @test_${function}_o(i8 ${as}* %dst, ${args}) { +; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128], +; CHECK: {${check_args}} + %dst1 = getelementptr i8, i8 ${as}* %dst, i32 128; + call void @${intrinsic}(i8 ${as}* %dst1, ${args}); + ret void +} +""" + intrinsic_template = ( + "llvm.nvvm.stmatrix.sync.aligned.${geom}.${frag}${trans}.${itype}.${pspace}" + ) + instruction_template = ( + "stmatrix.sync.aligned.${geom}.${frag}${trans}${space}.${itype}" + ) + generated_items = [] + + for frag, space, trans in product( + get_stmatrix_ops(), + ["", ".shared"], + ["", ".trans"], + ): + if not is_stmatrix_variant_supported(frag, trans): + continue + + params = { + "frag": frag.frag, + "space": space, + "trans": trans, + "itype": frag.mma_type.ptx_type, + "pspace": get_pspace(space), + "as": "addrspace(%d)" % get_aspace(space), + "geom": frag.geom, + } + + test_params = params + test_params["intrinsic"] = Template(intrinsic_template).substitute(params) + test_params["function"] = test_params["intrinsic"].replace(".", "_") + test_params["instruction"] = Template(instruction_template).substitute(params) + test_params["args"] = make_wmma_slice_args(frag) + test_params["check_args"] = check_pattern(frag) + + print(Template(stmatrix_template).substitute(test_params)) + generated_items.append((test_params["intrinsic"], test_params["instruction"])) + + return generated_items + def mma_signature(op): if op.a.mma_type.ptx_type == "f16": # FP16 ops identified by accumulator & result type. @@ -893,6 +1000,7 @@ def gen_check_unsupported_ops(items): ; NOALTFLOAT-NOT: .{{bf16|tf32}} ; NODOUBLE-NOT: .f64 ; NOLDMATRIX-NOT: ldmatrix.sync.aligned +; NOSTMATRIX-NOT: stmatrix.sync.aligned ; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p ; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p @@ -994,6 +1102,26 @@ def gen_check_unsupported_ops(items): ; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b6x16_p32 ; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b4x16_p64 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.shared.b16 + +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.shared.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.shared.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.shared.b8 + ; PTX71MMA-DAG: mma.m8n8k4.row.col.f64 ; PTX71MMA-DAG: mma.m16n8k4.row.col.tf32 ; PTX71MMA-DAG: mma.m16n8k8.row.col.tf32 @@ -1039,6 +1167,7 @@ def gen_tests(): items = gen_wmma_load_tests() items += gen_wmma_store_tests() items += gen_ldmatrix_tests() + items += gen_stmatrix_tests() items += gen_wmma_mma_tests() items += gen_mma_tests() gen_check_unsupported_ops(items) |