diff options
Diffstat (limited to 'llvm/test/CodeGen')
13 files changed, 3131 insertions, 380 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index bd2d8c09..5c164bf 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -71,12 +71,13 @@ # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # # DEBUG-NEXT: G_ABDS (opcode 65): 1 type index, 0 imm indices -# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # -# DEBUG-NEXT:G_ABDU (opcode 66): 1 type index, 0 imm indices -# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ABDU (opcode 66): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # # DEBUG-NEXT: G_IMPLICIT_DEF (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: {{[0-9]+}}, OK diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll index bd28d13..256ff94 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -1,5 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for test_vmull_p8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_p64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p64 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5 @@ -101,11 +107,18 @@ entry: } define <8 x i16> @test_vaddl_a8(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vaddl_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddl_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddl_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %a to <8 x i16> %vmovl.i2.i = zext <8 x i8> %b to <8 x i16> @@ -229,11 +242,18 @@ entry: } define <8 x i16> @test_vaddl_high_a8(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaddl_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddl_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddl_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -345,11 +365,18 @@ entry: } define <8 x i16> @test_vaddw_a8(<8 x i16> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vaddw_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddw_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddw_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %b to <8 x i16> %add.i = add <8 x i16> %vmovl.i.i, %a @@ -458,11 +485,18 @@ entry: } define <8 x i16> @test_vaddw_high_a8(<8 x i16> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaddw_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddw2 v0.8h, v0.8h, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddw_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddw2 v0.8h, v0.8h, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddw_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddw2 v0.8h, v0.8h, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -574,11 +608,18 @@ entry: } define <8 x i16> @test_vsubl_a8(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vsubl_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubl_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubl_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %a to <8 x i16> %vmovl.i2.i = zext <8 x i8> %b to <8 x i16> @@ -702,11 +743,18 @@ entry: } define <8 x i16> @test_vsubl_high_a8(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsubl_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubl2 v0.8h, v0.16b, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubl_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubl_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -818,11 +866,18 @@ entry: } define <8 x i16> @test_vsubw_a8(<8 x i16> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vsubw_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubw v0.8h, v0.8h, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubw_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubw v0.8h, v0.8h, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubw_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %b to <8 x i16> %sub.i = sub <8 x i16> %a, %vmovl.i.i @@ -931,11 +986,18 @@ entry: } define <8 x i16> @test_vsubw_high_a8(<8 x i16> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsubw_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubw2 v0.8h, v0.8h, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubw_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubw2 v0.8h, v0.8h, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubw_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubw2 v0.8h, v0.8h, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -975,10 +1037,16 @@ entry: } define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <8 x i16> %a, %b %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -987,10 +1055,16 @@ entry: } define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <4 x i32> %a, %b %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -999,10 +1073,16 @@ entry: } define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <2 x i64> %a, %b %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32> @@ -1011,10 +1091,16 @@ entry: } define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <8 x i16> %a, %b %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1023,10 +1109,16 @@ entry: } define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <4 x i32> %a, %b %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1035,10 +1127,16 @@ entry: } define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <2 x i64> %a, %b %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32> @@ -1047,11 +1145,20 @@ entry: } define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <8 x i16> %a, %b %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1064,11 +1171,20 @@ entry: } define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <4 x i32> %a, %b %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1081,11 +1197,20 @@ entry: } define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <2 x i64> %a, %b %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32> @@ -1098,11 +1223,20 @@ entry: } define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <8 x i16> %a, %b %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1115,11 +1249,20 @@ entry: } define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <4 x i32> %a, %b %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1132,11 +1275,20 @@ entry: } define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <2 x i64> %a, %b %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32> @@ -1209,11 +1361,19 @@ entry: } define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vraddhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1224,11 +1384,19 @@ entry: } define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vraddhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1239,11 +1407,19 @@ entry: } define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vraddhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1254,11 +1430,19 @@ entry: } define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vraddhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1269,11 +1453,19 @@ entry: } define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vraddhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1284,11 +1476,19 @@ entry: } define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vraddhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1299,10 +1499,16 @@ entry: } define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <8 x i16> %a, %b %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1311,10 +1517,16 @@ entry: } define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <4 x i32> %a, %b %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1323,10 +1535,16 @@ entry: } define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <2 x i64> %a, %b %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32> @@ -1335,10 +1553,16 @@ entry: } define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <8 x i16> %a, %b %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1347,10 +1571,16 @@ entry: } define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <4 x i32> %a, %b %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1359,10 +1589,16 @@ entry: } define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <2 x i64> %a, %b %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32> @@ -1371,11 +1607,20 @@ entry: } define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <8 x i16> %a, %b %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1388,11 +1633,20 @@ entry: } define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <4 x i32> %a, %b %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1405,11 +1659,20 @@ entry: } define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <2 x i64> %a, %b %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32> @@ -1422,11 +1685,20 @@ entry: } define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <8 x i16> %a, %b %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1439,11 +1711,20 @@ entry: } define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <4 x i32> %a, %b %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1456,11 +1737,20 @@ entry: } define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <2 x i64> %a, %b %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32> @@ -1533,11 +1823,19 @@ entry: } define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vrsubhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1548,11 +1846,19 @@ entry: } define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vrsubhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1563,11 +1869,19 @@ entry: } define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vrsubhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1578,11 +1892,19 @@ entry: } define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vrsubhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1593,11 +1915,19 @@ entry: } define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vrsubhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1608,11 +1938,19 @@ entry: } define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vrsubhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -2535,21 +2873,40 @@ entry: } define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) { -; CHECK-LABEL: cmplx_mul_combined_re_im: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: movi v1.2d, #0xffff0000ffff0000 -; CHECK-NEXT: rev32 v4.8h, v0.8h -; CHECK-NEXT: dup v2.8h, w8 -; CHECK-NEXT: sqneg v3.8h, v2.8h -; CHECK-NEXT: bsl v1.16b, v2.16b, v3.16b -; CHECK-NEXT: fmov d3, x0 -; CHECK-NEXT: sqdmull v2.4s, v4.4h, v1.4h -; CHECK-NEXT: sqdmull2 v1.4s, v4.8h, v1.8h -; CHECK-NEXT: sqdmlal v2.4s, v0.4h, v3.h[0] -; CHECK-NEXT: sqdmlal2 v1.4s, v0.8h, v3.h[0] -; CHECK-NEXT: uzp2 v0.8h, v2.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: cmplx_mul_combined_re_im: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: lsr x8, x0, #16 +; CHECK-SD-NEXT: movi v1.2d, #0xffff0000ffff0000 +; CHECK-SD-NEXT: rev32 v4.8h, v0.8h +; CHECK-SD-NEXT: dup v2.8h, w8 +; CHECK-SD-NEXT: sqneg v3.8h, v2.8h +; CHECK-SD-NEXT: bsl v1.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: fmov d3, x0 +; CHECK-SD-NEXT: sqdmull v2.4s, v4.4h, v1.4h +; CHECK-SD-NEXT: sqdmull2 v1.4s, v4.8h, v1.8h +; CHECK-SD-NEXT: sqdmlal v2.4s, v0.4h, v3.h[0] +; CHECK-SD-NEXT: sqdmlal2 v1.4s, v0.8h, v3.h[0] +; CHECK-SD-NEXT: uzp2 v0.8h, v2.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmplx_mul_combined_re_im: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: lsr x9, x0, #16 +; CHECK-GI-NEXT: adrp x8, .LCPI196_0 +; CHECK-GI-NEXT: rev32 v4.8h, v0.8h +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI196_0] +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: dup v2.8h, v1.h[0] +; CHECK-GI-NEXT: sqneg v1.8h, v2.8h +; CHECK-GI-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: fmov d3, x0 +; CHECK-GI-NEXT: sqdmull v2.4s, v2.4h, v3.h[0] +; CHECK-GI-NEXT: sqdmull v5.4s, v4.4h, v1.4h +; CHECK-GI-NEXT: sqdmlal v5.4s, v0.4h, v3.h[0] +; CHECK-GI-NEXT: sqdmlal2 v2.4s, v4.8h, v1.8h +; CHECK-GI-NEXT: uzp2 v0.8h, v5.8h, v2.8h +; CHECK-GI-NEXT: ret entry: %scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll index cc9732b..6c7ddd9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=arm64-none-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) @@ -197,11 +198,20 @@ define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { } define <2 x i32> @test_sabd_v2i32_const() { -; CHECK-LABEL: test_sabd_v2i32_const: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_sabd_v2i32_const: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI19_0 +; CHECK-SD-NEXT: ldr d0, [x8, :lo12:.LCPI19_0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sabd_v2i32_const: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI19_1 +; CHECK-GI-NEXT: adrp x9, .LCPI19_0 +; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI19_1] +; CHECK-GI-NEXT: ldr d1, [x9, :lo12:.LCPI19_0] +; CHECK-GI-NEXT: sabd v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32( <2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>) @@ -293,15 +303,26 @@ define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) { } define <8 x i16> @test_uabd_knownbits_vec8i16(<8 x i16> %lhs, <8 x i16> %rhs) { -; CHECK-LABEL: test_uabd_knownbits_vec8i16: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.8h, #15 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: rev64 v0.8h, v0.8h -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_uabd_knownbits_vec8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.8h, #15 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: rev64 v0.8h, v0.8h +; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_uabd_knownbits_vec8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.8h, #15 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: rev64 v0.8h, v0.8h +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret %and1 = and <8 x i16> %lhs, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> %and2 = and <8 x i16> %rhs, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> %uabd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %and1, <8 x i16> %and2) @@ -311,11 +332,22 @@ define <8 x i16> @test_uabd_knownbits_vec8i16(<8 x i16> %lhs, <8 x i16> %rhs) { } define <4 x i32> @knownbits_uabd_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_uabd_mask_and_shuffle_lshr: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushr v0.4s, v0.4s, #17 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_uabd_mask_and_shuffle_lshr: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_uabd_mask_and_shuffle_lshr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: rev64 v0.4s, v0.4s +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535> %2 = and <4 x i32> %a1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -325,10 +357,19 @@ define <4 x i32> @knownbits_uabd_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> } define <4 x i32> @knownbits_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_mask_and_shuffle_lshr: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_mask_and_shuffle_lshr: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_mask_and_shuffle_lshr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767> %2 = and <4 x i32> %a1, <i32 32767, i32 32767, i32 32767, i32 32767> %3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -338,20 +379,36 @@ define <4 x i32> @knownbits_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) } define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: test_sabd_knownbits_vec4i32: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI31_0 -; CHECK-NEXT: adrp x9, .LCPI31_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] -; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi v1.2d, #0x0000ff000000ff -; CHECK-NEXT: mov v0.s[1], v0.s[0] -; CHECK-NEXT: trn2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_sabd_knownbits_vec4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI31_0 +; CHECK-SD-NEXT: adrp x9, .LCPI31_1 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-SD-NEXT: mov v0.s[1], v0.s[0] +; CHECK-SD-NEXT: trn2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sabd_knownbits_vec4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI31_2 +; CHECK-GI-NEXT: adrp x9, .LCPI31_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_2] +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] +; CHECK-GI-NEXT: adrp x8, .LCPI31_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] +; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: ret %and1 = and <4 x i32> %lhs, <i32 255, i32 -1, i32 -1, i32 255> %and2 = and <4 x i32> %rhs, <i32 255, i32 255, i32 -1, i32 -1> %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %and1, <4 x i32> %and2) @@ -361,15 +418,27 @@ define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) { } define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI32_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI32_0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: zip2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI32_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_1] +; CHECK-GI-NEXT: adrp x8, .LCPI32_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> %3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -378,10 +447,25 @@ define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_or_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_or_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_or_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI33_1 +; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_1] +; CHECK-GI-NEXT: adrp x8, .LCPI33_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> @@ -392,18 +476,33 @@ define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_xor_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI34_0 -; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_xor_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI34_0 +; CHECK-SD-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: eor v0.16b, v0.16b, v3.16b +; CHECK-SD-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: zip2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_xor_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI34_1 +; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_1] +; CHECK-GI-NEXT: adrp x8, .LCPI34_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] +; CHECK-GI-NEXT: eor v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = xor <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> @@ -414,10 +513,24 @@ define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_shl_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_shl_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_shl_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI35_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] +; CHECK-GI-NEXT: adrp x8, .LCPI35_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0] +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #17 +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536> %2 = shl <4 x i32> %1, <i32 17, i32 17, i32 17, i32 17> %3 = and <4 x i32> %a1, <i32 -65536, i32 -7, i32 -7, i32 -65536> @@ -428,18 +541,32 @@ define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_mul_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_mul_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI36_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] -; CHECK-NEXT: and v3.16b, v0.16b, v2.16b -; CHECK-NEXT: and v2.16b, v1.16b, v2.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v0.s[1], v0.s[0] -; CHECK-NEXT: trn2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_mul_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI36_0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-SD-NEXT: and v3.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v2.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: mul v0.4s, v0.4s, v3.4s +; CHECK-SD-NEXT: mul v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: mov v0.s[1], v0.s[0] +; CHECK-SD-NEXT: trn2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_mul_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI36_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_1] +; CHECK-GI-NEXT: adrp x8, .LCPI36_0 +; CHECK-GI-NEXT: and v3.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v2.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: mul v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536> %2 = mul <4 x i32> %a0, %1 %3 = and <4 x i32> %a1, <i32 -65536, i32 -7, i32 -7, i32 -65536> diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll index f82d1ed..df4889b 100644 --- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll +++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for test_pmull_high_p8_128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_high_p8_64 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) @@ -12,10 +16,10 @@ declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2) define <4 x i32> @test_smull_high_s16_base(<8 x i16> %a, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_base: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_base: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_base: ; CHECK-BE: // %bb.0: // %entry @@ -35,10 +39,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcasta1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1: ; CHECK-BE: // %bb.0: // %entry @@ -59,10 +63,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcastb1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1: ; CHECK-BE: // %bb.0: // %entry @@ -83,10 +87,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcasta2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2: ; CHECK-BE: // %bb.0: // %entry @@ -109,10 +113,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcastb2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2: ; CHECK-BE: // %bb.0: // %entry @@ -157,6 +161,13 @@ define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcasta1_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -186,6 +197,13 @@ define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcastb1_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ext v1.16b, v1.16b, v0.16b, #6 +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %b = bitcast <16 x i8> %bb to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -215,6 +233,13 @@ define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i1 ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcasta2_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 2> %s1 = bitcast <2 x i32> %s1a to <4 x i16> @@ -244,6 +269,13 @@ define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcastb2_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ext v1.16b, v1.16b, v0.16b, #4 +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -269,6 +301,12 @@ define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splata1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v1.4h, v0.h[3] +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -293,6 +331,12 @@ define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splatb1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[3] +; CHECK-GI-NEXT: ret entry: %b = bitcast <16 x i8> %bb to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -322,6 +366,13 @@ define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splata2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: dup v0.2s, v0.s[3] +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %s1 = bitcast <2 x i32> %s1a to <4 x i16> @@ -351,6 +402,13 @@ define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splatb2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: dup v1.8b, v1.b[3] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> @@ -362,10 +420,10 @@ entry: define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: umull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_umull_high_s16_bitcasta1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1: ; CHECK-BE: // %bb.0: // %entry @@ -386,10 +444,10 @@ entry: } define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) { -; CHECK-LE-LABEL: test_vabdl_high_u82: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: uabdl2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vabdl_high_u82: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uabdl2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vabdl_high_u82: ; CHECK-BE: // %bb.0: // %entry @@ -411,10 +469,10 @@ entry: } define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) { -; CHECK-LE-LABEL: test_vabdl_high_s82: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: sabdl2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vabdl_high_s82: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sabdl2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vabdl_high_s82: ; CHECK-BE: // %bb.0: // %entry @@ -436,10 +494,10 @@ entry: } define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) { -; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vqdmlal_high_s16_bitcast: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast: ; CHECK-BE: // %bb.0: // %entry @@ -463,12 +521,12 @@ entry: } define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) { -; CHECK-LE-LABEL: test_pmull_high_p8_128: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: fmov d0, x3 -; CHECK-LE-NEXT: fmov d1, x1 -; CHECK-LE-NEXT: pmull v0.8h, v1.8b, v0.8b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_pmull_high_p8_128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x3 +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: pmull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_pmull_high_p8_128: ; CHECK-BE: // %bb.0: // %entry @@ -490,10 +548,10 @@ entry: } define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) { -; CHECK-LE-LABEL: test_pmull_high_p8_64: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: pmull2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_pmull_high_p8_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_pmull_high_p8_64: ; CHECK-BE: // %bb.0: // %entry @@ -532,6 +590,14 @@ define <8 x i16> @foov8i16(<16 x i8> %a1, <2 x i64> %b1) { ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: foov8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #5 +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #5 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret %a0 = bitcast <16 x i8> %a1 to <4 x i32> %b0 = bitcast <2 x i64> %b1 to <4 x i32> %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5> @@ -558,6 +624,12 @@ define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) { ; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #1 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: hadd32_zext_asr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-GI-NEXT: ret %src1 = bitcast <16 x i8> %src1a to <4 x i32> %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %zextsrc1 = zext <2 x i32> %s1 to <2 x i64> @@ -580,6 +652,12 @@ define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 { ; CHECK-BE-NEXT: umull2 v0.2d, v1.4s, v0.s[1] ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_umull_high_s16_splata1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: umull v0.2d, v1.2s, v0.s[1] +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <4 x i32> %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 1> diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll index 78ccc89..19967bd 100644 --- a/llvm/test/CodeGen/AArch64/neon-saba.ll +++ b/llvm/test/CodeGen/AArch64/neon-saba.ll @@ -1,13 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; SABA from ADD(ABS(SUB NSW)) define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { -; CHECK-LABEL: saba_abs_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i32> %b, %c %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) %add = add <4 x i32> %a, %abs @@ -15,10 +23,17 @@ define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { } define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { -; CHECK-LABEL: saba_abs_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: abs v1.2s, v1.2s +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %sub = sub nsw <2 x i32> %b, %c %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true) %add = add <2 x i32> %a, %abs @@ -26,10 +41,17 @@ define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { } define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { -; CHECK-LABEL: saba_abs_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: abs v1.8h, v1.8h +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i16> %b, %c %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) %add = add <8 x i16> %a, %abs @@ -37,10 +59,17 @@ define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { } define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { -; CHECK-LABEL: saba_abs_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: abs v1.4h, v1.4h +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i16> %b, %c %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true) %add = add <4 x i16> %a, %abs @@ -48,10 +77,17 @@ define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { } define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { -; CHECK-LABEL: saba_abs_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: abs v1.16b, v1.16b +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %sub = sub nsw <16 x i8> %b, %c %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) %add = add <16 x i8> %a, %abs @@ -59,10 +95,17 @@ define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { } define <8 x i8> @saba_abs_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { -; CHECK-LABEL: saba_abs_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: abs v1.8b, v1.8b +; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i8> %b, %c %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %sub, i1 true) %add = add <8 x i8> %a, %abs diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll new file mode 100644 index 0000000..843446a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll new file mode 100644 index 0000000..9b485803 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll @@ -0,0 +1,152 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll new file mode 100644 index 0000000..4325405 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll @@ -0,0 +1,353 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1); + +define void @cp_async_bulk_tensor_g2s_cta_tile_1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + ret void +} + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll new file mode 100644 index 0000000..ef4a8fb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1 +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2 +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll new file mode 100644 index 0000000..112dab1 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -0,0 +1,524 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d +define void @cp_async_bulk_tensor_g2s_im2colw_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d +define void @cp_async_bulk_tensor_g2s_im2colw_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d +define void @cp_async_bulk_tensor_g2s_im2colw_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll new file mode 100644 index 0000000..54e861e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -0,0 +1,524 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll new file mode 100644 index 0000000..6bf8f03 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_tile_gather4_2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll new file mode 100644 index 0000000..2ef44ff --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag); + +; CHECK-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d +define void @cp_async_bulk_tensor_s2g_tile_scatter4_2d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + + ret void +} |