diff options
Diffstat (limited to 'llvm/test/CodeGen/NVPTX')
45 files changed, 2925 insertions, 308 deletions
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index dd9a472..19ec257 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -45,29 +45,31 @@ define half @fh(ptr %p) { ; ENABLED-LABEL: fh( ; ENABLED: { ; ENABLED-NEXT: .reg .b16 %rs<10>; -; ENABLED-NEXT: .reg .b32 %r<13>; +; ENABLED-NEXT: .reg .b32 %r<17>; ; ENABLED-NEXT: .reg .b64 %rd<2>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: ; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0]; -; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8]; -; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2; -; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1; -; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3; -; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4; -; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3; -; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6; -; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7; -; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6; -; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9; -; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8; -; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5; -; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12; +; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } +; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2; +; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5; +; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4; +; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7; +; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3; +; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2; +; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10; +; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7; +; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6; +; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13; +; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8; +; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1; +; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16; ; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9; ; ENABLED-NEXT: ret; ; diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 4d930cd..41f77b5 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} @@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fadd( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; +; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fadd( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fsub( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; +; SM90-FTZ-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fsub( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_faddx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1]; +; SM90-FTZ-NEXT: add.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_faddx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fsubx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; +; SM90-FTZ-NEXT: sub.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fsubx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fmulx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1]; +; SM90-FTZ-NEXT: mul.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fmulx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fdiv( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<5>; +; SM90-FTZ-NEXT: .reg .b32 %r<8>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; +; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM90-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM90-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4; +; SM90-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r7; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fdiv( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<5>; @@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fpext_float( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fpext_float( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptrunc_float( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptrunc_float( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r2, %r1, 0f3F800000; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fadd_imm_1( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; +; SM90-FTZ-NEXT: mov.b16 %rs2, 0x3F80; +; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fadd_imm_1( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -661,17 +768,18 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat % ; CHECK-LABEL: test_select_cc_bf16_f64( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_bf16_f64_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_bf16_f64_param_1]; ; CHECK-NEXT: setp.lt.f64 %p1, %rd1, %rd2; -; CHECK-NEXT: ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2]; -; CHECK-NEXT: ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3]; -; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: mov.b64 %rd3, test_select_cc_bf16_f64_param_3; +; CHECK-NEXT: mov.b64 %rd4, test_select_cc_bf16_f64_param_2; +; CHECK-NEXT: selp.b64 %rd5, %rd4, %rd3, %p1; +; CHECK-NEXT: ld.param.b16 %rs1, [%rd5]; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %cc = fcmp olt double %a, %b %r = select i1 %cc, bfloat %c, bfloat %d @@ -750,18 +858,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; ; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; ; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs8; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r6, %rs7; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r7, %rs6; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r8, %rs5; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r9, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r10, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r11, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r12, %rs1; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_extload_bf16x8( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<9>; +; SM90-FTZ-NEXT: .reg .b32 %r<13>; +; SM90-FTZ-NEXT: .reg .b64 %rd<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; +; SM90-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM90-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM90-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM90-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; +; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_extload_bf16x8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<9>; @@ -825,12 +958,24 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %r1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptosi_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptosi_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -880,12 +1025,24 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %r1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptoui_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptoui_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -945,6 +1102,16 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_sitofp_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.s16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_sitofp_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1002,6 +1169,16 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i8( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1070,6 +1247,21 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i1( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .pred %p<2>; +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0]; +; SM90-FTZ-NEXT: and.b16 %rs2, %rs1, 1; +; SM90-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0; +; SM90-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1; +; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs3, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i1( ; SM90: { ; SM90-NEXT: .reg .pred %p<2>; @@ -1132,6 +1324,16 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1188,6 +1390,17 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i32( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs1, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i32( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1248,6 +1461,17 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i64( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b64 %rd<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u64 %rs1, %rd1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i64( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1302,12 +1526,22 @@ define bfloat @test_roundeven(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_roundeven( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; +; SM90-FTZ-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_roundeven( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1372,6 +1606,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maximum( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; +; SM90-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maximum( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -1430,6 +1675,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maxnum( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; +; SM90-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maxnum( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -1511,6 +1767,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maximum_v2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1]; +; SM90-FTZ-NEXT: max.NaN.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maximum_v2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -1583,6 +1850,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maxnum_v2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1]; +; SM90-FTZ-NEXT: max.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maxnum_v2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index 3c6fb4b..c19e665 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -229,16 +229,18 @@ define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1]; -; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: mov.b64 %rd1, test_select_param_1; +; CHECK-NEXT: mov.b64 %rd2, test_select_param_0; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %r diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index e3d1c80..8050c6f 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -1,25 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %} -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s -; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" define i16 @bswap16(i16 %a) { ; CHECK-LABEL: bswap16( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0]; -; CHECK-NEXT: shr.u16 %rs2, %rs1, 8; -; CHECK-NEXT: shl.b16 %rs3, %rs1, 8; -; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ld.param.b16 %r1, [bswap16_param_0]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7701U; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %b = tail call i16 @llvm.bswap.i16(i16 %a) ret i16 %b @@ -56,40 +49,39 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 { } define i64 @bswap64(i64 %a) { -; PTX70-LABEL: bswap64( -; PTX70: { -; PTX70-NEXT: .reg .b32 %r<5>; -; PTX70-NEXT: .reg .b64 %rd<3>; -; PTX70-EMPTY: -; PTX70-NEXT: // %bb.0: -; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; -; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } -; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U; -; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; } -; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U; -; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2}; -; PTX70-NEXT: st.param.b64 [func_retval0], %rd2; -; PTX70-NEXT: ret; -; -; PTX71-LABEL: bswap64( -; PTX71: { -; PTX71-NEXT: .reg .b32 %r<5>; -; PTX71-NEXT: .reg .b64 %rd<3>; -; PTX71-EMPTY: -; PTX71-NEXT: // %bb.0: -; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; -; PTX71-NEXT: mov.b64 {%r1, _}, %rd1; -; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U; -; PTX71-NEXT: mov.b64 {_, %r3}, %rd1; -; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U; -; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2}; -; PTX71-NEXT: st.param.b64 [func_retval0], %rd2; -; PTX71-NEXT: ret; +; CHECK-LABEL: bswap64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x123U; +; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x123U; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %b = tail call i64 @llvm.bswap.i64(i64 %a) ret i64 %b } +define <2 x i32> @bswapv2i32(<2 x i32> %a) { +; CHECK-LABEL: bswapv2i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [bswapv2i32_param_0]; +; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x123U; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x123U; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %b = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + ret <2 x i32> %b +} declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) declare i64 @llvm.bswap.i64(i64) +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll index 198878c..1d7a396 100644 --- a/llvm/test/CodeGen/NVPTX/bug22246.ll +++ b/llvm/test/CodeGen/NVPTX/bug22246.ll @@ -9,19 +9,20 @@ define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr noc ; CHECK-LABEL: _Z3foobbbPb( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<7>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b8 %rs1, [_Z3foobbbPb_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b8 %rs3, [_Z3foobbbPb_param_1]; -; CHECK-NEXT: ld.param.b8 %rs4, [_Z3foobbbPb_param_2]; -; CHECK-NEXT: selp.b16 %rs5, %rs3, %rs4, %p1; -; CHECK-NEXT: and.b16 %rs6, %rs5, 1; -; CHECK-NEXT: ld.param.b64 %rd1, [_Z3foobbbPb_param_3]; -; CHECK-NEXT: st.b8 [%rd1], %rs6; +; CHECK-NEXT: mov.b64 %rd1, _Z3foobbbPb_param_2; +; CHECK-NEXT: mov.b64 %rd2, _Z3foobbbPb_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b8 %rs3, [%rd3]; +; CHECK-NEXT: and.b16 %rs4, %rs3, 1; +; CHECK-NEXT: ld.param.b64 %rd4, [_Z3foobbbPb_param_3]; +; CHECK-NEXT: st.b8 [%rd4], %rs4; ; CHECK-NEXT: ret; entry: %.sink.v = select i1 %p1, i1 %p2, i1 %p3 diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll new file mode 100644 index 0000000..4d81fdc --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll @@ -0,0 +1,11 @@ +; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s + +; Test that we get a clear error message when using an unsupported syncscope. + +; CHECK: NVPTX backend does not support syncscope "agent" +; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device +define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) { + %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic + %value = extractvalue { i32, i1 } %result, 0 + ret i32 %value +} diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll new file mode 100644 index 0000000..b773c8d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll @@ -0,0 +1,278 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | FileCheck %s +; RUN: %if ptxas-sm_80 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | %ptxas-verify -arch=sm_80 %} + +; CHECK-LABEL: cvt_rna_satfinite_tf32_f32 +define i32 @cvt_rna_satfinite_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rna_satfinite_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rna_satfinite_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rna.satfinite.tf32.f32 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1) + ret i32 %val +} + +define <2 x bfloat> @cvt_rn_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rn_relu_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.relu.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rz_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rz_relu_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_relu_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.relu.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float, float) + +define <2 x half> @cvt_rn_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_relu_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.relu.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rz_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rz_relu_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_relu_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.relu.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +declare <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float, float) + +define bfloat @cvt_rn_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rn.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rn_relu_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_relu_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.relu.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rz_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rz.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rz_relu_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_relu_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.relu.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float %f1) + ret bfloat %val +} + +declare bfloat @llvm.nvvm.f2bf16.rn.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rz.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float) + +define half @cvt_rn_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.satfinite(float %f1) + ret half %val +} + +define half @cvt_rn_relu_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_relu_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.relu.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.relu.satfinite(float %f1) + ret half %val +} + +define half @cvt_rz_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.satfinite(float %f1) + ret half %val +} + +define half @cvt_rz_relu_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_relu_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.relu.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.relu.satfinite(float %f1) + ret half %val +} + +declare half @llvm.nvvm.f2f16.rn.satfinite(float) +declare half @llvm.nvvm.f2f16.rn.relu.satfinite(float) +declare half @llvm.nvvm.f2f16.rz.satfinite(float) +declare half @llvm.nvvm.f2f16.rz.relu.satfinite(float) diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll index edf1739..a47bbab 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll @@ -198,6 +198,71 @@ declare bfloat @llvm.nvvm.f2bf16.rn.relu(float) declare bfloat @llvm.nvvm.f2bf16.rz(float) declare bfloat @llvm.nvvm.f2bf16.rz.relu(float) +define half @cvt_rn_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rn_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16_f32_param_0]; +; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn(float %f1) + ret half %val +} + +define half @cvt_rn_relu_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rn_relu_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16_f32_param_0]; +; CHECK-NEXT: cvt.rn.relu.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.relu(float %f1) + ret half %val +} + +define half @cvt_rz_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rz_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16_f32_param_0]; +; CHECK-NEXT: cvt.rz.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz(float %f1) + ret half %val +} + +define half @cvt_rz_relu_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rz_relu_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16_f32_param_0]; +; CHECK-NEXT: cvt.rz.relu.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.relu(float %f1) + ret half %val +} + +declare half @llvm.nvvm.f2f16.rn(float) +declare half @llvm.nvvm.f2f16.rn.relu(float) +declare half @llvm.nvvm.f2f16.rz(float) +declare half @llvm.nvvm.f2f16.rz.relu(float) + define i32 @cvt_rna_tf32_f32(float %f1) { ; CHECK-LABEL: cvt_rna_tf32_f32( ; CHECK: { diff --git a/llvm/test/CodeGen/NVPTX/convert-sm89.ll b/llvm/test/CodeGen/NVPTX/convert-sm89.ll index 616dcfa..170c120 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm89.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm89.ll @@ -84,10 +84,3 @@ define <2 x half> @cvt_rn_relu_f16x2_e5m2x2(i16 %in) { %val = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 %in); ret <2 x half> %val } - -; CHECK-LABEL: cvt_rna_satfinite_tf32_f32 -define i32 @cvt_rna_satfinite_tf32_f32(float %f1) { -; CHECK: cvt.rna.satfinite.tf32.f32 - %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1) - ret i32 %val -} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll new file mode 100644 index 0000000..9872b2a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i64, i1) + +define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_g2s( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_g2s_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll index b5c43fd2..d653895 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll index 57342dc..5de1ac8 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll index 6296d5a..2f5c1ef 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll index e5ae387..a2b2c2f 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll index 7d04ada..e4c48dd 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll index b0fe77c..727bb3b 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" @@ -29,10 +33,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -48,10 +52,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -79,10 +83,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -99,10 +103,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -131,10 +135,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -152,10 +156,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -185,10 +189,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -207,10 +211,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -241,10 +245,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -264,10 +268,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -297,10 +301,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -319,10 +323,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -354,10 +358,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -378,10 +382,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -415,10 +419,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -441,10 +445,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll index ee79f9d..af3fe67 100644 --- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s -; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-FP16 %s +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" declare half @llvm.nvvm.ex2.approx.f16(half) -declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>) +declare <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half>) +declare bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat) +declare <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat>) -; CHECK-LABEL: ex2_half define half @ex2_half(half %0) { ; CHECK-FP16-LABEL: ex2_half( ; CHECK-FP16: { @@ -21,7 +22,6 @@ define half @ex2_half(half %0) { ret half %res } -; CHECK-LABEL: ex2_2xhalf define <2 x half> @ex2_2xhalf(<2 x half> %0) { ; CHECK-FP16-LABEL: ex2_2xhalf( ; CHECK-FP16: { @@ -32,6 +32,34 @@ define <2 x half> @ex2_2xhalf(<2 x half> %0) { ; CHECK-FP16-NEXT: ex2.approx.f16x2 %r2, %r1; ; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-FP16-NEXT: ret; - %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0) + %res = call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %0) ret <2 x half> %res } + +define bfloat @ex2_bfloat(bfloat %0) { +; CHECK-FP16-LABEL: ex2_bfloat( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b16 %rs<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: +; CHECK-FP16-NEXT: ld.param.b16 %rs1, [ex2_bfloat_param_0]; +; CHECK-FP16-NEXT: ex2.approx.ftz.bf16 %rs2, %rs1; +; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-FP16-NEXT: ret; + %res = call bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat %0) + ret bfloat %res +} + +define <2 x bfloat> @ex2_2xbfloat(<2 x bfloat> %0) { +; CHECK-FP16-LABEL: ex2_2xbfloat( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b32 %r<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: +; CHECK-FP16-NEXT: ld.param.b32 %r1, [ex2_2xbfloat_param_0]; +; CHECK-FP16-NEXT: ex2.approx.ftz.bf16x2 %r2, %r1; +; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-FP16-NEXT: ret; + %res = call <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat> %0) + ret <2 x bfloat> %res +} diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll index 796d80d..97b9d35 100644 --- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll @@ -3,7 +3,8 @@ ; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx-nvidia-cuda" -declare float @llvm.nvvm.ex2.approx.f(float) +declare float @llvm.nvvm.ex2.approx.f32(float) +declare float @llvm.nvvm.ex2.approx.ftz.f32(float) ; CHECK-LABEL: ex2_float define float @ex2_float(float %0) { @@ -16,7 +17,7 @@ define float @ex2_float(float %0) { ; CHECK-NEXT: ex2.approx.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %res = call float @llvm.nvvm.ex2.approx.f(float %0) + %res = call float @llvm.nvvm.ex2.approx.f32(float %0) ret float %res } @@ -31,6 +32,6 @@ define float @ex2_float_ftz(float %0) { ; CHECK-NEXT: ex2.approx.ftz.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0) + %res = call float @llvm.nvvm.ex2.approx.ftz.f32(float %0) ret float %res } diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll index 8561c60..7e778c4 100644 --- a/llvm/test/CodeGen/NVPTX/fast-math.ll +++ b/llvm/test/CodeGen/NVPTX/fast-math.ll @@ -312,18 +312,20 @@ define float @repeated_div_recip_allowed_sel(i1 %pred, float %a, float %b, float ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_recip_allowed_sel_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_recip_allowed_sel_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_sel_param_2]; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_recip_allowed_sel_param_3]; -; CHECK-NEXT: div.rn.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: mov.b64 %rd1, repeated_div_recip_allowed_sel_param_2; +; CHECK-NEXT: mov.b64 %rd2, repeated_div_recip_allowed_sel_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_sel_param_3]; +; CHECK-NEXT: div.rn.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %x = fdiv arcp float %a, %divisor %y = fdiv arcp float %b, %divisor @@ -364,18 +366,20 @@ define float @repeated_div_recip_allowed_ftz_sel(i1 %pred, float %a, float %b, f ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_recip_allowed_ftz_sel_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_recip_allowed_ftz_sel_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_sel_param_2]; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_recip_allowed_ftz_sel_param_3]; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: mov.b64 %rd1, repeated_div_recip_allowed_ftz_sel_param_2; +; CHECK-NEXT: mov.b64 %rd2, repeated_div_recip_allowed_ftz_sel_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_sel_param_3]; +; CHECK-NEXT: div.rn.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %x = fdiv arcp float %a, %divisor %y = fdiv arcp float %b, %divisor @@ -416,18 +420,20 @@ define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_fast_sel_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_fast_sel_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_sel_param_2]; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_fast_sel_param_3]; -; CHECK-NEXT: div.approx.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: mov.b64 %rd1, repeated_div_fast_sel_param_2; +; CHECK-NEXT: mov.b64 %rd2, repeated_div_fast_sel_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_sel_param_3]; +; CHECK-NEXT: div.approx.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %x = fdiv afn float %a, %divisor %y = fdiv afn float %b, %divisor @@ -468,18 +474,20 @@ define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %div ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_fast_ftz_sel_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_fast_ftz_sel_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_ftz_sel_param_2]; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_fast_ftz_sel_param_3]; -; CHECK-NEXT: div.approx.ftz.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: mov.b64 %rd1, repeated_div_fast_ftz_sel_param_2; +; CHECK-NEXT: mov.b64 %rd2, repeated_div_fast_ftz_sel_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_ftz_sel_param_3]; +; CHECK-NEXT: div.approx.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %x = fdiv afn float %a, %divisor %y = fdiv afn float %b, %divisor diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll new file mode 100644 index 0000000..d46408e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async_generic_acquire_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_acquire_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_generic_release_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_release_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll new file mode 100644 index 0000000..896c624 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async() { +; CHECK-LABEL: test_nvvm_fence_proxy_async( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async() + ret void +} + +define void @test_nvvm_fence_proxy_async_global() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_global( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.global; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.global() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cluster() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cluster( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cta() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cta( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cta; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cta() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll new file mode 100644 index 0000000..ab35e4f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll @@ -0,0 +1,8 @@ +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 -o /dev/null 2>&1 | FileCheck %s + +define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) { + ; CHECK: immarg value 130 out of range [128, 129) + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 130); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy.ll b/llvm/test/CodeGen/NVPTX/fence-proxy.ll new file mode 100644 index 0000000..cb5679e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_70 && ptxas-isa-7.5 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | %ptxas-verify -arch=sm_70 %} + +define void @test_nvvm_fence_proxy_alias() { +; CHECK-LABEL: test_nvvm_fence_proxy_alias( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.alias; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.alias() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index 264f380..c91a3df 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -8,21 +8,24 @@ define i32 @test_select_i1_trunc(i32 %a, i32 %b, i32 %c, i32 %true, i32 %false) ; CHECK-LABEL: test_select_i1_trunc( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_param_0]; ; CHECK-NEXT: and.b32 %r2, %r1, 1; ; CHECK-NEXT: setp.ne.b32 %p1, %r2, 0; -; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_trunc_param_1]; -; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_trunc_param_2]; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_trunc_param_3]; -; CHECK-NEXT: selp.b32 %r6, %r3, %r4, %p1; -; CHECK-NEXT: and.b32 %r7, %r6, 1; -; CHECK-NEXT: setp.ne.b32 %p2, %r7, 0; -; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_trunc_param_4]; -; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r9; +; CHECK-NEXT: mov.b64 %rd1, test_select_i1_trunc_param_2; +; CHECK-NEXT: mov.b64 %rd2, test_select_i1_trunc_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [%rd3]; +; CHECK-NEXT: and.b32 %r4, %r3, 1; +; CHECK-NEXT: setp.ne.b32 %p2, %r4, 0; +; CHECK-NEXT: mov.b64 %rd4, test_select_i1_trunc_param_4; +; CHECK-NEXT: mov.b64 %rd5, test_select_i1_trunc_param_3; +; CHECK-NEXT: selp.b64 %rd6, %rd5, %rd4, %p2; +; CHECK-NEXT: ld.param.b32 %r5, [%rd6]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %a_trunc = trunc i32 %a to i1 %b_trunc = trunc i32 %b to i1 @@ -36,23 +39,25 @@ define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false ; CHECK-LABEL: test_select_i1_trunc_2( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_select_i1_trunc_2_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 1; ; CHECK-NEXT: setp.ne.b64 %p1, %rd2, 0; -; CHECK-NEXT: ld.param.b16 %rs1, [test_select_i1_trunc_2_param_1]; -; CHECK-NEXT: ld.param.b16 %rs2, [test_select_i1_trunc_2_param_2]; -; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_2_param_3]; -; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-NEXT: and.b16 %rs4, %rs3, 1; -; CHECK-NEXT: setp.ne.b16 %p2, %rs4, 0; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_trunc_2_param_4]; -; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: mov.b64 %rd3, test_select_i1_trunc_2_param_2; +; CHECK-NEXT: mov.b64 %rd4, test_select_i1_trunc_2_param_1; +; CHECK-NEXT: selp.b64 %rd5, %rd4, %rd3, %p1; +; CHECK-NEXT: ld.param.b16 %rs1, [%rd5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p2, %rs2, 0; +; CHECK-NEXT: mov.b64 %rd6, test_select_i1_trunc_2_param_4; +; CHECK-NEXT: mov.b64 %rd7, test_select_i1_trunc_2_param_3; +; CHECK-NEXT: selp.b64 %rd8, %rd7, %rd6, %p2; +; CHECK-NEXT: ld.param.b32 %r1, [%rd8]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %a_trunc = trunc i64 %a to i1 %b_trunc = trunc i16 %b to i1 @@ -66,7 +71,8 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals ; CHECK-LABEL: test_select_i1_basic( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<4>; -; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_param_0]; @@ -75,13 +81,14 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals ; CHECK-NEXT: setp.ne.b32 %p1, %r1, 0; ; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_param_2]; ; CHECK-NEXT: setp.eq.b32 %p2, %r4, 0; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_3]; ; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0; -; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_param_4]; -; CHECK-NEXT: selp.b32 %r7, %r5, %r6, %p2; -; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1; -; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r9; +; CHECK-NEXT: mov.b64 %rd1, test_select_i1_basic_param_4; +; CHECK-NEXT: mov.b64 %rd2, test_select_i1_basic_param_3; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p2; +; CHECK-NEXT: selp.b64 %rd4, %rd3, %rd1, %p1; +; CHECK-NEXT: selp.b64 %rd5, %rd2, %rd4, %p3; +; CHECK-NEXT: ld.param.b32 %r5, [%rd5]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 @@ -95,7 +102,8 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i ; CHECK-LABEL: test_select_i1_basic_folding( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<11>; -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0]; @@ -105,16 +113,17 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i ; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0; ; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2]; ; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0; -; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; ; CHECK-NEXT: xor.pred %p5, %p1, %p3; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; ; CHECK-NEXT: and.pred %p6, %p5, %p4; ; CHECK-NEXT: and.pred %p7, %p2, %p4; ; CHECK-NEXT: and.pred %p8, %p3, %p6; ; CHECK-NEXT: or.pred %p9, %p8, %p7; ; CHECK-NEXT: xor.pred %p10, %p9, %p3; -; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p10; -; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: mov.b64 %rd1, test_select_i1_basic_folding_param_4; +; CHECK-NEXT: mov.b64 %rd2, test_select_i1_basic_folding_param_3; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p10; +; CHECK-NEXT: ld.param.b32 %r4, [%rd3]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 40d6a07..bfac2b4 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1442,16 +1442,18 @@ define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 { ; O3: { ; O3-NEXT: .reg .pred %p<2>; ; O3-NEXT: .reg .b16 %rs<3>; -; O3-NEXT: .reg .b32 %r<4>; +; O3-NEXT: .reg .b32 %r<2>; +; O3-NEXT: .reg .b64 %rd<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b8 %rs1, [test_select_param_2]; ; O3-NEXT: and.b16 %rs2, %rs1, 1; ; O3-NEXT: setp.ne.b16 %p1, %rs2, 0; -; O3-NEXT: ld.param.b32 %r1, [test_select_param_0]; -; O3-NEXT: ld.param.b32 %r2, [test_select_param_1]; -; O3-NEXT: selp.b32 %r3, %r1, %r2, %p1; -; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: mov.b64 %rd1, test_select_param_1; +; O3-NEXT: mov.b64 %rd2, test_select_param_0; +; O3-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; O3-NEXT: ld.param.b32 %r1, [%rd3]; +; O3-NEXT: st.param.b32 [func_retval0], %r1; ; O3-NEXT: ret; %r = select i1 %c, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll index 3fac29f..d219493 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -346,19 +346,15 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) { ; SM100-LABEL: ld_global_v8i32( ; SM100: { ; SM100-NEXT: .reg .b32 %r<16>; -; SM100-NEXT: .reg .b64 %rd<6>; +; SM100-NEXT: .reg .b64 %rd<2>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0]; -; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; -; SM100-NEXT: mov.b64 {%r1, %r2}, %rd5; -; SM100-NEXT: mov.b64 {%r3, %r4}, %rd4; -; SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; -; SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; -; SM100-NEXT: add.s32 %r9, %r7, %r8; -; SM100-NEXT: add.s32 %r10, %r5, %r6; -; SM100-NEXT: add.s32 %r11, %r3, %r4; -; SM100-NEXT: add.s32 %r12, %r1, %r2; +; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: add.s32 %r9, %r1, %r2; +; SM100-NEXT: add.s32 %r10, %r3, %r4; +; SM100-NEXT: add.s32 %r11, %r5, %r6; +; SM100-NEXT: add.s32 %r12, %r7, %r8; ; SM100-NEXT: add.s32 %r13, %r9, %r10; ; SM100-NEXT: add.s32 %r14, %r11, %r12; ; SM100-NEXT: add.s32 %r15, %r13, %r14; diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll index 297b2b9..ad78e0f 100644 --- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -20,19 +20,19 @@ entry: ; IR-LABEL: @memcpy_caller ; IR: entry: ; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 -; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion +; IR: br i1 [[Cond]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion -; IR: loop-memcpy-expansion: -; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; IR: dynamic-memcpy-expansion-main-body: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %dynamic-memcpy-expansion-main-body ] ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index ; IR: [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]] ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index ; IR: store i8 [[Load]], ptr [[DstGep]] ; IR: [[IndexInc]] = add i64 %loop-index, 1 ; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n -; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion +; IR: br i1 [[Cond2]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion -; IR-LABEL: post-loop-memcpy-expansion: +; IR-LABEL: dynamic-memcpy-post-expansion: ; IR: ret ptr %dst ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_caller @@ -53,19 +53,19 @@ entry: ; IR-LABEL: @memcpy_volatile_caller ; IR: entry: ; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 -; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion +; IR: br i1 [[Cond]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion -; IR: loop-memcpy-expansion: -; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; IR: dynamic-memcpy-expansion-main-body: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %dynamic-memcpy-expansion-main-body ] ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index ; IR: [[Load:%[0-9]+]] = load volatile i8, ptr [[SrcGep]] ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index ; IR: store volatile i8 [[Load]], ptr [[DstGep]] ; IR: [[IndexInc]] = add i64 %loop-index, 1 ; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n -; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion +; IR: br i1 [[Cond2]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion -; IR-LABEL: post-loop-memcpy-expansion: +; IR-LABEL: dynamic-memcpy-post-expansion: ; IR: ret ptr %dst @@ -97,16 +97,16 @@ entry: ; Check that calls with compile-time constant size are handled correctly ; IR-LABEL: @memcpy_known_size ; IR: entry: -; IR: br label %load-store-loop -; IR: load-store-loop: -; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] +; IR: br label %static-memcpy-expansion-main-body +; IR: static-memcpy-expansion-main-body: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %static-memcpy-expansion-main-body ] ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index ; IR: [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]] ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index ; IR: store i8 [[Load]], ptr [[DstGep]] ; IR: [[IndexInc]] = add i64 %loop-index, 1 ; IR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144 -; IR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split +; IR: br i1 [[Cond]], label %static-memcpy-expansion-main-body, label %static-memcpy-post-expansion } define ptr @memset_caller(ptr %dst, i32 %c, i64 %n) #0 { diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 21257e2..ca2914a 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -584,44 +584,25 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; -; PTX_60-LABEL: test_select( -; PTX_60: { -; PTX_60-NEXT: .reg .pred %p<2>; -; PTX_60-NEXT: .reg .b16 %rs<3>; -; PTX_60-NEXT: .reg .b32 %r<4>; -; PTX_60-NEXT: .reg .b64 %rd<3>; -; PTX_60-EMPTY: -; PTX_60-NEXT: // %bb.0: // %bb -; PTX_60-NEXT: ld.param.b8 %rs1, [test_select_param_3]; -; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; -; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_60-NEXT: ld.param.b64 %rd1, [test_select_param_2]; -; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX_60-NEXT: ld.param.b32 %r1, [test_select_param_1]; -; PTX_60-NEXT: ld.param.b32 %r2, [test_select_param_0]; -; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1; -; PTX_60-NEXT: st.global.b32 [%rd2], %r3; -; PTX_60-NEXT: ret; -; -; PTX_70-LABEL: test_select( -; PTX_70: { -; PTX_70-NEXT: .reg .pred %p<2>; -; PTX_70-NEXT: .reg .b16 %rs<3>; -; PTX_70-NEXT: .reg .b32 %r<2>; -; PTX_70-NEXT: .reg .b64 %rd<6>; -; PTX_70-EMPTY: -; PTX_70-NEXT: // %bb.0: // %bb -; PTX_70-NEXT: ld.param.b8 %rs1, [test_select_param_3]; -; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; -; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0; -; PTX_70-NEXT: ld.param.b64 %rd2, [test_select_param_2]; -; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2; -; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1; -; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1; -; PTX_70-NEXT: ld.param.b32 %r1, [%rd5]; -; PTX_70-NEXT: st.global.b32 [%rd3], %r1; -; PTX_70-NEXT: ret; +; PTX-LABEL: test_select( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %bb +; PTX-NEXT: ld.param.b8 %rs1, [test_select_param_3]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; +; PTX-NEXT: mov.b64 %rd1, test_select_param_0; +; PTX-NEXT: ld.param.b64 %rd2, [test_select_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.b64 %rd4, test_select_param_1; +; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1; +; PTX-NEXT: ld.param.b32 %r1, [%rd5]; +; PTX-NEXT: st.global.b32 [%rd3], %r1; +; PTX-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 %valloaded = load i32, ptr %ptrnew, align 4 diff --git a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir index 0b2d856..4be91df 100644 --- a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir +++ b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir @@ -26,10 +26,10 @@ body: | ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.2(0x30000000), %bb.3(0x50000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[LD_i32_:%[0-9]+]]:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101) - ; CHECK-NEXT: [[LD_i64_:%[0-9]+]]:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101) + ; CHECK-NEXT: [[LD_i32_:%[0-9]+]]:b32 = LD_i32 0, 0, 101, 3, 32, -1, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101) + ; CHECK-NEXT: [[LD_i64_:%[0-9]+]]:b64 = LD_i64 0, 0, 101, 3, 64, -1, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101) ; CHECK-NEXT: [[ADD64ri:%[0-9]+]]:b64 = nuw ADD64ri killed [[LD_i64_]], 2 - ; CHECK-NEXT: [[LD_i32_1:%[0-9]+]]:b32 = LD_i32 0, 0, 1, 3, 32, [[ADD64ri]], 0 + ; CHECK-NEXT: [[LD_i32_1:%[0-9]+]]:b32 = LD_i32 0, 0, 1, 3, 32, -1, [[ADD64ri]], 0 ; CHECK-NEXT: [[SETP_i32ri:%[0-9]+]]:b1 = SETP_i32ri [[LD_i32_]], 0, 0 ; CHECK-NEXT: CBranch killed [[SETP_i32ri]], %bb.2 ; CHECK-NEXT: {{ $}} @@ -54,10 +54,10 @@ body: | bb.0.entry: successors: %bb.2(0x30000000), %bb.1(0x50000000) - %5:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101) - %6:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101) + %5:b32 = LD_i32 0, 0, 101, 3, 32, -1, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101) + %6:b64 = LD_i64 0, 0, 101, 3, 64, -1, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101) %0:b64 = nuw ADD64ri killed %6, 2 - %1:b32 = LD_i32 0, 0, 1, 3, 32, %0, 0 + %1:b32 = LD_i32 0, 0, 1, 3, 32, -1, %0, 0 %7:b1 = SETP_i32ri %5, 0, 0 CBranch killed %7, %bb.2 GOTO %bb.1 diff --git a/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll b/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll new file mode 100644 index 0000000..bba240c --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + +; This is testing the lowering behavior of this case from LoadStoreVectorizer/NVPTX/4x2xhalf.ll +; where two 3xhalfs are chained together and extended to 8xhalf. +define void @halfx3_extend_chain(ptr align 16 captures(none) %rd0) { +; CHECK-LABEL: halfx3_extend_chain( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_extend_chain_param_0]; +; CHECK-NEXT: .pragma "used_bytes_mask 0xfff"; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {_, %rs3}, %r2; +; CHECK-NEXT: mov.b32 %r5, {%rs3, %rs1}; +; CHECK-NEXT: mov.b32 %r6, {%rs2, %rs4}; +; CHECK-NEXT: mov.b32 %r7, 0; +; CHECK-NEXT: max.f16x2 %r8, %r2, %r7; +; CHECK-NEXT: max.f16x2 %r9, %r1, %r7; +; CHECK-NEXT: st.b32 [%rd1], %r9; +; CHECK-NEXT: mov.b32 {%rs5, _}, %r8; +; CHECK-NEXT: st.b16 [%rd1+4], %rs5; +; CHECK-NEXT: max.f16x2 %r10, %r6, %r7; +; CHECK-NEXT: max.f16x2 %r11, %r5, %r7; +; CHECK-NEXT: st.b32 [%rd1+6], %r11; +; CHECK-NEXT: mov.b32 {%rs6, _}, %r10; +; CHECK-NEXT: st.b16 [%rd1+10], %rs6; +; CHECK-NEXT: ret; + %load1 = load <3 x half>, ptr %rd0, align 16 + %p1 = fcmp ogt <3 x half> %load1, zeroinitializer + %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer + store <3 x half> %s1, ptr %rd0, align 16 + %in2 = getelementptr half, ptr %rd0, i64 3 + %load2 = load <3 x half>, ptr %in2, align 4 + %p2 = fcmp ogt <3 x half> %load2, zeroinitializer + %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer + store <3 x half> %s2, ptr %in2, align 4 + ret void +} + +; This disables the vectorization by reducing the alignment. +define void @halfx3_no_align(ptr align 4 captures(none) %rd0) { +; CHECK-LABEL: halfx3_no_align( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_no_align_param_0]; +; CHECK-NEXT: ld.b16 %rs1, [%rd1+4]; +; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-NEXT: ld.b32 %r2, [%rd1]; +; CHECK-NEXT: mov.b32 %r3, 0; +; CHECK-NEXT: max.f16x2 %r4, %r1, %r3; +; CHECK-NEXT: max.f16x2 %r5, %r2, %r3; +; CHECK-NEXT: st.b32 [%rd1], %r5; +; CHECK-NEXT: mov.b32 {%rs3, _}, %r4; +; CHECK-NEXT: st.b16 [%rd1+4], %rs3; +; CHECK-NEXT: ld.b16 %rs4, [%rd1+10]; +; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs5}; +; CHECK-NEXT: ld.b32 %r7, [%rd1+6]; +; CHECK-NEXT: max.f16x2 %r8, %r6, %r3; +; CHECK-NEXT: max.f16x2 %r9, %r7, %r3; +; CHECK-NEXT: st.b32 [%rd1+6], %r9; +; CHECK-NEXT: mov.b32 {%rs6, _}, %r8; +; CHECK-NEXT: st.b16 [%rd1+10], %rs6; +; CHECK-NEXT: ret; + %load1 = load <3 x half>, ptr %rd0, align 4 + %p1 = fcmp ogt <3 x half> %load1, zeroinitializer + %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer + store <3 x half> %s1, ptr %rd0, align 4 + %in2 = getelementptr half, ptr %rd0, i64 3 + %load2 = load <3 x half>, ptr %in2, align 4 + %p2 = fcmp ogt <3 x half> %load2, zeroinitializer + %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer + store <3 x half> %s2, ptr %in2, align 4 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/masked-load-vectors.ll b/llvm/test/CodeGen/NVPTX/masked-load-vectors.ll new file mode 100644 index 0000000..3f72ffe --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/masked-load-vectors.ll @@ -0,0 +1,366 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM90 +; RUN: %if ptxas-sm_90 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx88 | %ptxas-verify -arch=sm_90 %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100 +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + + +; Different architectures are tested in this file for the following reasons: +; - SM90 does not have 256-bit load/store instructions +; - SM90 does not have masked store instructions +; - SM90 does not support packed f32x2 instructions + +define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; SM90-NEXT: .pragma "used_bytes_mask 0xf000"; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: .pragma "used_bytes_mask 0xf0f"; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1]; +; SM90-NEXT: st.global.b32 [%rd2], %r5; +; SM90-NEXT: st.global.b32 [%rd2+8], %r7; +; SM90-NEXT: st.global.b32 [%rd2+28], %r4; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_8xi32( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; SM100-NEXT: .pragma "used_bytes_mask 0xf0000f0f"; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, _, %r3, _, _, _, _, %r8}; +; SM100-NEXT: ret; + %a.load = tail call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 %a, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison) + tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>) + ret void +} + +; Masked stores are only supported for 32-bit element types, +; while masked loads are supported for all element types. +define void @global_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_16xi16( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<7>; +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_16xi16_param_0]; +; SM90-NEXT: .pragma "used_bytes_mask 0xf000"; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; SM90-NEXT: .pragma "used_bytes_mask 0xf0f"; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r7; +; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r5; +; SM90-NEXT: ld.param.b64 %rd2, [global_16xi16_param_1]; +; SM90-NEXT: st.global.b16 [%rd2], %rs5; +; SM90-NEXT: st.global.b16 [%rd2+2], %rs6; +; SM90-NEXT: st.global.b16 [%rd2+8], %rs3; +; SM90-NEXT: st.global.b16 [%rd2+10], %rs4; +; SM90-NEXT: st.global.b16 [%rd2+28], %rs1; +; SM90-NEXT: st.global.b16 [%rd2+30], %rs2; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_16xi16( +; SM100: { +; SM100-NEXT: .reg .b16 %rs<7>; +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_16xi16_param_0]; +; SM100-NEXT: .pragma "used_bytes_mask 0xf0000f0f"; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: mov.b32 {%rs1, %rs2}, %r8; +; SM100-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM100-NEXT: ld.param.b64 %rd2, [global_16xi16_param_1]; +; SM100-NEXT: st.global.b16 [%rd2], %rs5; +; SM100-NEXT: st.global.b16 [%rd2+2], %rs6; +; SM100-NEXT: st.global.b16 [%rd2+8], %rs3; +; SM100-NEXT: st.global.b16 [%rd2+10], %rs4; +; SM100-NEXT: st.global.b16 [%rd2+28], %rs1; +; SM100-NEXT: st.global.b16 [%rd2+30], %rs2; +; SM100-NEXT: ret; + %a.load = tail call <16 x i16> @llvm.masked.load.v16i16.p1(ptr addrspace(1) align 32 %a, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <16 x i16> poison) + tail call void @llvm.masked.store.v16i16.p1(<16 x i16> %a.load, ptr addrspace(1) align 32 %b, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>) + ret void +} + +define void @global_8xi32_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_8xi32_no_align( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [global_8xi32_no_align_param_0]; +; CHECK-NEXT: ld.global.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [global_8xi32_no_align_param_1]; +; CHECK-NEXT: ld.global.b32 %r2, [%rd1+8]; +; CHECK-NEXT: ld.global.b32 %r3, [%rd1+28]; +; CHECK-NEXT: st.global.b32 [%rd2], %r1; +; CHECK-NEXT: st.global.b32 [%rd2+8], %r2; +; CHECK-NEXT: st.global.b32 [%rd2+28], %r3; +; CHECK-NEXT: ret; + %a.load = tail call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 16 %a, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison) + tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 16 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>) + ret void +} + + +define void @global_8xi32_invariant(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_8xi32_invariant( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_invariant_param_0]; +; SM90-NEXT: .pragma "used_bytes_mask 0xf000"; +; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: .pragma "used_bytes_mask 0xf0f"; +; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_invariant_param_1]; +; SM90-NEXT: st.global.b32 [%rd2], %r5; +; SM90-NEXT: st.global.b32 [%rd2+8], %r7; +; SM90-NEXT: st.global.b32 [%rd2+28], %r4; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_8xi32_invariant( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_invariant_param_0]; +; SM100-NEXT: .pragma "used_bytes_mask 0xf0000f0f"; +; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_invariant_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, _, %r3, _, _, _, _, %r8}; +; SM100-NEXT: ret; + %a.load = tail call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 %a, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison), !invariant.load !0 + tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>) + ret void +} + +define void @global_2xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi16_param_0]; +; CHECK-NEXT: .pragma "used_bytes_mask 0x3"; +; CHECK-NEXT: ld.global.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [global_2xi16_param_1]; +; CHECK-NEXT: mov.b32 {%rs1, _}, %r1; +; CHECK-NEXT: st.global.b16 [%rd2], %rs1; +; CHECK-NEXT: ret; + %a.load = tail call <2 x i16> @llvm.masked.load.v2i16.p1(ptr addrspace(1) align 4 %a, <2 x i1> <i1 true, i1 false>, <2 x i16> poison) + tail call void @llvm.masked.store.v2i16.p1(<2 x i16> %a.load, ptr addrspace(1) align 4 %b, <2 x i1> <i1 true, i1 false>) + ret void +} + +define void @global_2xi16_invariant(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_2xi16_invariant( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi16_invariant_param_0]; +; CHECK-NEXT: .pragma "used_bytes_mask 0x3"; +; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [global_2xi16_invariant_param_1]; +; CHECK-NEXT: mov.b32 {%rs1, _}, %r1; +; CHECK-NEXT: st.global.b16 [%rd2], %rs1; +; CHECK-NEXT: ret; + %a.load = tail call <2 x i16> @llvm.masked.load.v2i16.p1(ptr addrspace(1) align 4 %a, <2 x i1> <i1 true, i1 false>, <2 x i16> poison), !invariant.load !0 + tail call void @llvm.masked.store.v2i16.p1(<2 x i16> %a.load, ptr addrspace(1) align 4 %b, <2 x i1> <i1 true, i1 false>) + ret void +} + +define void @global_2xi16_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_2xi16_no_align( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi16_no_align_param_0]; +; CHECK-NEXT: ld.global.b16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [global_2xi16_no_align_param_1]; +; CHECK-NEXT: st.global.b16 [%rd2], %rs1; +; CHECK-NEXT: ret; + %a.load = tail call <2 x i16> @llvm.masked.load.v2i16.p1(ptr addrspace(1) align 2 %a, <2 x i1> <i1 true, i1 false>, <2 x i16> poison) + tail call void @llvm.masked.store.v2i16.p1(<2 x i16> %a.load, ptr addrspace(1) align 4 %b, <2 x i1> <i1 true, i1 false>) + ret void +} + +define void @global_4xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi8_param_0]; +; CHECK-NEXT: .pragma "used_bytes_mask 0x5"; +; CHECK-NEXT: ld.global.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [global_4xi8_param_1]; +; CHECK-NEXT: st.global.b8 [%rd2], %r1; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U; +; CHECK-NEXT: st.global.b8 [%rd2+2], %r2; +; CHECK-NEXT: ret; + %a.load = tail call <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1) align 4 %a, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i8> poison) + tail call void @llvm.masked.store.v4i8.p1(<4 x i8> %a.load, ptr addrspace(1) align 4 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) + ret void +} + +define void @global_4xi8_invariant(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_4xi8_invariant( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi8_invariant_param_0]; +; CHECK-NEXT: .pragma "used_bytes_mask 0x5"; +; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [global_4xi8_invariant_param_1]; +; CHECK-NEXT: st.global.b8 [%rd2], %r1; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U; +; CHECK-NEXT: st.global.b8 [%rd2+2], %r2; +; CHECK-NEXT: ret; + %a.load = tail call <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1) align 4 %a, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i8> poison), !invariant.load !0 + tail call void @llvm.masked.store.v4i8.p1(<4 x i8> %a.load, ptr addrspace(1) align 4 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) + ret void +} + +define void @global_4xi8_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_4xi8_no_align( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi8_no_align_param_0]; +; CHECK-NEXT: ld.global.b8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [global_4xi8_no_align_param_1]; +; CHECK-NEXT: ld.global.b8 %rs2, [%rd1+2]; +; CHECK-NEXT: st.global.b8 [%rd2], %rs1; +; CHECK-NEXT: st.global.b8 [%rd2+2], %rs2; +; CHECK-NEXT: ret; + %a.load = tail call <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1) align 2 %a, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i8> poison) + tail call void @llvm.masked.store.v4i8.p1(<4 x i8> %a.load, ptr addrspace(1) align 4 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) + ret void +} + +; In sm100+, we pack 2xf32 loads into a single b64 load while lowering +define void @global_2xf32(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_2xf32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<3>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_2xf32_param_0]; +; SM90-NEXT: .pragma "used_bytes_mask 0xf"; +; SM90-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_2xf32_param_1]; +; SM90-NEXT: st.global.b32 [%rd2], %r1; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_2xf32( +; SM100: { +; SM100-NEXT: .reg .b32 %r<2>; +; SM100-NEXT: .reg .b64 %rd<4>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_2xf32_param_0]; +; SM100-NEXT: .pragma "used_bytes_mask 0xf"; +; SM100-NEXT: ld.global.b64 %rd2, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd3, [global_2xf32_param_1]; +; SM100-NEXT: mov.b64 {%r1, _}, %rd2; +; SM100-NEXT: st.global.b32 [%rd3], %r1; +; SM100-NEXT: ret; + %a.load = tail call <2 x float> @llvm.masked.load.v2f32.p1(ptr addrspace(1) align 8 %a, <2 x i1> <i1 true, i1 false>, <2 x float> poison) + tail call void @llvm.masked.store.v2f32.p1(<2 x float> %a.load, ptr addrspace(1) align 8 %b, <2 x i1> <i1 true, i1 false>) + ret void +} + +define void @global_2xf32_invariant(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_2xf32_invariant( +; SM90: { +; SM90-NEXT: .reg .b32 %r<3>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_2xf32_invariant_param_0]; +; SM90-NEXT: .pragma "used_bytes_mask 0xf"; +; SM90-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_2xf32_invariant_param_1]; +; SM90-NEXT: st.global.b32 [%rd2], %r1; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_2xf32_invariant( +; SM100: { +; SM100-NEXT: .reg .b32 %r<2>; +; SM100-NEXT: .reg .b64 %rd<4>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_2xf32_invariant_param_0]; +; SM100-NEXT: .pragma "used_bytes_mask 0xf"; +; SM100-NEXT: ld.global.nc.b64 %rd2, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd3, [global_2xf32_invariant_param_1]; +; SM100-NEXT: mov.b64 {%r1, _}, %rd2; +; SM100-NEXT: st.global.b32 [%rd3], %r1; +; SM100-NEXT: ret; + %a.load = tail call <2 x float> @llvm.masked.load.v2f32.p1(ptr addrspace(1) align 8 %a, <2 x i1> <i1 true, i1 false>, <2 x float> poison), !invariant.load !0 + tail call void @llvm.masked.store.v2f32.p1(<2 x float> %a.load, ptr addrspace(1) align 8 %b, <2 x i1> <i1 true, i1 false>) + ret void +} + +define void @global_2xf32_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_2xf32_no_align( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xf32_no_align_param_0]; +; CHECK-NEXT: ld.global.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [global_2xf32_no_align_param_1]; +; CHECK-NEXT: st.global.b32 [%rd2], %r1; +; CHECK-NEXT: ret; + %a.load = tail call <2 x float> @llvm.masked.load.v2f32.p1(ptr addrspace(1) align 4 %a, <2 x i1> <i1 true, i1 false>, <2 x float> poison) + tail call void @llvm.masked.store.v2f32.p1(<2 x float> %a.load, ptr addrspace(1) align 8 %b, <2 x i1> <i1 true, i1 false>) + ret void +} + +declare <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1), <8 x i1>, <8 x i32>) +declare void @llvm.masked.store.v8i32.p1(<8 x i32>, ptr addrspace(1), <8 x i1>) +declare <16 x i16> @llvm.masked.load.v16i16.p1(ptr addrspace(1), <16 x i1>, <16 x i16>) +declare void @llvm.masked.store.v16i16.p1(<16 x i16>, ptr addrspace(1), <16 x i1>) +declare <2 x i16> @llvm.masked.load.v2i16.p1(ptr addrspace(1), <2 x i1>, <2 x i16>) +declare void @llvm.masked.store.v2i16.p1(<2 x i16>, ptr addrspace(1), <2 x i1>) +declare <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1), <4 x i1>, <4 x i8>) +declare void @llvm.masked.store.v4i8.p1(<4 x i8>, ptr addrspace(1), <4 x i1>) +declare <2 x float> @llvm.masked.load.v2f32.p1(ptr addrspace(1), <2 x i1>, <2 x float>) +declare void @llvm.masked.store.v2f32.p1(<2 x float>, ptr addrspace(1), <2 x i1>) +!0 = !{} diff --git a/llvm/test/CodeGen/NVPTX/masked-store-variable-mask.ll b/llvm/test/CodeGen/NVPTX/masked-store-variable-mask.ll new file mode 100644 index 0000000..bbe2dbb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/masked-store-variable-mask.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + +; Confirm that a masked store with a variable mask is scalarized before lowering + +define void @global_variable_mask(ptr addrspace(1) %a, ptr addrspace(1) %b, <4 x i1> %mask) { +; CHECK-LABEL: global_variable_mask( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<9>; +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [global_variable_mask_param_2+3]; +; CHECK-NEXT: ld.param.b8 %rs3, [global_variable_mask_param_2+2]; +; CHECK-NEXT: and.b16 %rs4, %rs3, 1; +; CHECK-NEXT: ld.param.b8 %rs5, [global_variable_mask_param_2+1]; +; CHECK-NEXT: and.b16 %rs6, %rs5, 1; +; CHECK-NEXT: setp.ne.b16 %p2, %rs6, 0; +; CHECK-NEXT: ld.param.b8 %rs7, [global_variable_mask_param_2]; +; CHECK-NEXT: and.b16 %rs8, %rs7, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs8, 0; +; CHECK-NEXT: ld.param.b64 %rd5, [global_variable_mask_param_1]; +; CHECK-NEXT: ld.param.b64 %rd6, [global_variable_mask_param_0]; +; CHECK-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [%rd6]; +; CHECK-NEXT: not.pred %p5, %p1; +; CHECK-NEXT: @%p5 bra $L__BB0_2; +; CHECK-NEXT: // %bb.1: // %cond.store +; CHECK-NEXT: st.global.b64 [%rd5], %rd1; +; CHECK-NEXT: $L__BB0_2: // %else +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p3, %rs4, 0; +; CHECK-NEXT: not.pred %p6, %p2; +; CHECK-NEXT: @%p6 bra $L__BB0_4; +; CHECK-NEXT: // %bb.3: // %cond.store1 +; CHECK-NEXT: st.global.b64 [%rd5+8], %rd2; +; CHECK-NEXT: $L__BB0_4: // %else2 +; CHECK-NEXT: setp.ne.b16 %p4, %rs2, 0; +; CHECK-NEXT: not.pred %p7, %p3; +; CHECK-NEXT: @%p7 bra $L__BB0_6; +; CHECK-NEXT: // %bb.5: // %cond.store3 +; CHECK-NEXT: st.global.b64 [%rd5+16], %rd3; +; CHECK-NEXT: $L__BB0_6: // %else4 +; CHECK-NEXT: not.pred %p8, %p4; +; CHECK-NEXT: @%p8 bra $L__BB0_8; +; CHECK-NEXT: // %bb.7: // %cond.store5 +; CHECK-NEXT: st.global.b64 [%rd5+24], %rd4; +; CHECK-NEXT: $L__BB0_8: // %else6 +; CHECK-NEXT: ret; + %a.load = load <4 x i64>, ptr addrspace(1) %a + tail call void @llvm.masked.store.v4i64.p1(<4 x i64> %a.load, ptr addrspace(1) align 32 %b, <4 x i1> %mask) + ret void +} + +declare void @llvm.masked.store.v4i64.p1(<4 x i64>, ptr addrspace(1), <4 x i1>) diff --git a/llvm/test/CodeGen/NVPTX/masked-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/masked-store-vectors-256.ll new file mode 100644 index 0000000..44f3505 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/masked-store-vectors-256.ll @@ -0,0 +1,318 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM90 +; RUN: %if ptxas-sm_90 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx88 | %ptxas-verify -arch=sm_90 %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100 +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + +; This test is based on load-store-vectors.ll, +; and contains testing for lowering 256-bit masked vector stores + +; Types we are checking: i32, i64, f32, f64 + +; Address spaces we are checking: generic, global +; - Global is the only address space that currently supports masked stores. +; - The generic stores will get legalized before the backend via scalarization, +; this file tests that even though we won't be generating them in the LSV. + +; 256-bit vector loads/stores are only legal for blackwell+, so on sm_90, the vectors will be split + +; generic address space + +define void @generic_8xi32(ptr %a, ptr %b) { +; CHECK-LABEL: generic_8xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1]; +; CHECK-NEXT: st.b32 [%rd2], %r5; +; CHECK-NEXT: st.b32 [%rd2+8], %r7; +; CHECK-NEXT: st.b32 [%rd2+28], %r4; +; CHECK-NEXT: ret; + %a.load = load <8 x i32>, ptr %a + tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %a.load, ptr align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>) + ret void +} + +define void @generic_4xi64(ptr %a, ptr %b) { +; CHECK-LABEL: generic_4xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xi64_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1+16]; +; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd6, [generic_4xi64_param_1]; +; CHECK-NEXT: st.b64 [%rd6], %rd4; +; CHECK-NEXT: st.b64 [%rd6+16], %rd2; +; CHECK-NEXT: ret; + %a.load = load <4 x i64>, ptr %a + tail call void @llvm.masked.store.v4i64.p0(<4 x i64> %a.load, ptr align 32 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) + ret void +} + +define void @generic_8xfloat(ptr %a, ptr %b) { +; CHECK-LABEL: generic_8xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1]; +; CHECK-NEXT: st.b32 [%rd2], %r5; +; CHECK-NEXT: st.b32 [%rd2+8], %r7; +; CHECK-NEXT: st.b32 [%rd2+28], %r4; +; CHECK-NEXT: ret; + %a.load = load <8 x float>, ptr %a + tail call void @llvm.masked.store.v8f32.p0(<8 x float> %a.load, ptr align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>) + ret void +} + +define void @generic_4xdouble(ptr %a, ptr %b) { +; CHECK-LABEL: generic_4xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xdouble_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1+16]; +; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd6, [generic_4xdouble_param_1]; +; CHECK-NEXT: st.b64 [%rd6], %rd4; +; CHECK-NEXT: st.b64 [%rd6+16], %rd2; +; CHECK-NEXT: ret; + %a.load = load <4 x double>, ptr %a + tail call void @llvm.masked.store.v4f64.p0(<4 x double> %a.load, ptr align 32 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) + ret void +} + +; global address space + +define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1]; +; SM90-NEXT: st.global.b32 [%rd2], %r5; +; SM90-NEXT: st.global.b32 [%rd2+8], %r7; +; SM90-NEXT: st.global.b32 [%rd2+28], %r4; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_8xi32( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, _, %r3, _, _, _, _, %r8}; +; SM100-NEXT: ret; + %a.load = load <8 x i32>, ptr addrspace(1) %a + tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>) + ret void +} + +define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_4xi64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_4xi64_param_0]; +; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1+16]; +; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd6, [global_4xi64_param_1]; +; SM90-NEXT: st.global.b64 [%rd6], %rd4; +; SM90-NEXT: st.global.b64 [%rd6+16], %rd2; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_4xi64( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_4xi64_param_0]; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_4xi64_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, _, %rd4, _}; +; SM100-NEXT: ret; + %a.load = load <4 x i64>, ptr addrspace(1) %a + tail call void @llvm.masked.store.v4i64.p1(<4 x i64> %a.load, ptr addrspace(1) align 32 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) + ret void +} + +define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1]; +; SM90-NEXT: st.global.b32 [%rd2], %r5; +; SM90-NEXT: st.global.b32 [%rd2+8], %r7; +; SM90-NEXT: st.global.b32 [%rd2+28], %r4; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, _, %r3, _, _, _, _, %r8}; +; SM100-NEXT: ret; + %a.load = load <8 x float>, ptr addrspace(1) %a + tail call void @llvm.masked.store.v8f32.p1(<8 x float> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>) + ret void +} + +define void @global_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_4xdouble( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_4xdouble_param_0]; +; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1+16]; +; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd6, [global_4xdouble_param_1]; +; SM90-NEXT: st.global.b64 [%rd6], %rd4; +; SM90-NEXT: st.global.b64 [%rd6+16], %rd2; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_4xdouble( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_4xdouble_param_0]; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_4xdouble_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, _, %rd4, _}; +; SM100-NEXT: ret; + %a.load = load <4 x double>, ptr addrspace(1) %a + tail call void @llvm.masked.store.v4f64.p1(<4 x double> %a.load, ptr addrspace(1) align 32 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) + ret void +} + +; edge cases +define void @global_8xi32_all_mask_on(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_8xi32_all_mask_on( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_all_mask_on_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_all_mask_on_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_8xi32_all_mask_on( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_all_mask_on_param_0]; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_8xi32_all_mask_on_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ret; + %a.load = load <8 x i32>, ptr addrspace(1) %a + tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) + ret void +} + +define void @global_8xi32_all_mask_off(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: global_8xi32_all_mask_off( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ret; + %a.load = load <8 x i32>, ptr addrspace(1) %a + tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>) + ret void +} + +; This is an example pattern for the LSV's output of these masked stores +define void @vectorizerOutput(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; SM90-LABEL: vectorizerOutput( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [vectorizerOutput_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd2, [vectorizerOutput_param_1]; +; SM90-NEXT: st.global.b32 [%rd2], %r5; +; SM90-NEXT: st.global.b32 [%rd2+4], %r6; +; SM90-NEXT: st.global.b32 [%rd2+12], %r8; +; SM90-NEXT: st.global.b32 [%rd2+16], %r1; +; SM90-NEXT: ret; +; +; SM100-LABEL: vectorizerOutput( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [vectorizerOutput_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [vectorizerOutput_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, _, %r4, %r5, _, _, _}; +; SM100-NEXT: ret; + %1 = load <8 x i32>, ptr addrspace(1) %in, align 32 + %load05 = extractelement <8 x i32> %1, i32 0 + %load16 = extractelement <8 x i32> %1, i32 1 + %load38 = extractelement <8 x i32> %1, i32 3 + %load49 = extractelement <8 x i32> %1, i32 4 + %2 = insertelement <8 x i32> poison, i32 %load05, i32 0 + %3 = insertelement <8 x i32> %2, i32 %load16, i32 1 + %4 = insertelement <8 x i32> %3, i32 poison, i32 2 + %5 = insertelement <8 x i32> %4, i32 %load38, i32 3 + %6 = insertelement <8 x i32> %5, i32 %load49, i32 4 + %7 = insertelement <8 x i32> %6, i32 poison, i32 5 + %8 = insertelement <8 x i32> %7, i32 poison, i32 6 + %9 = insertelement <8 x i32> %8, i32 poison, i32 7 + call void @llvm.masked.store.v8i32.p1(<8 x i32> %9, ptr addrspace(1) align 32 %out, <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false>) + ret void +} + +declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, <8 x i1>) +declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, <4 x i1>) +declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, <8 x i1>) +declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, <4 x i1>) + +declare void @llvm.masked.store.v8i32.p1(<8 x i32>, ptr addrspace(1), <8 x i1>) +declare void @llvm.masked.store.v4i64.p1(<4 x i64>, ptr addrspace(1), <4 x i1>) +declare void @llvm.masked.store.v8f32.p1(<8 x float>, ptr addrspace(1), <8 x i1>) +declare void @llvm.masked.store.v4f64.p1(<4 x double>, ptr addrspace(1), <4 x i1>) diff --git a/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll new file mode 100644 index 0000000..6d9ad8d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll @@ -0,0 +1,247 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=nvptx-ir-peephole -S | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +; fsub(fmul(a, b), c) => fma(a, b, fneg(c)) +define float @test_fsub_fmul_c(float %a, float %b, float %c) { +; CHECK-LABEL: define float @test_fsub_fmul_c( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = fneg contract float [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]]) +; CHECK-NEXT: ret float [[TMP2]] +; + %mul = fmul contract float %a, %b + %sub = fsub contract float %mul, %c + ret float %sub +} + + +; fsub(c, fmul(a, b)) => fma(-a, b, c) +define float @test_fsub_c_fmul(float %a, float %b, float %c) { +; CHECK-LABEL: define float @test_fsub_c_fmul( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = fneg contract float [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[B]], float [[C]]) +; CHECK-NEXT: ret float [[TMP2]] +; + %mul = fmul contract float %a, %b + %sub = fsub contract float %c, %mul + ret float %sub +} + + +; fsub(fmul(a, b), fmul(c, d)) => fma(a, b, fneg(fmul(c, d))) +define float @test_fsub_fmul_fmul(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: define float @test_fsub_fmul_fmul( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) { +; CHECK-NEXT: [[MUL2:%.*]] = fmul contract float [[C]], [[D]] +; CHECK-NEXT: [[TMP1:%.*]] = fneg contract float [[MUL2]] +; CHECK-NEXT: [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]]) +; CHECK-NEXT: ret float [[TMP2]] +; + %mul1 = fmul contract float %a, %b + %mul2 = fmul contract float %c, %d + %sub = fsub contract float %mul1, %mul2 + ret float %sub +} + + +; fsub(fmul(a, b), fmul(c, d)) => fma(fneg(c), d, fmul(a, b))) +; fmul(a, b) has multiple uses. +define float @test_fsub_fmul_fmul_multiple_use(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: define float @test_fsub_fmul_fmul_multiple_use( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) { +; CHECK-NEXT: [[MUL1:%.*]] = fmul contract float [[A]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = fneg contract float [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[D]], float [[MUL1]]) +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP2]], [[MUL1]] +; CHECK-NEXT: ret float [[ADD]] +; + %mul1 = fmul contract float %a, %b + %mul2 = fmul contract float %c, %d + %sub = fsub contract float %mul1, %mul2 + %add = fadd float %sub, %mul1 + ret float %add +} + + +; fsub(fmul(a, b), c) => fma(a, b, fneg(c)) where fsub and fmul are in different BBs +define float @test_fsub_fmul_different_BB(float %a, float %b, float %c, i32 %n) { +; CHECK-LABEL: define float @test_fsub_fmul_different_BB( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[INIT:.*]]: +; CHECK-NEXT: [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10 +; CHECK-NEXT: br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]] +; CHECK: [[ITERATION]]: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ] +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00 +; CHECK-NEXT: [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ] +; CHECK-NEXT: [[TMP0:%.*]] = fneg contract float [[C_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP0]]) +; CHECK-NEXT: ret float [[TMP1]] +; +init: + %mul = fmul contract float %a, %b + %cmp_iter = icmp sgt i32 %n, 10 + br i1 %cmp_iter, label %iteration, label %exit + +iteration: + %i = phi i32 [ 0, %init ], [ %i_next, %iteration ] + %acc = phi float [ %c, %init ], [ %acc_next, %iteration ] + %i_next = add i32 %i, 1 + %acc_next = fadd contract float %acc, 1.0 + %cmp_loop = icmp slt i32 %i_next, %n + br i1 %cmp_loop, label %iteration, label %exit + +exit: + %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ] + %sub = fsub contract float %mul, %c_phi + ret float %sub +} + + +; fadd(fmul(a, b), c) => fma(a, b, c) +define float @test_fadd_fmul_c(float %a, float %b, float %c) { +; CHECK-LABEL: define float @test_fadd_fmul_c( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]]) +; CHECK-NEXT: ret float [[TMP1]] +; + %mul = fmul contract float %a, %b + %add = fadd contract float %mul, %c + ret float %add +} + + +; fadd(c, fmul(a, b)) => fma(a, b, c) +define float @test_fadd_c_fmul(float %a, float %b, float %c) { +; CHECK-LABEL: define float @test_fadd_c_fmul( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]]) +; CHECK-NEXT: ret float [[TMP1]] +; + %mul = fmul contract float %a, %b + %add = fadd contract float %c, %mul + ret float %add +} + + +; fadd(fmul(a, b), fmul(c, d)) => fma(a, b, fmul(c, d)) +define float @test_fadd_fmul_fmul(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: define float @test_fadd_fmul_fmul( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) { +; CHECK-NEXT: [[MUL2:%.*]] = fmul contract float [[C]], [[D]] +; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[MUL2]]) +; CHECK-NEXT: ret float [[TMP1]] +; + %mul1 = fmul contract float %a, %b + %mul2 = fmul contract float %c, %d + %add = fadd contract float %mul1, %mul2 + ret float %add +} + + +; fadd(fmul(a, b), c) => fma(a, b, c) where fadd and fmul are in different BBs +define float @test_fadd_fmul_different_BB(float %a, float %b, float %c, i32 %n) { +; CHECK-LABEL: define float @test_fadd_fmul_different_BB( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[INIT:.*]]: +; CHECK-NEXT: [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10 +; CHECK-NEXT: br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]] +; CHECK: [[ITERATION]]: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ] +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00 +; CHECK-NEXT: [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C_PHI]]) +; CHECK-NEXT: ret float [[TMP0]] +; +init: + %mul = fmul contract float %a, %b + %cmp_iter = icmp sgt i32 %n, 10 + br i1 %cmp_iter, label %iteration, label %exit + +iteration: + %i = phi i32 [ 0, %init ], [ %i_next, %iteration ] + %acc = phi float [ %c, %init ], [ %acc_next, %iteration ] + %i_next = add i32 %i, 1 + %acc_next = fadd contract float %acc, 1.0 + %cmp_loop = icmp slt i32 %i_next, %n + br i1 %cmp_loop, label %iteration, label %exit + +exit: + %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ] + %add = fadd contract float %mul, %c_phi + ret float %add +} + + +; These scenarios shouldn't work. +; fadd(fpext(fmul(a, b)), c) => fma(fpext(a), fpext(b), c) +define double @test_fadd_fpext_fmul_c(float %a, float %b, double %c) { +; CHECK-LABEL: define double @test_fadd_fpext_fmul_c( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) { +; CHECK-NEXT: [[MUL:%.*]] = fmul contract float [[A]], [[B]] +; CHECK-NEXT: [[EXT:%.*]] = fpext float [[MUL]] to double +; CHECK-NEXT: [[ADD:%.*]] = fadd contract double [[EXT]], [[C]] +; CHECK-NEXT: ret double [[ADD]] +; + %mul = fmul contract float %a, %b + %ext = fpext float %mul to double + %add = fadd contract double %ext, %c + ret double %add +} + + +; fadd(c, fpext(fmul(a, b))) => fma(fpext(a), fpext(b), c) +define double @test_fadd_c_fpext_fmul(float %a, float %b, double %c) { +; CHECK-LABEL: define double @test_fadd_c_fpext_fmul( +; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) { +; CHECK-NEXT: [[MUL:%.*]] = fmul contract float [[A]], [[B]] +; CHECK-NEXT: [[EXT:%.*]] = fpext float [[MUL]] to double +; CHECK-NEXT: [[ADD:%.*]] = fadd contract double [[C]], [[EXT]] +; CHECK-NEXT: ret double [[ADD]] +; + %mul = fmul contract float %a, %b + %ext = fpext float %mul to double + %add = fadd contract double %c, %ext + ret double %add +} + + +; Double precision tests +; fsub(fmul(a, b), c) => fma(a, b, fneg(c)) +define double @test_fsub_fmul_c_double(double %a, double %b, double %c) { +; CHECK-LABEL: define double @test_fsub_fmul_c_double( +; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = fneg contract double [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %mul = fmul contract double %a, %b + %sub = fsub contract double %mul, %c + ret double %sub +} + + +; fadd(fmul(a, b), c) => fma(a, b, c) +define double @test_fadd_fmul_c_double(double %a, double %b, double %c) { +; CHECK-LABEL: define double @test_fadd_fmul_c_double( +; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[C]]) +; CHECK-NEXT: ret double [[TMP1]] +; + %mul = fmul contract double %a, %b + %add = fadd contract double %mul, %c + ret double %add +} diff --git a/llvm/test/CodeGen/NVPTX/op-fence.ll b/llvm/test/CodeGen/NVPTX/op-fence.ll new file mode 100644 index 0000000..629b702 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/op-fence.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_mbarrier_init +define void @test_fence_mbarrier_init() { +; CHECK-LABEL: test_fence_mbarrier_init( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.mbarrier_init.release.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.mbarrier_init.release.cluster(); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll index 06d7384..b220450 100644 --- a/llvm/test/CodeGen/NVPTX/param-add.ll +++ b/llvm/test/CodeGen/NVPTX/param-add.ll @@ -2,11 +2,6 @@ ; RUN: llc < %s -march=nvptx64 --debug-counter=dagcombine=0 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 --debug-counter=dagcombine=0 | %ptxas-verify %} -; REQUIRES: asserts -; asserts are required for --debug-counter=dagcombine=0 to have the intended -; effect of disabling DAG combines, which exposes the bug. When combines are -; enabled the bug does not occur. - %struct.1float = type <{ [1 x float] }> declare i32 @callee(%struct.1float %a) diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index 51f6b00..4870050 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -171,8 +171,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) ; CHECK-LABEL: callee_St4x3( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] - ; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; - ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0]; ; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]]; ; CHECK-NEXT: ret; @@ -394,8 +393,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St4x7( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; - ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; - ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; + ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16]; ; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; ; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]]; diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir index dfc8417..a84b7fc 100644 --- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir +++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir @@ -77,7 +77,7 @@ constants: [] machineFunctionInfo: {} body: | bb.0: - %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, &retval0, 0 :: (load (s128), addrspace 101) + %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, -1, &retval0, 0 :: (load (s128), addrspace 101) ; CHECK-NOT: ProxyReg %4:b32 = ProxyRegB32 killed %0 %5:b32 = ProxyRegB32 killed %1 @@ -86,7 +86,7 @@ body: | ; CHECK: STV_i32_v4 killed %0, killed %1, killed %2, killed %3 STV_i32_v4 killed %4, killed %5, killed %6, killed %7, 0, 0, 101, 32, &func_retval0, 0 :: (store (s128), addrspace 101) - %8:b32 = LD_i32 0, 0, 101, 3, 32, &retval0, 0 :: (load (s32), addrspace 101) + %8:b32 = LD_i32 0, 0, 101, 3, 32, -1, &retval0, 0 :: (load (s32), addrspace 101) ; CHECK-NOT: ProxyReg %9:b32 = ProxyRegB32 killed %8 %10:b32 = ProxyRegB32 killed %9 diff --git a/llvm/test/CodeGen/NVPTX/switch-loop-header.mir b/llvm/test/CodeGen/NVPTX/switch-loop-header.mir new file mode 100644 index 0000000..4d86bb8 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/switch-loop-header.mir @@ -0,0 +1,182 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -o - %s -passes="require<machine-loops>,require<live-vars>,phi-node-elimination" | FileCheck %s + +--- | + target datalayout = "e-p6:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + target triple = "nvptx64-unknown-nvidiacl" + + define void @func_26(i32 %BS_COND_16.0.BS_COND_16.0.BS_COND_16.0.BS_COND_16.0.) { + entry: + br label %for.cond + + for.cond: ; preds = %BS_LABEL_1, %BS_LABEL_1, %entry + %p_2218_0.1 = phi i32 [ 0, %entry ], [ %p_2218_0.3, %BS_LABEL_1 ], [ %p_2218_0.3, %BS_LABEL_1 ] + br label %BS_LABEL_1 + + BS_LABEL_2: ; preds = %BS_LABEL_1 + %sub = or i32 %p_2218_0.3, 1 + br label %for.cond4 + + for.cond4: ; preds = %BS_LABEL_1, %BS_LABEL_2 + %p_2218_0.2 = phi i32 [ %BS_COND_16.0.BS_COND_16.0.BS_COND_16.0.BS_COND_16.0., %BS_LABEL_1 ], [ %sub, %BS_LABEL_2 ] + br label %BS_LABEL_1 + + BS_LABEL_1: ; preds = %for.cond4, %for.cond + %p_2218_0.3 = phi i32 [ %p_2218_0.2, %for.cond4 ], [ %p_2218_0.1, %for.cond ] + switch i32 %BS_COND_16.0.BS_COND_16.0.BS_COND_16.0.BS_COND_16.0., label %unreachable [ + i32 0, label %for.cond4 + i32 4, label %BS_LABEL_2 + i32 1, label %for.cond + i32 6, label %for.cond + ] + + unreachable: ; preds = %BS_LABEL_1 + call void asm sideeffect "exit;", ""() + unreachable + } +... +--- +name: func_26 +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHContTarget: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: b32, preferred-register: '', flags: [ ] } + - { id: 1, class: b32, preferred-register: '', flags: [ ] } + - { id: 2, class: b32, preferred-register: '', flags: [ ] } + - { id: 3, class: b32, preferred-register: '', flags: [ ] } + - { id: 4, class: b32, preferred-register: '', flags: [ ] } + - { id: 5, class: b32, preferred-register: '', flags: [ ] } + - { id: 6, class: b32, preferred-register: '', flags: [ ] } + - { id: 7, class: b1, preferred-register: '', flags: [ ] } + - { id: 8, class: b32, preferred-register: '', flags: [ ] } + - { id: 9, class: b1, preferred-register: '', flags: [ ] } + - { id: 10, class: b32, preferred-register: '', flags: [ ] } + - { id: 11, class: b1, preferred-register: '', flags: [ ] } +liveins: [] +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +jumpTable: + kind: inline + entries: + - id: 0 + blocks: [ '%bb.3', '%bb.1', '%bb.6', '%bb.6', '%bb.2', '%bb.6', + '%bb.1' ] +body: | + ; CHECK-LABEL: name: func_26 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:b32 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:b1 = IMPLICIT_DEF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[DEF2:%[0-9]+]]:b32 = IMPLICIT_DEF + ; CHECK-NEXT: GOTO %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.6(0x00000000), %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: CBranch undef [[DEF1]], %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.3(0x3e000000), %bb.1(0x04000000), %bb.6(0x00000000), %bb.2(0x3e000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BRX_START 0 + ; CHECK-NEXT: BRX_ITEM %bb.3 + ; CHECK-NEXT: BRX_ITEM %bb.1 + ; CHECK-NEXT: BRX_ITEM %bb.6 + ; CHECK-NEXT: BRX_ITEM %bb.6 + ; CHECK-NEXT: BRX_ITEM %bb.2 + ; CHECK-NEXT: BRX_ITEM %bb.6 + ; CHECK-NEXT: BRX_END %bb.1, undef [[DEF]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6: + bb.0: + successors: %bb.1(0x80000000) + + %10:b32 = IMPLICIT_DEF + %11:b1 = IMPLICIT_DEF + + bb.1: + successors: %bb.4(0x80000000) + + %0:b32 = PHI undef %10, %bb.0, undef %0, %bb.5 + GOTO %bb.4 + + bb.2: + successors: %bb.3(0x80000000) + + bb.3: + successors: %bb.4(0x80000000) + + bb.4: + successors: %bb.6(0x00000000), %bb.5(0x80000000) + + CBranch undef %11, %bb.6 + + bb.5: + successors: %bb.3(0x3e000000), %bb.1(0x04000000), %bb.6(0x00000000), %bb.2(0x3e000000) + + BRX_START 0 + BRX_ITEM %bb.3 + BRX_ITEM %bb.1 + BRX_ITEM %bb.6 + BRX_ITEM %bb.6 + BRX_ITEM %bb.2 + BRX_ITEM %bb.6 + BRX_END %bb.1, undef %10, 0 + + bb.6: +... diff --git a/llvm/test/CodeGen/NVPTX/switch.ll b/llvm/test/CodeGen/NVPTX/switch.ll new file mode 100644 index 0000000..7fcfcfb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/switch.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mcpu=sm_20 -verify-machineinstrs | FileCheck %s + +target triple = "nvptx64-unknown-nvidiacl" + +define void @pr170051(i32 %cond) { +; CHECK-LABEL: pr170051( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: mov.b32 %r2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [pr170051_param_0]; +; CHECK-NEXT: setp.gt.u32 %p1, %r1, 6; +; CHECK-NEXT: bra.uni $L__BB0_3; +; CHECK-NEXT: $L__BB0_1: // %BS_LABEL_2 +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: or.b32 %r3, %r2, 1; +; CHECK-NEXT: $L__BB0_2: // %for.cond4 +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: mov.b32 %r2, %r3; +; CHECK-NEXT: $L__BB0_3: // %BS_LABEL_1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: @%p1 bra $L__BB0_5; +; CHECK-NEXT: // %bb.4: // %BS_LABEL_1 +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: mov.b32 %r3, %r1; +; CHECK-NEXT: $L_brx_0: .branchtargets +; CHECK-NEXT: $L__BB0_2, +; CHECK-NEXT: $L__BB0_3, +; CHECK-NEXT: $L__BB0_5, +; CHECK-NEXT: $L__BB0_5, +; CHECK-NEXT: $L__BB0_1, +; CHECK-NEXT: $L__BB0_5, +; CHECK-NEXT: $L__BB0_3; +; CHECK-NEXT: brx.idx %r1, $L_brx_0; +; CHECK-NEXT: $L__BB0_5: // %unreachable +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: exit; +; CHECK-NEXT: // end inline asm +entry: + br label %for.cond + +for.cond: ; preds = %for.cond4.for.cond_crit_edge, %BS_LABEL_1, %BS_LABEL_1, %entry + %p_2218_0.1 = phi i32 [ 0, %entry ], [ %p_2218_0.3, %BS_LABEL_1 ], [ %p_2218_0.3, %BS_LABEL_1 ], [ poison, %for.cond4.for.cond_crit_edge ] + br label %BS_LABEL_1 + +BS_LABEL_2: ; preds = %BS_LABEL_1 + %sub = or i32 %p_2218_0.3, 1 + br label %for.cond4 + +for.cond4: ; preds = %BS_LABEL_1, %BS_LABEL_2 + %p_2218_0.2 = phi i32 [ 0, %BS_LABEL_1 ], [ %sub, %BS_LABEL_2 ] + br i1 false, label %for.cond4.for.cond_crit_edge, label %BS_LABEL_1 + +for.cond4.for.cond_crit_edge: ; preds = %for.cond4 + br label %for.cond + +BS_LABEL_1: ; preds = %for.cond4, %for.cond + %p_2218_0.3 = phi i32 [ %p_2218_0.2, %for.cond4 ], [ %p_2218_0.1, %for.cond ] + switch i32 %cond, label %unreachable [ + i32 0, label %for.cond4 + i32 4, label %BS_LABEL_2 + i32 1, label %for.cond + i32 6, label %for.cond + ] + +unreachable: ; preds = %BS_LABEL_1 + unreachable +} + + diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll new file mode 100644 index 0000000..479de53 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; NOTE: This sample test demonstrates the pretty print feature for NVPTX intrinsics +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +define void @tcgen05_mma_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) { + ; CHECK-LABEL: define void @tcgen05_mma_fp16_cta1( + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0) + + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1) + + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2) + + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=use */ i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_f8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) { + ; CHECK-LABEL: define void @tcgen05_mma_f8f6f4_cta2( + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0) + + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1) + + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2) + + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=use */ i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3) + + ret void +} + +; This test verifies that printImmArg is safe to call on all constant arguments, but only prints comments for arguments that have pretty printing configured. +define void @test_mixed_constants_edge_case(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor) { + ; CHECK-LABEL: define void @test_mixed_constants_edge_case( + ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, i32 3, i32 1, i32 0) + + ret void +} + +declare void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6), ptr addrspace(6), i64, i32, i1, i32, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/thread-fence.ll b/llvm/test/CodeGen/NVPTX/thread-fence.ll new file mode 100644 index 0000000..185461b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/thread-fence.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_acquire +define void @test_fence_acquire() { +; CHECK-LABEL: test_fence_acquire( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster(); + + ret void +} + +; CHECK-LABEL: test_fence_release +define void @test_fence_release() { +; CHECK-LABEL: test_fence_release( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster(); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll b/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll new file mode 100644 index 0000000..a888d99 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | FileCheck %s -check-prefixes=NOMASK +; RUN: %if ptxas-sm_90 && ptxas-isa-8.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | %ptxas-verify -arch=sm_90 %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck %s -check-prefixes=MASK +; RUN: %if ptxas-sm_90 && ptxas-isa-8.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %} + +; On older architectures and versions, we shouldn't be seeing a used bytes mask pragma. +; Specifically, the pragma is only supported on SM_50 or later, and PTX 8.3 or later. +; Architecture fixed at SM_90 for this test for stability, and we vary the PTX version to test the pragma. + +define i32 @global_8xi32(ptr %a, ptr %b) { +; NOMASK-LABEL: global_8xi32( +; NOMASK: { +; NOMASK-NEXT: .reg .b32 %r<5>; +; NOMASK-NEXT: .reg .b64 %rd<2>; +; NOMASK-EMPTY: +; NOMASK-NEXT: // %bb.0: +; NOMASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; NOMASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; NOMASK-NEXT: st.param.b32 [func_retval0], %r1; +; NOMASK-NEXT: ret; +; +; MASK-LABEL: global_8xi32( +; MASK: { +; MASK-NEXT: .reg .b32 %r<5>; +; MASK-NEXT: .reg .b64 %rd<2>; +; MASK-EMPTY: +; MASK-NEXT: // %bb.0: +; MASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; MASK-NEXT: .pragma "used_bytes_mask 0xfff"; +; MASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; MASK-NEXT: st.param.b32 [func_retval0], %r1; +; MASK-NEXT: ret; + %a.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 %a, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison) + %first = extractelement <4 x i32> %a.load, i32 0 + ret i32 %first +} +declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr , <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx88-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx88-sm120a.py new file mode 100644 index 0000000..f1666db --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx88-sm120a.py @@ -0,0 +1,12 @@ +# Check all variants of instructions supported by PTX88 on SM120a +# RUN: %python %s --ptx=88 --gpu-arch=120 --aa > %t-ptx88-sm_120a.ll +# RUN: llc < %t-ptx88-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx88 \ +# RUN: | FileCheck %t-ptx88-sm_120a.ll +# RUN: %if ptxas-sm_120a && ptxas-isa-8.8 %{ \ +# RUN: llc < %t-ptx88-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx88 \ +# RUN: | %ptxas-verify -arch=sm_120a \ +# RUN: %} + +import wmma + +wmma.main() diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py index 8427ae4..817665a6 100644 --- a/llvm/test/CodeGen/NVPTX/wmma.py +++ b/llvm/test/CodeGen/NVPTX/wmma.py @@ -131,7 +131,7 @@ class MMAFrag: "m16n8k64:b:e5m2": 4, "m16n8k64:b:e3m2": 4, "m16n8k64:b:e2m3": 4, - "m16n8k64:b:e2m1": 4, + "m16n8k64:b:e2m1": 4 if is_mma_sparse else 2, "m16n8k64:c:f16": 2, "m16n8k64:c:f32": 4, "m16n8k64:d:f16": 2, @@ -1131,6 +1131,160 @@ def gen_mma_tests(): return generated_items +def get_mma_block_scale_ops(): + return make_mma_ops(["m16n8k64"], ["e2m1"], [], ["f32"], []) + make_mma_ops( + ["m16n8k32"], + ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"], + ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"], + ["f32"], + [], + ) + + +def is_mma_block_scale_geom_supported(geom): + # geometries for FP. + if geom in [ + "m16n8k32", + "m16n8k64", + ]: + return True + raise ValueError(f"Unexpected MMA block scale geometry: {geom}") + + +def is_mma_block_scale_variant_supported(op, kind, scale_vec_size, stype): + if not ( + is_type_supported(op.a.mma_type.ptx_type) + and is_mma_block_scale_geom_supported(op.a.geom) + ): + return False + + if ( + op.a.geom == "m16n8k64" + and kind == "mxf4" + and stype == "ue8m0" + and scale_vec_size in ["", ".scale_vec::2X"] + ): + return True + + if ( + op.a.geom == "m16n8k64" + and kind == "mxf4nvf4" + and stype == "ue8m0" + and scale_vec_size == ".scale_vec::2X" + ): + return True + + if ( + op.a.geom == "m16n8k64" + and kind == "mxf4nvf4" + and stype == "ue4m3" + and scale_vec_size == ".scale_vec::4X" + ): + return True + + if ( + op.a.geom == "m16n8k32" + and kind == "mxf8f6f4" + and stype == "ue8m0" + and scale_vec_size in ["", ".scale_vec::1X"] + ): + return True + + return False + + +def common_mma_block_scale_test_gen( + params, op, intrinsic_template, instruction_template +): + mma_block_scale_template = """ +declare ${ret_ty} @${intrinsic}( + ${args}); + +; CHECK-LABEL: .func {{.*}}test_${function}( +define ${ret_ty} @test_${function}( + ${args}) { +; CHECK: ${instruction} +; CHECK-NEXT: ${check_d} +; CHECK-NEXT: ${check_a} +; CHECK-NEXT: ${check_b} +; CHECK-NEXT: ${check_c} +; CHECK-NEXT: ${check_scale_a_data} +; CHECK-NEXT: ${check_byte_id_a} +; CHECK-NEXT: ${check_thread_id_a} +; CHECK-NEXT: ${check_scale_b_data} +; CHECK-NEXT: ${check_byte_id_b} +; CHECK-NEXT: ${check_thread_id_b} + %r = call ${ret_ty} @${intrinsic}( + ${args}); + ret ${ret_ty} %r; +} +""" + + test_params = params + test_params["intrinsic"] = Template(intrinsic_template).substitute(params) + test_params["function"] = test_params["intrinsic"].replace(".", "_") + test_params["instruction"] = Template(instruction_template).substitute(params) + test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d) + test_params["check_a"] = check_pattern(op.a) + test_params["check_b"] = check_pattern(op.b) + test_params["check_c"] = check_pattern(op.c) + test_params["check_d"] = check_pattern(op.d) + test_params["check_scale_a_data"] = "{{%r[0-9]+}}" + test_params["check_byte_id_a"] = "{{%r[0-9]+}}" + test_params["check_thread_id_a"] = "{{%r[0-9]+}}" + test_params["check_scale_b_data"] = "{{%r[0-9]+}}" + test_params["check_byte_id_b"] = "{{%r[0-9]+}}" + test_params["check_thread_id_b"] = "{{%r[0-9]+}}" + args = ",\n ".join( + list(make_wmma_slice_args(frag) for frag in (op.a, op.b, op.c)) + + ["i32 %scale_a_data", "i16 %byte_id_a, i16 %thread_id_a"] + + ["i32 %scale_b_data", "i16 %byte_id_b, i16 %thread_id_b"] + ) + test_params["args"] = args + print(Template(mma_block_scale_template).substitute(test_params)) + return (test_params["intrinsic"], test_params["instruction"]) + + +def gen_mma_block_scale_tests(): + if not (ptx_version >= 88 and gpu_arch >= 120 and aa): + return [] + + mma_block_scale_intrinsic_template = "llvm.nvvm.mma.block.scale.${geom}.row.col.${kind}${scale}.${intrinsic_signature}.${stype}" + mma_block_scale_instruction_template = "mma.sync.aligned.${geom}.row.col.kind::${kind}.block_scale${scale_vec_size}.${ptx_signature}.${stype}" + + generated_items = [] + + for op, kind, scale_vec_size, stype in product( + get_mma_block_scale_ops(), + ["mxf4", "mxf4nvf4", "mxf8f6f4"], + ["", ".scale_vec::1X", ".scale_vec::2X", ".scale_vec::4X"], + ["ue8m0", "ue4m3"], + ): + if not is_mma_block_scale_variant_supported(op, kind, scale_vec_size, stype): + continue + + params = { + "intrinsic_signature": mma_signature(op), + "ptx_signature": mma_ptx_signature(op), + "geom": op.a.geom, + "kind": kind, + "scale_vec_size": scale_vec_size, + "scale": scale_vec_size.replace("_vec::", ".").lower(), + "stype": stype, + } + + intrinsic_template = mma_block_scale_intrinsic_template + instruction_template = mma_block_scale_instruction_template + + generated_items.append( + common_mma_block_scale_test_gen( + params, op, intrinsic_template, instruction_template + ) + ) + + return generated_items + + def get_mma_sp_ops(): return ( make_mma_ops(["m16n8k16", "m16n8k32"], ["bf16"], [], ["f32"], [], True) @@ -1224,7 +1378,11 @@ def is_mma_sp_variant_supported(op, metadata, kind, satf): return True -def sp_selector_gen(op): +def sp_selector_gen(op, block_scale=False): + if block_scale: + # PTX ISA 9.0 has the sparsity selector equal to 0 only + return range(1) + # (geom, type) -> allowed selector range range_01 = { ("m16n8k32", "bf16"), @@ -1355,6 +1513,178 @@ def gen_mma_sp_tests(): return generated_items +def get_mma_sp_block_scale_ops(): + return make_mma_ops(["m16n8k128"], ["e2m1"], [], ["f32"], [], True) + make_mma_ops( + ["m16n8k64"], + ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"], + ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"], + ["f32"], + [], + True, + ) + + +def is_mma_sp_block_scale_geom_supported(geom): + # geometries for FP. + if geom in [ + "m16n8k64", + "m16n8k128", + ]: + return True + raise ValueError(f"Unexpected sparse MMA block scale geometry: {geom}") + + +def is_mma_sp_block_scale_variant_supported(op, kind, scale_vec_size, stype): + if not ( + is_type_supported(op.a.mma_type.ptx_type) + and is_mma_sp_block_scale_geom_supported(op.a.geom) + ): + return False + + if ( + op.a.geom == "m16n8k128" + and kind == "mxf4" + and stype == "ue8m0" + and scale_vec_size in ["", ".scale_vec::2X"] + ): + return True + + if ( + op.a.geom == "m16n8k128" + and kind == "mxf4nvf4" + and stype == "ue8m0" + and scale_vec_size == ".scale_vec::2X" + ): + return True + + if ( + op.a.geom == "m16n8k128" + and kind == "mxf4nvf4" + and stype == "ue4m3" + and scale_vec_size == ".scale_vec::4X" + ): + return True + + if ( + op.a.geom == "m16n8k64" + and kind == "mxf8f6f4" + and stype == "ue8m0" + and scale_vec_size in ["", ".scale_vec::1X"] + ): + return True + + return False + + +def common_mma_sp_block_scale_test_gen( + params, op, intrinsic_template, instruction_template +): + mma_sp_block_scale_decl_template = """ +declare ${ret_ty} @${intrinsic}( + ${args}); +""" + + mma_sp_block_scale_test_template = """ +; CHECK-LABEL: .func {{.*}}test_${function}_${selector}( +define ${ret_ty} @test_${function}_${selector}( + ${args}) { +; CHECK: ${instruction} +; CHECK-NEXT: ${check_d} +; CHECK-NEXT: ${check_a} +; CHECK-NEXT: ${check_b} +; CHECK-NEXT: ${check_c} +; CHECK-NEXT: ${check_metadata} +; CHECK-NEXT: ${check_selector} +; CHECK-NEXT: ${check_scale_a_data} +; CHECK-NEXT: ${check_byte_id_a} +; CHECK-NEXT: ${check_thread_id_a} +; CHECK-NEXT: ${check_scale_b_data} +; CHECK-NEXT: ${check_byte_id_b} +; CHECK-NEXT: ${check_thread_id_b} + %r = call ${ret_ty} @${intrinsic}( + ${call_args}); + ret ${ret_ty} %r; +} +""" + + test_params = params + test_params["intrinsic"] = Template(intrinsic_template).substitute(params) + test_params["function"] = test_params["intrinsic"].replace(".", "_") + test_params["instruction"] = Template(instruction_template).substitute(params) + test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d) + test_params["check_a"] = check_pattern(op.a) + test_params["check_b"] = check_pattern(op.b) + test_params["check_c"] = check_pattern(op.c) + test_params["check_d"] = check_pattern(op.d) + test_params["check_metadata"] = "{{%r[0-9]+}}" + test_params["check_scale_a_data"] = "{{%r[0-9]+}}" + test_params["check_byte_id_a"] = "{{%r[0-9]+}}" + test_params["check_thread_id_a"] = "{{%r[0-9]+}}" + test_params["check_scale_b_data"] = "{{%r[0-9]+}}" + test_params["check_byte_id_b"] = "{{%r[0-9]+}}" + test_params["check_thread_id_b"] = "{{%r[0-9]+}}" + args = ",\n ".join( + list(make_wmma_slice_args(frag) for frag in (op.a, op.b, op.c)) + + ["i32 %metadata", "i32 %selector"] + + ["i32 %scale_a_data", "i16 %byte_id_a, i16 %thread_id_a"] + + ["i32 %scale_b_data", "i16 %byte_id_b, i16 %thread_id_b"] + ) + test_params["args"] = args + + print(Template(mma_sp_block_scale_decl_template).substitute(test_params)) + + for selector in [str(r) for r in sp_selector_gen(op, True)]: + test_params["selector"] = selector + test_params["check_selector"] = "{{" + test_params["selector"] + "}}" + test_params["call_args"] = test_params["args"].replace( + "%selector", test_params["selector"] + ) + + print(Template(mma_sp_block_scale_test_template).substitute(test_params)) + + return (test_params["intrinsic"], test_params["instruction"]) + + +def gen_mma_sp_block_scale_tests(): + if not (ptx_version >= 88 and gpu_arch >= 120 and aa): + return [] + + mma_sp_block_scale_intrinsic_template = "llvm.nvvm.mma.sp.ordered.metadata.block.scale.${geom}.row.col.${kind}${scale}.${intrinsic_signature}.${stype}" + mma_sp_block_scale_instruction_template = "mma.sp::ordered_metadata.sync.aligned.${geom}.row.col.kind::${kind}.block_scale${scale_vec_size}.${ptx_signature}.${stype}" + + generated_items = [] + + for op, kind, scale_vec_size, stype in product( + get_mma_sp_block_scale_ops(), + ["mxf4", "mxf4nvf4", "mxf8f6f4"], + ["", ".scale_vec::1X", ".scale_vec::2X", ".scale_vec::4X"], + ["ue8m0", "ue4m3"], + ): + if not is_mma_sp_block_scale_variant_supported(op, kind, scale_vec_size, stype): + continue + + params = { + "intrinsic_signature": mma_signature(op), + "ptx_signature": mma_ptx_signature(op), + "geom": op.a.geom, + "kind": kind, + "scale_vec_size": scale_vec_size, + "scale": scale_vec_size.replace("_vec::", ".").lower(), + "stype": stype, + } + + intrinsic_template = mma_sp_block_scale_intrinsic_template + instruction_template = mma_sp_block_scale_instruction_template + + generated_items.append( + common_mma_sp_block_scale_test_gen( + params, op, intrinsic_template, instruction_template + ) + ) + + return generated_items + + # Append complete list of intrinsics and instructions we've generated tests for. # Generate set of checks to verify that that we did generate sensible set of # tests for the given combination of PTX and SM variants. @@ -1545,7 +1875,9 @@ def gen_tests(): items += gen_stmatrix_tests() items += gen_wmma_mma_tests() items += gen_mma_tests() + items += gen_mma_block_scale_tests() items += gen_mma_sp_tests() + items += gen_mma_sp_block_scale_tests() gen_check_unsupported_ops(items) |
