aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/NVPTX
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/NVPTX')
-rw-r--r--llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll40
-rw-r--r--llvm/test/CodeGen/NVPTX/bf16-instructions.ll356
-rw-r--r--llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll12
-rw-r--r--llvm/test/CodeGen/NVPTX/bswap.ll74
-rw-r--r--llvm/test/CodeGen/NVPTX/bug22246.ll17
-rw-r--r--llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll11
-rw-r--r--llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll278
-rw-r--r--llvm/test/CodeGen/NVPTX/convert-sm80.ll65
-rw-r--r--llvm/test/CodeGen/NVPTX/convert-sm89.ll7
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll46
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll68
-rw-r--r--llvm/test/CodeGen/NVPTX/f16-ex2.ll40
-rw-r--r--llvm/test/CodeGen/NVPTX/f32-ex2.ll7
-rw-r--r--llvm/test/CodeGen/NVPTX/fast-math.ll64
-rw-r--r--llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll27
-rw-r--r--llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll51
-rw-r--r--llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll8
-rw-r--r--llvm/test/CodeGen/NVPTX/fence-proxy.ll15
-rw-r--r--llvm/test/CodeGen/NVPTX/i1-select.ll77
-rw-r--r--llvm/test/CodeGen/NVPTX/i8x4-instructions.ll12
-rw-r--r--llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll16
-rw-r--r--llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll28
-rw-r--r--llvm/test/CodeGen/NVPTX/lower-byval-args.ll57
-rw-r--r--llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir12
-rw-r--r--llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll84
-rw-r--r--llvm/test/CodeGen/NVPTX/masked-load-vectors.ll366
-rw-r--r--llvm/test/CodeGen/NVPTX/masked-store-variable-mask.ll56
-rw-r--r--llvm/test/CodeGen/NVPTX/masked-store-vectors-256.ll318
-rw-r--r--llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll247
-rw-r--r--llvm/test/CodeGen/NVPTX/op-fence.ll17
-rw-r--r--llvm/test/CodeGen/NVPTX/param-add.ll5
-rw-r--r--llvm/test/CodeGen/NVPTX/param-vectorize-device.ll6
-rw-r--r--llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir4
-rw-r--r--llvm/test/CodeGen/NVPTX/switch-loop-header.mir182
-rw-r--r--llvm/test/CodeGen/NVPTX/switch.ll73
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll50
-rw-r--r--llvm/test/CodeGen/NVPTX/thread-fence.ll31
-rw-r--r--llvm/test/CodeGen/NVPTX/used-bytes-mask.ll38
-rw-r--r--llvm/test/CodeGen/NVPTX/wmma-ptx88-sm120a.py12
-rw-r--r--llvm/test/CodeGen/NVPTX/wmma.py336
45 files changed, 2925 insertions, 308 deletions
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index dd9a472..19ec257 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -45,29 +45,31 @@ define half @fh(ptr %p) {
; ENABLED-LABEL: fh(
; ENABLED: {
; ENABLED-NEXT: .reg .b16 %rs<10>;
-; ENABLED-NEXT: .reg .b32 %r<13>;
+; ENABLED-NEXT: .reg .b32 %r<17>;
; ENABLED-NEXT: .reg .b64 %rd<2>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
-; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
-; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
-; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2;
-; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1;
-; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1;
-; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3;
-; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4;
-; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3;
-; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4;
-; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6;
-; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7;
-; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6;
-; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7;
-; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9;
-; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8;
-; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5;
-; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11;
-; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12;
+; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
+; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;
+; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1;
+; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5;
+; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4;
+; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7;
+; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3;
+; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2;
+; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10;
+; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7;
+; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6;
+; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13;
+; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8;
+; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1;
+; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16;
; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9;
; ENABLED-NEXT: ret;
;
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 4d930cd..41f77b5 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
@@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fadd(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
+; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fadd(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fsub(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
+; SM90-FTZ-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fsub(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_faddx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
+; SM90-FTZ-NEXT: add.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_faddx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fsubx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
+; SM90-FTZ-NEXT: sub.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fsubx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fmulx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
+; SM90-FTZ-NEXT: mul.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fmulx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fdiv(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<5>;
+; SM90-FTZ-NEXT: .reg .b32 %r<8>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM90-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM90-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4;
+; SM90-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fdiv(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<5>;
@@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fpext_float(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fpext_float(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptrunc_float(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptrunc_float(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r2, %r1, 0f3F800000;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fadd_imm_1(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM90-FTZ-NEXT: mov.b16 %rs2, 0x3F80;
+; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fadd_imm_1(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -661,17 +768,18 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat %
; CHECK-LABEL: test_select_cc_bf16_f64(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_bf16_f64_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_bf16_f64_param_1];
; CHECK-NEXT: setp.lt.f64 %p1, %rd1, %rd2;
-; CHECK-NEXT: ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2];
-; CHECK-NEXT: ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3];
-; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b64 %rd3, test_select_cc_bf16_f64_param_3;
+; CHECK-NEXT: mov.b64 %rd4, test_select_cc_bf16_f64_param_2;
+; CHECK-NEXT: selp.b64 %rd5, %rd4, %rd3, %p1;
+; CHECK-NEXT: ld.param.b16 %rs1, [%rd5];
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-NEXT: ret;
%cc = fcmp olt double %a, %b
%r = select i1 %cc, bfloat %c, bfloat %d
@@ -750,18 +858,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r12, %rs1;
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_extload_bf16x8(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<9>;
+; SM90-FTZ-NEXT: .reg .b32 %r<13>;
+; SM90-FTZ-NEXT: .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
+; SM90-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; SM90-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; SM90-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM90-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
+; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
+; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_extload_bf16x8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<9>;
@@ -825,12 +958,24 @@ define i16 @test_fptosi_i16(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %r1;
; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptosi_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptosi_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -880,12 +1025,24 @@ define i16 @test_fptoui_i16(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %r1;
; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptoui_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptoui_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -945,6 +1102,16 @@ define bfloat @test_sitofp_i16(i16 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_sitofp_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.s16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_sitofp_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1002,6 +1169,16 @@ define bfloat @test_uitofp_i8(i8 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i8(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1070,6 +1247,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i1(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .pred %p<2>;
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
+; SM90-FTZ-NEXT: and.b16 %rs2, %rs1, 1;
+; SM90-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; SM90-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1;
+; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs3, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i1(
; SM90: {
; SM90-NEXT: .reg .pred %p<2>;
@@ -1132,6 +1324,16 @@ define bfloat @test_uitofp_i16(i16 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1188,6 +1390,17 @@ define bfloat @test_uitofp_i32(i32 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i32(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs1, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i32(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1248,6 +1461,17 @@ define bfloat @test_uitofp_i64(i64 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i64(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u64 %rs1, %rd1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i64(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1302,12 +1526,22 @@ define bfloat @test_roundeven(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_roundeven(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0];
+; SM90-FTZ-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_roundeven(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1372,6 +1606,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_maximum(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1];
+; SM90-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_maximum(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -1430,6 +1675,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_maxnum(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1];
+; SM90-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_maxnum(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -1511,6 +1767,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_maximum_v2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1];
+; SM90-FTZ-NEXT: max.NaN.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_maximum_v2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -1583,6 +1850,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_maxnum_v2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1];
+; SM90-FTZ-NEXT: max.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_maxnum_v2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 3c6fb4b..c19e665 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -229,16 +229,18 @@ define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_0];
-; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1];
-; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: mov.b64 %rd1, test_select_param_1;
+; CHECK-NEXT: mov.b64 %rd2, test_select_param_0;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; CHECK-NEXT: ld.param.b32 %r1, [%rd3];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b
ret <2 x bfloat> %r
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index e3d1c80..8050c6f 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -1,25 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %}
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s
-; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
target triple = "nvptx64-nvidia-cuda"
define i16 @bswap16(i16 %a) {
; CHECK-LABEL: bswap16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0];
-; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
-; CHECK-NEXT: shl.b16 %rs3, %rs1, 8;
-; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2;
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ld.param.b16 %r1, [bswap16_param_0];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7701U;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%b = tail call i16 @llvm.bswap.i16(i16 %a)
ret i16 %b
@@ -56,40 +49,39 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
}
define i64 @bswap64(i64 %a) {
-; PTX70-LABEL: bswap64(
-; PTX70: {
-; PTX70-NEXT: .reg .b32 %r<5>;
-; PTX70-NEXT: .reg .b64 %rd<3>;
-; PTX70-EMPTY:
-; PTX70-NEXT: // %bb.0:
-; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
-; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
-; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
-; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
-; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
-; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2};
-; PTX70-NEXT: st.param.b64 [func_retval0], %rd2;
-; PTX70-NEXT: ret;
-;
-; PTX71-LABEL: bswap64(
-; PTX71: {
-; PTX71-NEXT: .reg .b32 %r<5>;
-; PTX71-NEXT: .reg .b64 %rd<3>;
-; PTX71-EMPTY:
-; PTX71-NEXT: // %bb.0:
-; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
-; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
-; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
-; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
-; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
-; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
-; PTX71-NEXT: st.param.b64 [func_retval0], %rd2;
-; PTX71-NEXT: ret;
+; CHECK-LABEL: bswap64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x123U;
+; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x123U;
+; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%b = tail call i64 @llvm.bswap.i64(i64 %a)
ret i64 %b
}
+define <2 x i32> @bswapv2i32(<2 x i32> %a) {
+; CHECK-LABEL: bswapv2i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [bswapv2i32_param_0];
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x123U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x123U;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %b = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+ ret <2 x i32> %b
+}
declare i16 @llvm.bswap.i16(i16)
declare i32 @llvm.bswap.i32(i32)
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
declare i64 @llvm.bswap.i64(i64)
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll
index 198878c..1d7a396 100644
--- a/llvm/test/CodeGen/NVPTX/bug22246.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22246.ll
@@ -9,19 +9,20 @@ define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr noc
; CHECK-LABEL: _Z3foobbbPb(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .b16 %rs<7>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b8 %rs1, [_Z3foobbbPb_param_0];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT: ld.param.b8 %rs3, [_Z3foobbbPb_param_1];
-; CHECK-NEXT: ld.param.b8 %rs4, [_Z3foobbbPb_param_2];
-; CHECK-NEXT: selp.b16 %rs5, %rs3, %rs4, %p1;
-; CHECK-NEXT: and.b16 %rs6, %rs5, 1;
-; CHECK-NEXT: ld.param.b64 %rd1, [_Z3foobbbPb_param_3];
-; CHECK-NEXT: st.b8 [%rd1], %rs6;
+; CHECK-NEXT: mov.b64 %rd1, _Z3foobbbPb_param_2;
+; CHECK-NEXT: mov.b64 %rd2, _Z3foobbbPb_param_1;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; CHECK-NEXT: ld.param.b8 %rs3, [%rd3];
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: ld.param.b64 %rd4, [_Z3foobbbPb_param_3];
+; CHECK-NEXT: st.b8 [%rd4], %rs4;
; CHECK-NEXT: ret;
entry:
%.sink.v = select i1 %p1, i1 %p2, i1 %p3
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
new file mode 100644
index 0000000..4d81fdc
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s
+
+; Test that we get a clear error message when using an unsupported syncscope.
+
+; CHECK: NVPTX backend does not support syncscope "agent"
+; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device
+define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) {
+ %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic
+ %value = extractvalue { i32, i1 } %result, 0
+ ret i32 %value
+}
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll
new file mode 100644
index 0000000..b773c8d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | FileCheck %s
+; RUN: %if ptxas-sm_80 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | %ptxas-verify -arch=sm_80 %}
+
+; CHECK-LABEL: cvt_rna_satfinite_tf32_f32
+define i32 @cvt_rna_satfinite_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rna_satfinite_tf32_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rna_satfinite_tf32_f32_param_0];
+; CHECK-NEXT: cvt.rna.satfinite.tf32.f32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1)
+ ret i32 %val
+}
+
+define <2 x bfloat> @cvt_rn_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rn_relu_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_relu_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.relu.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rz_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rz_relu_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_relu_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.relu.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float, float)
+
+define <2 x half> @cvt_rn_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rn_relu_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_relu_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.relu.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rz_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rz_relu_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_relu_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.relu.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+declare <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float, float)
+declare <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float, float)
+declare <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float, float)
+declare <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float, float)
+
+define bfloat @cvt_rn_bf16_f32_sf(float %f1) {
+; CHECK-LABEL: cvt_rn_bf16_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16_f32_sf_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.bf16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call bfloat @llvm.nvvm.f2bf16.rn.satfinite(float %f1)
+ ret bfloat %val
+}
+
+define bfloat @cvt_rn_relu_bf16_f32_sf(float %f1) {
+; CHECK-LABEL: cvt_rn_relu_bf16_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16_f32_sf_param_0];
+; CHECK-NEXT: cvt.rn.relu.satfinite.bf16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float %f1)
+ ret bfloat %val
+}
+
+define bfloat @cvt_rz_bf16_f32_sf(float %f1) {
+; CHECK-LABEL: cvt_rz_bf16_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16_f32_sf_param_0];
+; CHECK-NEXT: cvt.rz.satfinite.bf16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call bfloat @llvm.nvvm.f2bf16.rz.satfinite(float %f1)
+ ret bfloat %val
+}
+
+define bfloat @cvt_rz_relu_bf16_f32_sf(float %f1) {
+; CHECK-LABEL: cvt_rz_relu_bf16_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16_f32_sf_param_0];
+; CHECK-NEXT: cvt.rz.relu.satfinite.bf16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float %f1)
+ ret bfloat %val
+}
+
+declare bfloat @llvm.nvvm.f2bf16.rn.satfinite(float)
+declare bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float)
+declare bfloat @llvm.nvvm.f2bf16.rz.satfinite(float)
+declare bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float)
+
+define half @cvt_rn_f16_f32_sf(float %f1) {
+; CHECK-LABEL: cvt_rn_f16_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16_f32_sf_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.f16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call half @llvm.nvvm.f2f16.rn.satfinite(float %f1)
+ ret half %val
+}
+
+define half @cvt_rn_relu_f16_f32_sf(float %f1) {
+; CHECK-LABEL: cvt_rn_relu_f16_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16_f32_sf_param_0];
+; CHECK-NEXT: cvt.rn.relu.satfinite.f16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call half @llvm.nvvm.f2f16.rn.relu.satfinite(float %f1)
+ ret half %val
+}
+
+define half @cvt_rz_f16_f32_sf(float %f1) {
+; CHECK-LABEL: cvt_rz_f16_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16_f32_sf_param_0];
+; CHECK-NEXT: cvt.rz.satfinite.f16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call half @llvm.nvvm.f2f16.rz.satfinite(float %f1)
+ ret half %val
+}
+
+define half @cvt_rz_relu_f16_f32_sf(float %f1) {
+; CHECK-LABEL: cvt_rz_relu_f16_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16_f32_sf_param_0];
+; CHECK-NEXT: cvt.rz.relu.satfinite.f16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call half @llvm.nvvm.f2f16.rz.relu.satfinite(float %f1)
+ ret half %val
+}
+
+declare half @llvm.nvvm.f2f16.rn.satfinite(float)
+declare half @llvm.nvvm.f2f16.rn.relu.satfinite(float)
+declare half @llvm.nvvm.f2f16.rz.satfinite(float)
+declare half @llvm.nvvm.f2f16.rz.relu.satfinite(float)
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll
index edf1739..a47bbab 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll
@@ -198,6 +198,71 @@ declare bfloat @llvm.nvvm.f2bf16.rn.relu(float)
declare bfloat @llvm.nvvm.f2bf16.rz(float)
declare bfloat @llvm.nvvm.f2bf16.rz.relu(float)
+define half @cvt_rn_f16_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_f16_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16_f32_param_0];
+; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call half @llvm.nvvm.f2f16.rn(float %f1)
+ ret half %val
+}
+
+define half @cvt_rn_relu_f16_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_relu_f16_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16_f32_param_0];
+; CHECK-NEXT: cvt.rn.relu.f16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call half @llvm.nvvm.f2f16.rn.relu(float %f1)
+ ret half %val
+}
+
+define half @cvt_rz_f16_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_f16_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16_f32_param_0];
+; CHECK-NEXT: cvt.rz.f16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call half @llvm.nvvm.f2f16.rz(float %f1)
+ ret half %val
+}
+
+define half @cvt_rz_relu_f16_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_relu_f16_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16_f32_param_0];
+; CHECK-NEXT: cvt.rz.relu.f16.f32 %rs1, %r1;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT: ret;
+ %val = call half @llvm.nvvm.f2f16.rz.relu(float %f1)
+ ret half %val
+}
+
+declare half @llvm.nvvm.f2f16.rn(float)
+declare half @llvm.nvvm.f2f16.rn.relu(float)
+declare half @llvm.nvvm.f2f16.rz(float)
+declare half @llvm.nvvm.f2f16.rz.relu(float)
+
define i32 @cvt_rna_tf32_f32(float %f1) {
; CHECK-LABEL: cvt_rna_tf32_f32(
; CHECK: {
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm89.ll b/llvm/test/CodeGen/NVPTX/convert-sm89.ll
index 616dcfa..170c120 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm89.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm89.ll
@@ -84,10 +84,3 @@ define <2 x half> @cvt_rn_relu_f16x2_e5m2x2(i16 %in) {
%val = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 %in);
ret <2 x half> %val
}
-
-; CHECK-LABEL: cvt_rna_satfinite_tf32_f32
-define i32 @cvt_rna_satfinite_tf32_f32(float %f1) {
-; CHECK: cvt.rna.satfinite.tf32.f32
- %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1)
- ret i32 %val
-}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll
new file mode 100644
index 0000000..9872b2a
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i64, i1)
+
+define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_g2s(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX64-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
+; CHECK-PTX64-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4;
+; CHECK-PTX64-NEXT: ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s(
+; CHECK-PTX-SHARED32: {
+; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT: // %bb.0:
+; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2;
+; CHECK-PTX-SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 0)
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1)
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
index b5c43fd2..d653895 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
index 57342dc..5de1ac8 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
index 6296d5a..2f5c1ef 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
index e5ae387..a2b2c2f 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
index 7d04ada..e4c48dd 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
index b0fe77c..727bb3b 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
@@ -29,10 +33,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -48,10 +52,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -79,10 +83,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -99,10 +103,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -131,10 +135,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -152,10 +156,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -185,10 +189,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -207,10 +211,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -241,10 +245,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -264,10 +268,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -297,10 +301,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -319,10 +323,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -354,10 +358,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -378,10 +382,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -415,10 +419,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -441,10 +445,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
index ee79f9d..af3fe67 100644
--- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
-; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
target triple = "nvptx64-nvidia-cuda"
declare half @llvm.nvvm.ex2.approx.f16(half)
-declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
+declare <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half>)
+declare bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat)
+declare <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat>)
-; CHECK-LABEL: ex2_half
define half @ex2_half(half %0) {
; CHECK-FP16-LABEL: ex2_half(
; CHECK-FP16: {
@@ -21,7 +22,6 @@ define half @ex2_half(half %0) {
ret half %res
}
-; CHECK-LABEL: ex2_2xhalf
define <2 x half> @ex2_2xhalf(<2 x half> %0) {
; CHECK-FP16-LABEL: ex2_2xhalf(
; CHECK-FP16: {
@@ -32,6 +32,34 @@ define <2 x half> @ex2_2xhalf(<2 x half> %0) {
; CHECK-FP16-NEXT: ex2.approx.f16x2 %r2, %r1;
; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-FP16-NEXT: ret;
- %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0)
+ %res = call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %0)
ret <2 x half> %res
}
+
+define bfloat @ex2_bfloat(bfloat %0) {
+; CHECK-FP16-LABEL: ex2_bfloat(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0:
+; CHECK-FP16-NEXT: ld.param.b16 %rs1, [ex2_bfloat_param_0];
+; CHECK-FP16-NEXT: ex2.approx.ftz.bf16 %rs2, %rs1;
+; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT: ret;
+ %res = call bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat %0)
+ ret bfloat %res
+}
+
+define <2 x bfloat> @ex2_2xbfloat(<2 x bfloat> %0) {
+; CHECK-FP16-LABEL: ex2_2xbfloat(
+; CHECK-FP16: {
+; CHECK-FP16-NEXT: .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT: // %bb.0:
+; CHECK-FP16-NEXT: ld.param.b32 %r1, [ex2_2xbfloat_param_0];
+; CHECK-FP16-NEXT: ex2.approx.ftz.bf16x2 %r2, %r1;
+; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT: ret;
+ %res = call <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat> %0)
+ ret <2 x bfloat> %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
index 796d80d..97b9d35 100644
--- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -3,7 +3,8 @@
; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
target triple = "nvptx-nvidia-cuda"
-declare float @llvm.nvvm.ex2.approx.f(float)
+declare float @llvm.nvvm.ex2.approx.f32(float)
+declare float @llvm.nvvm.ex2.approx.ftz.f32(float)
; CHECK-LABEL: ex2_float
define float @ex2_float(float %0) {
@@ -16,7 +17,7 @@ define float @ex2_float(float %0) {
; CHECK-NEXT: ex2.approx.f32 %r2, %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
- %res = call float @llvm.nvvm.ex2.approx.f(float %0)
+ %res = call float @llvm.nvvm.ex2.approx.f32(float %0)
ret float %res
}
@@ -31,6 +32,6 @@ define float @ex2_float_ftz(float %0) {
; CHECK-NEXT: ex2.approx.ftz.f32 %r2, %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
- %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
+ %res = call float @llvm.nvvm.ex2.approx.ftz.f32(float %0)
ret float %res
}
diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll
index 8561c60..7e778c4 100644
--- a/llvm/test/CodeGen/NVPTX/fast-math.ll
+++ b/llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -312,18 +312,20 @@ define float @repeated_div_recip_allowed_sel(i1 %pred, float %a, float %b, float
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_recip_allowed_sel_param_0];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_recip_allowed_sel_param_1];
-; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_sel_param_2];
-; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1;
-; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_recip_allowed_sel_param_3];
-; CHECK-NEXT: div.rn.f32 %r5, %r3, %r4;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: mov.b64 %rd1, repeated_div_recip_allowed_sel_param_2;
+; CHECK-NEXT: mov.b64 %rd2, repeated_div_recip_allowed_sel_param_1;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; CHECK-NEXT: ld.param.b32 %r1, [%rd3];
+; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_sel_param_3];
+; CHECK-NEXT: div.rn.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%x = fdiv arcp float %a, %divisor
%y = fdiv arcp float %b, %divisor
@@ -364,18 +366,20 @@ define float @repeated_div_recip_allowed_ftz_sel(i1 %pred, float %a, float %b, f
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_recip_allowed_ftz_sel_param_0];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_recip_allowed_ftz_sel_param_1];
-; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_sel_param_2];
-; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1;
-; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_recip_allowed_ftz_sel_param_3];
-; CHECK-NEXT: div.rn.ftz.f32 %r5, %r3, %r4;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: mov.b64 %rd1, repeated_div_recip_allowed_ftz_sel_param_2;
+; CHECK-NEXT: mov.b64 %rd2, repeated_div_recip_allowed_ftz_sel_param_1;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; CHECK-NEXT: ld.param.b32 %r1, [%rd3];
+; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_sel_param_3];
+; CHECK-NEXT: div.rn.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%x = fdiv arcp float %a, %divisor
%y = fdiv arcp float %b, %divisor
@@ -416,18 +420,20 @@ define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_fast_sel_param_0];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_fast_sel_param_1];
-; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_sel_param_2];
-; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1;
-; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_fast_sel_param_3];
-; CHECK-NEXT: div.approx.f32 %r5, %r3, %r4;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: mov.b64 %rd1, repeated_div_fast_sel_param_2;
+; CHECK-NEXT: mov.b64 %rd2, repeated_div_fast_sel_param_1;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; CHECK-NEXT: ld.param.b32 %r1, [%rd3];
+; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_sel_param_3];
+; CHECK-NEXT: div.approx.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%x = fdiv afn float %a, %divisor
%y = fdiv afn float %b, %divisor
@@ -468,18 +474,20 @@ define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %div
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_fast_ftz_sel_param_0];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_fast_ftz_sel_param_1];
-; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_ftz_sel_param_2];
-; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1;
-; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_fast_ftz_sel_param_3];
-; CHECK-NEXT: div.approx.ftz.f32 %r5, %r3, %r4;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: mov.b64 %rd1, repeated_div_fast_ftz_sel_param_2;
+; CHECK-NEXT: mov.b64 %rd2, repeated_div_fast_ftz_sel_param_1;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; CHECK-NEXT: ld.param.b32 %r1, [%rd3];
+; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_ftz_sel_param_3];
+; CHECK-NEXT: div.approx.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%x = fdiv afn float %a, %divisor
%y = fdiv afn float %b, %divisor
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll
new file mode 100644
index 0000000..d46408e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %}
+
+define void @test_nvvm_fence_proxy_async_generic_acquire_sync_restrict() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_acquire_sync_restrict(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster()
+ ret void
+}
+
+define void @test_nvvm_fence_proxy_async_generic_release_sync_restrict() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_release_sync_restrict(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster()
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll
new file mode 100644
index 0000000..896c624
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+
+define void @test_nvvm_fence_proxy_async() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.proxy.async;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.proxy.async()
+ ret void
+}
+
+define void @test_nvvm_fence_proxy_async_global() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_global(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.proxy.async.global;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.proxy.async.global()
+ ret void
+}
+
+define void @test_nvvm_fence_proxy_async_shared_cluster() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cluster(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.proxy.async.shared::cluster;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.proxy.async.shared_cluster()
+ ret void
+}
+
+define void @test_nvvm_fence_proxy_async_shared_cta() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cta(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.proxy.async.shared::cta;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.proxy.async.shared_cta()
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll
new file mode 100644
index 0000000..ab35e4f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll
@@ -0,0 +1,8 @@
+; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 -o /dev/null 2>&1 | FileCheck %s
+
+define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) {
+ ; CHECK: immarg value 130 out of range [128, 129)
+ call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 130);
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy.ll b/llvm/test/CodeGen/NVPTX/fence-proxy.ll
new file mode 100644
index 0000000..cb5679e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_70 && ptxas-isa-7.5 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | %ptxas-verify -arch=sm_70 %}
+
+define void @test_nvvm_fence_proxy_alias() {
+; CHECK-LABEL: test_nvvm_fence_proxy_alias(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.proxy.alias;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.proxy.alias()
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index 264f380..c91a3df 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -8,21 +8,24 @@ define i32 @test_select_i1_trunc(i32 %a, i32 %b, i32 %c, i32 %true, i32 %false)
; CHECK-LABEL: test_select_i1_trunc(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_param_0];
; CHECK-NEXT: and.b32 %r2, %r1, 1;
; CHECK-NEXT: setp.ne.b32 %p1, %r2, 0;
-; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_trunc_param_1];
-; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_trunc_param_2];
-; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_trunc_param_3];
-; CHECK-NEXT: selp.b32 %r6, %r3, %r4, %p1;
-; CHECK-NEXT: and.b32 %r7, %r6, 1;
-; CHECK-NEXT: setp.ne.b32 %p2, %r7, 0;
-; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_trunc_param_4];
-; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NEXT: mov.b64 %rd1, test_select_i1_trunc_param_2;
+; CHECK-NEXT: mov.b64 %rd2, test_select_i1_trunc_param_1;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; CHECK-NEXT: ld.param.b32 %r3, [%rd3];
+; CHECK-NEXT: and.b32 %r4, %r3, 1;
+; CHECK-NEXT: setp.ne.b32 %p2, %r4, 0;
+; CHECK-NEXT: mov.b64 %rd4, test_select_i1_trunc_param_4;
+; CHECK-NEXT: mov.b64 %rd5, test_select_i1_trunc_param_3;
+; CHECK-NEXT: selp.b64 %rd6, %rd5, %rd4, %p2;
+; CHECK-NEXT: ld.param.b32 %r5, [%rd6];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%a_trunc = trunc i32 %a to i1
%b_trunc = trunc i32 %b to i1
@@ -36,23 +39,25 @@ define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false
; CHECK-LABEL: test_select_i1_trunc_2(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_select_i1_trunc_2_param_0];
; CHECK-NEXT: and.b64 %rd2, %rd1, 1;
; CHECK-NEXT: setp.ne.b64 %p1, %rd2, 0;
-; CHECK-NEXT: ld.param.b16 %rs1, [test_select_i1_trunc_2_param_1];
-; CHECK-NEXT: ld.param.b16 %rs2, [test_select_i1_trunc_2_param_2];
-; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_2_param_3];
-; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
-; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
-; CHECK-NEXT: setp.ne.b16 %p2, %rs4, 0;
-; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_trunc_2_param_4];
-; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: mov.b64 %rd3, test_select_i1_trunc_2_param_2;
+; CHECK-NEXT: mov.b64 %rd4, test_select_i1_trunc_2_param_1;
+; CHECK-NEXT: selp.b64 %rd5, %rd4, %rd3, %p1;
+; CHECK-NEXT: ld.param.b16 %rs1, [%rd5];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.ne.b16 %p2, %rs2, 0;
+; CHECK-NEXT: mov.b64 %rd6, test_select_i1_trunc_2_param_4;
+; CHECK-NEXT: mov.b64 %rd7, test_select_i1_trunc_2_param_3;
+; CHECK-NEXT: selp.b64 %rd8, %rd7, %rd6, %p2;
+; CHECK-NEXT: ld.param.b32 %r1, [%rd8];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%a_trunc = trunc i64 %a to i1
%b_trunc = trunc i16 %b to i1
@@ -66,7 +71,8 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
; CHECK-LABEL: test_select_i1_basic(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<4>;
-; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_param_0];
@@ -75,13 +81,14 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
; CHECK-NEXT: setp.ne.b32 %p1, %r1, 0;
; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_param_2];
; CHECK-NEXT: setp.eq.b32 %p2, %r4, 0;
-; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_3];
; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0;
-; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_param_4];
-; CHECK-NEXT: selp.b32 %r7, %r5, %r6, %p2;
-; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
-; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p3;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NEXT: mov.b64 %rd1, test_select_i1_basic_param_4;
+; CHECK-NEXT: mov.b64 %rd2, test_select_i1_basic_param_3;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p2;
+; CHECK-NEXT: selp.b64 %rd4, %rd3, %rd1, %p1;
+; CHECK-NEXT: selp.b64 %rd5, %rd2, %rd4, %p3;
+; CHECK-NEXT: ld.param.b32 %r5, [%rd5];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%b1 = icmp eq i32 %v1, 0
%b2 = icmp eq i32 %v2, 0
@@ -95,7 +102,8 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i
; CHECK-LABEL: test_select_i1_basic_folding(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<11>;
-; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
@@ -105,16 +113,17 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i
; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0;
; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0;
-; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
; CHECK-NEXT: xor.pred %p5, %p1, %p3;
-; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
; CHECK-NEXT: and.pred %p6, %p5, %p4;
; CHECK-NEXT: and.pred %p7, %p2, %p4;
; CHECK-NEXT: and.pred %p8, %p3, %p6;
; CHECK-NEXT: or.pred %p9, %p8, %p7;
; CHECK-NEXT: xor.pred %p10, %p9, %p3;
-; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p10;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: mov.b64 %rd1, test_select_i1_basic_folding_param_4;
+; CHECK-NEXT: mov.b64 %rd2, test_select_i1_basic_folding_param_3;
+; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p10;
+; CHECK-NEXT: ld.param.b32 %r4, [%rd3];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NEXT: ret;
%b1 = icmp eq i32 %v1, 0
%b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 40d6a07..bfac2b4 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1442,16 +1442,18 @@ define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 {
; O3: {
; O3-NEXT: .reg .pred %p<2>;
; O3-NEXT: .reg .b16 %rs<3>;
-; O3-NEXT: .reg .b32 %r<4>;
+; O3-NEXT: .reg .b32 %r<2>;
+; O3-NEXT: .reg .b64 %rd<4>;
; O3-EMPTY:
; O3-NEXT: // %bb.0:
; O3-NEXT: ld.param.b8 %rs1, [test_select_param_2];
; O3-NEXT: and.b16 %rs2, %rs1, 1;
; O3-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; O3-NEXT: ld.param.b32 %r1, [test_select_param_0];
-; O3-NEXT: ld.param.b32 %r2, [test_select_param_1];
-; O3-NEXT: selp.b32 %r3, %r1, %r2, %p1;
-; O3-NEXT: st.param.b32 [func_retval0], %r3;
+; O3-NEXT: mov.b64 %rd1, test_select_param_1;
+; O3-NEXT: mov.b64 %rd2, test_select_param_0;
+; O3-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; O3-NEXT: ld.param.b32 %r1, [%rd3];
+; O3-NEXT: st.param.b32 [func_retval0], %r1;
; O3-NEXT: ret;
%r = select i1 %c, <4 x i8> %a, <4 x i8> %b
ret <4 x i8> %r
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
index 3fac29f..d219493 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
@@ -346,19 +346,15 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) {
; SM100-LABEL: ld_global_v8i32(
; SM100: {
; SM100-NEXT: .reg .b32 %r<16>;
-; SM100-NEXT: .reg .b64 %rd<6>;
+; SM100-NEXT: .reg .b64 %rd<2>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0];
-; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
-; SM100-NEXT: mov.b64 {%r1, %r2}, %rd5;
-; SM100-NEXT: mov.b64 {%r3, %r4}, %rd4;
-; SM100-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; SM100-NEXT: mov.b64 {%r7, %r8}, %rd2;
-; SM100-NEXT: add.s32 %r9, %r7, %r8;
-; SM100-NEXT: add.s32 %r10, %r5, %r6;
-; SM100-NEXT: add.s32 %r11, %r3, %r4;
-; SM100-NEXT: add.s32 %r12, %r1, %r2;
+; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
+; SM100-NEXT: add.s32 %r9, %r1, %r2;
+; SM100-NEXT: add.s32 %r10, %r3, %r4;
+; SM100-NEXT: add.s32 %r11, %r5, %r6;
+; SM100-NEXT: add.s32 %r12, %r7, %r8;
; SM100-NEXT: add.s32 %r13, %r9, %r10;
; SM100-NEXT: add.s32 %r14, %r11, %r12;
; SM100-NEXT: add.s32 %r15, %r13, %r14;
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index 297b2b9..ad78e0f 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -20,19 +20,19 @@ entry:
; IR-LABEL: @memcpy_caller
; IR: entry:
; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
-; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+; IR: br i1 [[Cond]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion
-; IR: loop-memcpy-expansion:
-; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; IR: dynamic-memcpy-expansion-main-body:
+; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %dynamic-memcpy-expansion-main-body ]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
; IR: store i8 [[Load]], ptr [[DstGep]]
; IR: [[IndexInc]] = add i64 %loop-index, 1
; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
-; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+; IR: br i1 [[Cond2]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion
-; IR-LABEL: post-loop-memcpy-expansion:
+; IR-LABEL: dynamic-memcpy-post-expansion:
; IR: ret ptr %dst
; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_caller
@@ -53,19 +53,19 @@ entry:
; IR-LABEL: @memcpy_volatile_caller
; IR: entry:
; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
-; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+; IR: br i1 [[Cond]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion
-; IR: loop-memcpy-expansion:
-; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; IR: dynamic-memcpy-expansion-main-body:
+; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %dynamic-memcpy-expansion-main-body ]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load volatile i8, ptr [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
; IR: store volatile i8 [[Load]], ptr [[DstGep]]
; IR: [[IndexInc]] = add i64 %loop-index, 1
; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
-; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+; IR: br i1 [[Cond2]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion
-; IR-LABEL: post-loop-memcpy-expansion:
+; IR-LABEL: dynamic-memcpy-post-expansion:
; IR: ret ptr %dst
@@ -97,16 +97,16 @@ entry:
; Check that calls with compile-time constant size are handled correctly
; IR-LABEL: @memcpy_known_size
; IR: entry:
-; IR: br label %load-store-loop
-; IR: load-store-loop:
-; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
+; IR: br label %static-memcpy-expansion-main-body
+; IR: static-memcpy-expansion-main-body:
+; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %static-memcpy-expansion-main-body ]
; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
; IR: [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]]
; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
; IR: store i8 [[Load]], ptr [[DstGep]]
; IR: [[IndexInc]] = add i64 %loop-index, 1
; IR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
-; IR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split
+; IR: br i1 [[Cond]], label %static-memcpy-expansion-main-body, label %static-memcpy-post-expansion
}
define ptr @memset_caller(ptr %dst, i32 %c, i64 %n) #0 {
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 21257e2..ca2914a 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -584,44 +584,25 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
-; PTX_60-LABEL: test_select(
-; PTX_60: {
-; PTX_60-NEXT: .reg .pred %p<2>;
-; PTX_60-NEXT: .reg .b16 %rs<3>;
-; PTX_60-NEXT: .reg .b32 %r<4>;
-; PTX_60-NEXT: .reg .b64 %rd<3>;
-; PTX_60-EMPTY:
-; PTX_60-NEXT: // %bb.0: // %bb
-; PTX_60-NEXT: ld.param.b8 %rs1, [test_select_param_3];
-; PTX_60-NEXT: and.b16 %rs2, %rs1, 1;
-; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; PTX_60-NEXT: ld.param.b64 %rd1, [test_select_param_2];
-; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1;
-; PTX_60-NEXT: ld.param.b32 %r1, [test_select_param_1];
-; PTX_60-NEXT: ld.param.b32 %r2, [test_select_param_0];
-; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1;
-; PTX_60-NEXT: st.global.b32 [%rd2], %r3;
-; PTX_60-NEXT: ret;
-;
-; PTX_70-LABEL: test_select(
-; PTX_70: {
-; PTX_70-NEXT: .reg .pred %p<2>;
-; PTX_70-NEXT: .reg .b16 %rs<3>;
-; PTX_70-NEXT: .reg .b32 %r<2>;
-; PTX_70-NEXT: .reg .b64 %rd<6>;
-; PTX_70-EMPTY:
-; PTX_70-NEXT: // %bb.0: // %bb
-; PTX_70-NEXT: ld.param.b8 %rs1, [test_select_param_3];
-; PTX_70-NEXT: and.b16 %rs2, %rs1, 1;
-; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0;
-; PTX_70-NEXT: ld.param.b64 %rd2, [test_select_param_2];
-; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2;
-; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1;
-; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
-; PTX_70-NEXT: ld.param.b32 %r1, [%rd5];
-; PTX_70-NEXT: st.global.b32 [%rd3], %r1;
-; PTX_70-NEXT: ret;
+; PTX-LABEL: test_select(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b16 %rs<3>;
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %bb
+; PTX-NEXT: ld.param.b8 %rs1, [test_select_param_3];
+; PTX-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; PTX-NEXT: mov.b64 %rd1, test_select_param_0;
+; PTX-NEXT: ld.param.b64 %rd2, [test_select_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.b64 %rd4, test_select_param_1;
+; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
+; PTX-NEXT: ld.param.b32 %r1, [%rd5];
+; PTX-NEXT: st.global.b32 [%rd3], %r1;
+; PTX-NEXT: ret;
bb:
%ptrnew = select i1 %cond, ptr %input1, ptr %input2
%valloaded = load i32, ptr %ptrnew, align 4
diff --git a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
index 0b2d856..4be91df 100644
--- a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
+++ b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
@@ -26,10 +26,10 @@ body: |
; CHECK: bb.0.entry:
; CHECK-NEXT: successors: %bb.2(0x30000000), %bb.3(0x50000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[LD_i32_:%[0-9]+]]:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
- ; CHECK-NEXT: [[LD_i64_:%[0-9]+]]:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
+ ; CHECK-NEXT: [[LD_i32_:%[0-9]+]]:b32 = LD_i32 0, 0, 101, 3, 32, -1, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
+ ; CHECK-NEXT: [[LD_i64_:%[0-9]+]]:b64 = LD_i64 0, 0, 101, 3, 64, -1, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
; CHECK-NEXT: [[ADD64ri:%[0-9]+]]:b64 = nuw ADD64ri killed [[LD_i64_]], 2
- ; CHECK-NEXT: [[LD_i32_1:%[0-9]+]]:b32 = LD_i32 0, 0, 1, 3, 32, [[ADD64ri]], 0
+ ; CHECK-NEXT: [[LD_i32_1:%[0-9]+]]:b32 = LD_i32 0, 0, 1, 3, 32, -1, [[ADD64ri]], 0
; CHECK-NEXT: [[SETP_i32ri:%[0-9]+]]:b1 = SETP_i32ri [[LD_i32_]], 0, 0
; CHECK-NEXT: CBranch killed [[SETP_i32ri]], %bb.2
; CHECK-NEXT: {{ $}}
@@ -54,10 +54,10 @@ body: |
bb.0.entry:
successors: %bb.2(0x30000000), %bb.1(0x50000000)
- %5:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
- %6:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
+ %5:b32 = LD_i32 0, 0, 101, 3, 32, -1, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
+ %6:b64 = LD_i64 0, 0, 101, 3, 64, -1, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
%0:b64 = nuw ADD64ri killed %6, 2
- %1:b32 = LD_i32 0, 0, 1, 3, 32, %0, 0
+ %1:b32 = LD_i32 0, 0, 1, 3, 32, -1, %0, 0
%7:b1 = SETP_i32ri %5, 0, 0
CBranch killed %7, %bb.2
GOTO %bb.1
diff --git a/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll b/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll
new file mode 100644
index 0000000..bba240c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s
+; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
+
+; This is testing the lowering behavior of this case from LoadStoreVectorizer/NVPTX/4x2xhalf.ll
+; where two 3xhalfs are chained together and extended to 8xhalf.
+define void @halfx3_extend_chain(ptr align 16 captures(none) %rd0) {
+; CHECK-LABEL: halfx3_extend_chain(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<7>;
+; CHECK-NEXT: .reg .b32 %r<12>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_extend_chain_param_0];
+; CHECK-NEXT: .pragma "used_bytes_mask 0xfff";
+; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NEXT: mov.b32 {_, %rs3}, %r2;
+; CHECK-NEXT: mov.b32 %r5, {%rs3, %rs1};
+; CHECK-NEXT: mov.b32 %r6, {%rs2, %rs4};
+; CHECK-NEXT: mov.b32 %r7, 0;
+; CHECK-NEXT: max.f16x2 %r8, %r2, %r7;
+; CHECK-NEXT: max.f16x2 %r9, %r1, %r7;
+; CHECK-NEXT: st.b32 [%rd1], %r9;
+; CHECK-NEXT: mov.b32 {%rs5, _}, %r8;
+; CHECK-NEXT: st.b16 [%rd1+4], %rs5;
+; CHECK-NEXT: max.f16x2 %r10, %r6, %r7;
+; CHECK-NEXT: max.f16x2 %r11, %r5, %r7;
+; CHECK-NEXT: st.b32 [%rd1+6], %r11;
+; CHECK-NEXT: mov.b32 {%rs6, _}, %r10;
+; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
+; CHECK-NEXT: ret;
+ %load1 = load <3 x half>, ptr %rd0, align 16
+ %p1 = fcmp ogt <3 x half> %load1, zeroinitializer
+ %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
+ store <3 x half> %s1, ptr %rd0, align 16
+ %in2 = getelementptr half, ptr %rd0, i64 3
+ %load2 = load <3 x half>, ptr %in2, align 4
+ %p2 = fcmp ogt <3 x half> %load2, zeroinitializer
+ %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
+ store <3 x half> %s2, ptr %in2, align 4
+ ret void
+}
+
+; This disables the vectorization by reducing the alignment.
+define void @halfx3_no_align(ptr align 4 captures(none) %rd0) {
+; CHECK-LABEL: halfx3_no_align(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<7>;
+; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_no_align_param_0];
+; CHECK-NEXT: ld.b16 %rs1, [%rd1+4];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT: ld.b32 %r2, [%rd1];
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: max.f16x2 %r4, %r1, %r3;
+; CHECK-NEXT: max.f16x2 %r5, %r2, %r3;
+; CHECK-NEXT: st.b32 [%rd1], %r5;
+; CHECK-NEXT: mov.b32 {%rs3, _}, %r4;
+; CHECK-NEXT: st.b16 [%rd1+4], %rs3;
+; CHECK-NEXT: ld.b16 %rs4, [%rd1+10];
+; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs5};
+; CHECK-NEXT: ld.b32 %r7, [%rd1+6];
+; CHECK-NEXT: max.f16x2 %r8, %r6, %r3;
+; CHECK-NEXT: max.f16x2 %r9, %r7, %r3;
+; CHECK-NEXT: st.b32 [%rd1+6], %r9;
+; CHECK-NEXT: mov.b32 {%rs6, _}, %r8;
+; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
+; CHECK-NEXT: ret;
+ %load1 = load <3 x half>, ptr %rd0, align 4
+ %p1 = fcmp ogt <3 x half> %load1, zeroinitializer
+ %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
+ store <3 x half> %s1, ptr %rd0, align 4
+ %in2 = getelementptr half, ptr %rd0, i64 3
+ %load2 = load <3 x half>, ptr %in2, align 4
+ %p2 = fcmp ogt <3 x half> %load2, zeroinitializer
+ %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
+ store <3 x half> %s2, ptr %in2, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/masked-load-vectors.ll b/llvm/test/CodeGen/NVPTX/masked-load-vectors.ll
new file mode 100644
index 0000000..3f72ffe
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/masked-load-vectors.ll
@@ -0,0 +1,366 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM90
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx88 | %ptxas-verify -arch=sm_90 %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100
+; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
+
+
+; Different architectures are tested in this file for the following reasons:
+; - SM90 does not have 256-bit load/store instructions
+; - SM90 does not have masked store instructions
+; - SM90 does not support packed f32x2 instructions
+
+define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_8xi32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
+; SM90-NEXT: .pragma "used_bytes_mask 0xf000";
+; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; SM90-NEXT: .pragma "used_bytes_mask 0xf0f";
+; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1];
+; SM90-NEXT: st.global.b32 [%rd2], %r5;
+; SM90-NEXT: st.global.b32 [%rd2+8], %r7;
+; SM90-NEXT: st.global.b32 [%rd2+28], %r4;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_8xi32(
+; SM100: {
+; SM100-NEXT: .reg .b32 %r<9>;
+; SM100-NEXT: .reg .b64 %rd<3>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
+; SM100-NEXT: .pragma "used_bytes_mask 0xf0000f0f";
+; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1];
+; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, _, %r3, _, _, _, _, %r8};
+; SM100-NEXT: ret;
+ %a.load = tail call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 %a, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+ tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+; Masked stores are only supported for 32-bit element types,
+; while masked loads are supported for all element types.
+define void @global_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_16xi16(
+; SM90: {
+; SM90-NEXT: .reg .b16 %rs<7>;
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_16xi16_param_0];
+; SM90-NEXT: .pragma "used_bytes_mask 0xf000";
+; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; SM90-NEXT: .pragma "used_bytes_mask 0xf0f";
+; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r7;
+; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r5;
+; SM90-NEXT: ld.param.b64 %rd2, [global_16xi16_param_1];
+; SM90-NEXT: st.global.b16 [%rd2], %rs5;
+; SM90-NEXT: st.global.b16 [%rd2+2], %rs6;
+; SM90-NEXT: st.global.b16 [%rd2+8], %rs3;
+; SM90-NEXT: st.global.b16 [%rd2+10], %rs4;
+; SM90-NEXT: st.global.b16 [%rd2+28], %rs1;
+; SM90-NEXT: st.global.b16 [%rd2+30], %rs2;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_16xi16(
+; SM100: {
+; SM100-NEXT: .reg .b16 %rs<7>;
+; SM100-NEXT: .reg .b32 %r<9>;
+; SM100-NEXT: .reg .b64 %rd<3>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_16xi16_param_0];
+; SM100-NEXT: .pragma "used_bytes_mask 0xf0000f0f";
+; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
+; SM100-NEXT: mov.b32 {%rs1, %rs2}, %r8;
+; SM100-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM100-NEXT: ld.param.b64 %rd2, [global_16xi16_param_1];
+; SM100-NEXT: st.global.b16 [%rd2], %rs5;
+; SM100-NEXT: st.global.b16 [%rd2+2], %rs6;
+; SM100-NEXT: st.global.b16 [%rd2+8], %rs3;
+; SM100-NEXT: st.global.b16 [%rd2+10], %rs4;
+; SM100-NEXT: st.global.b16 [%rd2+28], %rs1;
+; SM100-NEXT: st.global.b16 [%rd2+30], %rs2;
+; SM100-NEXT: ret;
+ %a.load = tail call <16 x i16> @llvm.masked.load.v16i16.p1(ptr addrspace(1) align 32 %a, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <16 x i16> poison)
+ tail call void @llvm.masked.store.v16i16.p1(<16 x i16> %a.load, ptr addrspace(1) align 32 %b, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>)
+ ret void
+}
+
+define void @global_8xi32_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_8xi32_no_align(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [global_8xi32_no_align_param_0];
+; CHECK-NEXT: ld.global.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [global_8xi32_no_align_param_1];
+; CHECK-NEXT: ld.global.b32 %r2, [%rd1+8];
+; CHECK-NEXT: ld.global.b32 %r3, [%rd1+28];
+; CHECK-NEXT: st.global.b32 [%rd2], %r1;
+; CHECK-NEXT: st.global.b32 [%rd2+8], %r2;
+; CHECK-NEXT: st.global.b32 [%rd2+28], %r3;
+; CHECK-NEXT: ret;
+ %a.load = tail call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 16 %a, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+ tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 16 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+
+define void @global_8xi32_invariant(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_8xi32_invariant(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_invariant_param_0];
+; SM90-NEXT: .pragma "used_bytes_mask 0xf000";
+; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; SM90-NEXT: .pragma "used_bytes_mask 0xf0f";
+; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_invariant_param_1];
+; SM90-NEXT: st.global.b32 [%rd2], %r5;
+; SM90-NEXT: st.global.b32 [%rd2+8], %r7;
+; SM90-NEXT: st.global.b32 [%rd2+28], %r4;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_8xi32_invariant(
+; SM100: {
+; SM100-NEXT: .reg .b32 %r<9>;
+; SM100-NEXT: .reg .b64 %rd<3>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_invariant_param_0];
+; SM100-NEXT: .pragma "used_bytes_mask 0xf0000f0f";
+; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_invariant_param_1];
+; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, _, %r3, _, _, _, _, %r8};
+; SM100-NEXT: ret;
+ %a.load = tail call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 %a, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison), !invariant.load !0
+ tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+define void @global_2xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_2xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi16_param_0];
+; CHECK-NEXT: .pragma "used_bytes_mask 0x3";
+; CHECK-NEXT: ld.global.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [global_2xi16_param_1];
+; CHECK-NEXT: mov.b32 {%rs1, _}, %r1;
+; CHECK-NEXT: st.global.b16 [%rd2], %rs1;
+; CHECK-NEXT: ret;
+ %a.load = tail call <2 x i16> @llvm.masked.load.v2i16.p1(ptr addrspace(1) align 4 %a, <2 x i1> <i1 true, i1 false>, <2 x i16> poison)
+ tail call void @llvm.masked.store.v2i16.p1(<2 x i16> %a.load, ptr addrspace(1) align 4 %b, <2 x i1> <i1 true, i1 false>)
+ ret void
+}
+
+define void @global_2xi16_invariant(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_2xi16_invariant(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi16_invariant_param_0];
+; CHECK-NEXT: .pragma "used_bytes_mask 0x3";
+; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [global_2xi16_invariant_param_1];
+; CHECK-NEXT: mov.b32 {%rs1, _}, %r1;
+; CHECK-NEXT: st.global.b16 [%rd2], %rs1;
+; CHECK-NEXT: ret;
+ %a.load = tail call <2 x i16> @llvm.masked.load.v2i16.p1(ptr addrspace(1) align 4 %a, <2 x i1> <i1 true, i1 false>, <2 x i16> poison), !invariant.load !0
+ tail call void @llvm.masked.store.v2i16.p1(<2 x i16> %a.load, ptr addrspace(1) align 4 %b, <2 x i1> <i1 true, i1 false>)
+ ret void
+}
+
+define void @global_2xi16_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_2xi16_no_align(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi16_no_align_param_0];
+; CHECK-NEXT: ld.global.b16 %rs1, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [global_2xi16_no_align_param_1];
+; CHECK-NEXT: st.global.b16 [%rd2], %rs1;
+; CHECK-NEXT: ret;
+ %a.load = tail call <2 x i16> @llvm.masked.load.v2i16.p1(ptr addrspace(1) align 2 %a, <2 x i1> <i1 true, i1 false>, <2 x i16> poison)
+ tail call void @llvm.masked.store.v2i16.p1(<2 x i16> %a.load, ptr addrspace(1) align 4 %b, <2 x i1> <i1 true, i1 false>)
+ ret void
+}
+
+define void @global_4xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_4xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi8_param_0];
+; CHECK-NEXT: .pragma "used_bytes_mask 0x5";
+; CHECK-NEXT: ld.global.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [global_4xi8_param_1];
+; CHECK-NEXT: st.global.b8 [%rd2], %r1;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U;
+; CHECK-NEXT: st.global.b8 [%rd2+2], %r2;
+; CHECK-NEXT: ret;
+ %a.load = tail call <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1) align 4 %a, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i8> poison)
+ tail call void @llvm.masked.store.v4i8.p1(<4 x i8> %a.load, ptr addrspace(1) align 4 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+define void @global_4xi8_invariant(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_4xi8_invariant(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi8_invariant_param_0];
+; CHECK-NEXT: .pragma "used_bytes_mask 0x5";
+; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [global_4xi8_invariant_param_1];
+; CHECK-NEXT: st.global.b8 [%rd2], %r1;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U;
+; CHECK-NEXT: st.global.b8 [%rd2+2], %r2;
+; CHECK-NEXT: ret;
+ %a.load = tail call <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1) align 4 %a, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i8> poison), !invariant.load !0
+ tail call void @llvm.masked.store.v4i8.p1(<4 x i8> %a.load, ptr addrspace(1) align 4 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+define void @global_4xi8_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_4xi8_no_align(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi8_no_align_param_0];
+; CHECK-NEXT: ld.global.b8 %rs1, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [global_4xi8_no_align_param_1];
+; CHECK-NEXT: ld.global.b8 %rs2, [%rd1+2];
+; CHECK-NEXT: st.global.b8 [%rd2], %rs1;
+; CHECK-NEXT: st.global.b8 [%rd2+2], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = tail call <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1) align 2 %a, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i8> poison)
+ tail call void @llvm.masked.store.v4i8.p1(<4 x i8> %a.load, ptr addrspace(1) align 4 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+; In sm100+, we pack 2xf32 loads into a single b64 load while lowering
+define void @global_2xf32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_2xf32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_2xf32_param_0];
+; SM90-NEXT: .pragma "used_bytes_mask 0xf";
+; SM90-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd2, [global_2xf32_param_1];
+; SM90-NEXT: st.global.b32 [%rd2], %r1;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_2xf32(
+; SM100: {
+; SM100-NEXT: .reg .b32 %r<2>;
+; SM100-NEXT: .reg .b64 %rd<4>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_2xf32_param_0];
+; SM100-NEXT: .pragma "used_bytes_mask 0xf";
+; SM100-NEXT: ld.global.b64 %rd2, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd3, [global_2xf32_param_1];
+; SM100-NEXT: mov.b64 {%r1, _}, %rd2;
+; SM100-NEXT: st.global.b32 [%rd3], %r1;
+; SM100-NEXT: ret;
+ %a.load = tail call <2 x float> @llvm.masked.load.v2f32.p1(ptr addrspace(1) align 8 %a, <2 x i1> <i1 true, i1 false>, <2 x float> poison)
+ tail call void @llvm.masked.store.v2f32.p1(<2 x float> %a.load, ptr addrspace(1) align 8 %b, <2 x i1> <i1 true, i1 false>)
+ ret void
+}
+
+define void @global_2xf32_invariant(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_2xf32_invariant(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_2xf32_invariant_param_0];
+; SM90-NEXT: .pragma "used_bytes_mask 0xf";
+; SM90-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd2, [global_2xf32_invariant_param_1];
+; SM90-NEXT: st.global.b32 [%rd2], %r1;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_2xf32_invariant(
+; SM100: {
+; SM100-NEXT: .reg .b32 %r<2>;
+; SM100-NEXT: .reg .b64 %rd<4>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_2xf32_invariant_param_0];
+; SM100-NEXT: .pragma "used_bytes_mask 0xf";
+; SM100-NEXT: ld.global.nc.b64 %rd2, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd3, [global_2xf32_invariant_param_1];
+; SM100-NEXT: mov.b64 {%r1, _}, %rd2;
+; SM100-NEXT: st.global.b32 [%rd3], %r1;
+; SM100-NEXT: ret;
+ %a.load = tail call <2 x float> @llvm.masked.load.v2f32.p1(ptr addrspace(1) align 8 %a, <2 x i1> <i1 true, i1 false>, <2 x float> poison), !invariant.load !0
+ tail call void @llvm.masked.store.v2f32.p1(<2 x float> %a.load, ptr addrspace(1) align 8 %b, <2 x i1> <i1 true, i1 false>)
+ ret void
+}
+
+define void @global_2xf32_no_align(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_2xf32_no_align(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [global_2xf32_no_align_param_0];
+; CHECK-NEXT: ld.global.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [global_2xf32_no_align_param_1];
+; CHECK-NEXT: st.global.b32 [%rd2], %r1;
+; CHECK-NEXT: ret;
+ %a.load = tail call <2 x float> @llvm.masked.load.v2f32.p1(ptr addrspace(1) align 4 %a, <2 x i1> <i1 true, i1 false>, <2 x float> poison)
+ tail call void @llvm.masked.store.v2f32.p1(<2 x float> %a.load, ptr addrspace(1) align 8 %b, <2 x i1> <i1 true, i1 false>)
+ ret void
+}
+
+declare <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1), <8 x i1>, <8 x i32>)
+declare void @llvm.masked.store.v8i32.p1(<8 x i32>, ptr addrspace(1), <8 x i1>)
+declare <16 x i16> @llvm.masked.load.v16i16.p1(ptr addrspace(1), <16 x i1>, <16 x i16>)
+declare void @llvm.masked.store.v16i16.p1(<16 x i16>, ptr addrspace(1), <16 x i1>)
+declare <2 x i16> @llvm.masked.load.v2i16.p1(ptr addrspace(1), <2 x i1>, <2 x i16>)
+declare void @llvm.masked.store.v2i16.p1(<2 x i16>, ptr addrspace(1), <2 x i1>)
+declare <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1), <4 x i1>, <4 x i8>)
+declare void @llvm.masked.store.v4i8.p1(<4 x i8>, ptr addrspace(1), <4 x i1>)
+declare <2 x float> @llvm.masked.load.v2f32.p1(ptr addrspace(1), <2 x i1>, <2 x float>)
+declare void @llvm.masked.store.v2f32.p1(<2 x float>, ptr addrspace(1), <2 x i1>)
+!0 = !{}
diff --git a/llvm/test/CodeGen/NVPTX/masked-store-variable-mask.ll b/llvm/test/CodeGen/NVPTX/masked-store-variable-mask.ll
new file mode 100644
index 0000000..bbe2dbb
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/masked-store-variable-mask.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK
+; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
+
+; Confirm that a masked store with a variable mask is scalarized before lowering
+
+define void @global_variable_mask(ptr addrspace(1) %a, ptr addrspace(1) %b, <4 x i1> %mask) {
+; CHECK-LABEL: global_variable_mask(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<9>;
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [global_variable_mask_param_2+3];
+; CHECK-NEXT: ld.param.b8 %rs3, [global_variable_mask_param_2+2];
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: ld.param.b8 %rs5, [global_variable_mask_param_2+1];
+; CHECK-NEXT: and.b16 %rs6, %rs5, 1;
+; CHECK-NEXT: setp.ne.b16 %p2, %rs6, 0;
+; CHECK-NEXT: ld.param.b8 %rs7, [global_variable_mask_param_2];
+; CHECK-NEXT: and.b16 %rs8, %rs7, 1;
+; CHECK-NEXT: setp.ne.b16 %p1, %rs8, 0;
+; CHECK-NEXT: ld.param.b64 %rd5, [global_variable_mask_param_1];
+; CHECK-NEXT: ld.param.b64 %rd6, [global_variable_mask_param_0];
+; CHECK-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [%rd6];
+; CHECK-NEXT: not.pred %p5, %p1;
+; CHECK-NEXT: @%p5 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %cond.store
+; CHECK-NEXT: st.global.b64 [%rd5], %rd1;
+; CHECK-NEXT: $L__BB0_2: // %else
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.ne.b16 %p3, %rs4, 0;
+; CHECK-NEXT: not.pred %p6, %p2;
+; CHECK-NEXT: @%p6 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %cond.store1
+; CHECK-NEXT: st.global.b64 [%rd5+8], %rd2;
+; CHECK-NEXT: $L__BB0_4: // %else2
+; CHECK-NEXT: setp.ne.b16 %p4, %rs2, 0;
+; CHECK-NEXT: not.pred %p7, %p3;
+; CHECK-NEXT: @%p7 bra $L__BB0_6;
+; CHECK-NEXT: // %bb.5: // %cond.store3
+; CHECK-NEXT: st.global.b64 [%rd5+16], %rd3;
+; CHECK-NEXT: $L__BB0_6: // %else4
+; CHECK-NEXT: not.pred %p8, %p4;
+; CHECK-NEXT: @%p8 bra $L__BB0_8;
+; CHECK-NEXT: // %bb.7: // %cond.store5
+; CHECK-NEXT: st.global.b64 [%rd5+24], %rd4;
+; CHECK-NEXT: $L__BB0_8: // %else6
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i64>, ptr addrspace(1) %a
+ tail call void @llvm.masked.store.v4i64.p1(<4 x i64> %a.load, ptr addrspace(1) align 32 %b, <4 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.store.v4i64.p1(<4 x i64>, ptr addrspace(1), <4 x i1>)
diff --git a/llvm/test/CodeGen/NVPTX/masked-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/masked-store-vectors-256.ll
new file mode 100644
index 0000000..44f3505
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/masked-store-vectors-256.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM90
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx88 | %ptxas-verify -arch=sm_90 %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100
+; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
+
+; This test is based on load-store-vectors.ll,
+; and contains testing for lowering 256-bit masked vector stores
+
+; Types we are checking: i32, i64, f32, f64
+
+; Address spaces we are checking: generic, global
+; - Global is the only address space that currently supports masked stores.
+; - The generic stores will get legalized before the backend via scalarization,
+; this file tests that even though we won't be generating them in the LSV.
+
+; 256-bit vector loads/stores are only legal for blackwell+, so on sm_90, the vectors will be split
+
+; generic address space
+
+define void @generic_8xi32(ptr %a, ptr %b) {
+; CHECK-LABEL: generic_8xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0];
+; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1];
+; CHECK-NEXT: st.b32 [%rd2], %r5;
+; CHECK-NEXT: st.b32 [%rd2+8], %r7;
+; CHECK-NEXT: st.b32 [%rd2+28], %r4;
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i32>, ptr %a
+ tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %a.load, ptr align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+define void @generic_4xi64(ptr %a, ptr %b) {
+; CHECK-LABEL: generic_4xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xi64_param_0];
+; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1+16];
+; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd6, [generic_4xi64_param_1];
+; CHECK-NEXT: st.b64 [%rd6], %rd4;
+; CHECK-NEXT: st.b64 [%rd6+16], %rd2;
+; CHECK-NEXT: ret;
+ %a.load = load <4 x i64>, ptr %a
+ tail call void @llvm.masked.store.v4i64.p0(<4 x i64> %a.load, ptr align 32 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+define void @generic_8xfloat(ptr %a, ptr %b) {
+; CHECK-LABEL: generic_8xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
+; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1];
+; CHECK-NEXT: st.b32 [%rd2], %r5;
+; CHECK-NEXT: st.b32 [%rd2+8], %r7;
+; CHECK-NEXT: st.b32 [%rd2+28], %r4;
+; CHECK-NEXT: ret;
+ %a.load = load <8 x float>, ptr %a
+ tail call void @llvm.masked.store.v8f32.p0(<8 x float> %a.load, ptr align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+define void @generic_4xdouble(ptr %a, ptr %b) {
+; CHECK-LABEL: generic_4xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xdouble_param_0];
+; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1+16];
+; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1];
+; CHECK-NEXT: ld.param.b64 %rd6, [generic_4xdouble_param_1];
+; CHECK-NEXT: st.b64 [%rd6], %rd4;
+; CHECK-NEXT: st.b64 [%rd6+16], %rd2;
+; CHECK-NEXT: ret;
+ %a.load = load <4 x double>, ptr %a
+ tail call void @llvm.masked.store.v4f64.p0(<4 x double> %a.load, ptr align 32 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+; global address space
+
+define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_8xi32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
+; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1];
+; SM90-NEXT: st.global.b32 [%rd2], %r5;
+; SM90-NEXT: st.global.b32 [%rd2+8], %r7;
+; SM90-NEXT: st.global.b32 [%rd2+28], %r4;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_8xi32(
+; SM100: {
+; SM100-NEXT: .reg .b32 %r<9>;
+; SM100-NEXT: .reg .b64 %rd<3>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
+; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1];
+; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, _, %r3, _, _, _, _, %r8};
+; SM100-NEXT: ret;
+ %a.load = load <8 x i32>, ptr addrspace(1) %a
+ tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_4xi64(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<7>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_4xi64_param_0];
+; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1+16];
+; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd6, [global_4xi64_param_1];
+; SM90-NEXT: st.global.b64 [%rd6], %rd4;
+; SM90-NEXT: st.global.b64 [%rd6+16], %rd2;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_4xi64(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_4xi64_param_0];
+; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd6, [global_4xi64_param_1];
+; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, _, %rd4, _};
+; SM100-NEXT: ret;
+ %a.load = load <4 x i64>, ptr addrspace(1) %a
+ tail call void @llvm.masked.store.v4i64.p1(<4 x i64> %a.load, ptr addrspace(1) align 32 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_8xfloat(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0];
+; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1];
+; SM90-NEXT: st.global.b32 [%rd2], %r5;
+; SM90-NEXT: st.global.b32 [%rd2+8], %r7;
+; SM90-NEXT: st.global.b32 [%rd2+28], %r4;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_8xfloat(
+; SM100: {
+; SM100-NEXT: .reg .b32 %r<9>;
+; SM100-NEXT: .reg .b64 %rd<3>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0];
+; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1];
+; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, _, %r3, _, _, _, _, %r8};
+; SM100-NEXT: ret;
+ %a.load = load <8 x float>, ptr addrspace(1) %a
+ tail call void @llvm.masked.store.v8f32.p1(<8 x float> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+define void @global_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_4xdouble(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<7>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_4xdouble_param_0];
+; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1+16];
+; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd6, [global_4xdouble_param_1];
+; SM90-NEXT: st.global.b64 [%rd6], %rd4;
+; SM90-NEXT: st.global.b64 [%rd6+16], %rd2;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_4xdouble(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_4xdouble_param_0];
+; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd6, [global_4xdouble_param_1];
+; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, _, %rd4, _};
+; SM100-NEXT: ret;
+ %a.load = load <4 x double>, ptr addrspace(1) %a
+ tail call void @llvm.masked.store.v4f64.p1(<4 x double> %a.load, ptr addrspace(1) align 32 %b, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+; edge cases
+define void @global_8xi32_all_mask_on(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SM90-LABEL: global_8xi32_all_mask_on(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_all_mask_on_param_0];
+; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_all_mask_on_param_1];
+; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: global_8xi32_all_mask_on(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_all_mask_on_param_0];
+; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd6, [global_8xi32_all_mask_on_param_1];
+; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
+; SM100-NEXT: ret;
+ %a.load = load <8 x i32>, ptr addrspace(1) %a
+ tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define void @global_8xi32_all_mask_off(ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; CHECK-LABEL: global_8xi32_all_mask_off(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i32>, ptr addrspace(1) %a
+ tail call void @llvm.masked.store.v8i32.p1(<8 x i32> %a.load, ptr addrspace(1) align 32 %b, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+; This is an example pattern for the LSV's output of these masked stores
+define void @vectorizerOutput(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SM90-LABEL: vectorizerOutput(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [vectorizerOutput_param_0];
+; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
+; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
+; SM90-NEXT: ld.param.b64 %rd2, [vectorizerOutput_param_1];
+; SM90-NEXT: st.global.b32 [%rd2], %r5;
+; SM90-NEXT: st.global.b32 [%rd2+4], %r6;
+; SM90-NEXT: st.global.b32 [%rd2+12], %r8;
+; SM90-NEXT: st.global.b32 [%rd2+16], %r1;
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: vectorizerOutput(
+; SM100: {
+; SM100-NEXT: .reg .b32 %r<9>;
+; SM100-NEXT: .reg .b64 %rd<3>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [vectorizerOutput_param_0];
+; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd2, [vectorizerOutput_param_1];
+; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, _, %r4, %r5, _, _, _};
+; SM100-NEXT: ret;
+ %1 = load <8 x i32>, ptr addrspace(1) %in, align 32
+ %load05 = extractelement <8 x i32> %1, i32 0
+ %load16 = extractelement <8 x i32> %1, i32 1
+ %load38 = extractelement <8 x i32> %1, i32 3
+ %load49 = extractelement <8 x i32> %1, i32 4
+ %2 = insertelement <8 x i32> poison, i32 %load05, i32 0
+ %3 = insertelement <8 x i32> %2, i32 %load16, i32 1
+ %4 = insertelement <8 x i32> %3, i32 poison, i32 2
+ %5 = insertelement <8 x i32> %4, i32 %load38, i32 3
+ %6 = insertelement <8 x i32> %5, i32 %load49, i32 4
+ %7 = insertelement <8 x i32> %6, i32 poison, i32 5
+ %8 = insertelement <8 x i32> %7, i32 poison, i32 6
+ %9 = insertelement <8 x i32> %8, i32 poison, i32 7
+ call void @llvm.masked.store.v8i32.p1(<8 x i32> %9, ptr addrspace(1) align 32 %out, <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, <4 x i1>)
+
+declare void @llvm.masked.store.v8i32.p1(<8 x i32>, ptr addrspace(1), <8 x i1>)
+declare void @llvm.masked.store.v4i64.p1(<4 x i64>, ptr addrspace(1), <4 x i1>)
+declare void @llvm.masked.store.v8f32.p1(<8 x float>, ptr addrspace(1), <8 x i1>)
+declare void @llvm.masked.store.v4f64.p1(<4 x double>, ptr addrspace(1), <4 x i1>)
diff --git a/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll
new file mode 100644
index 0000000..6d9ad8d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=nvptx-ir-peephole -S | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+define float @test_fsub_fmul_c(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fsub_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = fneg contract float [[C]]
+; CHECK-NEXT: [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]])
+; CHECK-NEXT: ret float [[TMP2]]
+;
+ %mul = fmul contract float %a, %b
+ %sub = fsub contract float %mul, %c
+ ret float %sub
+}
+
+
+; fsub(c, fmul(a, b)) => fma(-a, b, c)
+define float @test_fsub_c_fmul(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fsub_c_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = fneg contract float [[A]]
+; CHECK-NEXT: [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[B]], float [[C]])
+; CHECK-NEXT: ret float [[TMP2]]
+;
+ %mul = fmul contract float %a, %b
+ %sub = fsub contract float %c, %mul
+ ret float %sub
+}
+
+
+; fsub(fmul(a, b), fmul(c, d)) => fma(a, b, fneg(fmul(c, d)))
+define float @test_fsub_fmul_fmul(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fsub_fmul_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT: [[MUL2:%.*]] = fmul contract float [[C]], [[D]]
+; CHECK-NEXT: [[TMP1:%.*]] = fneg contract float [[MUL2]]
+; CHECK-NEXT: [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]])
+; CHECK-NEXT: ret float [[TMP2]]
+;
+ %mul1 = fmul contract float %a, %b
+ %mul2 = fmul contract float %c, %d
+ %sub = fsub contract float %mul1, %mul2
+ ret float %sub
+}
+
+
+; fsub(fmul(a, b), fmul(c, d)) => fma(fneg(c), d, fmul(a, b)))
+; fmul(a, b) has multiple uses.
+define float @test_fsub_fmul_fmul_multiple_use(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fsub_fmul_fmul_multiple_use(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT: [[MUL1:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT: [[TMP1:%.*]] = fneg contract float [[C]]
+; CHECK-NEXT: [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[D]], float [[MUL1]])
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP2]], [[MUL1]]
+; CHECK-NEXT: ret float [[ADD]]
+;
+ %mul1 = fmul contract float %a, %b
+ %mul2 = fmul contract float %c, %d
+ %sub = fsub contract float %mul1, %mul2
+ %add = fadd float %sub, %mul1
+ ret float %add
+}
+
+
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c)) where fsub and fmul are in different BBs
+define float @test_fsub_fmul_different_BB(float %a, float %b, float %c, i32 %n) {
+; CHECK-LABEL: define float @test_fsub_fmul_different_BB(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT: [[INIT:.*]]:
+; CHECK-NEXT: [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10
+; CHECK-NEXT: br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]]
+; CHECK: [[ITERATION]]:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT: [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT: [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00
+; CHECK-NEXT: [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = fneg contract float [[C_PHI]]
+; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP0]])
+; CHECK-NEXT: ret float [[TMP1]]
+;
+init:
+ %mul = fmul contract float %a, %b
+ %cmp_iter = icmp sgt i32 %n, 10
+ br i1 %cmp_iter, label %iteration, label %exit
+
+iteration:
+ %i = phi i32 [ 0, %init ], [ %i_next, %iteration ]
+ %acc = phi float [ %c, %init ], [ %acc_next, %iteration ]
+ %i_next = add i32 %i, 1
+ %acc_next = fadd contract float %acc, 1.0
+ %cmp_loop = icmp slt i32 %i_next, %n
+ br i1 %cmp_loop, label %iteration, label %exit
+
+exit:
+ %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ]
+ %sub = fsub contract float %mul, %c_phi
+ ret float %sub
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c)
+define float @test_fadd_fmul_c(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fadd_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+; CHECK-NEXT: ret float [[TMP1]]
+;
+ %mul = fmul contract float %a, %b
+ %add = fadd contract float %mul, %c
+ ret float %add
+}
+
+
+; fadd(c, fmul(a, b)) => fma(a, b, c)
+define float @test_fadd_c_fmul(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fadd_c_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+; CHECK-NEXT: ret float [[TMP1]]
+;
+ %mul = fmul contract float %a, %b
+ %add = fadd contract float %c, %mul
+ ret float %add
+}
+
+
+; fadd(fmul(a, b), fmul(c, d)) => fma(a, b, fmul(c, d))
+define float @test_fadd_fmul_fmul(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fadd_fmul_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT: [[MUL2:%.*]] = fmul contract float [[C]], [[D]]
+; CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[MUL2]])
+; CHECK-NEXT: ret float [[TMP1]]
+;
+ %mul1 = fmul contract float %a, %b
+ %mul2 = fmul contract float %c, %d
+ %add = fadd contract float %mul1, %mul2
+ ret float %add
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c) where fadd and fmul are in different BBs
+define float @test_fadd_fmul_different_BB(float %a, float %b, float %c, i32 %n) {
+; CHECK-LABEL: define float @test_fadd_fmul_different_BB(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT: [[INIT:.*]]:
+; CHECK-NEXT: [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10
+; CHECK-NEXT: br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]]
+; CHECK: [[ITERATION]]:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT: [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT: [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00
+; CHECK-NEXT: [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C_PHI]])
+; CHECK-NEXT: ret float [[TMP0]]
+;
+init:
+ %mul = fmul contract float %a, %b
+ %cmp_iter = icmp sgt i32 %n, 10
+ br i1 %cmp_iter, label %iteration, label %exit
+
+iteration:
+ %i = phi i32 [ 0, %init ], [ %i_next, %iteration ]
+ %acc = phi float [ %c, %init ], [ %acc_next, %iteration ]
+ %i_next = add i32 %i, 1
+ %acc_next = fadd contract float %acc, 1.0
+ %cmp_loop = icmp slt i32 %i_next, %n
+ br i1 %cmp_loop, label %iteration, label %exit
+
+exit:
+ %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ]
+ %add = fadd contract float %mul, %c_phi
+ ret float %add
+}
+
+
+; These scenarios shouldn't work.
+; fadd(fpext(fmul(a, b)), c) => fma(fpext(a), fpext(b), c)
+define double @test_fadd_fpext_fmul_c(float %a, float %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_fpext_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT: [[MUL:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT: [[EXT:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT: [[ADD:%.*]] = fadd contract double [[EXT]], [[C]]
+; CHECK-NEXT: ret double [[ADD]]
+;
+ %mul = fmul contract float %a, %b
+ %ext = fpext float %mul to double
+ %add = fadd contract double %ext, %c
+ ret double %add
+}
+
+
+; fadd(c, fpext(fmul(a, b))) => fma(fpext(a), fpext(b), c)
+define double @test_fadd_c_fpext_fmul(float %a, float %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_c_fpext_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT: [[MUL:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT: [[EXT:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT: [[ADD:%.*]] = fadd contract double [[C]], [[EXT]]
+; CHECK-NEXT: ret double [[ADD]]
+;
+ %mul = fmul contract float %a, %b
+ %ext = fpext float %mul to double
+ %add = fadd contract double %c, %ext
+ ret double %add
+}
+
+
+; Double precision tests
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+define double @test_fsub_fmul_c_double(double %a, double %b, double %c) {
+; CHECK-LABEL: define double @test_fsub_fmul_c_double(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = fneg contract double [[C]]
+; CHECK-NEXT: [[TMP2:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[TMP1]])
+; CHECK-NEXT: ret double [[TMP2]]
+;
+ %mul = fmul contract double %a, %b
+ %sub = fsub contract double %mul, %c
+ ret double %sub
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c)
+define double @test_fadd_fmul_c_double(double %a, double %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_fmul_c_double(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+; CHECK-NEXT: ret double [[TMP1]]
+;
+ %mul = fmul contract double %a, %b
+ %add = fadd contract double %mul, %c
+ ret double %add
+}
diff --git a/llvm/test/CodeGen/NVPTX/op-fence.ll b/llvm/test/CodeGen/NVPTX/op-fence.ll
new file mode 100644
index 0000000..629b702
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/op-fence.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: test_fence_mbarrier_init
+define void @test_fence_mbarrier_init() {
+; CHECK-LABEL: test_fence_mbarrier_init(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.mbarrier_init.release.cluster;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.mbarrier_init.release.cluster();
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
index 06d7384..b220450 100644
--- a/llvm/test/CodeGen/NVPTX/param-add.ll
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -2,11 +2,6 @@
; RUN: llc < %s -march=nvptx64 --debug-counter=dagcombine=0 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 --debug-counter=dagcombine=0 | %ptxas-verify %}
-; REQUIRES: asserts
-; asserts are required for --debug-counter=dagcombine=0 to have the intended
-; effect of disabling DAG combines, which exposes the bug. When combines are
-; enabled the bug does not occur.
-
%struct.1float = type <{ [1 x float] }>
declare i32 @callee(%struct.1float %a)
diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index 51f6b00..4870050 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -171,8 +171,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
; CHECK-LABEL: callee_St4x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
- ; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
- ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8];
+ ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
; CHECK-NEXT: ret;
@@ -394,8 +393,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
; CHECK-LABEL: callee_St4x7(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
- ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
- ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24];
+ ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]];
diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
index dfc8417..a84b7fc 100644
--- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
+++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
@@ -77,7 +77,7 @@ constants: []
machineFunctionInfo: {}
body: |
bb.0:
- %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, &retval0, 0 :: (load (s128), addrspace 101)
+ %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, -1, &retval0, 0 :: (load (s128), addrspace 101)
; CHECK-NOT: ProxyReg
%4:b32 = ProxyRegB32 killed %0
%5:b32 = ProxyRegB32 killed %1
@@ -86,7 +86,7 @@ body: |
; CHECK: STV_i32_v4 killed %0, killed %1, killed %2, killed %3
STV_i32_v4 killed %4, killed %5, killed %6, killed %7, 0, 0, 101, 32, &func_retval0, 0 :: (store (s128), addrspace 101)
- %8:b32 = LD_i32 0, 0, 101, 3, 32, &retval0, 0 :: (load (s32), addrspace 101)
+ %8:b32 = LD_i32 0, 0, 101, 3, 32, -1, &retval0, 0 :: (load (s32), addrspace 101)
; CHECK-NOT: ProxyReg
%9:b32 = ProxyRegB32 killed %8
%10:b32 = ProxyRegB32 killed %9
diff --git a/llvm/test/CodeGen/NVPTX/switch-loop-header.mir b/llvm/test/CodeGen/NVPTX/switch-loop-header.mir
new file mode 100644
index 0000000..4d86bb8
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/switch-loop-header.mir
@@ -0,0 +1,182 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - %s -passes="require<machine-loops>,require<live-vars>,phi-node-elimination" | FileCheck %s
+
+--- |
+ target datalayout = "e-p6:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+ target triple = "nvptx64-unknown-nvidiacl"
+
+ define void @func_26(i32 %BS_COND_16.0.BS_COND_16.0.BS_COND_16.0.BS_COND_16.0.) {
+ entry:
+ br label %for.cond
+
+ for.cond: ; preds = %BS_LABEL_1, %BS_LABEL_1, %entry
+ %p_2218_0.1 = phi i32 [ 0, %entry ], [ %p_2218_0.3, %BS_LABEL_1 ], [ %p_2218_0.3, %BS_LABEL_1 ]
+ br label %BS_LABEL_1
+
+ BS_LABEL_2: ; preds = %BS_LABEL_1
+ %sub = or i32 %p_2218_0.3, 1
+ br label %for.cond4
+
+ for.cond4: ; preds = %BS_LABEL_1, %BS_LABEL_2
+ %p_2218_0.2 = phi i32 [ %BS_COND_16.0.BS_COND_16.0.BS_COND_16.0.BS_COND_16.0., %BS_LABEL_1 ], [ %sub, %BS_LABEL_2 ]
+ br label %BS_LABEL_1
+
+ BS_LABEL_1: ; preds = %for.cond4, %for.cond
+ %p_2218_0.3 = phi i32 [ %p_2218_0.2, %for.cond4 ], [ %p_2218_0.1, %for.cond ]
+ switch i32 %BS_COND_16.0.BS_COND_16.0.BS_COND_16.0.BS_COND_16.0., label %unreachable [
+ i32 0, label %for.cond4
+ i32 4, label %BS_LABEL_2
+ i32 1, label %for.cond
+ i32 6, label %for.cond
+ ]
+
+ unreachable: ; preds = %BS_LABEL_1
+ call void asm sideeffect "exit;", ""()
+ unreachable
+ }
+...
+---
+name: func_26
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 4, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 5, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 6, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 7, class: b1, preferred-register: '', flags: [ ] }
+ - { id: 8, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: b1, preferred-register: '', flags: [ ] }
+ - { id: 10, class: b32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: b1, preferred-register: '', flags: [ ] }
+liveins: []
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo: {}
+jumpTable:
+ kind: inline
+ entries:
+ - id: 0
+ blocks: [ '%bb.3', '%bb.1', '%bb.6', '%bb.6', '%bb.2', '%bb.6',
+ '%bb.1' ]
+body: |
+ ; CHECK-LABEL: name: func_26
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:b32 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:b1 = IMPLICIT_DEF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead [[DEF2:%[0-9]+]]:b32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GOTO %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: successors: %bb.6(0x00000000), %bb.5(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBranch undef [[DEF1]], %bb.6
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5:
+ ; CHECK-NEXT: successors: %bb.3(0x3e000000), %bb.1(0x04000000), %bb.6(0x00000000), %bb.2(0x3e000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: BRX_START 0
+ ; CHECK-NEXT: BRX_ITEM %bb.3
+ ; CHECK-NEXT: BRX_ITEM %bb.1
+ ; CHECK-NEXT: BRX_ITEM %bb.6
+ ; CHECK-NEXT: BRX_ITEM %bb.6
+ ; CHECK-NEXT: BRX_ITEM %bb.2
+ ; CHECK-NEXT: BRX_ITEM %bb.6
+ ; CHECK-NEXT: BRX_END %bb.1, undef [[DEF]], 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.6:
+ bb.0:
+ successors: %bb.1(0x80000000)
+
+ %10:b32 = IMPLICIT_DEF
+ %11:b1 = IMPLICIT_DEF
+
+ bb.1:
+ successors: %bb.4(0x80000000)
+
+ %0:b32 = PHI undef %10, %bb.0, undef %0, %bb.5
+ GOTO %bb.4
+
+ bb.2:
+ successors: %bb.3(0x80000000)
+
+ bb.3:
+ successors: %bb.4(0x80000000)
+
+ bb.4:
+ successors: %bb.6(0x00000000), %bb.5(0x80000000)
+
+ CBranch undef %11, %bb.6
+
+ bb.5:
+ successors: %bb.3(0x3e000000), %bb.1(0x04000000), %bb.6(0x00000000), %bb.2(0x3e000000)
+
+ BRX_START 0
+ BRX_ITEM %bb.3
+ BRX_ITEM %bb.1
+ BRX_ITEM %bb.6
+ BRX_ITEM %bb.6
+ BRX_ITEM %bb.2
+ BRX_ITEM %bb.6
+ BRX_END %bb.1, undef %10, 0
+
+ bb.6:
+...
diff --git a/llvm/test/CodeGen/NVPTX/switch.ll b/llvm/test/CodeGen/NVPTX/switch.ll
new file mode 100644
index 0000000..7fcfcfb
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/switch.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+target triple = "nvptx64-unknown-nvidiacl"
+
+define void @pr170051(i32 %cond) {
+; CHECK-LABEL: pr170051(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: ld.param.b32 %r1, [pr170051_param_0];
+; CHECK-NEXT: setp.gt.u32 %p1, %r1, 6;
+; CHECK-NEXT: bra.uni $L__BB0_3;
+; CHECK-NEXT: $L__BB0_1: // %BS_LABEL_2
+; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: or.b32 %r3, %r2, 1;
+; CHECK-NEXT: $L__BB0_2: // %for.cond4
+; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: mov.b32 %r2, %r3;
+; CHECK-NEXT: $L__BB0_3: // %BS_LABEL_1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: @%p1 bra $L__BB0_5;
+; CHECK-NEXT: // %bb.4: // %BS_LABEL_1
+; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: mov.b32 %r3, %r1;
+; CHECK-NEXT: $L_brx_0: .branchtargets
+; CHECK-NEXT: $L__BB0_2,
+; CHECK-NEXT: $L__BB0_3,
+; CHECK-NEXT: $L__BB0_5,
+; CHECK-NEXT: $L__BB0_5,
+; CHECK-NEXT: $L__BB0_1,
+; CHECK-NEXT: $L__BB0_5,
+; CHECK-NEXT: $L__BB0_3;
+; CHECK-NEXT: brx.idx %r1, $L_brx_0;
+; CHECK-NEXT: $L__BB0_5: // %unreachable
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: exit;
+; CHECK-NEXT: // end inline asm
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.cond4.for.cond_crit_edge, %BS_LABEL_1, %BS_LABEL_1, %entry
+ %p_2218_0.1 = phi i32 [ 0, %entry ], [ %p_2218_0.3, %BS_LABEL_1 ], [ %p_2218_0.3, %BS_LABEL_1 ], [ poison, %for.cond4.for.cond_crit_edge ]
+ br label %BS_LABEL_1
+
+BS_LABEL_2: ; preds = %BS_LABEL_1
+ %sub = or i32 %p_2218_0.3, 1
+ br label %for.cond4
+
+for.cond4: ; preds = %BS_LABEL_1, %BS_LABEL_2
+ %p_2218_0.2 = phi i32 [ 0, %BS_LABEL_1 ], [ %sub, %BS_LABEL_2 ]
+ br i1 false, label %for.cond4.for.cond_crit_edge, label %BS_LABEL_1
+
+for.cond4.for.cond_crit_edge: ; preds = %for.cond4
+ br label %for.cond
+
+BS_LABEL_1: ; preds = %for.cond4, %for.cond
+ %p_2218_0.3 = phi i32 [ %p_2218_0.2, %for.cond4 ], [ %p_2218_0.1, %for.cond ]
+ switch i32 %cond, label %unreachable [
+ i32 0, label %for.cond4
+ i32 4, label %BS_LABEL_2
+ i32 1, label %for.cond
+ i32 6, label %for.cond
+ ]
+
+unreachable: ; preds = %BS_LABEL_1
+ unreachable
+}
+
+
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
new file mode 100644
index 0000000..479de53
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: This sample test demonstrates the pretty print feature for NVPTX intrinsics
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @tcgen05_mma_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) {
+ ; CHECK-LABEL: define void @tcgen05_mma_fp16_cta1(
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=discard */ i32 0)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0)
+
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1)
+
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=fill */ i32 2)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2)
+
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=use */ i32 3)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3)
+
+ ret void
+}
+
+define void @tcgen05_mma_f8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) {
+ ; CHECK-LABEL: define void @tcgen05_mma_f8f6f4_cta2(
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=discard */ i32 0)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0)
+
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1)
+
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=fill */ i32 2)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2)
+
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=use */ i32 3)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3)
+
+ ret void
+}
+
+; This test verifies that printImmArg is safe to call on all constant arguments, but only prints comments for arguments that have pretty printing configured.
+define void @test_mixed_constants_edge_case(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor) {
+ ; CHECK-LABEL: define void @test_mixed_constants_edge_case(
+ ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=discard */ i32 0)
+ call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, i32 3, i32 1, i32 0)
+
+ ret void
+}
+
+declare void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6), ptr addrspace(6), i64, i32, i1, i32, i32, i32)
diff --git a/llvm/test/CodeGen/NVPTX/thread-fence.ll b/llvm/test/CodeGen/NVPTX/thread-fence.ll
new file mode 100644
index 0000000..185461b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/thread-fence.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: test_fence_acquire
+define void @test_fence_acquire() {
+; CHECK-LABEL: test_fence_acquire(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.acquire.sync_restrict::shared::cluster.cluster;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster();
+
+ ret void
+}
+
+; CHECK-LABEL: test_fence_release
+define void @test_fence_release() {
+; CHECK-LABEL: test_fence_release(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: fence.release.sync_restrict::shared::cta.cluster;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster();
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll b/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll
new file mode 100644
index 0000000..a888d99
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | FileCheck %s -check-prefixes=NOMASK
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | %ptxas-verify -arch=sm_90 %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck %s -check-prefixes=MASK
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %}
+
+; On older architectures and versions, we shouldn't be seeing a used bytes mask pragma.
+; Specifically, the pragma is only supported on SM_50 or later, and PTX 8.3 or later.
+; Architecture fixed at SM_90 for this test for stability, and we vary the PTX version to test the pragma.
+
+define i32 @global_8xi32(ptr %a, ptr %b) {
+; NOMASK-LABEL: global_8xi32(
+; NOMASK: {
+; NOMASK-NEXT: .reg .b32 %r<5>;
+; NOMASK-NEXT: .reg .b64 %rd<2>;
+; NOMASK-EMPTY:
+; NOMASK-NEXT: // %bb.0:
+; NOMASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
+; NOMASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; NOMASK-NEXT: st.param.b32 [func_retval0], %r1;
+; NOMASK-NEXT: ret;
+;
+; MASK-LABEL: global_8xi32(
+; MASK: {
+; MASK-NEXT: .reg .b32 %r<5>;
+; MASK-NEXT: .reg .b64 %rd<2>;
+; MASK-EMPTY:
+; MASK-NEXT: // %bb.0:
+; MASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
+; MASK-NEXT: .pragma "used_bytes_mask 0xfff";
+; MASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; MASK-NEXT: st.param.b32 [func_retval0], %r1;
+; MASK-NEXT: ret;
+ %a.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 %a, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
+ %first = extractelement <4 x i32> %a.load, i32 0
+ ret i32 %first
+}
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr , <4 x i1>, <4 x i32>)
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx88-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx88-sm120a.py
new file mode 100644
index 0000000..f1666db
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx88-sm120a.py
@@ -0,0 +1,12 @@
+# Check all variants of instructions supported by PTX88 on SM120a
+# RUN: %python %s --ptx=88 --gpu-arch=120 --aa > %t-ptx88-sm_120a.ll
+# RUN: llc < %t-ptx88-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx88 \
+# RUN: | FileCheck %t-ptx88-sm_120a.ll
+# RUN: %if ptxas-sm_120a && ptxas-isa-8.8 %{ \
+# RUN: llc < %t-ptx88-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx88 \
+# RUN: | %ptxas-verify -arch=sm_120a \
+# RUN: %}
+
+import wmma
+
+wmma.main()
diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py
index 8427ae4..817665a6 100644
--- a/llvm/test/CodeGen/NVPTX/wmma.py
+++ b/llvm/test/CodeGen/NVPTX/wmma.py
@@ -131,7 +131,7 @@ class MMAFrag:
"m16n8k64:b:e5m2": 4,
"m16n8k64:b:e3m2": 4,
"m16n8k64:b:e2m3": 4,
- "m16n8k64:b:e2m1": 4,
+ "m16n8k64:b:e2m1": 4 if is_mma_sparse else 2,
"m16n8k64:c:f16": 2,
"m16n8k64:c:f32": 4,
"m16n8k64:d:f16": 2,
@@ -1131,6 +1131,160 @@ def gen_mma_tests():
return generated_items
+def get_mma_block_scale_ops():
+ return make_mma_ops(["m16n8k64"], ["e2m1"], [], ["f32"], []) + make_mma_ops(
+ ["m16n8k32"],
+ ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"],
+ ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"],
+ ["f32"],
+ [],
+ )
+
+
+def is_mma_block_scale_geom_supported(geom):
+ # geometries for FP.
+ if geom in [
+ "m16n8k32",
+ "m16n8k64",
+ ]:
+ return True
+ raise ValueError(f"Unexpected MMA block scale geometry: {geom}")
+
+
+def is_mma_block_scale_variant_supported(op, kind, scale_vec_size, stype):
+ if not (
+ is_type_supported(op.a.mma_type.ptx_type)
+ and is_mma_block_scale_geom_supported(op.a.geom)
+ ):
+ return False
+
+ if (
+ op.a.geom == "m16n8k64"
+ and kind == "mxf4"
+ and stype == "ue8m0"
+ and scale_vec_size in ["", ".scale_vec::2X"]
+ ):
+ return True
+
+ if (
+ op.a.geom == "m16n8k64"
+ and kind == "mxf4nvf4"
+ and stype == "ue8m0"
+ and scale_vec_size == ".scale_vec::2X"
+ ):
+ return True
+
+ if (
+ op.a.geom == "m16n8k64"
+ and kind == "mxf4nvf4"
+ and stype == "ue4m3"
+ and scale_vec_size == ".scale_vec::4X"
+ ):
+ return True
+
+ if (
+ op.a.geom == "m16n8k32"
+ and kind == "mxf8f6f4"
+ and stype == "ue8m0"
+ and scale_vec_size in ["", ".scale_vec::1X"]
+ ):
+ return True
+
+ return False
+
+
+def common_mma_block_scale_test_gen(
+ params, op, intrinsic_template, instruction_template
+):
+ mma_block_scale_template = """
+declare ${ret_ty} @${intrinsic}(
+ ${args});
+
+; CHECK-LABEL: .func {{.*}}test_${function}(
+define ${ret_ty} @test_${function}(
+ ${args}) {
+; CHECK: ${instruction}
+; CHECK-NEXT: ${check_d}
+; CHECK-NEXT: ${check_a}
+; CHECK-NEXT: ${check_b}
+; CHECK-NEXT: ${check_c}
+; CHECK-NEXT: ${check_scale_a_data}
+; CHECK-NEXT: ${check_byte_id_a}
+; CHECK-NEXT: ${check_thread_id_a}
+; CHECK-NEXT: ${check_scale_b_data}
+; CHECK-NEXT: ${check_byte_id_b}
+; CHECK-NEXT: ${check_thread_id_b}
+ %r = call ${ret_ty} @${intrinsic}(
+ ${args});
+ ret ${ret_ty} %r;
+}
+"""
+
+ test_params = params
+ test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
+ test_params["function"] = test_params["intrinsic"].replace(".", "_")
+ test_params["instruction"] = Template(instruction_template).substitute(params)
+ test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d)
+ test_params["check_a"] = check_pattern(op.a)
+ test_params["check_b"] = check_pattern(op.b)
+ test_params["check_c"] = check_pattern(op.c)
+ test_params["check_d"] = check_pattern(op.d)
+ test_params["check_scale_a_data"] = "{{%r[0-9]+}}"
+ test_params["check_byte_id_a"] = "{{%r[0-9]+}}"
+ test_params["check_thread_id_a"] = "{{%r[0-9]+}}"
+ test_params["check_scale_b_data"] = "{{%r[0-9]+}}"
+ test_params["check_byte_id_b"] = "{{%r[0-9]+}}"
+ test_params["check_thread_id_b"] = "{{%r[0-9]+}}"
+ args = ",\n ".join(
+ list(make_wmma_slice_args(frag) for frag in (op.a, op.b, op.c))
+ + ["i32 %scale_a_data", "i16 %byte_id_a, i16 %thread_id_a"]
+ + ["i32 %scale_b_data", "i16 %byte_id_b, i16 %thread_id_b"]
+ )
+ test_params["args"] = args
+ print(Template(mma_block_scale_template).substitute(test_params))
+ return (test_params["intrinsic"], test_params["instruction"])
+
+
+def gen_mma_block_scale_tests():
+ if not (ptx_version >= 88 and gpu_arch >= 120 and aa):
+ return []
+
+ mma_block_scale_intrinsic_template = "llvm.nvvm.mma.block.scale.${geom}.row.col.${kind}${scale}.${intrinsic_signature}.${stype}"
+ mma_block_scale_instruction_template = "mma.sync.aligned.${geom}.row.col.kind::${kind}.block_scale${scale_vec_size}.${ptx_signature}.${stype}"
+
+ generated_items = []
+
+ for op, kind, scale_vec_size, stype in product(
+ get_mma_block_scale_ops(),
+ ["mxf4", "mxf4nvf4", "mxf8f6f4"],
+ ["", ".scale_vec::1X", ".scale_vec::2X", ".scale_vec::4X"],
+ ["ue8m0", "ue4m3"],
+ ):
+ if not is_mma_block_scale_variant_supported(op, kind, scale_vec_size, stype):
+ continue
+
+ params = {
+ "intrinsic_signature": mma_signature(op),
+ "ptx_signature": mma_ptx_signature(op),
+ "geom": op.a.geom,
+ "kind": kind,
+ "scale_vec_size": scale_vec_size,
+ "scale": scale_vec_size.replace("_vec::", ".").lower(),
+ "stype": stype,
+ }
+
+ intrinsic_template = mma_block_scale_intrinsic_template
+ instruction_template = mma_block_scale_instruction_template
+
+ generated_items.append(
+ common_mma_block_scale_test_gen(
+ params, op, intrinsic_template, instruction_template
+ )
+ )
+
+ return generated_items
+
+
def get_mma_sp_ops():
return (
make_mma_ops(["m16n8k16", "m16n8k32"], ["bf16"], [], ["f32"], [], True)
@@ -1224,7 +1378,11 @@ def is_mma_sp_variant_supported(op, metadata, kind, satf):
return True
-def sp_selector_gen(op):
+def sp_selector_gen(op, block_scale=False):
+ if block_scale:
+ # PTX ISA 9.0 has the sparsity selector equal to 0 only
+ return range(1)
+
# (geom, type) -> allowed selector range
range_01 = {
("m16n8k32", "bf16"),
@@ -1355,6 +1513,178 @@ def gen_mma_sp_tests():
return generated_items
+def get_mma_sp_block_scale_ops():
+ return make_mma_ops(["m16n8k128"], ["e2m1"], [], ["f32"], [], True) + make_mma_ops(
+ ["m16n8k64"],
+ ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"],
+ ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"],
+ ["f32"],
+ [],
+ True,
+ )
+
+
+def is_mma_sp_block_scale_geom_supported(geom):
+ # geometries for FP.
+ if geom in [
+ "m16n8k64",
+ "m16n8k128",
+ ]:
+ return True
+ raise ValueError(f"Unexpected sparse MMA block scale geometry: {geom}")
+
+
+def is_mma_sp_block_scale_variant_supported(op, kind, scale_vec_size, stype):
+ if not (
+ is_type_supported(op.a.mma_type.ptx_type)
+ and is_mma_sp_block_scale_geom_supported(op.a.geom)
+ ):
+ return False
+
+ if (
+ op.a.geom == "m16n8k128"
+ and kind == "mxf4"
+ and stype == "ue8m0"
+ and scale_vec_size in ["", ".scale_vec::2X"]
+ ):
+ return True
+
+ if (
+ op.a.geom == "m16n8k128"
+ and kind == "mxf4nvf4"
+ and stype == "ue8m0"
+ and scale_vec_size == ".scale_vec::2X"
+ ):
+ return True
+
+ if (
+ op.a.geom == "m16n8k128"
+ and kind == "mxf4nvf4"
+ and stype == "ue4m3"
+ and scale_vec_size == ".scale_vec::4X"
+ ):
+ return True
+
+ if (
+ op.a.geom == "m16n8k64"
+ and kind == "mxf8f6f4"
+ and stype == "ue8m0"
+ and scale_vec_size in ["", ".scale_vec::1X"]
+ ):
+ return True
+
+ return False
+
+
+def common_mma_sp_block_scale_test_gen(
+ params, op, intrinsic_template, instruction_template
+):
+ mma_sp_block_scale_decl_template = """
+declare ${ret_ty} @${intrinsic}(
+ ${args});
+"""
+
+ mma_sp_block_scale_test_template = """
+; CHECK-LABEL: .func {{.*}}test_${function}_${selector}(
+define ${ret_ty} @test_${function}_${selector}(
+ ${args}) {
+; CHECK: ${instruction}
+; CHECK-NEXT: ${check_d}
+; CHECK-NEXT: ${check_a}
+; CHECK-NEXT: ${check_b}
+; CHECK-NEXT: ${check_c}
+; CHECK-NEXT: ${check_metadata}
+; CHECK-NEXT: ${check_selector}
+; CHECK-NEXT: ${check_scale_a_data}
+; CHECK-NEXT: ${check_byte_id_a}
+; CHECK-NEXT: ${check_thread_id_a}
+; CHECK-NEXT: ${check_scale_b_data}
+; CHECK-NEXT: ${check_byte_id_b}
+; CHECK-NEXT: ${check_thread_id_b}
+ %r = call ${ret_ty} @${intrinsic}(
+ ${call_args});
+ ret ${ret_ty} %r;
+}
+"""
+
+ test_params = params
+ test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
+ test_params["function"] = test_params["intrinsic"].replace(".", "_")
+ test_params["instruction"] = Template(instruction_template).substitute(params)
+ test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d)
+ test_params["check_a"] = check_pattern(op.a)
+ test_params["check_b"] = check_pattern(op.b)
+ test_params["check_c"] = check_pattern(op.c)
+ test_params["check_d"] = check_pattern(op.d)
+ test_params["check_metadata"] = "{{%r[0-9]+}}"
+ test_params["check_scale_a_data"] = "{{%r[0-9]+}}"
+ test_params["check_byte_id_a"] = "{{%r[0-9]+}}"
+ test_params["check_thread_id_a"] = "{{%r[0-9]+}}"
+ test_params["check_scale_b_data"] = "{{%r[0-9]+}}"
+ test_params["check_byte_id_b"] = "{{%r[0-9]+}}"
+ test_params["check_thread_id_b"] = "{{%r[0-9]+}}"
+ args = ",\n ".join(
+ list(make_wmma_slice_args(frag) for frag in (op.a, op.b, op.c))
+ + ["i32 %metadata", "i32 %selector"]
+ + ["i32 %scale_a_data", "i16 %byte_id_a, i16 %thread_id_a"]
+ + ["i32 %scale_b_data", "i16 %byte_id_b, i16 %thread_id_b"]
+ )
+ test_params["args"] = args
+
+ print(Template(mma_sp_block_scale_decl_template).substitute(test_params))
+
+ for selector in [str(r) for r in sp_selector_gen(op, True)]:
+ test_params["selector"] = selector
+ test_params["check_selector"] = "{{" + test_params["selector"] + "}}"
+ test_params["call_args"] = test_params["args"].replace(
+ "%selector", test_params["selector"]
+ )
+
+ print(Template(mma_sp_block_scale_test_template).substitute(test_params))
+
+ return (test_params["intrinsic"], test_params["instruction"])
+
+
+def gen_mma_sp_block_scale_tests():
+ if not (ptx_version >= 88 and gpu_arch >= 120 and aa):
+ return []
+
+ mma_sp_block_scale_intrinsic_template = "llvm.nvvm.mma.sp.ordered.metadata.block.scale.${geom}.row.col.${kind}${scale}.${intrinsic_signature}.${stype}"
+ mma_sp_block_scale_instruction_template = "mma.sp::ordered_metadata.sync.aligned.${geom}.row.col.kind::${kind}.block_scale${scale_vec_size}.${ptx_signature}.${stype}"
+
+ generated_items = []
+
+ for op, kind, scale_vec_size, stype in product(
+ get_mma_sp_block_scale_ops(),
+ ["mxf4", "mxf4nvf4", "mxf8f6f4"],
+ ["", ".scale_vec::1X", ".scale_vec::2X", ".scale_vec::4X"],
+ ["ue8m0", "ue4m3"],
+ ):
+ if not is_mma_sp_block_scale_variant_supported(op, kind, scale_vec_size, stype):
+ continue
+
+ params = {
+ "intrinsic_signature": mma_signature(op),
+ "ptx_signature": mma_ptx_signature(op),
+ "geom": op.a.geom,
+ "kind": kind,
+ "scale_vec_size": scale_vec_size,
+ "scale": scale_vec_size.replace("_vec::", ".").lower(),
+ "stype": stype,
+ }
+
+ intrinsic_template = mma_sp_block_scale_intrinsic_template
+ instruction_template = mma_sp_block_scale_instruction_template
+
+ generated_items.append(
+ common_mma_sp_block_scale_test_gen(
+ params, op, intrinsic_template, instruction_template
+ )
+ )
+
+ return generated_items
+
+
# Append complete list of intrinsics and instructions we've generated tests for.
# Generate set of checks to verify that that we did generate sensible set of
# tests for the given combination of PTX and SM variants.
@@ -1545,7 +1875,9 @@ def gen_tests():
items += gen_stmatrix_tests()
items += gen_wmma_mma_tests()
items += gen_mma_tests()
+ items += gen_mma_block_scale_tests()
items += gen_mma_sp_tests()
+ items += gen_mma_sp_block_scale_tests()
gen_check_unsupported_ops(items)