diff options
Diffstat (limited to 'llvm/include/llvm/IR/IntrinsicsNVVM.td')
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsNVVM.td | 1513 |
1 files changed, 856 insertions, 657 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 967d166..7b40841 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -128,15 +128,16 @@ // * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32) // * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap // * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap -// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) -// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) -// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) -// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y) -// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) -// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y) +// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) +// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) +// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) +// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y) +// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) +// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y) def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr +def llvm_constant_ptr_ty: LLVMQualPointerType<4>; // (const)ptr def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr @@ -170,172 +171,248 @@ class StrJoin<string sep, list<string> str_list> { // Geom: m<M>n<N>k<K>. E.g. m8n32k16 // Frag: [a|b|c|d] ([x1|x2|x4] for ldmatrix) // PtxEltType: PTX type for the element. -class WMMA_REGS<string Geom, string Frag, string PtxEltType> { +class WMMA_REGS<string Geom, string Frag, string PtxEltType, bit IsSparse = false> { string geom = Geom; string frag = Frag; string ptx_elt_type = PtxEltType; string gft = Geom#":"#Frag#":"#ptx_elt_type; string gf = Geom#":"#Frag; string ft = frag#":"#ptx_elt_type; - list<LLVMType> regs = !cond( - // mma fp ops use smaller fragments than wmma fp ops - !eq(gft,"m8n8k4:a:f16") : !listsplat(llvm_v2f16_ty, 2), - !eq(gft,"m8n8k4:b:f16") : !listsplat(llvm_v2f16_ty, 2), - !eq(gft,"m16n8k8:a:f16") : !listsplat(llvm_v2f16_ty, 2), - !eq(gft,"m16n8k8:b:f16") : [llvm_v2f16_ty], - !eq(gft,"m16n8k8:c:f16") : !listsplat(llvm_v2f16_ty, 2), - !eq(gft,"m16n8k8:d:f16") : !listsplat(llvm_v2f16_ty, 2), - !eq(gft,"m16n8k8:c:f32") : !listsplat(llvm_float_ty, 4), - !eq(gft,"m16n8k8:d:f32") : !listsplat(llvm_float_ty, 4), - !eq(gft,"m16n8k16:a:f16") : !listsplat(llvm_v2f16_ty, 4), - !eq(gft,"m16n8k16:b:f16") : !listsplat(llvm_v2f16_ty, 2), - !eq(gft,"m16n8k16:c:f16") : !listsplat(llvm_v2f16_ty, 2), - !eq(gft,"m16n8k16:d:f16") : !listsplat(llvm_v2f16_ty, 2), - !eq(gft,"m16n8k16:c:f32") : !listsplat(llvm_float_ty, 4), - !eq(gft,"m16n8k16:d:f32") : !listsplat(llvm_float_ty, 4), - !eq(gft,"m16n8k4:c:f32") : !listsplat(llvm_float_ty, 4), - !eq(gft,"m16n8k4:d:f32") : !listsplat(llvm_float_ty, 4), - - // wmma fp16 -> fp16/fp32 @ m16n16k16/m8n32k16/m32n8k16 - // All other supported geometries use the same fragment format for f32 and - // f16, so we only need to consider {fragment, type}. - !eq(ft,"a:f16") : !listsplat(llvm_v2f16_ty, 8), - !eq(ft,"b:f16") : !listsplat(llvm_v2f16_ty, 8), - !eq(ft,"c:f16") : !listsplat(llvm_v2f16_ty, 4), - !eq(ft,"d:f16") : !listsplat(llvm_v2f16_ty, 4), - !eq(ft,"c:f32") : !listsplat(llvm_float_ty, 8), - !eq(ft,"d:f32") : !listsplat(llvm_float_ty, 8), - - // wmma tf32 -> s32 @ m16n16k8 - !eq(gft,"m16n16k8:a:tf32") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n16k8:b:tf32") : !listsplat(llvm_i32_ty, 4), - - // mma tf32 -> s32 @ m16n16k8/m16n8k8 - !eq(gft,"m16n8k4:a:tf32") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k4:b:tf32") : [llvm_i32_ty], - !eq(gft,"m16n8k8:a:tf32") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k8:b:tf32") : !listsplat(llvm_i32_ty, 2), - - !eq(gft,"m8n8k4:a:f64") : [llvm_double_ty], - !eq(gft,"m8n8k4:b:f64") : [llvm_double_ty], - !eq(gft,"m8n8k4:c:f64") : !listsplat(llvm_double_ty, 2), - !eq(gft,"m8n8k4:d:f64") : !listsplat(llvm_double_ty, 2), - - // wmma bf16 -> s32 @ m16n16k16/m8n32k16/m32n8k16 - !eq(gft,"m16n16k16:a:bf16") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n16k16:b:bf16") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m8n32k16:a:bf16") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m8n32k16:b:bf16") : !listsplat(llvm_i32_ty, 8), - !eq(gft,"m32n8k16:a:bf16") : !listsplat(llvm_i32_ty, 8), - !eq(gft,"m32n8k16:b:bf16") : !listsplat(llvm_i32_ty, 2), - - // mma bf16 -> s32 @ m16n8k16/m16n8k8 - !eq(gft,"m16n8k16:a:bf16") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k16:b:bf16") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k8:a:bf16") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k8:b:bf16") : [llvm_i32_ty], - - // wmma u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16 - !eq(gft,"m16n16k16:a:u8") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n16k16:a:s8") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n16k16:b:u8") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n16k16:b:s8") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n16k16:c:s32") : !listsplat(llvm_i32_ty, 8), - !eq(gft,"m16n16k16:d:s32") : !listsplat(llvm_i32_ty, 8), - - !eq(gft,"m8n32k16:a:u8") : [llvm_i32_ty], - !eq(gft,"m8n32k16:a:s8") : [llvm_i32_ty], - !eq(gft,"m8n32k16:b:u8") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m8n32k16:b:s8") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m8n32k16:c:s32") : !listsplat(llvm_i32_ty, 8), - !eq(gft,"m8n32k16:d:s32") : !listsplat(llvm_i32_ty, 8), - - !eq(gft,"m32n8k16:a:u8") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m32n8k16:a:s8") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m32n8k16:b:u8") : [llvm_i32_ty], - !eq(gft,"m32n8k16:b:s8") : [llvm_i32_ty], - !eq(gft,"m32n8k16:c:s32") : !listsplat(llvm_i32_ty, 8), - !eq(gft,"m32n8k16:d:s32") : !listsplat(llvm_i32_ty, 8), - - // mma u8/s8 -> s32 @ m8n8k16/m16n8k16/m16n8k32 - !eq(gft,"m8n8k16:a:u8") : [llvm_i32_ty], - !eq(gft,"m8n8k16:a:s8") : [llvm_i32_ty], - !eq(gft,"m8n8k16:b:u8") : [llvm_i32_ty], - !eq(gft,"m8n8k16:b:s8") : [llvm_i32_ty], - !eq(gft,"m8n8k16:c:s32") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m8n8k16:d:s32") : !listsplat(llvm_i32_ty, 2), - - !eq(gft,"m16n8k16:a:u8") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k16:a:s8") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k16:b:u8") : [llvm_i32_ty], - !eq(gft,"m16n8k16:b:s8") : [llvm_i32_ty], - !eq(gft,"m16n8k16:c:s32") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k16:d:s32") : !listsplat(llvm_i32_ty, 4), - - !eq(gft,"m16n8k32:a:u8") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k32:a:s8") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k32:b:u8") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k32:b:s8") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k32:c:s32") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k32:d:s32") : !listsplat(llvm_i32_ty, 4), - - // wmma/mma u4/s4 -> s32 @ m8n8k32 (u4/s4) - !eq(gft,"m8n8k32:a:u4") : [llvm_i32_ty], - !eq(gft,"m8n8k32:a:s4") : [llvm_i32_ty], - !eq(gft,"m8n8k32:b:u4") : [llvm_i32_ty], - !eq(gft,"m8n8k32:b:s4") : [llvm_i32_ty], - !eq(gft,"m8n8k32:c:s32") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m8n8k32:d:s32") : !listsplat(llvm_i32_ty, 2), - - !eq(gft,"m16n8k32:a:u4") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k32:a:s4") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k32:b:u4") : [llvm_i32_ty], - !eq(gft,"m16n8k32:b:s4") : [llvm_i32_ty], - !eq(gft,"m16n8k32:c:s32") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k32:d:s32") : !listsplat(llvm_i32_ty, 4), - - !eq(gft,"m16n8k64:a:u4") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k64:a:s4") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k64:b:u4") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k64:b:s4") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k64:c:s32") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k64:d:s32") : !listsplat(llvm_i32_ty, 4), - - // wmma/mma b1 -> s32 @ m8n8k128(b1) - !eq(gft,"m8n8k128:a:b1") : [llvm_i32_ty], - !eq(gft,"m8n8k128:b:b1") : [llvm_i32_ty], - !eq(gft,"m8n8k128:c:s32") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m8n8k128:d:s32") : !listsplat(llvm_i32_ty, 2), - - !eq(gft,"m16n8k128:a:b1") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k128:b:b1") : [llvm_i32_ty], - !eq(gft,"m16n8k128:c:s32") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k128:d:s32") : !listsplat(llvm_i32_ty, 4), - - !eq(gft,"m16n8k256:a:b1") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k256:b:b1") : !listsplat(llvm_i32_ty, 2), - !eq(gft,"m16n8k256:c:s32") : !listsplat(llvm_i32_ty, 4), - !eq(gft,"m16n8k256:d:s32") : !listsplat(llvm_i32_ty, 4), - - // ldmatrix b16 -> s32 @ m8n8 - !eq(gf,"m8n8:x1") : !listsplat(llvm_i32_ty, 1), - !eq(gf,"m8n8:x2") : !listsplat(llvm_i32_ty, 2), - !eq(gf,"m8n8:x4") : !listsplat(llvm_i32_ty, 4), - - // ldmatrix b8, b8x16.b6x16_p32, b8x16.b4x16_p64 -> s32 @ m16n16 - !eq(gf,"m16n16:x1") : !listsplat(llvm_i32_ty, 2), - !eq(gf,"m16n16:x2") : !listsplat(llvm_i32_ty, 4), - - // ldmatrix b8x16.b6x16_p32, b8x16.b4x16_p64 -> s32 @ m8n16 - !eq(gf,"m8n16:x1") : !listsplat(llvm_i32_ty, 1), - !eq(gf,"m8n16:x2") : !listsplat(llvm_i32_ty, 2), - !eq(gf,"m8n16:x4") : !listsplat(llvm_i32_ty, 4), - - // stmatrix b8 -> s32 @ m16n8 - !eq(gf,"m16n8:x1") : !listsplat(llvm_i32_ty, 1), - !eq(gf,"m16n8:x2") : !listsplat(llvm_i32_ty, 2), - !eq(gf,"m16n8:x4") : !listsplat(llvm_i32_ty, 4), - + list<LLVMType> regs = !if(!eq(IsSparse, true), + !cond( + // mma sparse ops use other fragments for some arguments + !eq(gft,"m16n8k16:a:bf16") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k16:a:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k16:b:bf16") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k16:b:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k16:c:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k16:c:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k16:d:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k16:d:f32") : !listsplat(llvm_float_ty, 4), + + !eq(gft,"m16n8k32:a:bf16") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k32:a:f16") : !listsplat(llvm_v2f16_ty, 4), + !eq(gft,"m16n8k32:b:bf16") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k32:b:f16") : !listsplat(llvm_v2f16_ty, 4), + !eq(gft,"m16n8k32:c:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k32:c:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k32:d:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k32:d:f32") : !listsplat(llvm_float_ty, 4), + + !eq(gft,"m16n8k16:a:tf32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k16:b:tf32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k16:c:tf32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k16:d:tf32") : !listsplat(llvm_float_ty, 4), + + !eq(gft,"m16n8k8:a:tf32") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k8:b:tf32") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k8:c:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k8:d:f32") : !listsplat(llvm_float_ty, 4), + + !eq(gft,"m16n8k32:a:u8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k32:a:s8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k32:b:u8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k32:b:s8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k32:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k32:d:s32") : !listsplat(llvm_i32_ty, 4), + + !eq(gft,"m16n8k64:a:u8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:a:s8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:a:e4m3") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:a:e5m2") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:a:e3m2") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:a:e2m3") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:a:e2m1") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:b:u8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:b:s8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:b:e4m3") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:b:e5m2") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:b:e3m2") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:b:e2m3") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:b:e2m1") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:c:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k64:c:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k64:d:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k64:d:f32") : !listsplat(llvm_float_ty, 4), + + !eq(gft,"m16n8k64:a:u4") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k64:a:s4") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k64:b:u4") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k64:b:s4") : !listsplat(llvm_i32_ty, 2), + + !eq(gft,"m16n8k64:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:d:s32") : !listsplat(llvm_i32_ty, 4), + + !eq(gft,"m16n8k128:a:u4") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:a:s4") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:a:e2m1") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:b:u4") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:b:s4") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:b:e2m1") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:c:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k128:d:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:d:f32") : !listsplat(llvm_float_ty, 4), + ), + !cond( + // mma fp ops use smaller fragments than wmma fp ops + !eq(gft,"m8n8k4:a:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m8n8k4:b:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k8:a:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k8:b:f16") : [llvm_v2f16_ty], + !eq(gft,"m16n8k8:c:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k8:d:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k8:c:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k8:d:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k16:a:f16") : !listsplat(llvm_v2f16_ty, 4), + !eq(gft,"m16n8k16:b:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k16:c:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k16:d:f16") : !listsplat(llvm_v2f16_ty, 2), + !eq(gft,"m16n8k16:c:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k16:d:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k4:c:f32") : !listsplat(llvm_float_ty, 4), + !eq(gft,"m16n8k4:d:f32") : !listsplat(llvm_float_ty, 4), + + // wmma fp16 -> fp16/fp32 @ m16n16k16/m8n32k16/m32n8k16 + // All other supported geometries use the same fragment format for f32 and + // f16, so we only need to consider {fragment, type}. + !eq(ft,"a:f16") : !listsplat(llvm_v2f16_ty, 8), + !eq(ft,"b:f16") : !listsplat(llvm_v2f16_ty, 8), + !eq(ft,"c:f16") : !listsplat(llvm_v2f16_ty, 4), + !eq(ft,"d:f16") : !listsplat(llvm_v2f16_ty, 4), + !eq(ft,"c:f32") : !listsplat(llvm_float_ty, 8), + !eq(ft,"d:f32") : !listsplat(llvm_float_ty, 8), + + // wmma tf32 -> s32 @ m16n16k8 + !eq(gft,"m16n16k8:a:tf32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n16k8:b:tf32") : !listsplat(llvm_i32_ty, 4), + + // mma tf32 -> s32 @ m16n16k8/m16n8k8 + !eq(gft,"m16n8k4:a:tf32") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k4:b:tf32") : [llvm_i32_ty], + !eq(gft,"m16n8k8:a:tf32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k8:b:tf32") : !listsplat(llvm_i32_ty, 2), + + !eq(gft,"m8n8k4:a:f64") : [llvm_double_ty], + !eq(gft,"m8n8k4:b:f64") : [llvm_double_ty], + !eq(gft,"m8n8k4:c:f64") : !listsplat(llvm_double_ty, 2), + !eq(gft,"m8n8k4:d:f64") : !listsplat(llvm_double_ty, 2), + + // wmma bf16 -> s32 @ m16n16k16/m8n32k16/m32n8k16 + !eq(gft,"m16n16k16:a:bf16") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n16k16:b:bf16") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m8n32k16:a:bf16") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m8n32k16:b:bf16") : !listsplat(llvm_i32_ty, 8), + !eq(gft,"m32n8k16:a:bf16") : !listsplat(llvm_i32_ty, 8), + !eq(gft,"m32n8k16:b:bf16") : !listsplat(llvm_i32_ty, 2), + + // mma bf16 -> s32 @ m16n8k16/m16n8k8 + !eq(gft,"m16n8k16:a:bf16") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k16:b:bf16") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k8:a:bf16") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k8:b:bf16") : [llvm_i32_ty], + + // wmma u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16 + !eq(gft,"m16n16k16:a:u8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n16k16:a:s8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n16k16:b:u8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n16k16:b:s8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n16k16:c:s32") : !listsplat(llvm_i32_ty, 8), + !eq(gft,"m16n16k16:d:s32") : !listsplat(llvm_i32_ty, 8), + + !eq(gft,"m8n32k16:a:u8") : [llvm_i32_ty], + !eq(gft,"m8n32k16:a:s8") : [llvm_i32_ty], + !eq(gft,"m8n32k16:b:u8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m8n32k16:b:s8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m8n32k16:c:s32") : !listsplat(llvm_i32_ty, 8), + !eq(gft,"m8n32k16:d:s32") : !listsplat(llvm_i32_ty, 8), + + !eq(gft,"m32n8k16:a:u8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m32n8k16:a:s8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m32n8k16:b:u8") : [llvm_i32_ty], + !eq(gft,"m32n8k16:b:s8") : [llvm_i32_ty], + !eq(gft,"m32n8k16:c:s32") : !listsplat(llvm_i32_ty, 8), + !eq(gft,"m32n8k16:d:s32") : !listsplat(llvm_i32_ty, 8), + + // mma u8/s8 -> s32 @ m8n8k16/m16n8k16/m16n8k32 + !eq(gft,"m8n8k16:a:u8") : [llvm_i32_ty], + !eq(gft,"m8n8k16:a:s8") : [llvm_i32_ty], + !eq(gft,"m8n8k16:b:u8") : [llvm_i32_ty], + !eq(gft,"m8n8k16:b:s8") : [llvm_i32_ty], + !eq(gft,"m8n8k16:c:s32") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m8n8k16:d:s32") : !listsplat(llvm_i32_ty, 2), + + !eq(gft,"m16n8k16:a:u8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k16:a:s8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k16:b:u8") : [llvm_i32_ty], + !eq(gft,"m16n8k16:b:s8") : [llvm_i32_ty], + !eq(gft,"m16n8k16:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k16:d:s32") : !listsplat(llvm_i32_ty, 4), + + !eq(gft,"m16n8k32:a:u8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k32:a:s8") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k32:b:u8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k32:b:s8") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k32:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k32:d:s32") : !listsplat(llvm_i32_ty, 4), + + // wmma/mma u4/s4 -> s32 @ m8n8k32 (u4/s4) + !eq(gft,"m8n8k32:a:u4") : [llvm_i32_ty], + !eq(gft,"m8n8k32:a:s4") : [llvm_i32_ty], + !eq(gft,"m8n8k32:b:u4") : [llvm_i32_ty], + !eq(gft,"m8n8k32:b:s4") : [llvm_i32_ty], + !eq(gft,"m8n8k32:c:s32") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m8n8k32:d:s32") : !listsplat(llvm_i32_ty, 2), + + !eq(gft,"m16n8k32:a:u4") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k32:a:s4") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k32:b:u4") : [llvm_i32_ty], + !eq(gft,"m16n8k32:b:s4") : [llvm_i32_ty], + !eq(gft,"m16n8k32:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k32:d:s32") : !listsplat(llvm_i32_ty, 4), + + !eq(gft,"m16n8k64:a:u4") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:a:s4") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:b:u4") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k64:b:s4") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k64:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k64:d:s32") : !listsplat(llvm_i32_ty, 4), + + // wmma/mma b1 -> s32 @ m8n8k128(b1) + !eq(gft,"m8n8k128:a:b1") : [llvm_i32_ty], + !eq(gft,"m8n8k128:b:b1") : [llvm_i32_ty], + !eq(gft,"m8n8k128:c:s32") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m8n8k128:d:s32") : !listsplat(llvm_i32_ty, 2), + + !eq(gft,"m16n8k128:a:b1") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k128:b:b1") : [llvm_i32_ty], + !eq(gft,"m16n8k128:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k128:d:s32") : !listsplat(llvm_i32_ty, 4), + + !eq(gft,"m16n8k256:a:b1") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k256:b:b1") : !listsplat(llvm_i32_ty, 2), + !eq(gft,"m16n8k256:c:s32") : !listsplat(llvm_i32_ty, 4), + !eq(gft,"m16n8k256:d:s32") : !listsplat(llvm_i32_ty, 4), + + // ldmatrix b16 -> s32 @ m8n8 + !eq(gf,"m8n8:x1") : !listsplat(llvm_i32_ty, 1), + !eq(gf,"m8n8:x2") : !listsplat(llvm_i32_ty, 2), + !eq(gf,"m8n8:x4") : !listsplat(llvm_i32_ty, 4), + + // ldmatrix b8, b8x16.b6x16_p32, b8x16.b4x16_p64 -> s32 @ m16n16 + !eq(gf,"m16n16:x1") : !listsplat(llvm_i32_ty, 2), + !eq(gf,"m16n16:x2") : !listsplat(llvm_i32_ty, 4), + + // ldmatrix b8x16.b6x16_p32, b8x16.b4x16_p64 -> s32 @ m8n16 + !eq(gf,"m8n16:x1") : !listsplat(llvm_i32_ty, 1), + !eq(gf,"m8n16:x2") : !listsplat(llvm_i32_ty, 2), + !eq(gf,"m8n16:x4") : !listsplat(llvm_i32_ty, 4), + + // stmatrix b8 -> s32 @ m16n8 + !eq(gf,"m16n8:x1") : !listsplat(llvm_i32_ty, 1), + !eq(gf,"m16n8:x2") : !listsplat(llvm_i32_ty, 2), + !eq(gf,"m16n8:x4") : !listsplat(llvm_i32_ty, 4), + ) ); } @@ -362,6 +439,12 @@ class WMMA_NAME_LDST<string Op, WMMA_REGS Frag, string Layout, int WithStride> { class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> { list<WMMA_REGS> id_frags = !cond( + // FP8/F8F6F4 ops are identified by A,B inputs & accomulator & result type. + !or(!eq(A.ptx_elt_type, "e4m3"), + !eq(A.ptx_elt_type, "e5m2"), + !eq(A.ptx_elt_type, "e3m2"), + !eq(A.ptx_elt_type, "e2m3"), + !eq(A.ptx_elt_type, "e2m1")): [D, A, B, C], // FP16 ops are identified by accumulator & result type. !eq(A.ptx_elt_type, "f16") : [D, C], // other ops are identified by input types. @@ -397,6 +480,19 @@ class MMA_NAME<string ALayout, string BLayout, int Satfinite, string b1op, # signature; } +class MMA_SP_NAME<string Metadata, string Kind, int Satfinite, + WMMA_REGS A, WMMA_REGS B, + WMMA_REGS C, WMMA_REGS D> { + string signature = MMA_SIGNATURE<A, B, C, D>.ret; + string record = "int_nvvm_mma" + # "_" # !subst("::", "_", Metadata) + # "_" # A.geom + # "_row_col" + # !if(!ne(Kind, ""), !strconcat("_", !subst("::", "_", Kind)), "") + # !if(Satfinite, "_satfinite", "") + # signature; +} + class LDMATRIX_NAME<WMMA_REGS Frag, int Trans> { string intr = "llvm.nvvm.ldmatrix.sync.aligned" # "." # Frag.geom @@ -424,21 +520,22 @@ class STMATRIX_NAME<WMMA_REGS Frag, int Trans> { // TypeN: PTX type of the corresponding fragment's element. // TypeB and TypeD may be empty if it must match that of TypeA or TypeC. class MMA_OPS<list<string> Geom, list<string> TypeA, list<string> TypeB, - list<string> TypeC, list<string> TypeD> { + list<string> TypeC, list<string> TypeD, bit IsSparse = false> { list<list<WMMA_REGS>> ret = !foldl([]<list<WMMA_REGS>>, Geom, t1, geom, !listconcat(t1, !foldl([]<list<WMMA_REGS>>, TypeA, t2, type_a, !listconcat(t2, !foldl([]<list<WMMA_REGS>>, !if(!size(TypeB), TypeB, [type_a]), t3, type_b, !listconcat(t3, !foldl([]<list<WMMA_REGS>>, TypeC, t4, type_c, !listconcat(t4, !foldl([]<list<WMMA_REGS>>, !if(!size(TypeD), TypeD, [type_c]), t5, type_d, !listconcat(t5, - [[WMMA_REGS<geom, "a", type_a>, - WMMA_REGS<geom, "b", type_b>, - WMMA_REGS<geom, "c", type_c>, - WMMA_REGS<geom, "d", type_d>]])))))))))); + [[WMMA_REGS<geom, "a", type_a, IsSparse>, + WMMA_REGS<geom, "b", type_b, IsSparse>, + WMMA_REGS<geom, "c", type_c, IsSparse>, + WMMA_REGS<geom, "d", type_d, IsSparse>]])))))))))); // Debugging aid for readable representation of the list above. list<list<string>> ops = !foreach(x, ret, [x[0].gft, x[1].gft, x[2].gft, x[3].gft]); } + class MMA_LDST_OPS<list<string> Geom, list<string> Frags, list<string> Types> { list<WMMA_REGS> ret = !foldl([]<WMMA_REGS>, Geom, t1, geom, !listconcat(t1, @@ -522,6 +619,30 @@ class NVVM_MMA_OPS { tf32_mma_ops, bf16_mma_ops, f64_mma_ops, fp_mma_ops, int_mma_ops, subint_mma_ops, bit_mma_ops); + list<list<WMMA_REGS>> bf16_mma_sp_ops = MMA_OPS< + ["m16n8k16", "m16n8k32"], + ["bf16"], [], ["f32"], [], true>.ret; + list<list<WMMA_REGS>> tf32_mma_sp_ops = MMA_OPS< + ["m16n8k8", "m16n8k16"], + ["tf32"], [], ["f32"], [], true>.ret; + list<list<WMMA_REGS>> fp_mma_sp_ops = MMA_OPS< + ["m16n8k16", "m16n8k32"], + ["f16"], [], ["f16", "f32"], ["f16", "f32"], true>.ret; + list<list<WMMA_REGS>> fp8_mma_sp_ops = MMA_OPS< + ["m16n8k64"], + ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"], + ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"], + ["f16", "f32"], ["f16", "f32"], true>.ret; + list<list<WMMA_REGS>> subint_mma_sp_ops = MMA_OPS< + ["m16n8k64", "m16n8k128"], + ["s4", "u4"], ["s4", "u4"], ["s32"], [], true>.ret; + list<list<WMMA_REGS>> int_mma_sp_ops = MMA_OPS< + ["m16n8k32", "m16n8k64"], + ["s8", "u8"], ["s8", "u8"], ["s32"], [], true>.ret; + list<list<WMMA_REGS>> all_mma_sp_ops = !listconcat( + bf16_mma_sp_ops, tf32_mma_sp_ops, fp_mma_sp_ops, fp8_mma_sp_ops, + subint_mma_sp_ops, int_mma_sp_ops); + list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS< ["m16n16k16", "m32n8k16", "m8n32k16"], ["a", "b"], ["f16", "u8", "s8", "bf16"]>.ret; @@ -728,6 +849,68 @@ class NVVM_STMATRIX_SUPPORTED<WMMA_REGS frag, bit trans> { ); } + +// Returns true if this combination of layout/kind/satf for MMA.SP ops is supported; +// false otherwise. +// E.g. +// if NVVM_MMA_SP_SUPPORTED<...>.ret then +// def : FOO<>; // The record will only be defined for supported ops. +// +class NVVM_MMA_SP_SUPPORTED<list<WMMA_REGS> frags, string metadata, + string kind, int satf> { + // MMA.SP ops check both layouts. + string a_type = frags[0].ptx_elt_type; + string b_type = frags[1].ptx_elt_type; + string c_type = frags[2].ptx_elt_type; + string d_type = frags[3].ptx_elt_type; + string geom = frags[0].geom; + + bit is_int = !or(!eq(a_type, "s8"), + !eq(a_type, "u8"), + !eq(a_type, "s4"), + !eq(a_type, "u4")); + + bit ret = !cond( + + // Limit satf to valid types + !and(!eq(satf, 1), + !eq(is_int, 0)): false, + + // f16/bf16/tf32 requires A and B to be the same type. + !and(!or(!eq(a_type, "f16"), + !eq(a_type, "bf16"), + !eq(a_type, "tf32")), + !ne(a_type, b_type)): false, + + // m16n8k16 and m16n8k32 requires C and D to be the same type. + !and(!or(!eq(geom, "m16n8k16"), + !eq(geom, "m16n8k32")), + !ne(c_type, d_type)): false, + + !and(!eq(kind, ""), + !or(!eq(a_type, "e3m2"), + !eq(a_type, "e2m3"), + !eq(a_type, "e2m1"), + !eq(b_type, "e3m2"), + !eq(b_type, "e2m3"), + !eq(b_type, "e2m1"))): false, + + !and(!eq(kind, ""), + !eq(geom, "m16n8k64"), + !or(!eq(c_type, "f16"), + !eq(d_type, "f16"))): false, + + !and(!ne(kind, ""), + !or(!eq(metadata, "sp"), + !ne(geom, "m16n8k64"), + !eq(is_int, 1))): false, + + // All other are OK. + true: true + ); +} + + class SHFL_INFO<bit sync, string mode, string type, bit return_pred> { string Suffix = !if(sync, "sync_", "") # mode # "_" @@ -792,38 +975,49 @@ class NVVMBuiltin : "NVVMBuiltin must be a NVVM intrinsic starting with 'int_nvvm_'"; } +class PureIntrinsic<list<LLVMType> ret_types, + list<LLVMType> param_types = [], + list<IntrinsicProperty> intr_properties = [], + string name = ""> : + DefaultAttrsIntrinsic<ret_types, param_types, + intr_properties # [IntrNoMem, IntrSpeculatable], name> {} + let TargetPrefix = "nvvm" in { + // // PRMT - permute - - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - def int_nvvm_prmt : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - - foreach mode = ["f4e", "b4e"] in - def int_nvvm_prmt_ # mode : - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - - // Note: these variants also have 2 source operands but only one will ever - // be used so we eliminate the other operand in the IR (0 is used as the - // placeholder in the backend). - foreach mode = ["rc8", "ecl", "ecr", "rc16"] in - def int_nvvm_prmt_ # mode : - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; - } - + // + def int_nvvm_prmt : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + + foreach mode = ["f4e", "b4e"] in + def int_nvvm_prmt_ # mode : + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + + // Note: these variants also have 2 source operands but only one will ever + // be used so we eliminate the other operand in the IR (0 is used as the + // placeholder in the backend). + foreach mode = ["rc8", "ecl", "ecr", "rc16"] in + def int_nvvm_prmt_ # mode : + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; + + // + // Nanosleep + // def int_nvvm_nanosleep : NVVMBuiltin, DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoMem, IntrHasSideEffects]>; + // // Performance Monitor Events (pm events) intrinsics + // def int_nvvm_pm_event_mask : NVVMBuiltin, DefaultAttrsIntrinsic<[], [llvm_i16_ty], [IntrConvergent, IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>; -// -// Min Max -// + // + // Min Max + // let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { foreach operation = ["min", "max"] in { def int_nvvm_f # operation # _d : NVVMBuiltin, @@ -852,9 +1046,9 @@ let TargetPrefix = "nvvm" in { } // operation } -// -// Multiplication -// + // + // Multiplication + // let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { foreach sign = ["", "u"] in { def int_nvvm_mulhi_ # sign # s : NVVMBuiltin, @@ -880,9 +1074,9 @@ let TargetPrefix = "nvvm" in { } } -// -// Div -// + // + // Div + // let IntrProperties = [IntrNoMem] in { foreach ftz = ["", "_ftz"] in { def int_nvvm_div_approx # ftz # _f : NVVMBuiltin, @@ -902,90 +1096,79 @@ let TargetPrefix = "nvvm" in { } } -// -// Sad -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach sign = ["", "u"] in { - def int_nvvm_sad_ # sign # s : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty]>; + // + // Sad - Sum of Absolute Differences + // + foreach sign = ["", "u"] in { + def int_nvvm_sad_ # sign # s : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty]>; - def int_nvvm_sad_ # sign # i : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + def int_nvvm_sad_ # sign # i : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - def int_nvvm_sad_ # sign # ll : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty]>; - } + def int_nvvm_sad_ # sign # ll : NVVMBuiltin, + PureIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty]>; } -// -// Floor Ceil -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach op = ["floor", "ceil"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_ # op # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_ # op # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - } + // + // Floor Ceil + // + foreach op = ["floor", "ceil"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_ # op # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + def int_nvvm_ # op # _d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>; } -// -// Abs -// + // + // Abs + // foreach ftz = ["", "_ftz"] in def int_nvvm_fabs # ftz : - DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; -// -// Abs, Neg bf16, bf16x2 -// + // + // Neg bf16, bf16x2 + // def int_nvvm_neg_bf16 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty], [IntrNoMem]>; + PureIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty]>; def int_nvvm_neg_bf16x2 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty], [IntrNoMem]>; + PureIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty]>; -// -// Round -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_round # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + // + // Round + // + foreach ftz = ["", "_ftz"] in + def int_nvvm_round # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_round_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - } + def int_nvvm_round_d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>; -// -// Trunc -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_trunc # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + // + // Trunc + // + foreach ftz = ["", "_ftz"] in + def int_nvvm_trunc # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_trunc_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - } + def int_nvvm_trunc_d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>; -// -// Saturate -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_saturate # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + // + // Saturate + // + foreach ftz = ["", "_ftz"] in + def int_nvvm_saturate # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_saturate_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - } + def int_nvvm_saturate_d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>; -// -// Exp2 Log2 -// + // + // Exp2 Log2 + // let IntrProperties = [IntrNoMem] in { foreach ftz = ["", "_ftz"] in def int_nvvm_ex2_approx # ftz # _f : NVVMBuiltin, @@ -1006,53 +1189,51 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; } -// -// Sin Cos -// + // + // Sin Cos + // foreach op = ["sin", "cos"] in foreach ftz = ["", "_ftz"] in def int_nvvm_ # op # _approx # ftz # _f : NVVMBuiltin, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; -// -// Fma -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach variant = ["", "_sat", "_relu"] in { - foreach ftz = ["", "_ftz"] in { - def int_nvvm_fma_rn # ftz # variant # _f16 : - DefaultAttrsIntrinsic<[llvm_half_ty], - [llvm_half_ty, llvm_half_ty, llvm_half_ty]>; - - def int_nvvm_fma_rn # ftz # variant # _f16x2 : - DefaultAttrsIntrinsic<[llvm_v2f16_ty], - [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty]>; - - def int_nvvm_fma_rn # ftz # variant # _bf16 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_bfloat_ty], - [llvm_bfloat_ty, llvm_bfloat_ty, llvm_bfloat_ty]>; - - def int_nvvm_fma_rn # ftz # variant # _bf16x2 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], - [llvm_v2bf16_ty, llvm_v2bf16_ty, llvm_v2bf16_ty]>; - } // ftz - } // variant + // + // Fma + // + foreach variant = ["", "_sat", "_relu"] in { + foreach ftz = ["", "_ftz"] in { + def int_nvvm_fma_rn # ftz # variant # _f16 : + PureIntrinsic<[llvm_half_ty], + [llvm_half_ty, llvm_half_ty, llvm_half_ty]>; - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_fma_ # rnd # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty]>; + def int_nvvm_fma_rn # ftz # variant # _f16x2 : + PureIntrinsic<[llvm_v2f16_ty], + [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty]>; - def int_nvvm_fma_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], - [llvm_double_ty, llvm_double_ty, llvm_double_ty]>; - } + def int_nvvm_fma_rn # ftz # variant # _bf16 : NVVMBuiltin, + PureIntrinsic<[llvm_bfloat_ty], + [llvm_bfloat_ty, llvm_bfloat_ty, llvm_bfloat_ty]>; + + def int_nvvm_fma_rn # ftz # variant # _bf16x2 : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], + [llvm_v2bf16_ty, llvm_v2bf16_ty, llvm_v2bf16_ty]>; + } // ftz + } // variant + + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_fma_ # rnd # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_fma_ # rnd # _d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], + [llvm_double_ty, llvm_double_ty, llvm_double_ty]>; } -// -// Rcp -// + // + // Rcp + // let IntrProperties = [IntrNoMem] in { foreach rnd = ["rn", "rz", "rm", "rp"] in { foreach ftz = ["", "_ftz"] in @@ -1069,9 +1250,9 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; } -// -// Sqrt -// + // + // Sqrt + // let IntrProperties = [IntrNoMem] in { foreach rnd = ["rn", "rz", "rm", "rp"] in { foreach ftz = ["", "_ftz"] in @@ -1090,9 +1271,9 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; } -// -// Rsqrt -// + // + // Rsqrt + // let IntrProperties = [IntrNoMem] in { foreach ftz = ["", "_ftz"] in { def int_nvvm_rsqrt_approx # ftz # _f : NVVMBuiltin, @@ -1102,208 +1283,206 @@ let TargetPrefix = "nvvm" in { } } -// -// Add -// + // + // Add + // let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { foreach rnd = ["rn", "rz", "rm", "rp"] in { foreach ftz = ["", "_ftz"] in def int_nvvm_add_ # rnd # ftz # _f : NVVMBuiltin, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_add_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>; + def int_nvvm_add_ # rnd # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>; } } -// -// Dot Product -// + // + // Dot Product + // foreach a_type = ["s", "u"] in { foreach b_type = ["s", "u"] in { def int_nvvm_idp4a_ # a_type # _ # b_type : - DefaultAttrsIntrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; def int_nvvm_idp2a_ # a_type # _ # b_type : - DefaultAttrsIntrinsic<[llvm_i32_ty], + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]>; + [ImmArg<ArgIndex<2>>]>; } } -// -// Funnel-shift -// + // + // Funnel-shift + // foreach direction = ["l", "r"] in def int_nvvm_fsh # direction # _clamp : - DefaultAttrsIntrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; -// -// FLO - Find Leading One -// + // + // FLO - Find Leading One + // foreach sign = ["s", "u"] in def int_nvvm_flo_ # sign : - DefaultAttrsIntrinsic<[llvm_i32_ty], - [llvm_anyint_ty, llvm_i1_ty], - [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>; + PureIntrinsic<[llvm_i32_ty], [llvm_anyint_ty, llvm_i1_ty], + [ImmArg<ArgIndex<1>>]>; -// -// szext -// + // + // szext + // foreach ext = ["sext", "zext"] in foreach mode = ["wrap", "clamp"] in def int_nvvm_ # ext # _ # mode : - DefaultAttrsIntrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; -// -// BMSK - bit mask -// + // + // BMSK - bit mask + // foreach mode = ["wrap", "clamp"] in def int_nvvm_bmsk_ # mode : - DefaultAttrsIntrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; -// -// Convert -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - def int_nvvm_lohi_i2d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty]>; - - def int_nvvm_d2i_lo : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; - def int_nvvm_d2i_hi : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; - - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_d2f_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty]>; + // + // FNS - Find the n-th set bit + // + def int_nvvm_fns : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + + // + // Convert + // + // TODO: All these intrinsics are defined as PureIntrinsic, this attaches the + // IntrSpeculatable property to them. Consider if some of these should + // have this attribute removed as they may be too expensive. + // + def int_nvvm_lohi_i2d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty]>; + + def int_nvvm_d2i_lo : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; + def int_nvvm_d2i_hi : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; + + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_d2f_ # rnd # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_double_ty]>; - foreach sign = ["", "u"] in { + foreach sign = ["", "u"] in { - def int_nvvm_d2 # sign # i_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; + def int_nvvm_d2 # sign # i_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; - def int_nvvm_ # sign # i2d_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty]>; + def int_nvvm_ # sign # i2d_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_i32_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_f2 # sign # i_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; + foreach ftz = ["", "_ftz"] in + def int_nvvm_f2 # sign # i_ # rnd # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; - def int_nvvm_ # sign # i2f_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty]>; + def int_nvvm_ # sign # i2f_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_i32_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_f2 # sign # ll_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty]>; + foreach ftz = ["", "_ftz"] in + def int_nvvm_f2 # sign # ll_ # rnd # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_i64_ty], [llvm_float_ty]>; - def int_nvvm_d2 # sign # ll_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty]>; + def int_nvvm_d2 # sign # ll_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_i64_ty], [llvm_double_ty]>; - def int_nvvm_ # sign # ll2f_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty]>; + def int_nvvm_ # sign # ll2f_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_i64_ty]>; - def int_nvvm_ # sign # ll2d_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty]>; + def int_nvvm_ # sign # ll2d_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_i64_ty]>; - } // sign - } // rnd + } // sign + } // rnd - foreach ftz = ["", "_ftz"] in { - def int_nvvm_f2h_rn # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty]>; + foreach ftz = ["", "_ftz"] in { + def int_nvvm_f2h_rn # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty]>; - def int_nvvm_bf2h_rn # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_bfloat_ty]>; - } + def int_nvvm_bf2h_rn # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_bfloat_ty]>; + } - foreach rnd = ["rn", "rz"] in { - foreach relu = ["", "_relu"] in { - def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; + foreach rnd = ["rn", "rz"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; + def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; - } + def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin, + PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; } + } - foreach satfinite = ["", "_satfinite"] in { - def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; + foreach satfinite = ["", "_satfinite"] in { + def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; - foreach rnd = ["rn", "rz"] in - foreach relu = ["", "_relu"] in - def int_nvvm_f2tf32_ # rnd # relu # satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; - } + foreach rnd = ["rn", "rz"] in + foreach relu = ["", "_relu"] in + def int_nvvm_f2tf32_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; + } - foreach type = ["e4m3x2", "e5m2x2"] in { - foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_ # type # _rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; + foreach type = ["e4m3x2", "e5m2x2"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_ # type # _rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_f16x2_to_ # type # _rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>; + def int_nvvm_f16x2_to_ # type # _rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>; - def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; - } + def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; } + } - // FP4 conversions. - foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_e2m1x2_rn # relu # _satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; + // FP4 conversions. + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_e2m1x2_rn # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; - } + def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; + } - // FP6 conversions. - foreach type = ["e2m3x2", "e3m2x2"] in { - foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_ # type # _rn # relu # _satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; + // FP6 conversions. + foreach type = ["e2m3x2", "e3m2x2"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_ # type # _rn # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; - } + def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; } + } - // UE8M0x2 conversions. - foreach rmode = ["_rz", "_rp"] in { - foreach satmode = ["", "_satfinite"] in { - defvar suffix = rmode # satmode; - def int_nvvm_ff_to_ue8m0x2 # suffix : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; + // UE8M0x2 conversions. + foreach rmode = ["_rz", "_rp"] in { + foreach satmode = ["", "_satfinite"] in { + defvar suffix = rmode # satmode; + def int_nvvm_ff_to_ue8m0x2 # suffix : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_bf16x2_to_ue8m0x2 # suffix : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>; + def int_nvvm_bf16x2_to_ue8m0x2 # suffix : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>; - } } + } - def int_nvvm_ue8m0x2_to_bf16x2 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>; - - } // IntrProperties = [IntrNoMem, IntrSpeculatable] - -// FNS - def int_nvvm_fns : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; + def int_nvvm_ue8m0x2_to_bf16x2 : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>; + // + // Atomic operations + // class SCOPED_ATOMIC2_impl<LLVMType elty> : Intrinsic<[elty], [llvm_anyptr_ty, LLVMMatchType<0>], @@ -1336,7 +1515,9 @@ let TargetPrefix = "nvvm" in { defm int_nvvm_atomic_and_gen_i : PTXAtomicWithScope2<llvm_anyint_ty>; defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3<llvm_anyint_ty>; -// Bar.Sync + // + // Bar.Sync + // def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">, @@ -1360,62 +1541,65 @@ let TargetPrefix = "nvvm" in { } } - // barrier.cluster.[wait, arrive, arrive.relaxed] - def int_nvvm_barrier_cluster_arrive : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - def int_nvvm_barrier_cluster_arrive_relaxed : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - def int_nvvm_barrier_cluster_wait : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - - // 'aligned' versions of the above barrier.cluster.* intrinsics - def int_nvvm_barrier_cluster_arrive_aligned : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - def int_nvvm_barrier_cluster_arrive_relaxed_aligned : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - def int_nvvm_barrier_cluster_wait_aligned : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; + let IntrProperties = [IntrConvergent, IntrNoCallback] in { + // barrier.cluster.[wait, arrive, arrive.relaxed] + def int_nvvm_barrier_cluster_arrive : Intrinsic<[]>; + def int_nvvm_barrier_cluster_arrive_relaxed : Intrinsic<[]>; + def int_nvvm_barrier_cluster_wait : Intrinsic<[]>; + + // 'aligned' versions of the above barrier.cluster.* intrinsics + def int_nvvm_barrier_cluster_arrive_aligned : Intrinsic<[]>; + def int_nvvm_barrier_cluster_arrive_relaxed_aligned : Intrinsic<[]>; + def int_nvvm_barrier_cluster_wait_aligned : Intrinsic<[]>; + } + // // Membar - def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[], [], [IntrNoCallback]>; - def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[], [], [IntrNoCallback]>; - def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[], [], [IntrNoCallback]>; - def int_nvvm_fence_sc_cluster : Intrinsic<[], [], [IntrNoCallback]>; - -// Proxy fence (uni-directional) -foreach scope = ["cta", "cluster", "gpu", "sys"] in { - - def int_nvvm_fence_proxy_tensormap_generic_release_ # scope : - Intrinsic<[], [], [IntrNoCallback], - "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>; - - // The imm-arg 'size' can only be 128. - def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoCallback, IntrArgMemOnly, ImmArg<ArgIndex<1>>, - Range<ArgIndex<1>, 128, 129>], - "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>; -} + // + let IntrProperties = [IntrNoCallback] in { + def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[]>; + def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[]>; + def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[]>; + def int_nvvm_fence_sc_cluster : Intrinsic<[]>; + } + // + // Proxy fence (uni-directional) + // + foreach scope = ["cta", "cluster", "gpu", "sys"] in { + + def int_nvvm_fence_proxy_tensormap_generic_release_ # scope : + Intrinsic<[], [], [IntrNoCallback], + "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>; + + // The imm-arg 'size' can only be 128. + def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], + [IntrNoCallback, IntrArgMemOnly, ImmArg<ArgIndex<1>>, + Range<ArgIndex<1>, 128, 129>], + "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>; + } + +// // Async Copy +// let IntrProperties = [IntrConvergent, IntrNoCallback] in { def int_nvvm_cp_async_mbarrier_arrive : NVVMBuiltin, - Intrinsic<[],[llvm_ptr_ty]>; + Intrinsic<[], [llvm_ptr_ty]>; def int_nvvm_cp_async_mbarrier_arrive_shared : NVVMBuiltin, - Intrinsic<[],[llvm_shared_ptr_ty]>; + Intrinsic<[], [llvm_shared_ptr_ty]>; def int_nvvm_cp_async_mbarrier_arrive_noinc : NVVMBuiltin, - Intrinsic<[],[llvm_ptr_ty]>; + Intrinsic<[], [llvm_ptr_ty]>; def int_nvvm_cp_async_mbarrier_arrive_noinc_shared : NVVMBuiltin, - Intrinsic<[],[llvm_shared_ptr_ty]>; + Intrinsic<[], [llvm_shared_ptr_ty]>; } multiclass CP_ASYNC_SHARED_GLOBAL { - def NAME : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty], - [IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>, - WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>; - def _s : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty, llvm_i32_ty], - [IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>, - WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>; + let IntrProperties = [IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, + NoAlias<ArgIndex<1>>, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>] in { + def NAME : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty]>; + def _s : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty, llvm_i32_ty]>; + } } defm int_nvvm_cp_async_ca_shared_global_4 : CP_ASYNC_SHARED_GLOBAL; @@ -1423,17 +1607,15 @@ defm int_nvvm_cp_async_ca_shared_global_8 : CP_ASYNC_SHARED_GLOBAL; defm int_nvvm_cp_async_ca_shared_global_16 : CP_ASYNC_SHARED_GLOBAL; defm int_nvvm_cp_async_cg_shared_global_16 : CP_ASYNC_SHARED_GLOBAL; -def int_nvvm_cp_async_commit_group : NVVMBuiltin, Intrinsic<[], [], []>; +def int_nvvm_cp_async_commit_group : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_cp_async_wait_group : NVVMBuiltin, Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>; -def int_nvvm_cp_async_wait_all : NVVMBuiltin, - Intrinsic<[], [], []>; +def int_nvvm_cp_async_wait_all : NVVMBuiltin, Intrinsic<[]>; // cp.async.bulk variants of the commit/wait group -def int_nvvm_cp_async_bulk_commit_group : - Intrinsic<[], [], []>; +def int_nvvm_cp_async_bulk_commit_group : Intrinsic<[]>; def int_nvvm_cp_async_bulk_wait_group : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>; @@ -1456,29 +1638,30 @@ def int_nvvm_mbarrier_inval_shared : NVVMBuiltin, [IntrConvergent, IntrWriteMem, IntrArgMemOnly, IntrNoCallback, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>; -def int_nvvm_mbarrier_arrive : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_shared : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_noComplete : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_noComplete_shared : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty, - llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; - -def int_nvvm_mbarrier_arrive_drop : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_drop_shared : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_drop_noComplete : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_drop_noComplete_shared : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; - -def int_nvvm_mbarrier_test_wait : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_test_wait_shared : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_shared_ptr_ty, llvm_i64_ty], [IntrConvergent, IntrNoCallback]>; +let IntrProperties = [IntrConvergent, IntrNoCallback] in { + def int_nvvm_mbarrier_arrive : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_ptr_ty]>; + def int_nvvm_mbarrier_arrive_shared : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty]>; + def int_nvvm_mbarrier_arrive_noComplete : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i32_ty]>; + def int_nvvm_mbarrier_arrive_noComplete_shared : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty, llvm_i32_ty]>; + + def int_nvvm_mbarrier_arrive_drop : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_ptr_ty]>; + def int_nvvm_mbarrier_arrive_drop_shared : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty]>; + def int_nvvm_mbarrier_arrive_drop_noComplete : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i32_ty]>; + def int_nvvm_mbarrier_arrive_drop_noComplete_shared : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty, llvm_i32_ty]>; + + def int_nvvm_mbarrier_test_wait : NVVMBuiltin, + Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i64_ty]>; + def int_nvvm_mbarrier_test_wait_shared : NVVMBuiltin, + Intrinsic<[llvm_i1_ty], [llvm_shared_ptr_ty, llvm_i64_ty]>; +} def int_nvvm_mbarrier_pending_count : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent, IntrNoCallback]>; @@ -1503,9 +1686,8 @@ let IntrProperties = [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillRetur // space when lowered during ISel. // def int_nvvm_internal_addrspace_wrap : - DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty], - [IntrNoMem, IntrSpeculatable, NoUndef<ArgIndex<0>>, - NoUndef<RetIndex>]>; + PureIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty], + [NoUndef<ArgIndex<0>>, NoUndef<RetIndex>]>; // Move intrinsics, used in nvvm internally @@ -1519,36 +1701,26 @@ let IntrProperties = [IntrNoMem] in { } // For getting the handle from a texture or surface variable -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - def int_nvvm_texsurf_handle - : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyptr_ty]>; - def int_nvvm_texsurf_handle_internal - : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; -} +def int_nvvm_texsurf_handle + : PureIntrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyptr_ty]>; +def int_nvvm_texsurf_handle_internal + : PureIntrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; /// Error / Warn def int_nvvm_compiler_error : Intrinsic<[], [llvm_anyptr_ty]>; def int_nvvm_compiler_warn : Intrinsic<[], [llvm_anyptr_ty]>; -def int_nvvm_reflect : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; +def int_nvvm_reflect : NVVMBuiltin, PureIntrinsic<[llvm_i32_ty], [llvm_ptr_ty]>; // isspacep.{const, global, local, shared} foreach space = ["const", "global", "local", "shared", "shared_cluster"] in def int_nvvm_isspacep_ # space : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], - [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]>; - -// Environment register read -foreach i = 0...31 in - def int_nvvm_read_ptx_sreg_envreg # i : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [], - [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>; + PureIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [NoCapture<ArgIndex<0>>]>; // // Texture Fetch // -let IntrProperties = [IntrReadMem] in { +let IntrProperties = [IntrReadMem, IntrNoCallback, IntrNoFree, IntrWillReturn] in { foreach is_unified = [true, false] in { defvar mode = !if(is_unified, "_unified", ""); defvar addr_args = !if(is_unified, [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty]); @@ -1557,76 +1729,63 @@ let IntrProperties = [IntrReadMem] in { foreach is_array = [true, false] in { defvar array = !if(is_array, "_array", ""); defvar array_args = !if(is_array, [llvm_i32_ty], []<LLVMType>); + defvar base_args = !listconcat(addr_args, array_args); def int_nvvm_tex # mode # _1d # array # _ # vec.Name # _s32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_i32_ty, 1))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_i32_ty, 1)>; def int_nvvm_tex # mode # _1d # array # _ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 1))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 1)>; def int_nvvm_tex # mode # _1d # array # _level_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 2))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 2)>; def int_nvvm_tex # mode # _1d # array # _grad_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 3))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 3)>; def int_nvvm_tex # mode # _2d # array # _ # vec.Name # _s32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_i32_ty, 2))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_i32_ty, 2)>; def int_nvvm_tex # mode # _2d # array # _ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 2))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 2)>; def int_nvvm_tex # mode # _2d # array # _level_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 3))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 3)>; def int_nvvm_tex # mode # _2d # array # _grad_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 6))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 6)>; if !not(is_array) then { def int_nvvm_tex # mode # _3d_ # vec.Name # _s32 - : Intrinsic<vec.Types, - !listconcat(addr_args, !listsplat(llvm_i32_ty, 3))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_i32_ty, 3)>; def int_nvvm_tex # mode # _3d_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, !listsplat(llvm_float_ty, 3))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 3)>; def int_nvvm_tex # mode # _3d_level_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, !listsplat(llvm_float_ty, 4))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 4)>; def int_nvvm_tex # mode # _3d_grad_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, !listsplat(llvm_float_ty, 9))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 9)>; } def int_nvvm_tex # mode # _cube # array # _ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 3))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 3)>; def int_nvvm_tex # mode # _cube # array # _level_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 4))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 4)>; if is_unified then def int_nvvm_tex # mode # _cube # array # _grad_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, array_args, !listsplat(llvm_float_ty, 9))>; + : Intrinsic<vec.Types, base_args # !listsplat(llvm_float_ty, 9)>; } // is_array foreach comp = ["r", "g", "b", "a"] in { def int_nvvm_tld4 # mode # _ # comp # _2d_ # vec.Name # _f32 - : Intrinsic<vec.Types, - !listconcat(addr_args, !listsplat(llvm_float_ty, 2))>; + : Intrinsic<vec.Types, addr_args # !listsplat(llvm_float_ty, 2)>; } // comp } // vec } // is_unified } // IntrProperties = [IntrReadMem] //=== Surface Load -let IntrProperties = [IntrReadMem] in { - foreach clamp = ["clamp", "trap", "zero"] in { - foreach vec = [TV_I8, TV_I16, TV_I32, TV_I64, - TV_V2I8, TV_V2I16, TV_V2I32, TV_V2I64, - TV_V4I8, TV_V4I16, TV_V4I32] in { +foreach clamp = ["clamp", "trap", "zero"] in { + foreach vec = [TV_I8, TV_I16, TV_I32, TV_I64, + TV_V2I8, TV_V2I16, TV_V2I32, TV_V2I64, + TV_V4I8, TV_V4I16, TV_V4I32] in { + + let IntrProperties = [IntrNoCallback, IntrNoFree, IntrReadMem] + # !if(!ne(clamp, "trap"), [IntrWillReturn], []<IntrinsicProperty>) in { def int_nvvm_suld_1d_ # vec.Name # _ # clamp : Intrinsic<vec.Types, @@ -1647,47 +1806,50 @@ let IntrProperties = [IntrReadMem] in { def int_nvvm_suld_3d_ # vec.Name # _ # clamp : Intrinsic<vec.Types, [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - } // vec - } // clamp -} // IntrProperties = [IntrReadMem] + } + } // vec +} // clamp //===- Texture Query ------------------------------------------------------===// foreach query = ["channel_order", "channel_data_type", "width", "height", "depth", "array_size", "num_samples", "num_mipmap_levels"] in def int_nvvm_txq_ # query : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; //===- Surface Query ------------------------------------------------------===// foreach query = ["channel_order", "channel_data_type", "width", "height", "depth", "array_size"] in def int_nvvm_suq_ # query : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; //===- Handle Query -------------------------------------------------------===// foreach type = ["sampler", "surface", "texture"] in def int_nvvm_istypep_ # type : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem]>; //===- Surface Stores -----------------------------------------------------===// multiclass SurfaceStoreIntrinsics<string clamp, TexVector vec> { - def _1d_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty], vec.Types)>; + let IntrProperties = [IntrNoCallback, IntrNoFree, IntrWriteMem] # + !if(!ne(clamp, "trap"), [IntrWillReturn], []<IntrinsicProperty>) in { + def _1d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty] # vec.Types>; - def _1d_array_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + def _1d_array_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty] # vec.Types>; - def _2d_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + def _2d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty] # vec.Types>; - def _2d_array_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + def _2d_array_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty] # vec.Types>; - def _3d_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + def _3d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty] # vec.Types>; + } } // Unformatted @@ -1703,23 +1865,17 @@ foreach vec = [TV_I8, TV_I16, TV_I32, TV_V4I8, TV_V4I16, TV_V4I32] in defm int_nvvm_sust_p : SurfaceStoreIntrinsics<"trap", vec>; +// // Accessing special registers. - +// class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []> - : DefaultAttrsIntrinsic<[llvm_i32_ty], [], - !listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>; + : PureIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>] # properties>; class PTXReadSRegIntrinsic_r32<list<IntrinsicProperty> properties = []> - : PTXReadSRegIntrinsicNB_r32<properties>, - NVVMBuiltin; + : PTXReadSRegIntrinsicNB_r32<properties>, NVVMBuiltin; multiclass PTXReadSRegIntrinsic_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> { assert !eq(!size(properties), 4), "properties must be a list of 4 lists"; -// FIXME: Do we need the 128-bit integer type version? -// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>; - -// FIXME: Enable this once v4i32 support is enabled in back-end. -// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>; defvar suffixes = ["_x", "_y", "_z", "_w"]; foreach i = !range(suffixes) in def suffixes[i] : PTXReadSRegIntrinsic_r32<properties[i]>; @@ -1736,30 +1892,20 @@ multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties // Intrinsics to read registers with non-constant values. E.g. the values that // do change over the kernel lifetime. Such reads should not be CSE'd. -class PTXReadNCSRegIntrinsic_r32 - : Intrinsic<[llvm_i32_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>, - NVVMBuiltin; -class PTXReadNCSRegIntrinsic_r64 - : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>, +class PTXReadNCSRegIntrinsic<LLVMType ty> + : Intrinsic<[ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, + IntrNoFree, IntrWillReturn, NoUndef<RetIndex>]>, NVVMBuiltin; -defm int_nvvm_read_ptx_sreg_tid - : PTXReadSRegIntrinsic_v4i32<[[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>], - [Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>], - [Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>], - [Range<RetIndex, 0, 1>]]>; - -defm int_nvvm_read_ptx_sreg_ntid - : PTXReadSRegIntrinsic_v4i32<[[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>], - [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>], - [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>], - [Range<RetIndex, 0, 1>]]>; +defvar MAX_BLOCK_ID_RANGE = [[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>], + [Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>], + [Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>], + [Range<RetIndex, 0, 1>]]; -def int_nvvm_read_ptx_sreg_laneid - : PTXReadSRegIntrinsic_r32<[Range<RetIndex, 0, WARP_SIZE>]>; - -def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32; +defvar MAX_BLOCK_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>], + [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>], + [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>], + [Range<RetIndex, 0, 1>]]; defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>], [Range<RetIndex, 0, MAX_GRID_SIZE_Y>], @@ -1771,11 +1917,17 @@ defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>], [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>], [Range<RetIndex, 0, 1>]]; -defm int_nvvm_read_ptx_sreg_ctaid - : PTXReadSRegIntrinsic_v4i32<MAX_GRID_ID_RANGE>; +defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<MAX_BLOCK_ID_RANGE>; +defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<MAX_BLOCK_NID_RANGE>; -defm int_nvvm_read_ptx_sreg_nctaid - : PTXReadSRegIntrinsic_v4i32<MAX_GRID_NID_RANGE>; +def int_nvvm_read_ptx_sreg_laneid + : PTXReadSRegIntrinsic_r32<[Range<RetIndex, 0, WARP_SIZE>]>; + +def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32; +def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32; + +defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<MAX_GRID_ID_RANGE>; +defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<MAX_GRID_NID_RANGE>; def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32; def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32; @@ -1787,19 +1939,23 @@ def int_nvvm_read_ptx_sreg_lanemask_lt : PTXReadSRegIntrinsic_r32; def int_nvvm_read_ptx_sreg_lanemask_ge : PTXReadSRegIntrinsic_r32; def int_nvvm_read_ptx_sreg_lanemask_gt : PTXReadSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_clock : PTXReadNCSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_clock64 : PTXReadNCSRegIntrinsic_r64; +def int_nvvm_read_ptx_sreg_clock : PTXReadNCSRegIntrinsic<llvm_i32_ty>; +def int_nvvm_read_ptx_sreg_clock64 : PTXReadNCSRegIntrinsic<llvm_i64_ty>; -def int_nvvm_read_ptx_sreg_globaltimer : PTXReadNCSRegIntrinsic_r64; +def int_nvvm_read_ptx_sreg_globaltimer : PTXReadNCSRegIntrinsic<llvm_i64_ty>; +def int_nvvm_read_ptx_sreg_globaltimer_lo : PTXReadNCSRegIntrinsic<llvm_i32_ty>; -def int_nvvm_read_ptx_sreg_pm0 : PTXReadNCSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32; +def int_nvvm_read_ptx_sreg_pm0 : PTXReadNCSRegIntrinsic<llvm_i32_ty>; +def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic<llvm_i32_ty>; +def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic<llvm_i32_ty>; +def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic<llvm_i32_ty>; def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<[Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>; +foreach i = 0...31 in + def int_nvvm_read_ptx_sreg_envreg # i : PTXReadSRegIntrinsic_r32; + // sm90+, PTX7.8+ // Note: Since clusters are subdivisions of the grid, we conservatively use the @@ -1807,14 +1963,10 @@ def int_nvvm_read_ptx_sreg_warpsize // practice, the clusterid will likely be much smaller. The CUDA programming // guide recommends 8 as a maximum portable value and H100s support 16. -defm int_nvvm_read_ptx_sreg_clusterid - : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>; -defm int_nvvm_read_ptx_sreg_nclusterid - : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>; -defm int_nvvm_read_ptx_sreg_cluster_ctaid - : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>; -defm int_nvvm_read_ptx_sreg_cluster_nctaid - : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>; +defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>; +defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>; +defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>; +defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>; def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32; def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32; @@ -1842,13 +1994,13 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback] i // // VOTE // - let IntrProperties = [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback] in { def int_nvvm_vote_all : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; def int_nvvm_vote_any : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; def int_nvvm_vote_uni : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; def int_nvvm_vote_ballot : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_i1_ty]>; } + // // VOTE.SYNC // @@ -2001,6 +2153,53 @@ foreach layout_a = ["row", "col"] in { } // layout_b } // layout_a +// MMA.SP +class NVVM_MMA_SP<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> + : Intrinsic<D.regs, + !listconcat(A.regs, B.regs, C.regs, [llvm_i32_ty], [llvm_i32_ty])> { + int pos = !size(!listconcat(A.regs, B.regs, C.regs, [llvm_i32_ty])); + + // The range [0;num_threads) is for the sparsity selector that indicates the threads + // which contribute metadata. + int num_threads = !if(!or(!and(!eq(A.geom, "m16n8k32"), !eq(A.ptx_elt_type, "bf16")), + !and(!eq(A.geom, "m16n8k32"), !eq(A.ptx_elt_type, "f16")), + !and(!eq(A.geom, "m16n8k16"), !eq(A.ptx_elt_type, "tf32")), + !and(!eq(A.geom, "m16n8k32"), !eq(A.ptx_elt_type, "u8")), + !and(!eq(A.geom, "m16n8k32"), !eq(A.ptx_elt_type, "s8")), + !and(!eq(A.geom, "m16n8k64"), !eq(A.ptx_elt_type, "u4")), + !and(!eq(A.geom, "m16n8k64"), !eq(A.ptx_elt_type, "s4"))), + 2, + !if(!and(!eq(A.geom, "m16n8k64"), + !or(!eq(A.ptx_elt_type, "u8"), + !eq(A.ptx_elt_type, "s8"), + !eq(A.ptx_elt_type, "e4m3"), + !eq(A.ptx_elt_type, "e5m2"), + !eq(A.ptx_elt_type, "e3m2"), + !eq(A.ptx_elt_type, "e2m3"), + !eq(A.ptx_elt_type, "e2m1"))), + 1, + !if(!and(!eq(A.geom, "m16n8k128"), + !or(!eq(A.ptx_elt_type, "s4"), + !eq(A.ptx_elt_type, "u4"))), + 1, 4))); + let IntrProperties = [IntrNoMem, IntrNoCallback, ImmArg<ArgIndex<pos>>, + Range<ArgIndex<pos>, 0, num_threads>]; +} + +foreach metadata = ["sp", "sp::ordered_metadata"] in { + foreach kind = ["", "kind::f8f6f4"] in { + foreach satf = [0, 1] in { + foreach op = NVVM_MMA_OPS.all_mma_sp_ops in { + if NVVM_MMA_SP_SUPPORTED<op, metadata, kind, satf>.ret then { + def MMA_SP_NAME<metadata, kind, satf, + op[0], op[1], op[2], op[3]>.record + : NVVM_MMA_SP<op[0], op[1], op[2], op[3]>; + } + } // op + } // satf + } // kind +} // metadata + // LDMATRIX class NVVM_LDMATRIX<WMMA_REGS Frag, int Transposed> : Intrinsic<Frag.regs, [llvm_anyptr_ty], @@ -2051,8 +2250,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] in { } def int_nvvm_is_explicit_cluster - : DefaultAttrsIntrinsic<[llvm_i1_ty], [], - [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], + : PureIntrinsic<[llvm_i1_ty], [], [NoUndef<RetIndex>], "llvm.nvvm.is_explicit_cluster">; // Setmaxnreg inc/dec intrinsics @@ -2212,15 +2410,17 @@ def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d // Intrinsics for Prefetch and Prefetchu let IntrProperties = [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>] in { foreach level = ["L1", "L2"] in { - def int_nvvm_prefetch_ # level : Intrinsic<[], [llvm_ptr_ty]>; - def int_nvvm_prefetch_global_ # level : Intrinsic<[], [llvm_global_ptr_ty]>; - def int_nvvm_prefetch_local_ # level : Intrinsic<[], [llvm_local_ptr_ty]>; + def int_nvvm_prefetch_ # level : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>; + def int_nvvm_prefetch_global_ # level : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>; + def int_nvvm_prefetch_local_ # level : DefaultAttrsIntrinsic<[], [llvm_local_ptr_ty]>; } + def int_nvvm_prefetch_tensormap : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>; + foreach eviction_priority = ["evict_normal", "evict_last"] in - def int_nvvm_prefetch_global_L2_ # eviction_priority : Intrinsic<[], [llvm_global_ptr_ty]>; + def int_nvvm_prefetch_global_L2_ # eviction_priority : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>; - def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty]>; + def int_nvvm_prefetchu_L1 : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>; } // applypriority @@ -2455,13 +2655,12 @@ def int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared // clusterlaunchcontrol.query_cancel.is_canceled def int_nvvm_clusterlaunchcontrol_query_cancel_is_canceled - : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i128_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled">; + : PureIntrinsic<[llvm_i1_ty], [llvm_i128_ty], [], + "llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled">; -foreach dim = ["x", "y", "z"] in { -def int_nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_ # dim - : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i128_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid." # dim>; -} +foreach dim = ["x", "y", "z"] in + def int_nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_ # dim + : PureIntrinsic<[llvm_i32_ty], [llvm_i128_ty], [], + "llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid." # dim>; } // let TargetPrefix = "nvvm" |