diff options
Diffstat (limited to 'llvm/docs/NVPTXUsage.rst')
-rw-r--r-- | llvm/docs/NVPTXUsage.rst | 161 |
1 files changed, 152 insertions, 9 deletions
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 11017fe..d28eb68 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1072,6 +1072,8 @@ Syntax: declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) + Overview: """"""""" @@ -1082,7 +1084,13 @@ global memory to shared::cluster memory (indicated by the ``g2s`` prefix) in ``tile`` mode. In tile mode, the multi-dimensional layout of the source tensor is preserved at the destination. The dimension of the tensor data ranges from 1d to 5d with the coordinates specified -by the ``i32 %d0 ... i32 %d4`` arguments. +by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode, +four rows in a 2D tensor are combined to form a single 2D destination +tensor. The first coordinate ``i32 %x0`` denotes the column index +followed by four coordinates indicating the four row-indices. +So, this mode takes a total of 5 coordinates as input arguments. +For more information on ``gather4`` mode, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_. * The last three arguments to these intrinsics are flags indicating support for multicast, cache_hint and cta_group::1/2 @@ -1116,10 +1124,18 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + Overview: """"""""" @@ -1131,10 +1147,105 @@ in ``im2col`` mode. In im2col mode, some dimensions of the source tensor are unrolled into a single dimensional column at the destination. In this mode, the tensor has to be at least three-dimensional. Along with the tensor coordinates, im2col offsets are also specified (denoted by -``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less -than the number of dimensions of the tensor operation. The last three arguments -to these intrinsics are flags, with the same functionality as described -in the ``tile`` mode intrinsics above. +``i16 im2col0...i16 %im2col2``). For the ``im2col`` mode, the number of offsets +is two less than the number of dimensions of the tensor operation. For the +``im2col.w`` and ``im2col.w.128`` mode, the number of offsets is always 2, +denoted by ``i16 %wHalo`` and ``i16 %wOffset`` arguments. For more information +on ``im2col.w`` and ``im2col.w.128`` modes, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_. + +The last three arguments to these intrinsics are flags, with the same functionality +as described in the ``tile`` mode intrinsics above. + +For more information, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_. + +'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(..., i32 %d0, i32 %d1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*`` +set of PTX instructions. These instructions initiate an asynchronous +copy of tensor data from global memory to shared::cta memory in +``tile`` mode. In tile mode, the multi-dimensional layout of the +source tensor is preserved at the destination. The dimension of the +tensor data ranges from 1d to 5d with the coordinates specified +by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode, +four rows in a 2D tensor are combined to form a single 2D destination +tensor. The first coordinate ``i32 %x0`` denotes the column index +followed by four coordinates indicating the four row-indices. +So, this mode takes a total of 5 coordinates as input arguments. +For more information on ``gather4`` mode, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +For more information, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_. + +'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``' intrinsics +correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*`` +set of PTX instructions. These instructions initiate an asynchronous copy +of tensor data from global memory to shared::cta memory in ``im2col`` mode. +In im2col mode, some dimensions of the source tensor are unrolled into a +single dimensional column at the destination. In this mode, the tensor has +to be at least three-dimensional. Along with the tensor coordinates, im2col +offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2``). +For the ``im2col`` mode, the number of offsets is two less than the number +of dimensions of the tensor operation. For the ``im2col.w`` and ``im2col.w.128`` +mode, the number of offsets is always 2, denoted by ``i16 %wHalo`` and +``i16 %wOffset`` arguments. For more information on ``im2col.w`` and +``im2col.w.128`` modes, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. For more information, refer PTX ISA `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_. @@ -1153,6 +1264,8 @@ Syntax: declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch) + Overview: """"""""" @@ -1162,6 +1275,12 @@ These instructions initiate an asynchronous copy of tensor data from shared::cta to global memory (indicated by the ``s2g`` prefix) in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments. +In ``tile.scatter4`` mode, a single 2D source tensor is divided into +four rows in the 2D destination tensor. The first coordinate ``i32 %x0`` +denotes the column index followed by four coordinates indicating the +four row-indices. So, this mode takes a total of 5 coordinates as input arguments. +For more information on ``scatter4`` mode, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_. * The last argument to these intrinsics is a boolean flag indicating support for cache_hint. This flag argument must @@ -1214,6 +1333,8 @@ Syntax: declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch) + Overview: """"""""" @@ -1225,6 +1346,13 @@ multi-dimensional layout of the source tensor is preserved at the destination. The dimension of the tensor data ranges from 1d to 5d with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments. +In ``tile.gather4`` mode, four rows in the 2-dimnesional source tensor are +fetched to the L2 cache. The first coordinate ``i32 %x0`` denotes the column index +followed by four coordinates indicating the four row-indices. So, this mode takes +a total of 5 coordinates as input arguments. +For more information on ``gather4`` mode, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_. + * The last argument to these intrinsics is a boolean flag indicating support for cache_hint. This flag argument must be a compile-time constant. When set, it indicates a valid @@ -1246,6 +1374,14 @@ Syntax: declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + Overview: """"""""" @@ -1256,9 +1392,16 @@ of tensor data from global memory to the L2 cache. In im2col mode, some dimensions of the source tensor are unrolled into a single dimensional column at the destination. In this mode, the tensor has to be at least three-dimensional. Along with the tensor coordinates, im2col offsets are -also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number -of im2col offsets is two less than the number of dimensions of the tensor -operation. The last argument to these intrinsics is a boolean flag, with +also specified (denoted by ``i16 im2col0...i16 %im2col2``). For ``im2col`` +mode, the number of offsets is two less than the number of dimensions of +the tensor operation. For the ``im2col.w`` and ``im2col.w.128`` modes, +the number of offsets is always 2, denoted by ``i16 %wHalo`` and +``i16 %wOffset`` arguments. For more information on ``im2col.w`` and +``im2col.w.128`` modes, refer PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_. + + +The last argument to these intrinsics is a boolean flag, with the same functionality as described in the ``tile`` mode intrinsics above. For more information, refer PTX ISA |