clang/lib/Headers/amxfp8intrin.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230

/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===------------------------------------------------------------------------===
 */

#ifndef __IMMINTRIN_H
#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead."
#endif /* __IMMINTRIN_H */

#ifndef __AMXFP8INTRIN_H
#define __AMXFP8INTRIN_H
#ifdef __x86_64__

#define __DEFAULT_FN_ATTRS_FP8                                                 \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp8")))

static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2);
}

/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2
/// accumulating into a Single Precision (FP32) source/dest \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2)
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
///   temp1[(dst.colsb / 4 - 1) : 0] = 0
///   FOR k := 0 TO src1.colsb / 4 - 1
///     FOR n := 0 TO dst.colsb / 4 - 1
///       temp1[n] +=
///         INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
///         + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
///         + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
///         + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
///     ENDFOR
///   ENDFOR
///   FOR n := 0 TO dst.colsb / 4 - 1
///     tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
///   ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TDPBF8PS instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src1
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src2
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_FP8 static void
__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
  dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
                                     src1.tile, src2.tile);
}

static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2);
}

/// Perform the dot product of a BF8 value \a src1 by an HF8 value \a src2
/// accumulating into a Single Precision (FP32) source/dest \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// void __tile_dpbhf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
///   temp1[(dst.colsb / 4 - 1) : 0] = 0
///   FOR k := 0 TO src1.colsb / 4 - 1
///     FOR n := 0 TO dst.colsb / 4 - 1
///       temp1[n] +=
///         INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
///         + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
///         + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
///         + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
///     ENDFOR
///   ENDFOR
///   FOR n := 0 TO dst.colsb / 4 - 1
///     tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
///   ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TDPBHF8PS instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src1
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src2
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_FP8 static void
__tile_dpbhf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
  dst->tile = _tile_dpbhf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
                                      src1.tile, src2.tile);
}

static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
_tile_dphbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdphbf8ps_internal(m, n, k, dst, src1, src2);
}

/// Perform the dot product of an HF8 value \a src1 by a BF8 value \a src2
/// accumulating into a Single Precision (FP32) source/dest \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// void __tile_dphbf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
///   temp1[(dst.colsb / 4 - 1) : 0] = 0
///   FOR k := 0 TO src1.colsb / 4 - 1
///     FOR n := 0 TO dst.colsb / 4 - 1
///       temp1[n] +=
///         INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
///         + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
///         + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
///         + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
///     ENDFOR
///   ENDFOR
///   FOR n := 0 TO dst.colsb / 4 - 1
///     tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
///   ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TDPHBF8PS instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src1
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src2
///    The 2nd source tile. Max size is 1024 Bytes.

__DEFAULT_FN_ATTRS_FP8 static void
__tile_dphbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
  dst->tile = _tile_dphbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
                                      src1.tile, src2.tile);
}

static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
_tile_dphf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdphf8ps_internal(m, n, k, dst, src1, src2);
}

/// Perform the dot product of an HF8 value \a src1 by an HF8 value \a src2
/// accumulating into a Single Precision (FP32) source/dest \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// void __tile_dphf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
///   temp1[(dst.colsb / 4 - 1) : 0] = 0
///   FOR k := 0 TO src1.colsb / 4 - 1
///     FOR n := 0 TO dst.colsb / 4 - 1
///       temp1[n] +=
///         INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
///         + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
///         + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
///         + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
///     ENDFOR
///   ENDFOR
///   FOR n := 0 TO dst.colsb / 4 - 1
///     tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
///   ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TDPHF8PS instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src1
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src2
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_FP8 static void
__tile_dphf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
  dst->tile = _tile_dphf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
                                     src1.tile, src2.tile);
}

#define _tile_dpbf8ps(dst, src1, src2)                                         \
  __builtin_ia32_tdpbf8ps((dst), (src1), (src2))
#define _tile_dpbhf8ps(dst, src1, src2)                                        \
  __builtin_ia32_tdpbhf8ps((dst), (src1), (src2))
#define _tile_dphbf8ps(dst, src1, src2)                                        \
  __builtin_ia32_tdphbf8ps((dst), (src1), (src2))
#define _tile_dphf8ps(dst, src1, src2)                                         \
  __builtin_ia32_tdphf8ps((dst), (src1), (src2))

#undef __DEFAULT_FN_ATTRS_FP8

#endif /* __x86_64__ */
#endif /* __AMXFP8INTRIN_H */