Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
34pub const fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
35    unsafe { simd_add(a, b) }
36}
37
38/// Adds packed single-precision (32-bit) floating-point elements in `a` and
39/// `b`.
40///
41/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
42#[inline]
43#[target_feature(enable = "avx")]
44#[cfg_attr(test, assert_instr(vaddps))]
45#[stable(feature = "simd_x86", since = "1.27.0")]
46#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
47pub const fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
48    unsafe { simd_add(a, b) }
49}
50
51/// Computes the bitwise AND of a packed double-precision (64-bit)
52/// floating-point elements in `a` and `b`.
53///
54/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
55#[inline]
56#[target_feature(enable = "avx")]
57// See https://github.com/rust-lang/stdarch/issues/71
58#[cfg_attr(test, assert_instr(vandp))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
61pub const fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
62    unsafe {
63        let a: u64x4 = transmute(a);
64        let b: u64x4 = transmute(b);
65        transmute(simd_and(a, b))
66    }
67}
68
69/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
70/// elements in `a` and `b`.
71///
72/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
73#[inline]
74#[target_feature(enable = "avx")]
75#[cfg_attr(test, assert_instr(vandps))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
78pub const fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
79    unsafe {
80        let a: u32x8 = transmute(a);
81        let b: u32x8 = transmute(b);
82        transmute(simd_and(a, b))
83    }
84}
85
86/// Computes the bitwise OR packed double-precision (64-bit) floating-point
87/// elements in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
90#[inline]
91#[target_feature(enable = "avx")]
92// See <https://github.com/rust-lang/stdarch/issues/71>.
93#[cfg_attr(test, assert_instr(vorp))]
94#[stable(feature = "simd_x86", since = "1.27.0")]
95#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
96pub const fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
97    unsafe {
98        let a: u64x4 = transmute(a);
99        let b: u64x4 = transmute(b);
100        transmute(simd_or(a, b))
101    }
102}
103
104/// Computes the bitwise OR packed single-precision (32-bit) floating-point
105/// elements in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
108#[inline]
109#[target_feature(enable = "avx")]
110#[cfg_attr(test, assert_instr(vorps))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
113pub const fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
114    unsafe {
115        let a: u32x8 = transmute(a);
116        let b: u32x8 = transmute(b);
117        transmute(simd_or(a, b))
118    }
119}
120
121/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
122/// lanes using the control in `imm8`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
125#[inline]
126#[target_feature(enable = "avx")]
127#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
128#[rustc_legacy_const_generics(2)]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
132    static_assert_uimm_bits!(MASK, 8);
133    unsafe {
134        simd_shuffle!(
135            a,
136            b,
137            [
138                MASK as u32 & 0b1,
139                ((MASK as u32 >> 1) & 0b1) + 4,
140                ((MASK as u32 >> 2) & 0b1) + 2,
141                ((MASK as u32 >> 3) & 0b1) + 6,
142            ],
143        )
144    }
145}
146
147/// Shuffles single-precision (32-bit) floating-point elements in `a` within
148/// 128-bit lanes using the control in `imm8`.
149///
150/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
151#[inline]
152#[target_feature(enable = "avx")]
153#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
154#[rustc_legacy_const_generics(2)]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
157pub const fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
158    static_assert_uimm_bits!(MASK, 8);
159    unsafe {
160        simd_shuffle!(
161            a,
162            b,
163            [
164                MASK as u32 & 0b11,
165                (MASK as u32 >> 2) & 0b11,
166                ((MASK as u32 >> 4) & 0b11) + 8,
167                ((MASK as u32 >> 6) & 0b11) + 8,
168                (MASK as u32 & 0b11) + 4,
169                ((MASK as u32 >> 2) & 0b11) + 4,
170                ((MASK as u32 >> 4) & 0b11) + 12,
171                ((MASK as u32 >> 6) & 0b11) + 12,
172            ],
173        )
174    }
175}
176
177/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
178/// elements in `a`, and then AND with `b`.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
181#[inline]
182#[target_feature(enable = "avx")]
183#[cfg_attr(test, assert_instr(vandnp))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
186pub const fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
187    unsafe {
188        let a: u64x4 = transmute(a);
189        let b: u64x4 = transmute(b);
190        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
191    }
192}
193
194/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
195/// elements in `a`
196/// and then AND with `b`.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
199#[inline]
200#[target_feature(enable = "avx")]
201#[cfg_attr(test, assert_instr(vandnps))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
204pub const fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
205    unsafe {
206        let a: u32x8 = transmute(a);
207        let b: u32x8 = transmute(b);
208        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
209    }
210}
211
212/// Compares packed double-precision (64-bit) floating-point elements
213/// in `a` and `b`, and returns packed maximum values
214///
215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
216#[inline]
217#[target_feature(enable = "avx")]
218#[cfg_attr(test, assert_instr(vmaxpd))]
219#[stable(feature = "simd_x86", since = "1.27.0")]
220pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
221    unsafe { vmaxpd(a, b) }
222}
223
224/// Compares packed single-precision (32-bit) floating-point elements in `a`
225/// and `b`, and returns packed maximum values
226///
227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
228#[inline]
229#[target_feature(enable = "avx")]
230#[cfg_attr(test, assert_instr(vmaxps))]
231#[stable(feature = "simd_x86", since = "1.27.0")]
232pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
233    unsafe { vmaxps(a, b) }
234}
235
236/// Compares packed double-precision (64-bit) floating-point elements
237/// in `a` and `b`, and returns packed minimum values
238///
239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
240#[inline]
241#[target_feature(enable = "avx")]
242#[cfg_attr(test, assert_instr(vminpd))]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
245    unsafe { vminpd(a, b) }
246}
247
248/// Compares packed single-precision (32-bit) floating-point elements in `a`
249/// and `b`, and returns packed minimum values
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
252#[inline]
253#[target_feature(enable = "avx")]
254#[cfg_attr(test, assert_instr(vminps))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
257    unsafe { vminps(a, b) }
258}
259
260/// Multiplies packed double-precision (64-bit) floating-point elements
261/// in `a` and `b`.
262///
263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
264#[inline]
265#[target_feature(enable = "avx")]
266#[cfg_attr(test, assert_instr(vmulpd))]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
269pub const fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
270    unsafe { simd_mul(a, b) }
271}
272
273/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
274/// `b`.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
277#[inline]
278#[target_feature(enable = "avx")]
279#[cfg_attr(test, assert_instr(vmulps))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
282pub const fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
283    unsafe { simd_mul(a, b) }
284}
285
286/// Alternatively adds and subtracts packed double-precision (64-bit)
287/// floating-point elements in `a` to/from packed elements in `b`.
288///
289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
290#[inline]
291#[target_feature(enable = "avx")]
292#[cfg_attr(test, assert_instr(vaddsubpd))]
293#[stable(feature = "simd_x86", since = "1.27.0")]
294#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
295pub const fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
296    unsafe {
297        let a = a.as_f64x4();
298        let b = b.as_f64x4();
299        let add = simd_add(a, b);
300        let sub = simd_sub(a, b);
301        simd_shuffle!(add, sub, [4, 1, 6, 3])
302    }
303}
304
305/// Alternatively adds and subtracts packed single-precision (32-bit)
306/// floating-point elements in `a` to/from packed elements in `b`.
307///
308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
309#[inline]
310#[target_feature(enable = "avx")]
311#[cfg_attr(test, assert_instr(vaddsubps))]
312#[stable(feature = "simd_x86", since = "1.27.0")]
313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
314pub const fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
315    unsafe {
316        let a = a.as_f32x8();
317        let b = b.as_f32x8();
318        let add = simd_add(a, b);
319        let sub = simd_sub(a, b);
320        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
321    }
322}
323
324/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
325/// from packed elements in `a`.
326///
327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
328#[inline]
329#[target_feature(enable = "avx")]
330#[cfg_attr(test, assert_instr(vsubpd))]
331#[stable(feature = "simd_x86", since = "1.27.0")]
332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
333pub const fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
334    unsafe { simd_sub(a, b) }
335}
336
337/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
338/// from packed elements in `a`.
339///
340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
341#[inline]
342#[target_feature(enable = "avx")]
343#[cfg_attr(test, assert_instr(vsubps))]
344#[stable(feature = "simd_x86", since = "1.27.0")]
345#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
346pub const fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
347    unsafe { simd_sub(a, b) }
348}
349
350/// Computes the division of each of the 8 packed 32-bit floating-point elements
351/// in `a` by the corresponding packed elements in `b`.
352///
353/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
354#[inline]
355#[target_feature(enable = "avx")]
356#[cfg_attr(test, assert_instr(vdivps))]
357#[stable(feature = "simd_x86", since = "1.27.0")]
358#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
359pub const fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
360    unsafe { simd_div(a, b) }
361}
362
363/// Computes the division of each of the 4 packed 64-bit floating-point elements
364/// in `a` by the corresponding packed elements in `b`.
365///
366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
367#[inline]
368#[target_feature(enable = "avx")]
369#[cfg_attr(test, assert_instr(vdivpd))]
370#[stable(feature = "simd_x86", since = "1.27.0")]
371#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
372pub const fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
373    unsafe { simd_div(a, b) }
374}
375
376/// Rounds packed double-precision (64-bit) floating point elements in `a`
377/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
378///
379/// - `0x00`: Round to the nearest whole number.
380/// - `0x01`: Round down, toward negative infinity.
381/// - `0x02`: Round up, toward positive infinity.
382/// - `0x03`: Truncate the values.
383///
384/// For a complete list of options, check [the LLVM docs][llvm_docs].
385///
386/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
387///
388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
389#[inline]
390#[target_feature(enable = "avx")]
391#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
392#[rustc_legacy_const_generics(1)]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
395    static_assert_uimm_bits!(ROUNDING, 4);
396    unsafe { roundpd256(a, ROUNDING) }
397}
398
399/// Rounds packed double-precision (64-bit) floating point elements in `a`
400/// toward positive infinity.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
403#[inline]
404#[target_feature(enable = "avx")]
405#[cfg_attr(test, assert_instr(vroundpd))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
408pub const fn _mm256_ceil_pd(a: __m256d) -> __m256d {
409    unsafe { simd_ceil(a) }
410}
411
412/// Rounds packed double-precision (64-bit) floating point elements in `a`
413/// toward negative infinity.
414///
415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
416#[inline]
417#[target_feature(enable = "avx")]
418#[cfg_attr(test, assert_instr(vroundpd))]
419#[stable(feature = "simd_x86", since = "1.27.0")]
420#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
421pub const fn _mm256_floor_pd(a: __m256d) -> __m256d {
422    unsafe { simd_floor(a) }
423}
424
425/// Rounds packed single-precision (32-bit) floating point elements in `a`
426/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
427///
428/// - `0x00`: Round to the nearest whole number.
429/// - `0x01`: Round down, toward negative infinity.
430/// - `0x02`: Round up, toward positive infinity.
431/// - `0x03`: Truncate the values.
432///
433/// For a complete list of options, check [the LLVM docs][llvm_docs].
434///
435/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
436///
437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
438#[inline]
439#[target_feature(enable = "avx")]
440#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
441#[rustc_legacy_const_generics(1)]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
444    static_assert_uimm_bits!(ROUNDING, 4);
445    unsafe { roundps256(a, ROUNDING) }
446}
447
448/// Rounds packed single-precision (32-bit) floating point elements in `a`
449/// toward positive infinity.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
452#[inline]
453#[target_feature(enable = "avx")]
454#[cfg_attr(test, assert_instr(vroundps))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
457pub const fn _mm256_ceil_ps(a: __m256) -> __m256 {
458    unsafe { simd_ceil(a) }
459}
460
461/// Rounds packed single-precision (32-bit) floating point elements in `a`
462/// toward negative infinity.
463///
464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
465#[inline]
466#[target_feature(enable = "avx")]
467#[cfg_attr(test, assert_instr(vroundps))]
468#[stable(feature = "simd_x86", since = "1.27.0")]
469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
470pub const fn _mm256_floor_ps(a: __m256) -> __m256 {
471    unsafe { simd_floor(a) }
472}
473
474/// Returns the square root of packed single-precision (32-bit) floating point
475/// elements in `a`.
476///
477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
478#[inline]
479#[target_feature(enable = "avx")]
480#[cfg_attr(test, assert_instr(vsqrtps))]
481#[stable(feature = "simd_x86", since = "1.27.0")]
482pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
483    unsafe { simd_fsqrt(a) }
484}
485
486/// Returns the square root of packed double-precision (64-bit) floating point
487/// elements in `a`.
488///
489/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
490#[inline]
491#[target_feature(enable = "avx")]
492#[cfg_attr(test, assert_instr(vsqrtpd))]
493#[stable(feature = "simd_x86", since = "1.27.0")]
494pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
495    unsafe { simd_fsqrt(a) }
496}
497
498/// Blends packed double-precision (64-bit) floating-point elements from
499/// `a` and `b` using control mask `imm8`.
500///
501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
502#[inline]
503#[target_feature(enable = "avx")]
504// Note: LLVM7 prefers single-precision blend instructions when
505// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
506// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
507#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
508#[rustc_legacy_const_generics(2)]
509#[stable(feature = "simd_x86", since = "1.27.0")]
510#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
511pub const fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
512    static_assert_uimm_bits!(IMM4, 4);
513    unsafe {
514        simd_shuffle!(
515            a,
516            b,
517            [
518                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
519                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
520                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
521                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
522            ],
523        )
524    }
525}
526
527/// Blends packed single-precision (32-bit) floating-point elements from
528/// `a` and `b` using control mask `imm8`.
529///
530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
531#[inline]
532#[target_feature(enable = "avx")]
533#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
534#[rustc_legacy_const_generics(2)]
535#[stable(feature = "simd_x86", since = "1.27.0")]
536#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
537pub const fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
538    static_assert_uimm_bits!(IMM8, 8);
539    unsafe {
540        simd_shuffle!(
541            a,
542            b,
543            [
544                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
545                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
546                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
547                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
548                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
549                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
550                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
551                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
552            ],
553        )
554    }
555}
556
557/// Blends packed double-precision (64-bit) floating-point elements from
558/// `a` and `b` using `c` as a mask.
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
561#[inline]
562#[target_feature(enable = "avx")]
563#[cfg_attr(test, assert_instr(vblendvpd))]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
566pub const fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
567    unsafe {
568        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
569        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
570    }
571}
572
573/// Blends packed single-precision (32-bit) floating-point elements from
574/// `a` and `b` using `c` as a mask.
575///
576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
577#[inline]
578#[target_feature(enable = "avx")]
579#[cfg_attr(test, assert_instr(vblendvps))]
580#[stable(feature = "simd_x86", since = "1.27.0")]
581#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
582pub const fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
583    unsafe {
584        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
585        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
586    }
587}
588
589/// Conditionally multiplies the packed single-precision (32-bit) floating-point
590/// elements in `a` and `b` using the high 4 bits in `imm8`,
591/// sum the four products, and conditionally return the sum
592///  using the low 4 bits of `imm8`.
593///
594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
595#[inline]
596#[target_feature(enable = "avx")]
597#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
598#[rustc_legacy_const_generics(2)]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
601    static_assert_uimm_bits!(IMM8, 8);
602    unsafe { vdpps(a, b, IMM8 as i8) }
603}
604
605/// Horizontal addition of adjacent pairs in the two packed vectors
606/// of 4 64-bit floating points `a` and `b`.
607/// In the result, sums of elements from `a` are returned in even locations,
608/// while sums of elements from `b` are returned in odd locations.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
611#[inline]
612#[target_feature(enable = "avx")]
613#[cfg_attr(test, assert_instr(vhaddpd))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
616pub const fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
617    unsafe {
618        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
619        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
620        simd_add(even, odd)
621    }
622}
623
624/// Horizontal addition of adjacent pairs in the two packed vectors
625/// of 8 32-bit floating points `a` and `b`.
626/// In the result, sums of elements from `a` are returned in locations of
627/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
628/// 2, 3, 6, 7.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
631#[inline]
632#[target_feature(enable = "avx")]
633#[cfg_attr(test, assert_instr(vhaddps))]
634#[stable(feature = "simd_x86", since = "1.27.0")]
635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
636pub const fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
637    unsafe {
638        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
639        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
640        simd_add(even, odd)
641    }
642}
643
644/// Horizontal subtraction of adjacent pairs in the two packed vectors
645/// of 4 64-bit floating points `a` and `b`.
646/// In the result, sums of elements from `a` are returned in even locations,
647/// while sums of elements from `b` are returned in odd locations.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
650#[inline]
651#[target_feature(enable = "avx")]
652#[cfg_attr(test, assert_instr(vhsubpd))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
655pub const fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
656    unsafe {
657        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
658        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
659        simd_sub(even, odd)
660    }
661}
662
663/// Horizontal subtraction of adjacent pairs in the two packed vectors
664/// of 8 32-bit floating points `a` and `b`.
665/// In the result, sums of elements from `a` are returned in locations of
666/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
667/// 2, 3, 6, 7.
668///
669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
670#[inline]
671#[target_feature(enable = "avx")]
672#[cfg_attr(test, assert_instr(vhsubps))]
673#[stable(feature = "simd_x86", since = "1.27.0")]
674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
675pub const fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
676    unsafe {
677        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
678        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
679        simd_sub(even, odd)
680    }
681}
682
683/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
684/// elements in `a` and `b`.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
687#[inline]
688#[target_feature(enable = "avx")]
689#[cfg_attr(test, assert_instr(vxorp))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
692pub const fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
693    unsafe {
694        let a: u64x4 = transmute(a);
695        let b: u64x4 = transmute(b);
696        transmute(simd_xor(a, b))
697    }
698}
699
700/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
701/// elements in `a` and `b`.
702///
703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
704#[inline]
705#[target_feature(enable = "avx")]
706#[cfg_attr(test, assert_instr(vxorps))]
707#[stable(feature = "simd_x86", since = "1.27.0")]
708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
709pub const fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
710    unsafe {
711        let a: u32x8 = transmute(a);
712        let b: u32x8 = transmute(b);
713        transmute(simd_xor(a, b))
714    }
715}
716
717/// Equal (ordered, non-signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OQ: i32 = 0x00;
720/// Less-than (ordered, signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OS: i32 = 0x01;
723/// Less-than-or-equal (ordered, signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OS: i32 = 0x02;
726/// Unordered (non-signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_Q: i32 = 0x03;
729/// Not-equal (unordered, non-signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_UQ: i32 = 0x04;
732/// Not-less-than (unordered, signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_US: i32 = 0x05;
735/// Not-less-than-or-equal (unordered, signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_US: i32 = 0x06;
738/// Ordered (non-signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_Q: i32 = 0x07;
741/// Equal (unordered, non-signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_UQ: i32 = 0x08;
744/// Not-greater-than-or-equal (unordered, signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_US: i32 = 0x09;
747/// Not-greater-than (unordered, signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_US: i32 = 0x0a;
750/// False (ordered, non-signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OQ: i32 = 0x0b;
753/// Not-equal (ordered, non-signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OQ: i32 = 0x0c;
756/// Greater-than-or-equal (ordered, signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OS: i32 = 0x0d;
759/// Greater-than (ordered, signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OS: i32 = 0x0e;
762/// True (unordered, non-signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_UQ: i32 = 0x0f;
765/// Equal (ordered, signaling)
766#[stable(feature = "simd_x86", since = "1.27.0")]
767pub const _CMP_EQ_OS: i32 = 0x10;
768/// Less-than (ordered, non-signaling)
769#[stable(feature = "simd_x86", since = "1.27.0")]
770pub const _CMP_LT_OQ: i32 = 0x11;
771/// Less-than-or-equal (ordered, non-signaling)
772#[stable(feature = "simd_x86", since = "1.27.0")]
773pub const _CMP_LE_OQ: i32 = 0x12;
774/// Unordered (signaling)
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub const _CMP_UNORD_S: i32 = 0x13;
777/// Not-equal (unordered, signaling)
778#[stable(feature = "simd_x86", since = "1.27.0")]
779pub const _CMP_NEQ_US: i32 = 0x14;
780/// Not-less-than (unordered, non-signaling)
781#[stable(feature = "simd_x86", since = "1.27.0")]
782pub const _CMP_NLT_UQ: i32 = 0x15;
783/// Not-less-than-or-equal (unordered, non-signaling)
784#[stable(feature = "simd_x86", since = "1.27.0")]
785pub const _CMP_NLE_UQ: i32 = 0x16;
786/// Ordered (signaling)
787#[stable(feature = "simd_x86", since = "1.27.0")]
788pub const _CMP_ORD_S: i32 = 0x17;
789/// Equal (unordered, signaling)
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub const _CMP_EQ_US: i32 = 0x18;
792/// Not-greater-than-or-equal (unordered, non-signaling)
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub const _CMP_NGE_UQ: i32 = 0x19;
795/// Not-greater-than (unordered, non-signaling)
796#[stable(feature = "simd_x86", since = "1.27.0")]
797pub const _CMP_NGT_UQ: i32 = 0x1a;
798/// False (ordered, signaling)
799#[stable(feature = "simd_x86", since = "1.27.0")]
800pub const _CMP_FALSE_OS: i32 = 0x1b;
801/// Not-equal (ordered, signaling)
802#[stable(feature = "simd_x86", since = "1.27.0")]
803pub const _CMP_NEQ_OS: i32 = 0x1c;
804/// Greater-than-or-equal (ordered, non-signaling)
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub const _CMP_GE_OQ: i32 = 0x1d;
807/// Greater-than (ordered, non-signaling)
808#[stable(feature = "simd_x86", since = "1.27.0")]
809pub const _CMP_GT_OQ: i32 = 0x1e;
810/// True (unordered, signaling)
811#[stable(feature = "simd_x86", since = "1.27.0")]
812pub const _CMP_TRUE_US: i32 = 0x1f;
813
814/// Compares packed double-precision (64-bit) floating-point
815/// elements in `a` and `b` based on the comparison operand
816/// specified by `IMM5`.
817///
818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
819#[inline]
820#[target_feature(enable = "avx")]
821#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
822#[rustc_legacy_const_generics(2)]
823#[stable(feature = "simd_x86", since = "1.27.0")]
824pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
825    static_assert_uimm_bits!(IMM5, 5);
826    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
827}
828
829/// Compares packed double-precision (64-bit) floating-point
830/// elements in `a` and `b` based on the comparison operand
831/// specified by `IMM5`.
832///
833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
834#[inline]
835#[target_feature(enable = "avx")]
836#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
837#[rustc_legacy_const_generics(2)]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
840    static_assert_uimm_bits!(IMM5, 5);
841    unsafe { vcmppd256(a, b, IMM5 as u8) }
842}
843
844/// Compares packed single-precision (32-bit) floating-point
845/// elements in `a` and `b` based on the comparison operand
846/// specified by `IMM5`.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
855    static_assert_uimm_bits!(IMM5, 5);
856    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
857}
858
859/// Compares packed single-precision (32-bit) floating-point
860/// elements in `a` and `b` based on the comparison operand
861/// specified by `IMM5`.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
867#[rustc_legacy_const_generics(2)]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
870    static_assert_uimm_bits!(IMM5, 5);
871    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
872}
873
874/// Compares the lower double-precision (64-bit) floating-point element in
875/// `a` and `b` based on the comparison operand specified by `IMM5`,
876/// store the result in the lower element of returned vector,
877/// and copies the upper element from `a` to the upper element of returned
878/// vector.
879///
880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
881#[inline]
882#[target_feature(enable = "avx")]
883#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
884#[rustc_legacy_const_generics(2)]
885#[stable(feature = "simd_x86", since = "1.27.0")]
886pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
887    static_assert_uimm_bits!(IMM5, 5);
888    unsafe { vcmpsd(a, b, IMM5 as i8) }
889}
890
891/// Compares the lower single-precision (32-bit) floating-point element in
892/// `a` and `b` based on the comparison operand specified by `IMM5`,
893/// store the result in the lower element of returned vector,
894/// and copies the upper 3 packed elements from `a` to the upper elements of
895/// returned vector.
896///
897/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
898#[inline]
899#[target_feature(enable = "avx")]
900#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
901#[rustc_legacy_const_generics(2)]
902#[stable(feature = "simd_x86", since = "1.27.0")]
903pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
904    static_assert_uimm_bits!(IMM5, 5);
905    unsafe { vcmpss(a, b, IMM5 as i8) }
906}
907
908/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
909/// floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtdq2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
917pub const fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
918    unsafe { simd_cast(a.as_i32x4()) }
919}
920
921/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
922/// floating-point elements.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
925#[inline]
926#[target_feature(enable = "avx")]
927#[cfg_attr(test, assert_instr(vcvtdq2ps))]
928#[stable(feature = "simd_x86", since = "1.27.0")]
929#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
930pub const fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
931    unsafe { simd_cast(a.as_i32x8()) }
932}
933
934/// Converts packed double-precision (64-bit) floating-point elements in `a`
935/// to packed single-precision (32-bit) floating-point elements.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
938#[inline]
939#[target_feature(enable = "avx")]
940#[cfg_attr(test, assert_instr(vcvtpd2ps))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
943pub const fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
944    unsafe { simd_cast(a) }
945}
946
947/// Converts packed single-precision (32-bit) floating-point elements in `a`
948/// to packed 32-bit integers.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvtps2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
956    unsafe { transmute(vcvtps2dq(a)) }
957}
958
959/// Converts packed single-precision (32-bit) floating-point elements in `a`
960/// to packed double-precision (64-bit) floating-point elements.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtps2pd))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
968pub const fn _mm256_cvtps_pd(a: __m128) -> __m256d {
969    unsafe { simd_cast(a) }
970}
971
972/// Returns the first element of the input vector of `[4 x double]`.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
975#[inline]
976#[target_feature(enable = "avx")]
977//#[cfg_attr(test, assert_instr(movsd))] FIXME
978#[stable(feature = "simd_x86", since = "1.27.0")]
979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
980pub const fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
981    unsafe { simd_extract!(a, 0) }
982}
983
984/// Converts packed double-precision (64-bit) floating-point elements in `a`
985/// to packed 32-bit integers with truncation.
986///
987/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
988#[inline]
989#[target_feature(enable = "avx")]
990#[cfg_attr(test, assert_instr(vcvttpd2dq))]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
993    unsafe { transmute(vcvttpd2dq(a)) }
994}
995
996/// Converts packed double-precision (64-bit) floating-point elements in `a`
997/// to packed 32-bit integers.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
1000#[inline]
1001#[target_feature(enable = "avx")]
1002#[cfg_attr(test, assert_instr(vcvtpd2dq))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
1005    unsafe { transmute(vcvtpd2dq(a)) }
1006}
1007
1008/// Converts packed single-precision (32-bit) floating-point elements in `a`
1009/// to packed 32-bit integers with truncation.
1010///
1011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
1012#[inline]
1013#[target_feature(enable = "avx")]
1014#[cfg_attr(test, assert_instr(vcvttps2dq))]
1015#[stable(feature = "simd_x86", since = "1.27.0")]
1016pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
1017    unsafe { transmute(vcvttps2dq(a)) }
1018}
1019
1020/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
1021/// floating-point elements) from `a`, selected with `imm8`.
1022///
1023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
1024#[inline]
1025#[target_feature(enable = "avx")]
1026#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1027#[rustc_legacy_const_generics(1)]
1028#[stable(feature = "simd_x86", since = "1.27.0")]
1029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1030pub const fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
1031    static_assert_uimm_bits!(IMM1, 1);
1032    unsafe {
1033        simd_shuffle!(
1034            a,
1035            _mm256_undefined_ps(),
1036            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
1037        )
1038    }
1039}
1040
1041/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1042/// floating-point elements) from `a`, selected with `imm8`.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1045#[inline]
1046#[target_feature(enable = "avx")]
1047#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1048#[rustc_legacy_const_generics(1)]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1052    static_assert_uimm_bits!(IMM1, 1);
1053    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1054}
1055
1056/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1062#[rustc_legacy_const_generics(1)]
1063#[stable(feature = "simd_x86", since = "1.27.0")]
1064#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1065pub const fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1066    static_assert_uimm_bits!(IMM1, 1);
1067    unsafe {
1068        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1069        transmute(dst)
1070    }
1071}
1072
1073/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1074///
1075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1076#[inline]
1077#[target_feature(enable = "avx")]
1078// This intrinsic has no corresponding instruction.
1079#[rustc_legacy_const_generics(1)]
1080#[stable(feature = "simd_x86", since = "1.27.0")]
1081#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1082pub const fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1083    static_assert_uimm_bits!(INDEX, 3);
1084    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1085}
1086
1087/// Returns the first element of the input vector of `[8 x i32]`.
1088///
1089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1090#[inline]
1091#[target_feature(enable = "avx")]
1092#[stable(feature = "simd_x86", since = "1.27.0")]
1093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1094pub const fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1095    unsafe { simd_extract!(a.as_i32x8(), 0) }
1096}
1097
1098/// Zeroes the contents of all XMM or YMM registers.
1099///
1100/// This operation is purely a performance hint for the CPU and has no effect on the Abstract
1101/// Machine state.
1102///
1103/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1104#[inline]
1105#[target_feature(enable = "avx")]
1106#[cfg_attr(test, assert_instr(vzeroall))]
1107#[stable(feature = "simd_x86", since = "1.27.0")]
1108pub fn _mm256_zeroall() {
1109    unsafe { vzeroall() }
1110}
1111
1112/// Zeroes the upper 128 bits of all YMM registers;
1113/// the lower 128-bits of the registers are unmodified.
1114///
1115/// This operation is purely a performance hint for the CPU and has no effect on the Abstract
1116/// Machine state.
1117///
1118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1119#[inline]
1120#[target_feature(enable = "avx")]
1121#[cfg_attr(test, assert_instr(vzeroupper))]
1122#[stable(feature = "simd_x86", since = "1.27.0")]
1123pub fn _mm256_zeroupper() {
1124    unsafe { vzeroupper() }
1125}
1126
1127/// Shuffles single-precision (32-bit) floating-point elements in `a`
1128/// within 128-bit lanes using the control in `b`.
1129///
1130/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1131#[inline]
1132#[target_feature(enable = "avx")]
1133#[cfg_attr(test, assert_instr(vpermilps))]
1134#[stable(feature = "simd_x86", since = "1.27.0")]
1135pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1136    unsafe { vpermilps256(a, b.as_i32x8()) }
1137}
1138
1139/// Shuffles single-precision (32-bit) floating-point elements in `a`
1140/// using the control in `b`.
1141///
1142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1143#[inline]
1144#[target_feature(enable = "avx")]
1145#[cfg_attr(test, assert_instr(vpermilps))]
1146#[stable(feature = "simd_x86", since = "1.27.0")]
1147pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1148    unsafe { vpermilps(a, b.as_i32x4()) }
1149}
1150
1151/// Shuffles single-precision (32-bit) floating-point elements in `a`
1152/// within 128-bit lanes using the control in `imm8`.
1153///
1154/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1155#[inline]
1156#[target_feature(enable = "avx")]
1157#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1158#[rustc_legacy_const_generics(1)]
1159#[stable(feature = "simd_x86", since = "1.27.0")]
1160#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1161pub const fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1162    static_assert_uimm_bits!(IMM8, 8);
1163    unsafe {
1164        simd_shuffle!(
1165            a,
1166            _mm256_undefined_ps(),
1167            [
1168                (IMM8 as u32 >> 0) & 0b11,
1169                (IMM8 as u32 >> 2) & 0b11,
1170                (IMM8 as u32 >> 4) & 0b11,
1171                (IMM8 as u32 >> 6) & 0b11,
1172                ((IMM8 as u32 >> 0) & 0b11) + 4,
1173                ((IMM8 as u32 >> 2) & 0b11) + 4,
1174                ((IMM8 as u32 >> 4) & 0b11) + 4,
1175                ((IMM8 as u32 >> 6) & 0b11) + 4,
1176            ],
1177        )
1178    }
1179}
1180
1181/// Shuffles single-precision (32-bit) floating-point elements in `a`
1182/// using the control in `imm8`.
1183///
1184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1185#[inline]
1186#[target_feature(enable = "avx")]
1187#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1188#[rustc_legacy_const_generics(1)]
1189#[stable(feature = "simd_x86", since = "1.27.0")]
1190#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1191pub const fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1192    static_assert_uimm_bits!(IMM8, 8);
1193    unsafe {
1194        simd_shuffle!(
1195            a,
1196            _mm_undefined_ps(),
1197            [
1198                (IMM8 as u32 >> 0) & 0b11,
1199                (IMM8 as u32 >> 2) & 0b11,
1200                (IMM8 as u32 >> 4) & 0b11,
1201                (IMM8 as u32 >> 6) & 0b11,
1202            ],
1203        )
1204    }
1205}
1206
1207/// Shuffles double-precision (64-bit) floating-point elements in `a`
1208/// within 256-bit lanes using the control in `b`.
1209///
1210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1211#[inline]
1212#[target_feature(enable = "avx")]
1213#[cfg_attr(test, assert_instr(vpermilpd))]
1214#[stable(feature = "simd_x86", since = "1.27.0")]
1215pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1216    unsafe { vpermilpd256(a, b.as_i64x4()) }
1217}
1218
1219/// Shuffles double-precision (64-bit) floating-point elements in `a`
1220/// using the control in `b`.
1221///
1222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1223#[inline]
1224#[target_feature(enable = "avx")]
1225#[cfg_attr(test, assert_instr(vpermilpd))]
1226#[stable(feature = "simd_x86", since = "1.27.0")]
1227pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1228    unsafe { vpermilpd(a, b.as_i64x2()) }
1229}
1230
1231/// Shuffles double-precision (64-bit) floating-point elements in `a`
1232/// within 128-bit lanes using the control in `imm8`.
1233///
1234/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1235#[inline]
1236#[target_feature(enable = "avx")]
1237#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1238#[rustc_legacy_const_generics(1)]
1239#[stable(feature = "simd_x86", since = "1.27.0")]
1240#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1241pub const fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1242    static_assert_uimm_bits!(IMM4, 4);
1243    unsafe {
1244        simd_shuffle!(
1245            a,
1246            _mm256_undefined_pd(),
1247            [
1248                ((IMM4 as u32 >> 0) & 1),
1249                ((IMM4 as u32 >> 1) & 1),
1250                ((IMM4 as u32 >> 2) & 1) + 2,
1251                ((IMM4 as u32 >> 3) & 1) + 2,
1252            ],
1253        )
1254    }
1255}
1256
1257/// Shuffles double-precision (64-bit) floating-point elements in `a`
1258/// using the control in `imm8`.
1259///
1260/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1261#[inline]
1262#[target_feature(enable = "avx")]
1263#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1264#[rustc_legacy_const_generics(1)]
1265#[stable(feature = "simd_x86", since = "1.27.0")]
1266#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1267pub const fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1268    static_assert_uimm_bits!(IMM2, 2);
1269    unsafe {
1270        simd_shuffle!(
1271            a,
1272            _mm_undefined_pd(),
1273            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1274        )
1275    }
1276}
1277
1278/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1279/// floating-point elements) selected by `imm8` from `a` and `b`.
1280///
1281/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1282#[inline]
1283#[target_feature(enable = "avx")]
1284#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1285#[rustc_legacy_const_generics(2)]
1286#[stable(feature = "simd_x86", since = "1.27.0")]
1287#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1288pub const fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1289    static_assert_uimm_bits!(IMM8, 8);
1290    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1291        _mm256_castps_si256(a),
1292        _mm256_castps_si256(b),
1293    ))
1294}
1295
1296/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1297/// floating-point elements) selected by `imm8` from `a` and `b`.
1298///
1299/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1300#[inline]
1301#[target_feature(enable = "avx")]
1302#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1303#[rustc_legacy_const_generics(2)]
1304#[stable(feature = "simd_x86", since = "1.27.0")]
1305#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1306pub const fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1307    static_assert_uimm_bits!(IMM8, 8);
1308    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1309        _mm256_castpd_si256(a),
1310        _mm256_castpd_si256(b),
1311    ))
1312}
1313
1314/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1315/// from `a` and `b`.
1316///
1317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1318#[inline]
1319#[target_feature(enable = "avx")]
1320#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1321#[rustc_legacy_const_generics(2)]
1322#[stable(feature = "simd_x86", since = "1.27.0")]
1323#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1324pub const fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1325    static_assert_uimm_bits!(IMM8, 8);
1326    const fn idx(imm8: i32, pos: u32) -> u32 {
1327        let part = if pos < 2 {
1328            imm8 & 0xf
1329        } else {
1330            (imm8 & 0xf0) >> 4
1331        };
1332        2 * (part as u32 & 0b11) + (pos & 1)
1333    }
1334    const fn idx0(imm8: i32, pos: u32) -> u32 {
1335        let part = if pos < 2 {
1336            imm8 & 0xf
1337        } else {
1338            (imm8 & 0xf0) >> 4
1339        };
1340        if part & 0b1000 != 0 { 4 } else { pos }
1341    }
1342    unsafe {
1343        let r = simd_shuffle!(
1344            a.as_i64x4(),
1345            b.as_i64x4(),
1346            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1347        );
1348        let r: i64x4 = simd_shuffle!(
1349            r,
1350            i64x4::ZERO,
1351            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1352        );
1353        r.as_m256i()
1354    }
1355}
1356
1357/// Broadcasts a single-precision (32-bit) floating-point element from memory
1358/// to all elements of the returned vector.
1359///
1360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1361#[inline]
1362#[target_feature(enable = "avx")]
1363#[cfg_attr(test, assert_instr(vbroadcastss))]
1364#[stable(feature = "simd_x86", since = "1.27.0")]
1365#[allow(clippy::trivially_copy_pass_by_ref)]
1366#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1367pub const fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1368    _mm256_set1_ps(*f)
1369}
1370
1371/// Broadcasts a single-precision (32-bit) floating-point element from memory
1372/// to all elements of the returned vector.
1373///
1374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1375#[inline]
1376#[target_feature(enable = "avx")]
1377#[cfg_attr(test, assert_instr(vbroadcastss))]
1378#[stable(feature = "simd_x86", since = "1.27.0")]
1379#[allow(clippy::trivially_copy_pass_by_ref)]
1380#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1381pub const fn _mm_broadcast_ss(f: &f32) -> __m128 {
1382    _mm_set1_ps(*f)
1383}
1384
1385/// Broadcasts a double-precision (64-bit) floating-point element from memory
1386/// to all elements of the returned vector.
1387///
1388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1389#[inline]
1390#[target_feature(enable = "avx")]
1391#[cfg_attr(test, assert_instr(vbroadcastsd))]
1392#[stable(feature = "simd_x86", since = "1.27.0")]
1393#[allow(clippy::trivially_copy_pass_by_ref)]
1394#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1395pub const fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1396    _mm256_set1_pd(*f)
1397}
1398
1399/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1400/// (32-bit) floating-point elements) to all elements of the returned vector.
1401///
1402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1403#[inline]
1404#[target_feature(enable = "avx")]
1405#[cfg_attr(test, assert_instr(vbroadcastf128))]
1406#[stable(feature = "simd_x86", since = "1.27.0")]
1407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1408pub const fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1409    unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1410}
1411
1412/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1413/// (64-bit) floating-point elements) to all elements of the returned vector.
1414///
1415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1416#[inline]
1417#[target_feature(enable = "avx")]
1418#[cfg_attr(test, assert_instr(vbroadcastf128))]
1419#[stable(feature = "simd_x86", since = "1.27.0")]
1420#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1421pub const fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1422    unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1423}
1424
1425/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1426/// single-precision (32-bit) floating-point elements) from `b` into result
1427/// at the location specified by `imm8`.
1428///
1429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1430#[inline]
1431#[target_feature(enable = "avx")]
1432#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1433#[rustc_legacy_const_generics(2)]
1434#[stable(feature = "simd_x86", since = "1.27.0")]
1435#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1436pub const fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1437    static_assert_uimm_bits!(IMM1, 1);
1438    unsafe {
1439        simd_shuffle!(
1440            a,
1441            _mm256_castps128_ps256(b),
1442            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1443        )
1444    }
1445}
1446
1447/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1448/// double-precision (64-bit) floating-point elements) from `b` into result
1449/// at the location specified by `imm8`.
1450///
1451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1452#[inline]
1453#[target_feature(enable = "avx")]
1454#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1455#[rustc_legacy_const_generics(2)]
1456#[stable(feature = "simd_x86", since = "1.27.0")]
1457#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1458pub const fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1459    static_assert_uimm_bits!(IMM1, 1);
1460    unsafe {
1461        simd_shuffle!(
1462            a,
1463            _mm256_castpd128_pd256(b),
1464            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1465        )
1466    }
1467}
1468
1469/// Copies `a` to result, then inserts 128 bits from `b` into result
1470/// at the location specified by `imm8`.
1471///
1472/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1473#[inline]
1474#[target_feature(enable = "avx")]
1475#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1476#[rustc_legacy_const_generics(2)]
1477#[stable(feature = "simd_x86", since = "1.27.0")]
1478#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1479pub const fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1480    static_assert_uimm_bits!(IMM1, 1);
1481    unsafe {
1482        let dst: i64x4 = simd_shuffle!(
1483            a.as_i64x4(),
1484            _mm256_castsi128_si256(b).as_i64x4(),
1485            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1486        );
1487        transmute(dst)
1488    }
1489}
1490
1491/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1492/// at the location specified by `index`.
1493///
1494/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1495#[inline]
1496#[target_feature(enable = "avx")]
1497// This intrinsic has no corresponding instruction.
1498#[rustc_legacy_const_generics(2)]
1499#[stable(feature = "simd_x86", since = "1.27.0")]
1500#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1501pub const fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1502    static_assert_uimm_bits!(INDEX, 5);
1503    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1504}
1505
1506/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1507/// at the location specified by `index`.
1508///
1509/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1510#[inline]
1511#[target_feature(enable = "avx")]
1512// This intrinsic has no corresponding instruction.
1513#[rustc_legacy_const_generics(2)]
1514#[stable(feature = "simd_x86", since = "1.27.0")]
1515#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1516pub const fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1517    static_assert_uimm_bits!(INDEX, 4);
1518    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1519}
1520
1521/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1522/// at the location specified by `index`.
1523///
1524/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1525#[inline]
1526#[target_feature(enable = "avx")]
1527// This intrinsic has no corresponding instruction.
1528#[rustc_legacy_const_generics(2)]
1529#[stable(feature = "simd_x86", since = "1.27.0")]
1530#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1531pub const fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1532    static_assert_uimm_bits!(INDEX, 3);
1533    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1534}
1535
1536/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1537/// floating-point elements) from memory into result.
1538/// `mem_addr` must be aligned on a 32-byte boundary or a
1539/// general-protection exception may be generated.
1540///
1541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1542#[inline]
1543#[target_feature(enable = "avx")]
1544#[cfg_attr(
1545    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1546    assert_instr(vmovap)
1547)]
1548#[stable(feature = "simd_x86", since = "1.27.0")]
1549#[allow(clippy::cast_ptr_alignment)]
1550#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1551pub const unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1552    *(mem_addr as *const __m256d)
1553}
1554
1555/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1556/// floating-point elements) from `a` into memory.
1557/// `mem_addr` must be aligned on a 32-byte boundary or a
1558/// general-protection exception may be generated.
1559///
1560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1561#[inline]
1562#[target_feature(enable = "avx")]
1563#[cfg_attr(
1564    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1565    assert_instr(vmovap)
1566)]
1567#[stable(feature = "simd_x86", since = "1.27.0")]
1568#[allow(clippy::cast_ptr_alignment)]
1569#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1570pub const unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1571    *(mem_addr as *mut __m256d) = a;
1572}
1573
1574/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1575/// floating-point elements) from memory into result.
1576/// `mem_addr` must be aligned on a 32-byte boundary or a
1577/// general-protection exception may be generated.
1578///
1579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1580#[inline]
1581#[target_feature(enable = "avx")]
1582#[cfg_attr(
1583    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1584    assert_instr(vmovaps)
1585)]
1586#[stable(feature = "simd_x86", since = "1.27.0")]
1587#[allow(clippy::cast_ptr_alignment)]
1588#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1589pub const unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1590    *(mem_addr as *const __m256)
1591}
1592
1593/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1594/// floating-point elements) from `a` into memory.
1595/// `mem_addr` must be aligned on a 32-byte boundary or a
1596/// general-protection exception may be generated.
1597///
1598/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1599#[inline]
1600#[target_feature(enable = "avx")]
1601#[cfg_attr(
1602    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1603    assert_instr(vmovaps)
1604)]
1605#[stable(feature = "simd_x86", since = "1.27.0")]
1606#[allow(clippy::cast_ptr_alignment)]
1607#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1608pub const unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1609    *(mem_addr as *mut __m256) = a;
1610}
1611
1612/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1613/// floating-point elements) from memory into result.
1614/// `mem_addr` does not need to be aligned on any particular boundary.
1615///
1616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1617#[inline]
1618#[target_feature(enable = "avx")]
1619#[cfg_attr(test, assert_instr(vmovup))]
1620#[stable(feature = "simd_x86", since = "1.27.0")]
1621#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1622pub const unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1623    let mut dst = _mm256_undefined_pd();
1624    ptr::copy_nonoverlapping(
1625        mem_addr as *const u8,
1626        ptr::addr_of_mut!(dst) as *mut u8,
1627        mem::size_of::<__m256d>(),
1628    );
1629    dst
1630}
1631
1632/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1633/// floating-point elements) from `a` into memory.
1634/// `mem_addr` does not need to be aligned on any particular boundary.
1635///
1636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1637#[inline]
1638#[target_feature(enable = "avx")]
1639#[cfg_attr(test, assert_instr(vmovup))]
1640#[stable(feature = "simd_x86", since = "1.27.0")]
1641#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1642pub const unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1643    mem_addr.cast::<__m256d>().write_unaligned(a);
1644}
1645
1646/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1647/// floating-point elements) from memory into result.
1648/// `mem_addr` does not need to be aligned on any particular boundary.
1649///
1650/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1651#[inline]
1652#[target_feature(enable = "avx")]
1653#[cfg_attr(test, assert_instr(vmovups))]
1654#[stable(feature = "simd_x86", since = "1.27.0")]
1655#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1656pub const unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1657    let mut dst = _mm256_undefined_ps();
1658    ptr::copy_nonoverlapping(
1659        mem_addr as *const u8,
1660        ptr::addr_of_mut!(dst) as *mut u8,
1661        mem::size_of::<__m256>(),
1662    );
1663    dst
1664}
1665
1666/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1667/// floating-point elements) from `a` into memory.
1668/// `mem_addr` does not need to be aligned on any particular boundary.
1669///
1670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1671#[inline]
1672#[target_feature(enable = "avx")]
1673#[cfg_attr(test, assert_instr(vmovups))]
1674#[stable(feature = "simd_x86", since = "1.27.0")]
1675#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1676pub const unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1677    mem_addr.cast::<__m256>().write_unaligned(a);
1678}
1679
1680/// Loads 256-bits of integer data from memory into result.
1681/// `mem_addr` must be aligned on a 32-byte boundary or a
1682/// general-protection exception may be generated.
1683///
1684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1685#[inline]
1686#[target_feature(enable = "avx")]
1687#[cfg_attr(
1688    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1689    assert_instr(vmovaps)
1690)] // FIXME vmovdqa expected
1691#[stable(feature = "simd_x86", since = "1.27.0")]
1692#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1693pub const unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1694    *mem_addr
1695}
1696
1697/// Stores 256-bits of integer data from `a` into memory.
1698/// `mem_addr` must be aligned on a 32-byte boundary or a
1699/// general-protection exception may be generated.
1700///
1701/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1702#[inline]
1703#[target_feature(enable = "avx")]
1704#[cfg_attr(
1705    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1706    assert_instr(vmovaps)
1707)] // FIXME vmovdqa expected
1708#[stable(feature = "simd_x86", since = "1.27.0")]
1709#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1710pub const unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1711    *mem_addr = a;
1712}
1713
1714/// Loads 256-bits of integer data from memory into result.
1715/// `mem_addr` does not need to be aligned on any particular boundary.
1716///
1717/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1718#[inline]
1719#[target_feature(enable = "avx")]
1720#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1721#[stable(feature = "simd_x86", since = "1.27.0")]
1722#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1723pub const unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1724    let mut dst = _mm256_undefined_si256();
1725    ptr::copy_nonoverlapping(
1726        mem_addr as *const u8,
1727        ptr::addr_of_mut!(dst) as *mut u8,
1728        mem::size_of::<__m256i>(),
1729    );
1730    dst
1731}
1732
1733/// Stores 256-bits of integer data from `a` into memory.
1734/// `mem_addr` does not need to be aligned on any particular boundary.
1735///
1736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1737#[inline]
1738#[target_feature(enable = "avx")]
1739#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1740#[stable(feature = "simd_x86", since = "1.27.0")]
1741#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1742pub const unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1743    mem_addr.write_unaligned(a);
1744}
1745
1746/// Loads packed double-precision (64-bit) floating-point elements from memory
1747/// into result using `mask` (elements are zeroed out when the high bit of the
1748/// corresponding element is not set).
1749///
1750/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1751#[inline]
1752#[target_feature(enable = "avx")]
1753#[cfg_attr(test, assert_instr(vmaskmovpd))]
1754#[stable(feature = "simd_x86", since = "1.27.0")]
1755#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1756pub const unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1757    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1758    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
1759}
1760
1761/// Stores packed double-precision (64-bit) floating-point elements from `a`
1762/// into memory using `mask`.
1763///
1764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1765#[inline]
1766#[target_feature(enable = "avx")]
1767#[cfg_attr(test, assert_instr(vmaskmovpd))]
1768#[stable(feature = "simd_x86", since = "1.27.0")]
1769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1770pub const unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1771    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1772    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1773}
1774
1775/// Loads packed double-precision (64-bit) floating-point elements from memory
1776/// into result using `mask` (elements are zeroed out when the high bit of the
1777/// corresponding element is not set).
1778///
1779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1780#[inline]
1781#[target_feature(enable = "avx")]
1782#[cfg_attr(test, assert_instr(vmaskmovpd))]
1783#[stable(feature = "simd_x86", since = "1.27.0")]
1784#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1785pub const unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1786    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1787    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
1788}
1789
1790/// Stores packed double-precision (64-bit) floating-point elements from `a`
1791/// into memory using `mask`.
1792///
1793/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1794#[inline]
1795#[target_feature(enable = "avx")]
1796#[cfg_attr(test, assert_instr(vmaskmovpd))]
1797#[stable(feature = "simd_x86", since = "1.27.0")]
1798#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1799pub const unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1800    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1801    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1802}
1803
1804/// Loads packed single-precision (32-bit) floating-point elements from memory
1805/// into result using `mask` (elements are zeroed out when the high bit of the
1806/// corresponding element is not set).
1807///
1808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1809#[inline]
1810#[target_feature(enable = "avx")]
1811#[cfg_attr(test, assert_instr(vmaskmovps))]
1812#[stable(feature = "simd_x86", since = "1.27.0")]
1813#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1814pub const unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1815    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1816    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
1817}
1818
1819/// Stores packed single-precision (32-bit) floating-point elements from `a`
1820/// into memory using `mask`.
1821///
1822/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1823#[inline]
1824#[target_feature(enable = "avx")]
1825#[cfg_attr(test, assert_instr(vmaskmovps))]
1826#[stable(feature = "simd_x86", since = "1.27.0")]
1827#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1828pub const unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1829    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1830    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1831}
1832
1833/// Loads packed single-precision (32-bit) floating-point elements from memory
1834/// into result using `mask` (elements are zeroed out when the high bit of the
1835/// corresponding element is not set).
1836///
1837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1838#[inline]
1839#[target_feature(enable = "avx")]
1840#[cfg_attr(test, assert_instr(vmaskmovps))]
1841#[stable(feature = "simd_x86", since = "1.27.0")]
1842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1843pub const unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1844    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1845    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
1846}
1847
1848/// Stores packed single-precision (32-bit) floating-point elements from `a`
1849/// into memory using `mask`.
1850///
1851/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1852#[inline]
1853#[target_feature(enable = "avx")]
1854#[cfg_attr(test, assert_instr(vmaskmovps))]
1855#[stable(feature = "simd_x86", since = "1.27.0")]
1856#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1857pub const unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1858    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1859    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1860}
1861
1862/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1863/// from `a`, and returns the results.
1864///
1865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1866#[inline]
1867#[target_feature(enable = "avx")]
1868#[cfg_attr(test, assert_instr(vmovshdup))]
1869#[stable(feature = "simd_x86", since = "1.27.0")]
1870#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1871pub const fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1872    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1873}
1874
1875/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1876/// from `a`, and returns the results.
1877///
1878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1879#[inline]
1880#[target_feature(enable = "avx")]
1881#[cfg_attr(test, assert_instr(vmovsldup))]
1882#[stable(feature = "simd_x86", since = "1.27.0")]
1883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1884pub const fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1885    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1886}
1887
1888/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1889/// from `a`, and returns the results.
1890///
1891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1892#[inline]
1893#[target_feature(enable = "avx")]
1894#[cfg_attr(test, assert_instr(vmovddup))]
1895#[stable(feature = "simd_x86", since = "1.27.0")]
1896#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1897pub const fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1898    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1899}
1900
1901/// Loads 256-bits of integer data from unaligned memory into result.
1902/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1903/// data crosses a cache line boundary.
1904///
1905/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1906#[inline]
1907#[target_feature(enable = "avx")]
1908#[cfg_attr(test, assert_instr(vlddqu))]
1909#[stable(feature = "simd_x86", since = "1.27.0")]
1910pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1911    transmute(vlddqu(mem_addr as *const i8))
1912}
1913
1914/// Moves integer data from a 256-bit integer vector to a 32-byte
1915/// aligned memory location. To minimize caching, the data is flagged as
1916/// non-temporal (unlikely to be used again soon)
1917///
1918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1919///
1920/// # Safety of non-temporal stores
1921///
1922/// After using this intrinsic, but before any other access to the memory that this intrinsic
1923/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1924/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1925/// return.
1926///
1927/// See [`_mm_sfence`] for details.
1928#[inline]
1929#[target_feature(enable = "avx")]
1930#[cfg_attr(test, assert_instr(vmovntdq))]
1931#[stable(feature = "simd_x86", since = "1.27.0")]
1932pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1933    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1934    crate::arch::asm!(
1935        vps!("vmovntdq", ",{a}"),
1936        p = in(reg) mem_addr,
1937        a = in(ymm_reg) a,
1938        options(nostack, preserves_flags),
1939    );
1940}
1941
1942/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1943/// to a 32-byte aligned memory location. To minimize caching, the data is
1944/// flagged as non-temporal (unlikely to be used again soon).
1945///
1946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1947///
1948/// # Safety of non-temporal stores
1949///
1950/// After using this intrinsic, but before any other access to the memory that this intrinsic
1951/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1952/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1953/// return.
1954///
1955/// See [`_mm_sfence`] for details.
1956#[inline]
1957#[target_feature(enable = "avx")]
1958#[cfg_attr(test, assert_instr(vmovntpd))]
1959#[stable(feature = "simd_x86", since = "1.27.0")]
1960#[allow(clippy::cast_ptr_alignment)]
1961pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1962    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1963    crate::arch::asm!(
1964        vps!("vmovntpd", ",{a}"),
1965        p = in(reg) mem_addr,
1966        a = in(ymm_reg) a,
1967        options(nostack, preserves_flags),
1968    );
1969}
1970
1971/// Moves single-precision floating point values from a 256-bit vector
1972/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1973/// caching, the data is flagged as non-temporal (unlikely to be used again
1974/// soon).
1975///
1976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1977///
1978/// # Safety of non-temporal stores
1979///
1980/// After using this intrinsic, but before any other access to the memory that this intrinsic
1981/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1982/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1983/// return.
1984///
1985/// See [`_mm_sfence`] for details.
1986#[inline]
1987#[target_feature(enable = "avx")]
1988#[cfg_attr(test, assert_instr(vmovntps))]
1989#[stable(feature = "simd_x86", since = "1.27.0")]
1990#[allow(clippy::cast_ptr_alignment)]
1991pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1992    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1993    crate::arch::asm!(
1994        vps!("vmovntps", ",{a}"),
1995        p = in(reg) mem_addr,
1996        a = in(ymm_reg) a,
1997        options(nostack, preserves_flags),
1998    );
1999}
2000
2001/// Computes the approximate reciprocal of packed single-precision (32-bit)
2002/// floating-point elements in `a`, and returns the results. The maximum
2003/// relative error for this approximation is less than 1.5*2^-12.
2004///
2005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
2006#[inline]
2007#[target_feature(enable = "avx")]
2008#[cfg_attr(test, assert_instr(vrcpps))]
2009#[stable(feature = "simd_x86", since = "1.27.0")]
2010pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
2011    unsafe { vrcpps(a) }
2012}
2013
2014/// Computes the approximate reciprocal square root of packed single-precision
2015/// (32-bit) floating-point elements in `a`, and returns the results.
2016/// The maximum relative error for this approximation is less than 1.5*2^-12.
2017///
2018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
2019#[inline]
2020#[target_feature(enable = "avx")]
2021#[cfg_attr(test, assert_instr(vrsqrtps))]
2022#[stable(feature = "simd_x86", since = "1.27.0")]
2023pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
2024    unsafe { vrsqrtps(a) }
2025}
2026
2027/// Unpacks and interleave double-precision (64-bit) floating-point elements
2028/// from the high half of each 128-bit lane in `a` and `b`.
2029///
2030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
2031#[inline]
2032#[target_feature(enable = "avx")]
2033#[cfg_attr(test, assert_instr(vunpckhpd))]
2034#[stable(feature = "simd_x86", since = "1.27.0")]
2035#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2036pub const fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
2037    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
2038}
2039
2040/// Unpacks and interleave single-precision (32-bit) floating-point elements
2041/// from the high half of each 128-bit lane in `a` and `b`.
2042///
2043/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
2044#[inline]
2045#[target_feature(enable = "avx")]
2046#[cfg_attr(test, assert_instr(vunpckhps))]
2047#[stable(feature = "simd_x86", since = "1.27.0")]
2048#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2049pub const fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
2050    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
2051}
2052
2053/// Unpacks and interleave double-precision (64-bit) floating-point elements
2054/// from the low half of each 128-bit lane in `a` and `b`.
2055///
2056/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
2057#[inline]
2058#[target_feature(enable = "avx")]
2059#[cfg_attr(test, assert_instr(vunpcklpd))]
2060#[stable(feature = "simd_x86", since = "1.27.0")]
2061#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2062pub const fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
2063    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
2064}
2065
2066/// Unpacks and interleave single-precision (32-bit) floating-point elements
2067/// from the low half of each 128-bit lane in `a` and `b`.
2068///
2069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
2070#[inline]
2071#[target_feature(enable = "avx")]
2072#[cfg_attr(test, assert_instr(vunpcklps))]
2073#[stable(feature = "simd_x86", since = "1.27.0")]
2074#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2075pub const fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
2076    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
2077}
2078
2079/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2080/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2081/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2082/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
2083///
2084/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
2085#[inline]
2086#[target_feature(enable = "avx")]
2087#[cfg_attr(test, assert_instr(vptest))]
2088#[stable(feature = "simd_x86", since = "1.27.0")]
2089#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2090pub const fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
2091    unsafe {
2092        let r = simd_and(a.as_i64x4(), b.as_i64x4());
2093        (0i64 == simd_reduce_or(r)) as i32
2094    }
2095}
2096
2097/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2098/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2099/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2100/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
2101///
2102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2103#[inline]
2104#[target_feature(enable = "avx")]
2105#[cfg_attr(test, assert_instr(vptest))]
2106#[stable(feature = "simd_x86", since = "1.27.0")]
2107#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2108pub const fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2109    unsafe {
2110        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
2111        (0i64 == simd_reduce_or(r)) as i32
2112    }
2113}
2114
2115/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2116/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2117/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2118/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2119/// `CF` values are zero, otherwise return 0.
2120///
2121/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2122#[inline]
2123#[target_feature(enable = "avx")]
2124#[cfg_attr(test, assert_instr(vptest))]
2125#[stable(feature = "simd_x86", since = "1.27.0")]
2126pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2127    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2128}
2129
2130/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2131/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2132/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2133/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2134/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2135/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2136/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2137///
2138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2139#[inline]
2140#[target_feature(enable = "avx")]
2141#[cfg_attr(test, assert_instr(vtestpd))]
2142#[stable(feature = "simd_x86", since = "1.27.0")]
2143pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2144    unsafe { vtestzpd256(a, b) }
2145}
2146
2147/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2148/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2149/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2150/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2151/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2152/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2153/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2154///
2155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2156#[inline]
2157#[target_feature(enable = "avx")]
2158#[cfg_attr(test, assert_instr(vtestpd))]
2159#[stable(feature = "simd_x86", since = "1.27.0")]
2160pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2161    unsafe { vtestcpd256(a, b) }
2162}
2163
2164/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2165/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2166/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2167/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2168/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2169/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2170/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2171/// are zero, otherwise return 0.
2172///
2173/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2174#[inline]
2175#[target_feature(enable = "avx")]
2176#[cfg_attr(test, assert_instr(vtestpd))]
2177#[stable(feature = "simd_x86", since = "1.27.0")]
2178pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2179    unsafe { vtestnzcpd256(a, b) }
2180}
2181
2182/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2183/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2184/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2185/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2186/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2187/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2188/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2189///
2190/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2191#[inline]
2192#[target_feature(enable = "avx")]
2193#[cfg_attr(test, assert_instr(vtestpd))]
2194#[stable(feature = "simd_x86", since = "1.27.0")]
2195#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2196pub const fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2197    unsafe {
2198        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
2199        (0i64 == simd_reduce_or(r)) as i32
2200    }
2201}
2202
2203/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2204/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2205/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2206/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2207/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2208/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2209/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2210///
2211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2212#[inline]
2213#[target_feature(enable = "avx")]
2214#[cfg_attr(test, assert_instr(vtestpd))]
2215#[stable(feature = "simd_x86", since = "1.27.0")]
2216#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2217pub const fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2218    unsafe {
2219        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
2220        (0i64 == simd_reduce_or(r)) as i32
2221    }
2222}
2223
2224/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2225/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2226/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2227/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2228/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2229/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2230/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2231/// are zero, otherwise return 0.
2232///
2233/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2234#[inline]
2235#[target_feature(enable = "avx")]
2236#[cfg_attr(test, assert_instr(vtestpd))]
2237#[stable(feature = "simd_x86", since = "1.27.0")]
2238pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2239    unsafe { vtestnzcpd(a, b) }
2240}
2241
2242/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2243/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2244/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2245/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2246/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2247/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2248/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2249///
2250/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2251#[inline]
2252#[target_feature(enable = "avx")]
2253#[cfg_attr(test, assert_instr(vtestps))]
2254#[stable(feature = "simd_x86", since = "1.27.0")]
2255pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2256    unsafe { vtestzps256(a, b) }
2257}
2258
2259/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2260/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2261/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2262/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2263/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2264/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2265/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2266///
2267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2268#[inline]
2269#[target_feature(enable = "avx")]
2270#[cfg_attr(test, assert_instr(vtestps))]
2271#[stable(feature = "simd_x86", since = "1.27.0")]
2272pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2273    unsafe { vtestcps256(a, b) }
2274}
2275
2276/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2277/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2278/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2279/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2280/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2281/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2282/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2283/// are zero, otherwise return 0.
2284///
2285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2286#[inline]
2287#[target_feature(enable = "avx")]
2288#[cfg_attr(test, assert_instr(vtestps))]
2289#[stable(feature = "simd_x86", since = "1.27.0")]
2290pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2291    unsafe { vtestnzcps256(a, b) }
2292}
2293
2294/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2295/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2296/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2297/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2298/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2299/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2300/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2301///
2302/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2303#[inline]
2304#[target_feature(enable = "avx")]
2305#[cfg_attr(test, assert_instr(vtestps))]
2306#[stable(feature = "simd_x86", since = "1.27.0")]
2307#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2308pub const fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2309    unsafe {
2310        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
2311        (0i32 == simd_reduce_or(r)) as i32
2312    }
2313}
2314
2315/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2316/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2317/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2318/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2319/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2320/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2321/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2322///
2323/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2324#[inline]
2325#[target_feature(enable = "avx")]
2326#[cfg_attr(test, assert_instr(vtestps))]
2327#[stable(feature = "simd_x86", since = "1.27.0")]
2328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2329pub const fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2330    unsafe {
2331        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
2332        (0i32 == simd_reduce_or(r)) as i32
2333    }
2334}
2335
2336/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2337/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2338/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2339/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2340/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2341/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2342/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2343/// are zero, otherwise return 0.
2344///
2345/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2346#[inline]
2347#[target_feature(enable = "avx")]
2348#[cfg_attr(test, assert_instr(vtestps))]
2349#[stable(feature = "simd_x86", since = "1.27.0")]
2350pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2351    unsafe { vtestnzcps(a, b) }
2352}
2353
2354/// Sets each bit of the returned mask based on the most significant bit of the
2355/// corresponding packed double-precision (64-bit) floating-point element in
2356/// `a`.
2357///
2358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2359#[inline]
2360#[target_feature(enable = "avx")]
2361#[cfg_attr(test, assert_instr(vmovmskpd))]
2362#[stable(feature = "simd_x86", since = "1.27.0")]
2363#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2364pub const fn _mm256_movemask_pd(a: __m256d) -> i32 {
2365    // Propagate the highest bit to the rest, because simd_bitmask
2366    // requires all-1 or all-0.
2367    unsafe {
2368        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2369        simd_bitmask::<i64x4, u8>(mask) as i32
2370    }
2371}
2372
2373/// Sets each bit of the returned mask based on the most significant bit of the
2374/// corresponding packed single-precision (32-bit) floating-point element in
2375/// `a`.
2376///
2377/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2378#[inline]
2379#[target_feature(enable = "avx")]
2380#[cfg_attr(test, assert_instr(vmovmskps))]
2381#[stable(feature = "simd_x86", since = "1.27.0")]
2382#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2383pub const fn _mm256_movemask_ps(a: __m256) -> i32 {
2384    // Propagate the highest bit to the rest, because simd_bitmask
2385    // requires all-1 or all-0.
2386    unsafe {
2387        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2388        simd_bitmask::<i32x8, u8>(mask) as i32
2389    }
2390}
2391
2392/// Returns vector of type __m256d with all elements set to zero.
2393///
2394/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2395#[inline]
2396#[target_feature(enable = "avx")]
2397#[cfg_attr(test, assert_instr(vxorp))]
2398#[stable(feature = "simd_x86", since = "1.27.0")]
2399#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2400pub const fn _mm256_setzero_pd() -> __m256d {
2401    const { unsafe { mem::zeroed() } }
2402}
2403
2404/// Returns vector of type __m256 with all elements set to zero.
2405///
2406/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2407#[inline]
2408#[target_feature(enable = "avx")]
2409#[cfg_attr(test, assert_instr(vxorps))]
2410#[stable(feature = "simd_x86", since = "1.27.0")]
2411#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2412pub const fn _mm256_setzero_ps() -> __m256 {
2413    const { unsafe { mem::zeroed() } }
2414}
2415
2416/// Returns vector of type __m256i with all elements set to zero.
2417///
2418/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2419#[inline]
2420#[target_feature(enable = "avx")]
2421#[cfg_attr(test, assert_instr(vxor))]
2422#[stable(feature = "simd_x86", since = "1.27.0")]
2423#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2424pub const fn _mm256_setzero_si256() -> __m256i {
2425    const { unsafe { mem::zeroed() } }
2426}
2427
2428/// Sets packed double-precision (64-bit) floating-point elements in returned
2429/// vector with the supplied values.
2430///
2431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2432#[inline]
2433#[target_feature(enable = "avx")]
2434// This intrinsic has no corresponding instruction.
2435#[stable(feature = "simd_x86", since = "1.27.0")]
2436#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2437pub const fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2438    _mm256_setr_pd(d, c, b, a)
2439}
2440
2441/// Sets packed single-precision (32-bit) floating-point elements in returned
2442/// vector with the supplied values.
2443///
2444/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2445#[inline]
2446#[target_feature(enable = "avx")]
2447// This intrinsic has no corresponding instruction.
2448#[stable(feature = "simd_x86", since = "1.27.0")]
2449#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2450pub const fn _mm256_set_ps(
2451    a: f32,
2452    b: f32,
2453    c: f32,
2454    d: f32,
2455    e: f32,
2456    f: f32,
2457    g: f32,
2458    h: f32,
2459) -> __m256 {
2460    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2461}
2462
2463/// Sets packed 8-bit integers in returned vector with the supplied values.
2464///
2465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2466#[inline]
2467#[target_feature(enable = "avx")]
2468// This intrinsic has no corresponding instruction.
2469#[stable(feature = "simd_x86", since = "1.27.0")]
2470#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2471pub const fn _mm256_set_epi8(
2472    e00: i8,
2473    e01: i8,
2474    e02: i8,
2475    e03: i8,
2476    e04: i8,
2477    e05: i8,
2478    e06: i8,
2479    e07: i8,
2480    e08: i8,
2481    e09: i8,
2482    e10: i8,
2483    e11: i8,
2484    e12: i8,
2485    e13: i8,
2486    e14: i8,
2487    e15: i8,
2488    e16: i8,
2489    e17: i8,
2490    e18: i8,
2491    e19: i8,
2492    e20: i8,
2493    e21: i8,
2494    e22: i8,
2495    e23: i8,
2496    e24: i8,
2497    e25: i8,
2498    e26: i8,
2499    e27: i8,
2500    e28: i8,
2501    e29: i8,
2502    e30: i8,
2503    e31: i8,
2504) -> __m256i {
2505    #[rustfmt::skip]
2506    _mm256_setr_epi8(
2507        e31, e30, e29, e28, e27, e26, e25, e24,
2508        e23, e22, e21, e20, e19, e18, e17, e16,
2509        e15, e14, e13, e12, e11, e10, e09, e08,
2510        e07, e06, e05, e04, e03, e02, e01, e00,
2511    )
2512}
2513
2514/// Sets packed 16-bit integers in returned vector with the supplied values.
2515///
2516/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2517#[inline]
2518#[target_feature(enable = "avx")]
2519// This intrinsic has no corresponding instruction.
2520#[stable(feature = "simd_x86", since = "1.27.0")]
2521#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2522pub const fn _mm256_set_epi16(
2523    e00: i16,
2524    e01: i16,
2525    e02: i16,
2526    e03: i16,
2527    e04: i16,
2528    e05: i16,
2529    e06: i16,
2530    e07: i16,
2531    e08: i16,
2532    e09: i16,
2533    e10: i16,
2534    e11: i16,
2535    e12: i16,
2536    e13: i16,
2537    e14: i16,
2538    e15: i16,
2539) -> __m256i {
2540    #[rustfmt::skip]
2541    _mm256_setr_epi16(
2542        e15, e14, e13, e12,
2543        e11, e10, e09, e08,
2544        e07, e06, e05, e04,
2545        e03, e02, e01, e00,
2546    )
2547}
2548
2549/// Sets packed 32-bit integers in returned vector with the supplied values.
2550///
2551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2552#[inline]
2553#[target_feature(enable = "avx")]
2554// This intrinsic has no corresponding instruction.
2555#[stable(feature = "simd_x86", since = "1.27.0")]
2556#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2557pub const fn _mm256_set_epi32(
2558    e0: i32,
2559    e1: i32,
2560    e2: i32,
2561    e3: i32,
2562    e4: i32,
2563    e5: i32,
2564    e6: i32,
2565    e7: i32,
2566) -> __m256i {
2567    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2568}
2569
2570/// Sets packed 64-bit integers in returned vector with the supplied values.
2571///
2572/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2573#[inline]
2574#[target_feature(enable = "avx")]
2575// This intrinsic has no corresponding instruction.
2576#[stable(feature = "simd_x86", since = "1.27.0")]
2577#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2578pub const fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2579    _mm256_setr_epi64x(d, c, b, a)
2580}
2581
2582/// Sets packed double-precision (64-bit) floating-point elements in returned
2583/// vector with the supplied values in reverse order.
2584///
2585/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2586#[inline]
2587#[target_feature(enable = "avx")]
2588// This intrinsic has no corresponding instruction.
2589#[stable(feature = "simd_x86", since = "1.27.0")]
2590#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2591pub const fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2592    __m256d([a, b, c, d])
2593}
2594
2595/// Sets packed single-precision (32-bit) floating-point elements in returned
2596/// vector with the supplied values in reverse order.
2597///
2598/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2599#[inline]
2600#[target_feature(enable = "avx")]
2601// This intrinsic has no corresponding instruction.
2602#[stable(feature = "simd_x86", since = "1.27.0")]
2603#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2604pub const fn _mm256_setr_ps(
2605    a: f32,
2606    b: f32,
2607    c: f32,
2608    d: f32,
2609    e: f32,
2610    f: f32,
2611    g: f32,
2612    h: f32,
2613) -> __m256 {
2614    __m256([a, b, c, d, e, f, g, h])
2615}
2616
2617/// Sets packed 8-bit integers in returned vector with the supplied values in
2618/// reverse order.
2619///
2620/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2621#[inline]
2622#[target_feature(enable = "avx")]
2623// This intrinsic has no corresponding instruction.
2624#[stable(feature = "simd_x86", since = "1.27.0")]
2625#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2626pub const fn _mm256_setr_epi8(
2627    e00: i8,
2628    e01: i8,
2629    e02: i8,
2630    e03: i8,
2631    e04: i8,
2632    e05: i8,
2633    e06: i8,
2634    e07: i8,
2635    e08: i8,
2636    e09: i8,
2637    e10: i8,
2638    e11: i8,
2639    e12: i8,
2640    e13: i8,
2641    e14: i8,
2642    e15: i8,
2643    e16: i8,
2644    e17: i8,
2645    e18: i8,
2646    e19: i8,
2647    e20: i8,
2648    e21: i8,
2649    e22: i8,
2650    e23: i8,
2651    e24: i8,
2652    e25: i8,
2653    e26: i8,
2654    e27: i8,
2655    e28: i8,
2656    e29: i8,
2657    e30: i8,
2658    e31: i8,
2659) -> __m256i {
2660    unsafe {
2661        #[rustfmt::skip]
2662        transmute(i8x32::new(
2663            e00, e01, e02, e03, e04, e05, e06, e07,
2664            e08, e09, e10, e11, e12, e13, e14, e15,
2665            e16, e17, e18, e19, e20, e21, e22, e23,
2666            e24, e25, e26, e27, e28, e29, e30, e31,
2667        ))
2668    }
2669}
2670
2671/// Sets packed 16-bit integers in returned vector with the supplied values in
2672/// reverse order.
2673///
2674/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2675#[inline]
2676#[target_feature(enable = "avx")]
2677// This intrinsic has no corresponding instruction.
2678#[stable(feature = "simd_x86", since = "1.27.0")]
2679#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2680pub const fn _mm256_setr_epi16(
2681    e00: i16,
2682    e01: i16,
2683    e02: i16,
2684    e03: i16,
2685    e04: i16,
2686    e05: i16,
2687    e06: i16,
2688    e07: i16,
2689    e08: i16,
2690    e09: i16,
2691    e10: i16,
2692    e11: i16,
2693    e12: i16,
2694    e13: i16,
2695    e14: i16,
2696    e15: i16,
2697) -> __m256i {
2698    unsafe {
2699        #[rustfmt::skip]
2700        transmute(i16x16::new(
2701            e00, e01, e02, e03,
2702            e04, e05, e06, e07,
2703            e08, e09, e10, e11,
2704            e12, e13, e14, e15,
2705        ))
2706    }
2707}
2708
2709/// Sets packed 32-bit integers in returned vector with the supplied values in
2710/// reverse order.
2711///
2712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2713#[inline]
2714#[target_feature(enable = "avx")]
2715// This intrinsic has no corresponding instruction.
2716#[stable(feature = "simd_x86", since = "1.27.0")]
2717#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2718pub const fn _mm256_setr_epi32(
2719    e0: i32,
2720    e1: i32,
2721    e2: i32,
2722    e3: i32,
2723    e4: i32,
2724    e5: i32,
2725    e6: i32,
2726    e7: i32,
2727) -> __m256i {
2728    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2729}
2730
2731/// Sets packed 64-bit integers in returned vector with the supplied values in
2732/// reverse order.
2733///
2734/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2735#[inline]
2736#[target_feature(enable = "avx")]
2737// This intrinsic has no corresponding instruction.
2738#[stable(feature = "simd_x86", since = "1.27.0")]
2739#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2740pub const fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2741    unsafe { transmute(i64x4::new(a, b, c, d)) }
2742}
2743
2744/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2745/// elements of returned vector.
2746///
2747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2748#[inline]
2749#[target_feature(enable = "avx")]
2750// This intrinsic has no corresponding instruction.
2751#[stable(feature = "simd_x86", since = "1.27.0")]
2752#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2753pub const fn _mm256_set1_pd(a: f64) -> __m256d {
2754    f64x4::splat(a).as_m256d()
2755}
2756
2757/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2758/// elements of returned vector.
2759///
2760/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2761#[inline]
2762#[target_feature(enable = "avx")]
2763// This intrinsic has no corresponding instruction.
2764#[stable(feature = "simd_x86", since = "1.27.0")]
2765#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2766pub const fn _mm256_set1_ps(a: f32) -> __m256 {
2767    f32x8::splat(a).as_m256()
2768}
2769
2770/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2771/// This intrinsic may generate the `vpbroadcastb`.
2772///
2773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2774#[inline]
2775#[target_feature(enable = "avx")]
2776// This intrinsic has no corresponding instruction.
2777#[stable(feature = "simd_x86", since = "1.27.0")]
2778#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2779pub const fn _mm256_set1_epi8(a: i8) -> __m256i {
2780    i8x32::splat(a).as_m256i()
2781}
2782
2783/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2784/// This intrinsic may generate the `vpbroadcastw`.
2785///
2786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2787#[inline]
2788#[target_feature(enable = "avx")]
2789//#[cfg_attr(test, assert_instr(vpshufb))]
2790#[cfg_attr(test, assert_instr(vinsertf128))]
2791// This intrinsic has no corresponding instruction.
2792#[stable(feature = "simd_x86", since = "1.27.0")]
2793#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2794pub const fn _mm256_set1_epi16(a: i16) -> __m256i {
2795    i16x16::splat(a).as_m256i()
2796}
2797
2798/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2799/// This intrinsic may generate the `vpbroadcastd`.
2800///
2801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2802#[inline]
2803#[target_feature(enable = "avx")]
2804// This intrinsic has no corresponding instruction.
2805#[stable(feature = "simd_x86", since = "1.27.0")]
2806#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2807pub const fn _mm256_set1_epi32(a: i32) -> __m256i {
2808    i32x8::splat(a).as_m256i()
2809}
2810
2811/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2812/// This intrinsic may generate the `vpbroadcastq`.
2813///
2814/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2815#[inline]
2816#[target_feature(enable = "avx")]
2817#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2818#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2819// This intrinsic has no corresponding instruction.
2820#[stable(feature = "simd_x86", since = "1.27.0")]
2821#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2822pub const fn _mm256_set1_epi64x(a: i64) -> __m256i {
2823    i64x4::splat(a).as_m256i()
2824}
2825
2826/// Cast vector of type __m256d to type __m256.
2827///
2828/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2829#[inline]
2830#[target_feature(enable = "avx")]
2831// This intrinsic is only used for compilation and does not generate any
2832// instructions, thus it has zero latency.
2833#[stable(feature = "simd_x86", since = "1.27.0")]
2834#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2835pub const fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2836    unsafe { transmute(a) }
2837}
2838
2839/// Cast vector of type __m256 to type __m256d.
2840///
2841/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2842#[inline]
2843#[target_feature(enable = "avx")]
2844// This intrinsic is only used for compilation and does not generate any
2845// instructions, thus it has zero latency.
2846#[stable(feature = "simd_x86", since = "1.27.0")]
2847#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2848pub const fn _mm256_castps_pd(a: __m256) -> __m256d {
2849    unsafe { transmute(a) }
2850}
2851
2852/// Casts vector of type __m256 to type __m256i.
2853///
2854/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2855#[inline]
2856#[target_feature(enable = "avx")]
2857// This intrinsic is only used for compilation and does not generate any
2858// instructions, thus it has zero latency.
2859#[stable(feature = "simd_x86", since = "1.27.0")]
2860#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2861pub const fn _mm256_castps_si256(a: __m256) -> __m256i {
2862    unsafe { transmute(a) }
2863}
2864
2865/// Casts vector of type __m256i to type __m256.
2866///
2867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2868#[inline]
2869#[target_feature(enable = "avx")]
2870// This intrinsic is only used for compilation and does not generate any
2871// instructions, thus it has zero latency.
2872#[stable(feature = "simd_x86", since = "1.27.0")]
2873#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2874pub const fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2875    unsafe { transmute(a) }
2876}
2877
2878/// Casts vector of type __m256d to type __m256i.
2879///
2880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2881#[inline]
2882#[target_feature(enable = "avx")]
2883// This intrinsic is only used for compilation and does not generate any
2884// instructions, thus it has zero latency.
2885#[stable(feature = "simd_x86", since = "1.27.0")]
2886#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2887pub const fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2888    unsafe { transmute(a) }
2889}
2890
2891/// Casts vector of type __m256i to type __m256d.
2892///
2893/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2894#[inline]
2895#[target_feature(enable = "avx")]
2896// This intrinsic is only used for compilation and does not generate any
2897// instructions, thus it has zero latency.
2898#[stable(feature = "simd_x86", since = "1.27.0")]
2899#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2900pub const fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2901    unsafe { transmute(a) }
2902}
2903
2904/// Casts vector of type __m256 to type __m128.
2905///
2906/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2907#[inline]
2908#[target_feature(enable = "avx")]
2909// This intrinsic is only used for compilation and does not generate any
2910// instructions, thus it has zero latency.
2911#[stable(feature = "simd_x86", since = "1.27.0")]
2912#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2913pub const fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2914    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2915}
2916
2917/// Casts vector of type __m256d to type __m128d.
2918///
2919/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2920#[inline]
2921#[target_feature(enable = "avx")]
2922// This intrinsic is only used for compilation and does not generate any
2923// instructions, thus it has zero latency.
2924#[stable(feature = "simd_x86", since = "1.27.0")]
2925#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2926pub const fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2927    unsafe { simd_shuffle!(a, a, [0, 1]) }
2928}
2929
2930/// Casts vector of type __m256i to type __m128i.
2931///
2932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2933#[inline]
2934#[target_feature(enable = "avx")]
2935// This intrinsic is only used for compilation and does not generate any
2936// instructions, thus it has zero latency.
2937#[stable(feature = "simd_x86", since = "1.27.0")]
2938#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2939pub const fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2940    unsafe {
2941        let a = a.as_i64x4();
2942        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2943        transmute(dst)
2944    }
2945}
2946
2947/// Casts vector of type __m128 to type __m256;
2948/// the upper 128 bits of the result are indeterminate.
2949///
2950/// In the Intel documentation, the upper bits are declared to be "undefined".
2951/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2952/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2953///
2954/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2955#[inline]
2956#[target_feature(enable = "avx")]
2957// This intrinsic is only used for compilation and does not generate any
2958// instructions, thus it has zero latency.
2959#[stable(feature = "simd_x86", since = "1.27.0")]
2960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2961pub const fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2962    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2963}
2964
2965/// Casts vector of type __m128d to type __m256d;
2966/// the upper 128 bits of the result are indeterminate.
2967///
2968/// In the Intel documentation, the upper bits are declared to be "undefined".
2969/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2970/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2971///
2972/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2973#[inline]
2974#[target_feature(enable = "avx")]
2975// This intrinsic is only used for compilation and does not generate any
2976// instructions, thus it has zero latency.
2977#[stable(feature = "simd_x86", since = "1.27.0")]
2978#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2979pub const fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2980    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2981}
2982
2983/// Casts vector of type __m128i to type __m256i;
2984/// the upper 128 bits of the result are indeterminate.
2985///
2986/// In the Intel documentation, the upper bits are declared to be "undefined".
2987/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2988/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2989///
2990/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2991#[inline]
2992#[target_feature(enable = "avx")]
2993// This intrinsic is only used for compilation and does not generate any
2994// instructions, thus it has zero latency.
2995#[stable(feature = "simd_x86", since = "1.27.0")]
2996#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2997pub const fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2998    unsafe {
2999        let a = a.as_i64x2();
3000        let undefined = i64x2::ZERO;
3001        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
3002        transmute(dst)
3003    }
3004}
3005
3006/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
3007/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
3008/// the value of the source vector. The upper 128 bits are set to zero.
3009///
3010/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
3011#[inline]
3012#[target_feature(enable = "avx")]
3013// This intrinsic is only used for compilation and does not generate any
3014// instructions, thus it has zero latency.
3015#[stable(feature = "simd_x86", since = "1.27.0")]
3016#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3017pub const fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
3018    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
3019}
3020
3021/// Constructs a 256-bit integer vector from a 128-bit integer vector.
3022/// The lower 128 bits contain the value of the source vector. The upper
3023/// 128 bits are set to zero.
3024///
3025/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
3026#[inline]
3027#[target_feature(enable = "avx")]
3028// This intrinsic is only used for compilation and does not generate any
3029// instructions, thus it has zero latency.
3030#[stable(feature = "simd_x86", since = "1.27.0")]
3031#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3032pub const fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
3033    unsafe {
3034        let b = i64x2::ZERO;
3035        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
3036        transmute(dst)
3037    }
3038}
3039
3040/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
3041/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
3042/// contain the value of the source vector. The upper 128 bits are set
3043/// to zero.
3044///
3045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
3046#[inline]
3047#[target_feature(enable = "avx")]
3048// This intrinsic is only used for compilation and does not generate any
3049// instructions, thus it has zero latency.
3050#[stable(feature = "simd_x86", since = "1.27.0")]
3051#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3052pub const fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
3053    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
3054}
3055
3056/// Returns vector of type `__m256` with indeterminate elements.
3057/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3058/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3059/// In practice, this is typically equivalent to [`mem::zeroed`].
3060///
3061/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
3062#[inline]
3063#[target_feature(enable = "avx")]
3064// This intrinsic has no corresponding instruction.
3065#[stable(feature = "simd_x86", since = "1.27.0")]
3066#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3067pub const fn _mm256_undefined_ps() -> __m256 {
3068    const { unsafe { mem::zeroed() } }
3069}
3070
3071/// Returns vector of type `__m256d` with indeterminate elements.
3072/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3073/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3074/// In practice, this is typically equivalent to [`mem::zeroed`].
3075///
3076/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
3077#[inline]
3078#[target_feature(enable = "avx")]
3079// This intrinsic has no corresponding instruction.
3080#[stable(feature = "simd_x86", since = "1.27.0")]
3081#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3082pub const fn _mm256_undefined_pd() -> __m256d {
3083    const { unsafe { mem::zeroed() } }
3084}
3085
3086/// Returns vector of type __m256i with with indeterminate elements.
3087/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3088/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3089/// In practice, this is typically equivalent to [`mem::zeroed`].
3090///
3091/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
3092#[inline]
3093#[target_feature(enable = "avx")]
3094// This intrinsic has no corresponding instruction.
3095#[stable(feature = "simd_x86", since = "1.27.0")]
3096#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3097pub const fn _mm256_undefined_si256() -> __m256i {
3098    const { unsafe { mem::zeroed() } }
3099}
3100
3101/// Sets packed __m256 returned vector with the supplied values.
3102///
3103/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
3104#[inline]
3105#[target_feature(enable = "avx")]
3106#[cfg_attr(test, assert_instr(vinsertf128))]
3107#[stable(feature = "simd_x86", since = "1.27.0")]
3108#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3109pub const fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
3110    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
3111}
3112
3113/// Sets packed __m256d returned vector with the supplied values.
3114///
3115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
3116#[inline]
3117#[target_feature(enable = "avx")]
3118#[cfg_attr(test, assert_instr(vinsertf128))]
3119#[stable(feature = "simd_x86", since = "1.27.0")]
3120#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3121pub const fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
3122    unsafe {
3123        let hi: __m128 = transmute(hi);
3124        let lo: __m128 = transmute(lo);
3125        transmute(_mm256_set_m128(hi, lo))
3126    }
3127}
3128
3129/// Sets packed __m256i returned vector with the supplied values.
3130///
3131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
3132#[inline]
3133#[target_feature(enable = "avx")]
3134#[cfg_attr(test, assert_instr(vinsertf128))]
3135#[stable(feature = "simd_x86", since = "1.27.0")]
3136#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3137pub const fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
3138    unsafe {
3139        let hi: __m128 = transmute(hi);
3140        let lo: __m128 = transmute(lo);
3141        transmute(_mm256_set_m128(hi, lo))
3142    }
3143}
3144
3145/// Sets packed __m256 returned vector with the supplied values.
3146///
3147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
3148#[inline]
3149#[target_feature(enable = "avx")]
3150#[cfg_attr(test, assert_instr(vinsertf128))]
3151#[stable(feature = "simd_x86", since = "1.27.0")]
3152#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3153pub const fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
3154    _mm256_set_m128(hi, lo)
3155}
3156
3157/// Sets packed __m256d returned vector with the supplied values.
3158///
3159/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
3160#[inline]
3161#[target_feature(enable = "avx")]
3162#[cfg_attr(test, assert_instr(vinsertf128))]
3163#[stable(feature = "simd_x86", since = "1.27.0")]
3164#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3165pub const fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
3166    _mm256_set_m128d(hi, lo)
3167}
3168
3169/// Sets packed __m256i returned vector with the supplied values.
3170///
3171/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
3172#[inline]
3173#[target_feature(enable = "avx")]
3174#[cfg_attr(test, assert_instr(vinsertf128))]
3175#[stable(feature = "simd_x86", since = "1.27.0")]
3176#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3177pub const fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3178    _mm256_set_m128i(hi, lo)
3179}
3180
3181/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3182/// floating-point elements) from memory, and combine them into a 256-bit
3183/// value.
3184/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3185///
3186/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3187#[inline]
3188#[target_feature(enable = "avx")]
3189// This intrinsic has no corresponding instruction.
3190#[stable(feature = "simd_x86", since = "1.27.0")]
3191#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3192pub const unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3193    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3194    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
3195}
3196
3197/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3198/// floating-point elements) from memory, and combine them into a 256-bit
3199/// value.
3200/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3201///
3202/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3203#[inline]
3204#[target_feature(enable = "avx")]
3205// This intrinsic has no corresponding instruction.
3206#[stable(feature = "simd_x86", since = "1.27.0")]
3207#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3208pub const unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3209    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
3210    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
3211}
3212
3213/// Loads two 128-bit values (composed of integer data) from memory, and combine
3214/// them into a 256-bit value.
3215/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3216///
3217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3218#[inline]
3219#[target_feature(enable = "avx")]
3220// This intrinsic has no corresponding instruction.
3221#[stable(feature = "simd_x86", since = "1.27.0")]
3222#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3223pub const unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3224    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
3225    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
3226}
3227
3228/// Stores the high and low 128-bit halves (each composed of 4 packed
3229/// single-precision (32-bit) floating-point elements) from `a` into memory two
3230/// different 128-bit locations.
3231/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3232///
3233/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3234#[inline]
3235#[target_feature(enable = "avx")]
3236// This intrinsic has no corresponding instruction.
3237#[stable(feature = "simd_x86", since = "1.27.0")]
3238#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3239pub const unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3240    let lo = _mm256_castps256_ps128(a);
3241    _mm_storeu_ps(loaddr, lo);
3242    let hi = _mm256_extractf128_ps::<1>(a);
3243    _mm_storeu_ps(hiaddr, hi);
3244}
3245
3246/// Stores the high and low 128-bit halves (each composed of 2 packed
3247/// double-precision (64-bit) floating-point elements) from `a` into memory two
3248/// different 128-bit locations.
3249/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3250///
3251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3252#[inline]
3253#[target_feature(enable = "avx")]
3254// This intrinsic has no corresponding instruction.
3255#[stable(feature = "simd_x86", since = "1.27.0")]
3256#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3257pub const unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3258    let lo = _mm256_castpd256_pd128(a);
3259    _mm_storeu_pd(loaddr, lo);
3260    let hi = _mm256_extractf128_pd::<1>(a);
3261    _mm_storeu_pd(hiaddr, hi);
3262}
3263
3264/// Stores the high and low 128-bit halves (each composed of integer data) from
3265/// `a` into memory two different 128-bit locations.
3266/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3267///
3268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3269#[inline]
3270#[target_feature(enable = "avx")]
3271// This intrinsic has no corresponding instruction.
3272#[stable(feature = "simd_x86", since = "1.27.0")]
3273#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3274pub const unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3275    let lo = _mm256_castsi256_si128(a);
3276    _mm_storeu_si128(loaddr, lo);
3277    let hi = _mm256_extractf128_si256::<1>(a);
3278    _mm_storeu_si128(hiaddr, hi);
3279}
3280
3281/// Returns the first element of the input vector of `[8 x float]`.
3282///
3283/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3284#[inline]
3285#[target_feature(enable = "avx")]
3286//#[cfg_attr(test, assert_instr(movss))] FIXME
3287#[stable(feature = "simd_x86", since = "1.27.0")]
3288#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3289pub const fn _mm256_cvtss_f32(a: __m256) -> f32 {
3290    unsafe { simd_extract!(a, 0) }
3291}
3292
3293// LLVM intrinsics used in the above functions
3294#[allow(improper_ctypes)]
3295unsafe extern "C" {
3296    #[link_name = "llvm.x86.avx.round.pd.256"]
3297    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3298    #[link_name = "llvm.x86.avx.round.ps.256"]
3299    fn roundps256(a: __m256, b: i32) -> __m256;
3300    #[link_name = "llvm.x86.avx.dp.ps.256"]
3301    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3302    #[link_name = "llvm.x86.sse2.cmp.pd"]
3303    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3304    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3305    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3306    #[link_name = "llvm.x86.sse.cmp.ps"]
3307    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3308    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3309    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3310    #[link_name = "llvm.x86.sse2.cmp.sd"]
3311    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3312    #[link_name = "llvm.x86.sse.cmp.ss"]
3313    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3314    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3315    fn vcvtps2dq(a: __m256) -> i32x8;
3316    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3317    fn vcvttpd2dq(a: __m256d) -> i32x4;
3318    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3319    fn vcvtpd2dq(a: __m256d) -> i32x4;
3320    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3321    fn vcvttps2dq(a: __m256) -> i32x8;
3322    #[link_name = "llvm.x86.avx.vzeroall"]
3323    fn vzeroall();
3324    #[link_name = "llvm.x86.avx.vzeroupper"]
3325    fn vzeroupper();
3326    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3327    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3328    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3329    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3330    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3331    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3332    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3333    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3334    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3335    fn vlddqu(mem_addr: *const i8) -> i8x32;
3336    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3337    fn vrcpps(a: __m256) -> __m256;
3338    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3339    fn vrsqrtps(a: __m256) -> __m256;
3340    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3341    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3342    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3343    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3344    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3345    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3346    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3347    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3348    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3349    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3350    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3351    fn vtestzps256(a: __m256, b: __m256) -> i32;
3352    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3353    fn vtestcps256(a: __m256, b: __m256) -> i32;
3354    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3355    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3356    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3357    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3358    #[link_name = "llvm.x86.avx.min.ps.256"]
3359    fn vminps(a: __m256, b: __m256) -> __m256;
3360    #[link_name = "llvm.x86.avx.max.ps.256"]
3361    fn vmaxps(a: __m256, b: __m256) -> __m256;
3362    #[link_name = "llvm.x86.avx.min.pd.256"]
3363    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3364    #[link_name = "llvm.x86.avx.max.pd.256"]
3365    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3366}
3367
3368#[cfg(test)]
3369mod tests {
3370    use crate::core_arch::assert_eq_const as assert_eq;
3371    use crate::core_arch::simd::*;
3372    use crate::hint::black_box;
3373    use crate::ptr;
3374    use stdarch_test::simd_test;
3375
3376    use crate::core_arch::x86::*;
3377
3378    #[simd_test(enable = "avx")]
3379    const fn test_mm256_add_pd() {
3380        let a = _mm256_setr_pd(1., 2., 3., 4.);
3381        let b = _mm256_setr_pd(5., 6., 7., 8.);
3382        let r = _mm256_add_pd(a, b);
3383        let e = _mm256_setr_pd(6., 8., 10., 12.);
3384        assert_eq_m256d(r, e);
3385    }
3386
3387    #[simd_test(enable = "avx")]
3388    const fn test_mm256_add_ps() {
3389        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3390        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3391        let r = _mm256_add_ps(a, b);
3392        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3393        assert_eq_m256(r, e);
3394    }
3395
3396    #[simd_test(enable = "avx")]
3397    const fn test_mm256_and_pd() {
3398        let a = _mm256_set1_pd(1.);
3399        let b = _mm256_set1_pd(0.6);
3400        let r = _mm256_and_pd(a, b);
3401        let e = _mm256_set1_pd(0.5);
3402        assert_eq_m256d(r, e);
3403    }
3404
3405    #[simd_test(enable = "avx")]
3406    const fn test_mm256_and_ps() {
3407        let a = _mm256_set1_ps(1.);
3408        let b = _mm256_set1_ps(0.6);
3409        let r = _mm256_and_ps(a, b);
3410        let e = _mm256_set1_ps(0.5);
3411        assert_eq_m256(r, e);
3412    }
3413
3414    #[simd_test(enable = "avx")]
3415    const fn test_mm256_or_pd() {
3416        let a = _mm256_set1_pd(1.);
3417        let b = _mm256_set1_pd(0.6);
3418        let r = _mm256_or_pd(a, b);
3419        let e = _mm256_set1_pd(1.2);
3420        assert_eq_m256d(r, e);
3421    }
3422
3423    #[simd_test(enable = "avx")]
3424    const fn test_mm256_or_ps() {
3425        let a = _mm256_set1_ps(1.);
3426        let b = _mm256_set1_ps(0.6);
3427        let r = _mm256_or_ps(a, b);
3428        let e = _mm256_set1_ps(1.2);
3429        assert_eq_m256(r, e);
3430    }
3431
3432    #[simd_test(enable = "avx")]
3433    const fn test_mm256_shuffle_pd() {
3434        let a = _mm256_setr_pd(1., 4., 5., 8.);
3435        let b = _mm256_setr_pd(2., 3., 6., 7.);
3436        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3437        let e = _mm256_setr_pd(4., 3., 8., 7.);
3438        assert_eq_m256d(r, e);
3439    }
3440
3441    #[simd_test(enable = "avx")]
3442    const fn test_mm256_shuffle_ps() {
3443        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3444        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3445        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3446        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3447        assert_eq_m256(r, e);
3448    }
3449
3450    #[simd_test(enable = "avx")]
3451    const fn test_mm256_andnot_pd() {
3452        let a = _mm256_set1_pd(0.);
3453        let b = _mm256_set1_pd(0.6);
3454        let r = _mm256_andnot_pd(a, b);
3455        assert_eq_m256d(r, b);
3456    }
3457
3458    #[simd_test(enable = "avx")]
3459    const fn test_mm256_andnot_ps() {
3460        let a = _mm256_set1_ps(0.);
3461        let b = _mm256_set1_ps(0.6);
3462        let r = _mm256_andnot_ps(a, b);
3463        assert_eq_m256(r, b);
3464    }
3465
3466    #[simd_test(enable = "avx")]
3467    fn test_mm256_max_pd() {
3468        let a = _mm256_setr_pd(1., 4., 5., 8.);
3469        let b = _mm256_setr_pd(2., 3., 6., 7.);
3470        let r = _mm256_max_pd(a, b);
3471        let e = _mm256_setr_pd(2., 4., 6., 8.);
3472        assert_eq_m256d(r, e);
3473        // > If the values being compared are both 0.0s (of either sign), the
3474        // > value in the second operand (source operand) is returned.
3475        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3476        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3477        let wu = _mm256_castpd_si256(w).as_u64x4();
3478        let xu = _mm256_castpd_si256(x).as_u64x4();
3479        assert_eq!(wu, u64x4::splat(0x8000_0000_0000_0000u64));
3480        assert_eq!(xu, u64x4::splat(0u64));
3481        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3482        // > second operand (source operand), either a NaN or a valid
3483        // > floating-point value, is written to the result.
3484        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3485        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3486        assert_eq_m256d(y, _mm256_set1_pd(0.0));
3487        let zf = *z.as_f64x4().as_array();
3488        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3489    }
3490
3491    #[simd_test(enable = "avx")]
3492    fn test_mm256_max_ps() {
3493        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3494        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3495        let r = _mm256_max_ps(a, b);
3496        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3497        assert_eq_m256(r, e);
3498        // > If the values being compared are both 0.0s (of either sign), the
3499        // > value in the second operand (source operand) is returned.
3500        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3501        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3502        let wu = _mm256_castps_si256(w).as_u32x8();
3503        let xu = _mm256_castps_si256(x).as_u32x8();
3504        assert_eq!(wu, u32x8::splat(0x8000_0000u32));
3505        assert_eq!(xu, u32x8::splat(0u32));
3506        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3507        // > second operand (source operand), either a NaN or a valid
3508        // > floating-point value, is written to the result.
3509        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3510        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3511        assert_eq_m256(y, _mm256_set1_ps(0.0));
3512        let zf = *z.as_f32x8().as_array();
3513        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3514    }
3515
3516    #[simd_test(enable = "avx")]
3517    fn test_mm256_min_pd() {
3518        let a = _mm256_setr_pd(1., 4., 5., 8.);
3519        let b = _mm256_setr_pd(2., 3., 6., 7.);
3520        let r = _mm256_min_pd(a, b);
3521        let e = _mm256_setr_pd(1., 3., 5., 7.);
3522        assert_eq_m256d(r, e);
3523        // > If the values being compared are both 0.0s (of either sign), the
3524        // > value in the second operand (source operand) is returned.
3525        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3526        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3527        let wu = _mm256_castpd_si256(w).as_u64x4();
3528        let xu = _mm256_castpd_si256(x).as_u64x4();
3529        assert_eq!(wu, u64x4::splat(0x8000_0000_0000_0000u64));
3530        assert_eq!(xu, u64x4::splat(0u64));
3531        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3532        // > second operand (source operand), either a NaN or a valid
3533        // > floating-point value, is written to the result.
3534        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3535        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3536        assert_eq_m256d(y, _mm256_set1_pd(0.0));
3537        let zf = *z.as_f64x4().as_array();
3538        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3539    }
3540
3541    #[simd_test(enable = "avx")]
3542    fn test_mm256_min_ps() {
3543        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3544        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3545        let r = _mm256_min_ps(a, b);
3546        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3547        assert_eq_m256(r, e);
3548        // > If the values being compared are both 0.0s (of either sign), the
3549        // > value in the second operand (source operand) is returned.
3550        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3551        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3552        let wu = _mm256_castps_si256(w).as_u32x8();
3553        let xu = _mm256_castps_si256(x).as_u32x8();
3554        assert_eq!(wu, u32x8::splat(0x8000_0000u32));
3555        assert_eq!(xu, u32x8::splat(0u32));
3556        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3557        // > second operand (source operand), either a NaN or a valid
3558        // > floating-point value, is written to the result.
3559        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3560        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3561        assert_eq_m256(y, _mm256_set1_ps(0.0));
3562        let zf = *z.as_f32x8().as_array();
3563        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3564    }
3565
3566    #[simd_test(enable = "avx")]
3567    const fn test_mm256_mul_pd() {
3568        let a = _mm256_setr_pd(1., 2., 3., 4.);
3569        let b = _mm256_setr_pd(5., 6., 7., 8.);
3570        let r = _mm256_mul_pd(a, b);
3571        let e = _mm256_setr_pd(5., 12., 21., 32.);
3572        assert_eq_m256d(r, e);
3573    }
3574
3575    #[simd_test(enable = "avx")]
3576    const fn test_mm256_mul_ps() {
3577        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3578        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3579        let r = _mm256_mul_ps(a, b);
3580        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3581        assert_eq_m256(r, e);
3582    }
3583
3584    #[simd_test(enable = "avx")]
3585    const fn test_mm256_addsub_pd() {
3586        let a = _mm256_setr_pd(1., 2., 3., 4.);
3587        let b = _mm256_setr_pd(5., 6., 7., 8.);
3588        let r = _mm256_addsub_pd(a, b);
3589        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3590        assert_eq_m256d(r, e);
3591    }
3592
3593    #[simd_test(enable = "avx")]
3594    const fn test_mm256_addsub_ps() {
3595        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3596        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3597        let r = _mm256_addsub_ps(a, b);
3598        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3599        assert_eq_m256(r, e);
3600    }
3601
3602    #[simd_test(enable = "avx")]
3603    const fn test_mm256_sub_pd() {
3604        let a = _mm256_setr_pd(1., 2., 3., 4.);
3605        let b = _mm256_setr_pd(5., 6., 7., 8.);
3606        let r = _mm256_sub_pd(a, b);
3607        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3608        assert_eq_m256d(r, e);
3609    }
3610
3611    #[simd_test(enable = "avx")]
3612    const fn test_mm256_sub_ps() {
3613        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3614        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3615        let r = _mm256_sub_ps(a, b);
3616        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3617        assert_eq_m256(r, e);
3618    }
3619
3620    #[simd_test(enable = "avx")]
3621    fn test_mm256_round_pd() {
3622        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3623        let result_closest = _mm256_round_pd::<0b0000>(a);
3624        let result_down = _mm256_round_pd::<0b0001>(a);
3625        let result_up = _mm256_round_pd::<0b0010>(a);
3626        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3627        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3628        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3629        assert_eq_m256d(result_closest, expected_closest);
3630        assert_eq_m256d(result_down, expected_down);
3631        assert_eq_m256d(result_up, expected_up);
3632    }
3633
3634    #[simd_test(enable = "avx")]
3635    const fn test_mm256_floor_pd() {
3636        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3637        let result_down = _mm256_floor_pd(a);
3638        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3639        assert_eq_m256d(result_down, expected_down);
3640    }
3641
3642    #[simd_test(enable = "avx")]
3643    const fn test_mm256_ceil_pd() {
3644        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3645        let result_up = _mm256_ceil_pd(a);
3646        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3647        assert_eq_m256d(result_up, expected_up);
3648    }
3649
3650    #[simd_test(enable = "avx")]
3651    fn test_mm256_round_ps() {
3652        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3653        let result_closest = _mm256_round_ps::<0b0000>(a);
3654        let result_down = _mm256_round_ps::<0b0001>(a);
3655        let result_up = _mm256_round_ps::<0b0010>(a);
3656        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3657        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3658        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3659        assert_eq_m256(result_closest, expected_closest);
3660        assert_eq_m256(result_down, expected_down);
3661        assert_eq_m256(result_up, expected_up);
3662    }
3663
3664    #[simd_test(enable = "avx")]
3665    const fn test_mm256_floor_ps() {
3666        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3667        let result_down = _mm256_floor_ps(a);
3668        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3669        assert_eq_m256(result_down, expected_down);
3670    }
3671
3672    #[simd_test(enable = "avx")]
3673    const fn test_mm256_ceil_ps() {
3674        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3675        let result_up = _mm256_ceil_ps(a);
3676        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3677        assert_eq_m256(result_up, expected_up);
3678    }
3679
3680    #[simd_test(enable = "avx")]
3681    fn test_mm256_sqrt_pd() {
3682        let a = _mm256_setr_pd(4., 9., 16., 25.);
3683        let r = _mm256_sqrt_pd(a);
3684        let e = _mm256_setr_pd(2., 3., 4., 5.);
3685        assert_eq_m256d(r, e);
3686    }
3687
3688    #[simd_test(enable = "avx")]
3689    fn test_mm256_sqrt_ps() {
3690        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3691        let r = _mm256_sqrt_ps(a);
3692        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3693        assert_eq_m256(r, e);
3694    }
3695
3696    #[simd_test(enable = "avx")]
3697    const fn test_mm256_div_ps() {
3698        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3699        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3700        let r = _mm256_div_ps(a, b);
3701        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3702        assert_eq_m256(r, e);
3703    }
3704
3705    #[simd_test(enable = "avx")]
3706    const fn test_mm256_div_pd() {
3707        let a = _mm256_setr_pd(4., 9., 16., 25.);
3708        let b = _mm256_setr_pd(4., 3., 2., 5.);
3709        let r = _mm256_div_pd(a, b);
3710        let e = _mm256_setr_pd(1., 3., 8., 5.);
3711        assert_eq_m256d(r, e);
3712    }
3713
3714    #[simd_test(enable = "avx")]
3715    const fn test_mm256_blend_pd() {
3716        let a = _mm256_setr_pd(4., 9., 16., 25.);
3717        let b = _mm256_setr_pd(4., 3., 2., 5.);
3718        let r = _mm256_blend_pd::<0x0>(a, b);
3719        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3720        let r = _mm256_blend_pd::<0x3>(a, b);
3721        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3722        let r = _mm256_blend_pd::<0xF>(a, b);
3723        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3724    }
3725
3726    #[simd_test(enable = "avx")]
3727    const fn test_mm256_blend_ps() {
3728        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3729        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3730        let r = _mm256_blend_ps::<0x0>(a, b);
3731        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3732        let r = _mm256_blend_ps::<0x3>(a, b);
3733        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3734        let r = _mm256_blend_ps::<0xF>(a, b);
3735        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3736    }
3737
3738    #[simd_test(enable = "avx")]
3739    const fn test_mm256_blendv_pd() {
3740        let a = _mm256_setr_pd(4., 9., 16., 25.);
3741        let b = _mm256_setr_pd(4., 3., 2., 5.);
3742        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3743        let r = _mm256_blendv_pd(a, b, c);
3744        let e = _mm256_setr_pd(4., 9., 2., 5.);
3745        assert_eq_m256d(r, e);
3746    }
3747
3748    #[simd_test(enable = "avx")]
3749    const fn test_mm256_blendv_ps() {
3750        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3751        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3752        #[rustfmt::skip]
3753        let c = _mm256_setr_ps(
3754            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3755        );
3756        let r = _mm256_blendv_ps(a, b, c);
3757        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3758        assert_eq_m256(r, e);
3759    }
3760
3761    #[simd_test(enable = "avx")]
3762    fn test_mm256_dp_ps() {
3763        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3764        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3765        let r = _mm256_dp_ps::<0xFF>(a, b);
3766        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3767        assert_eq_m256(r, e);
3768    }
3769
3770    #[simd_test(enable = "avx")]
3771    const fn test_mm256_hadd_pd() {
3772        let a = _mm256_setr_pd(4., 9., 16., 25.);
3773        let b = _mm256_setr_pd(4., 3., 2., 5.);
3774        let r = _mm256_hadd_pd(a, b);
3775        let e = _mm256_setr_pd(13., 7., 41., 7.);
3776        assert_eq_m256d(r, e);
3777
3778        let a = _mm256_setr_pd(1., 2., 3., 4.);
3779        let b = _mm256_setr_pd(5., 6., 7., 8.);
3780        let r = _mm256_hadd_pd(a, b);
3781        let e = _mm256_setr_pd(3., 11., 7., 15.);
3782        assert_eq_m256d(r, e);
3783    }
3784
3785    #[simd_test(enable = "avx")]
3786    const fn test_mm256_hadd_ps() {
3787        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3788        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3789        let r = _mm256_hadd_ps(a, b);
3790        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3791        assert_eq_m256(r, e);
3792
3793        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3794        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3795        let r = _mm256_hadd_ps(a, b);
3796        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3797        assert_eq_m256(r, e);
3798    }
3799
3800    #[simd_test(enable = "avx")]
3801    const fn test_mm256_hsub_pd() {
3802        let a = _mm256_setr_pd(4., 9., 16., 25.);
3803        let b = _mm256_setr_pd(4., 3., 2., 5.);
3804        let r = _mm256_hsub_pd(a, b);
3805        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3806        assert_eq_m256d(r, e);
3807
3808        let a = _mm256_setr_pd(1., 2., 3., 4.);
3809        let b = _mm256_setr_pd(5., 6., 7., 8.);
3810        let r = _mm256_hsub_pd(a, b);
3811        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3812        assert_eq_m256d(r, e);
3813    }
3814
3815    #[simd_test(enable = "avx")]
3816    const fn test_mm256_hsub_ps() {
3817        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3818        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3819        let r = _mm256_hsub_ps(a, b);
3820        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3821        assert_eq_m256(r, e);
3822
3823        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3824        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3825        let r = _mm256_hsub_ps(a, b);
3826        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3827        assert_eq_m256(r, e);
3828    }
3829
3830    #[simd_test(enable = "avx")]
3831    const fn test_mm256_xor_pd() {
3832        let a = _mm256_setr_pd(4., 9., 16., 25.);
3833        let b = _mm256_set1_pd(0.);
3834        let r = _mm256_xor_pd(a, b);
3835        assert_eq_m256d(r, a);
3836    }
3837
3838    #[simd_test(enable = "avx")]
3839    const fn test_mm256_xor_ps() {
3840        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3841        let b = _mm256_set1_ps(0.);
3842        let r = _mm256_xor_ps(a, b);
3843        assert_eq_m256(r, a);
3844    }
3845
3846    #[simd_test(enable = "avx")]
3847    fn test_mm_cmp_pd() {
3848        let a = _mm_setr_pd(4., 9.);
3849        let b = _mm_setr_pd(4., 3.);
3850        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3851        assert!(get_m128d(r, 0).is_nan());
3852        assert!(get_m128d(r, 1).is_nan());
3853    }
3854
3855    #[simd_test(enable = "avx")]
3856    fn test_mm256_cmp_pd() {
3857        let a = _mm256_setr_pd(1., 2., 3., 4.);
3858        let b = _mm256_setr_pd(5., 6., 7., 8.);
3859        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3860        let e = _mm256_set1_pd(0.);
3861        assert_eq_m256d(r, e);
3862    }
3863
3864    #[simd_test(enable = "avx")]
3865    fn test_mm_cmp_ps() {
3866        let a = _mm_setr_ps(4., 3., 2., 5.);
3867        let b = _mm_setr_ps(4., 9., 16., 25.);
3868        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3869        assert!(get_m128(r, 0).is_nan());
3870        assert_eq!(get_m128(r, 1), 0.);
3871        assert_eq!(get_m128(r, 2), 0.);
3872        assert_eq!(get_m128(r, 3), 0.);
3873    }
3874
3875    #[simd_test(enable = "avx")]
3876    fn test_mm256_cmp_ps() {
3877        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3878        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3879        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3880        let e = _mm256_set1_ps(0.);
3881        assert_eq_m256(r, e);
3882    }
3883
3884    #[simd_test(enable = "avx")]
3885    fn test_mm_cmp_sd() {
3886        let a = _mm_setr_pd(4., 9.);
3887        let b = _mm_setr_pd(4., 3.);
3888        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3889        assert!(get_m128d(r, 0).is_nan());
3890        assert_eq!(get_m128d(r, 1), 9.);
3891    }
3892
3893    #[simd_test(enable = "avx")]
3894    fn test_mm_cmp_ss() {
3895        let a = _mm_setr_ps(4., 3., 2., 5.);
3896        let b = _mm_setr_ps(4., 9., 16., 25.);
3897        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3898        assert!(get_m128(r, 0).is_nan());
3899        assert_eq!(get_m128(r, 1), 3.);
3900        assert_eq!(get_m128(r, 2), 2.);
3901        assert_eq!(get_m128(r, 3), 5.);
3902    }
3903
3904    #[simd_test(enable = "avx")]
3905    const fn test_mm256_cvtepi32_pd() {
3906        let a = _mm_setr_epi32(4, 9, 16, 25);
3907        let r = _mm256_cvtepi32_pd(a);
3908        let e = _mm256_setr_pd(4., 9., 16., 25.);
3909        assert_eq_m256d(r, e);
3910    }
3911
3912    #[simd_test(enable = "avx")]
3913    const fn test_mm256_cvtepi32_ps() {
3914        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3915        let r = _mm256_cvtepi32_ps(a);
3916        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3917        assert_eq_m256(r, e);
3918    }
3919
3920    #[simd_test(enable = "avx")]
3921    const fn test_mm256_cvtpd_ps() {
3922        let a = _mm256_setr_pd(4., 9., 16., 25.);
3923        let r = _mm256_cvtpd_ps(a);
3924        let e = _mm_setr_ps(4., 9., 16., 25.);
3925        assert_eq_m128(r, e);
3926    }
3927
3928    #[simd_test(enable = "avx")]
3929    fn test_mm256_cvtps_epi32() {
3930        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3931        let r = _mm256_cvtps_epi32(a);
3932        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3933        assert_eq_m256i(r, e);
3934    }
3935
3936    #[simd_test(enable = "avx")]
3937    const fn test_mm256_cvtps_pd() {
3938        let a = _mm_setr_ps(4., 9., 16., 25.);
3939        let r = _mm256_cvtps_pd(a);
3940        let e = _mm256_setr_pd(4., 9., 16., 25.);
3941        assert_eq_m256d(r, e);
3942    }
3943
3944    #[simd_test(enable = "avx")]
3945    const fn test_mm256_cvtsd_f64() {
3946        let a = _mm256_setr_pd(1., 2., 3., 4.);
3947        let r = _mm256_cvtsd_f64(a);
3948        assert_eq!(r, 1.);
3949    }
3950
3951    #[simd_test(enable = "avx")]
3952    fn test_mm256_cvttpd_epi32() {
3953        let a = _mm256_setr_pd(4., 9., 16., 25.);
3954        let r = _mm256_cvttpd_epi32(a);
3955        let e = _mm_setr_epi32(4, 9, 16, 25);
3956        assert_eq_m128i(r, e);
3957    }
3958
3959    #[simd_test(enable = "avx")]
3960    fn test_mm256_cvtpd_epi32() {
3961        let a = _mm256_setr_pd(4., 9., 16., 25.);
3962        let r = _mm256_cvtpd_epi32(a);
3963        let e = _mm_setr_epi32(4, 9, 16, 25);
3964        assert_eq_m128i(r, e);
3965    }
3966
3967    #[simd_test(enable = "avx")]
3968    fn test_mm256_cvttps_epi32() {
3969        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3970        let r = _mm256_cvttps_epi32(a);
3971        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3972        assert_eq_m256i(r, e);
3973    }
3974
3975    #[simd_test(enable = "avx")]
3976    const fn test_mm256_extractf128_ps() {
3977        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3978        let r = _mm256_extractf128_ps::<0>(a);
3979        let e = _mm_setr_ps(4., 3., 2., 5.);
3980        assert_eq_m128(r, e);
3981    }
3982
3983    #[simd_test(enable = "avx")]
3984    const fn test_mm256_extractf128_pd() {
3985        let a = _mm256_setr_pd(4., 3., 2., 5.);
3986        let r = _mm256_extractf128_pd::<0>(a);
3987        let e = _mm_setr_pd(4., 3.);
3988        assert_eq_m128d(r, e);
3989    }
3990
3991    #[simd_test(enable = "avx")]
3992    const fn test_mm256_extractf128_si256() {
3993        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3994        let r = _mm256_extractf128_si256::<0>(a);
3995        let e = _mm_setr_epi64x(4, 3);
3996        assert_eq_m128i(r, e);
3997    }
3998
3999    #[simd_test(enable = "avx")]
4000    const fn test_mm256_extract_epi32() {
4001        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
4002        let r1 = _mm256_extract_epi32::<0>(a);
4003        let r2 = _mm256_extract_epi32::<3>(a);
4004        assert_eq!(r1, -1);
4005        assert_eq!(r2, 3);
4006    }
4007
4008    #[simd_test(enable = "avx")]
4009    const fn test_mm256_cvtsi256_si32() {
4010        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4011        let r = _mm256_cvtsi256_si32(a);
4012        assert_eq!(r, 1);
4013    }
4014
4015    #[simd_test(enable = "avx")]
4016    fn test_mm256_zeroall() {
4017        _mm256_zeroall();
4018    }
4019
4020    #[simd_test(enable = "avx")]
4021    fn test_mm256_zeroupper() {
4022        _mm256_zeroupper();
4023    }
4024
4025    #[simd_test(enable = "avx")]
4026    fn test_mm256_permutevar_ps() {
4027        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4028        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4029        let r = _mm256_permutevar_ps(a, b);
4030        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
4031        assert_eq_m256(r, e);
4032    }
4033
4034    #[simd_test(enable = "avx")]
4035    fn test_mm_permutevar_ps() {
4036        let a = _mm_setr_ps(4., 3., 2., 5.);
4037        let b = _mm_setr_epi32(1, 2, 3, 4);
4038        let r = _mm_permutevar_ps(a, b);
4039        let e = _mm_setr_ps(3., 2., 5., 4.);
4040        assert_eq_m128(r, e);
4041    }
4042
4043    #[simd_test(enable = "avx")]
4044    const fn test_mm256_permute_ps() {
4045        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4046        let r = _mm256_permute_ps::<0x1b>(a);
4047        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
4048        assert_eq_m256(r, e);
4049    }
4050
4051    #[simd_test(enable = "avx")]
4052    const fn test_mm_permute_ps() {
4053        let a = _mm_setr_ps(4., 3., 2., 5.);
4054        let r = _mm_permute_ps::<0x1b>(a);
4055        let e = _mm_setr_ps(5., 2., 3., 4.);
4056        assert_eq_m128(r, e);
4057    }
4058
4059    #[simd_test(enable = "avx")]
4060    fn test_mm256_permutevar_pd() {
4061        let a = _mm256_setr_pd(4., 3., 2., 5.);
4062        let b = _mm256_setr_epi64x(1, 2, 3, 4);
4063        let r = _mm256_permutevar_pd(a, b);
4064        let e = _mm256_setr_pd(4., 3., 5., 2.);
4065        assert_eq_m256d(r, e);
4066    }
4067
4068    #[simd_test(enable = "avx")]
4069    fn test_mm_permutevar_pd() {
4070        let a = _mm_setr_pd(4., 3.);
4071        let b = _mm_setr_epi64x(3, 0);
4072        let r = _mm_permutevar_pd(a, b);
4073        let e = _mm_setr_pd(3., 4.);
4074        assert_eq_m128d(r, e);
4075    }
4076
4077    #[simd_test(enable = "avx")]
4078    const fn test_mm256_permute_pd() {
4079        let a = _mm256_setr_pd(4., 3., 2., 5.);
4080        let r = _mm256_permute_pd::<5>(a);
4081        let e = _mm256_setr_pd(3., 4., 5., 2.);
4082        assert_eq_m256d(r, e);
4083    }
4084
4085    #[simd_test(enable = "avx")]
4086    const fn test_mm_permute_pd() {
4087        let a = _mm_setr_pd(4., 3.);
4088        let r = _mm_permute_pd::<1>(a);
4089        let e = _mm_setr_pd(3., 4.);
4090        assert_eq_m128d(r, e);
4091    }
4092
4093    #[simd_test(enable = "avx")]
4094    const fn test_mm256_permute2f128_ps() {
4095        let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
4096        let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
4097        let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
4098        let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
4099        assert_eq_m256(r, e);
4100
4101        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4102        let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
4103        let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
4104        assert_eq_m256(r, z);
4105    }
4106
4107    #[simd_test(enable = "avx")]
4108    const fn test_mm256_permute2f128_pd() {
4109        let a = _mm256_setr_pd(1., 2., 3., 4.);
4110        let b = _mm256_setr_pd(5., 6., 7., 8.);
4111        let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
4112        let e = _mm256_setr_pd(3., 4., 7., 8.);
4113        assert_eq_m256d(r, e);
4114
4115        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4116        let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
4117        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
4118        assert_eq_m256d(r, e);
4119    }
4120
4121    #[simd_test(enable = "avx")]
4122    const fn test_mm256_permute2f128_si256() {
4123        let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
4124        let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
4125        let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
4126        let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
4127        assert_eq_m256i(r, e);
4128
4129        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4130        let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
4131        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
4132        assert_eq_m256i(r, e);
4133    }
4134
4135    #[simd_test(enable = "avx")]
4136    const fn test_mm256_broadcast_ss() {
4137        let r = _mm256_broadcast_ss(&3.);
4138        let e = _mm256_set1_ps(3.);
4139        assert_eq_m256(r, e);
4140    }
4141
4142    #[simd_test(enable = "avx")]
4143    const fn test_mm_broadcast_ss() {
4144        let r = _mm_broadcast_ss(&3.);
4145        let e = _mm_set1_ps(3.);
4146        assert_eq_m128(r, e);
4147    }
4148
4149    #[simd_test(enable = "avx")]
4150    const fn test_mm256_broadcast_sd() {
4151        let r = _mm256_broadcast_sd(&3.);
4152        let e = _mm256_set1_pd(3.);
4153        assert_eq_m256d(r, e);
4154    }
4155
4156    #[simd_test(enable = "avx")]
4157    const fn test_mm256_broadcast_ps() {
4158        let a = _mm_setr_ps(4., 3., 2., 5.);
4159        let r = _mm256_broadcast_ps(&a);
4160        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
4161        assert_eq_m256(r, e);
4162    }
4163
4164    #[simd_test(enable = "avx")]
4165    const fn test_mm256_broadcast_pd() {
4166        let a = _mm_setr_pd(4., 3.);
4167        let r = _mm256_broadcast_pd(&a);
4168        let e = _mm256_setr_pd(4., 3., 4., 3.);
4169        assert_eq_m256d(r, e);
4170    }
4171
4172    #[simd_test(enable = "avx")]
4173    const fn test_mm256_insertf128_ps() {
4174        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4175        let b = _mm_setr_ps(4., 9., 16., 25.);
4176        let r = _mm256_insertf128_ps::<0>(a, b);
4177        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
4178        assert_eq_m256(r, e);
4179    }
4180
4181    #[simd_test(enable = "avx")]
4182    const fn test_mm256_insertf128_pd() {
4183        let a = _mm256_setr_pd(1., 2., 3., 4.);
4184        let b = _mm_setr_pd(5., 6.);
4185        let r = _mm256_insertf128_pd::<0>(a, b);
4186        let e = _mm256_setr_pd(5., 6., 3., 4.);
4187        assert_eq_m256d(r, e);
4188    }
4189
4190    #[simd_test(enable = "avx")]
4191    const fn test_mm256_insertf128_si256() {
4192        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4193        let b = _mm_setr_epi64x(5, 6);
4194        let r = _mm256_insertf128_si256::<0>(a, b);
4195        let e = _mm256_setr_epi64x(5, 6, 3, 4);
4196        assert_eq_m256i(r, e);
4197    }
4198
4199    #[simd_test(enable = "avx")]
4200    const fn test_mm256_insert_epi8() {
4201        #[rustfmt::skip]
4202        let a = _mm256_setr_epi8(
4203            1, 2, 3, 4, 5, 6, 7, 8,
4204            9, 10, 11, 12, 13, 14, 15, 16,
4205            17, 18, 19, 20, 21, 22, 23, 24,
4206            25, 26, 27, 28, 29, 30, 31, 32,
4207        );
4208        let r = _mm256_insert_epi8::<31>(a, 0);
4209        #[rustfmt::skip]
4210        let e = _mm256_setr_epi8(
4211            1, 2, 3, 4, 5, 6, 7, 8,
4212            9, 10, 11, 12, 13, 14, 15, 16,
4213            17, 18, 19, 20, 21, 22, 23, 24,
4214            25, 26, 27, 28, 29, 30, 31, 0,
4215        );
4216        assert_eq_m256i(r, e);
4217    }
4218
4219    #[simd_test(enable = "avx")]
4220    const fn test_mm256_insert_epi16() {
4221        #[rustfmt::skip]
4222        let a = _mm256_setr_epi16(
4223            0, 1, 2, 3, 4, 5, 6, 7,
4224            8, 9, 10, 11, 12, 13, 14, 15,
4225        );
4226        let r = _mm256_insert_epi16::<15>(a, 0);
4227        #[rustfmt::skip]
4228        let e = _mm256_setr_epi16(
4229            0, 1, 2, 3, 4, 5, 6, 7,
4230            8, 9, 10, 11, 12, 13, 14, 0,
4231        );
4232        assert_eq_m256i(r, e);
4233    }
4234
4235    #[simd_test(enable = "avx")]
4236    const fn test_mm256_insert_epi32() {
4237        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4238        let r = _mm256_insert_epi32::<7>(a, 0);
4239        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4240        assert_eq_m256i(r, e);
4241    }
4242
4243    #[simd_test(enable = "avx")]
4244    const fn test_mm256_load_pd() {
4245        let a = _mm256_setr_pd(1., 2., 3., 4.);
4246        let p = ptr::addr_of!(a) as *const f64;
4247        let r = unsafe { _mm256_load_pd(p) };
4248        let e = _mm256_setr_pd(1., 2., 3., 4.);
4249        assert_eq_m256d(r, e);
4250    }
4251
4252    #[simd_test(enable = "avx")]
4253    const fn test_mm256_store_pd() {
4254        let a = _mm256_setr_pd(1., 2., 3., 4.);
4255        let mut r = _mm256_undefined_pd();
4256        unsafe {
4257            _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4258        }
4259        assert_eq_m256d(r, a);
4260    }
4261
4262    #[simd_test(enable = "avx")]
4263    const fn test_mm256_load_ps() {
4264        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4265        let p = ptr::addr_of!(a) as *const f32;
4266        let r = unsafe { _mm256_load_ps(p) };
4267        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4268        assert_eq_m256(r, e);
4269    }
4270
4271    #[simd_test(enable = "avx")]
4272    const fn test_mm256_store_ps() {
4273        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4274        let mut r = _mm256_undefined_ps();
4275        unsafe {
4276            _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4277        }
4278        assert_eq_m256(r, a);
4279    }
4280
4281    #[simd_test(enable = "avx")]
4282    const fn test_mm256_loadu_pd() {
4283        let a = &[1.0f64, 2., 3., 4.];
4284        let p = a.as_ptr();
4285        let r = unsafe { _mm256_loadu_pd(black_box(p)) };
4286        let e = _mm256_setr_pd(1., 2., 3., 4.);
4287        assert_eq_m256d(r, e);
4288    }
4289
4290    #[simd_test(enable = "avx")]
4291    const fn test_mm256_storeu_pd() {
4292        let a = _mm256_set1_pd(9.);
4293        let mut r = _mm256_undefined_pd();
4294        unsafe {
4295            _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4296        }
4297        assert_eq_m256d(r, a);
4298    }
4299
4300    #[simd_test(enable = "avx")]
4301    const fn test_mm256_loadu_ps() {
4302        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4303        let p = a.as_ptr();
4304        let r = unsafe { _mm256_loadu_ps(black_box(p)) };
4305        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4306        assert_eq_m256(r, e);
4307    }
4308
4309    #[simd_test(enable = "avx")]
4310    const fn test_mm256_storeu_ps() {
4311        let a = _mm256_set1_ps(9.);
4312        let mut r = _mm256_undefined_ps();
4313        unsafe {
4314            _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4315        }
4316        assert_eq_m256(r, a);
4317    }
4318
4319    #[simd_test(enable = "avx")]
4320    const fn test_mm256_load_si256() {
4321        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4322        let p = ptr::addr_of!(a);
4323        let r = unsafe { _mm256_load_si256(p) };
4324        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4325        assert_eq_m256i(r, e);
4326    }
4327
4328    #[simd_test(enable = "avx")]
4329    const fn test_mm256_store_si256() {
4330        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4331        let mut r = _mm256_undefined_si256();
4332        unsafe {
4333            _mm256_store_si256(ptr::addr_of_mut!(r), a);
4334        }
4335        assert_eq_m256i(r, a);
4336    }
4337
4338    #[simd_test(enable = "avx")]
4339    const fn test_mm256_loadu_si256() {
4340        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4341        let p = ptr::addr_of!(a);
4342        let r = unsafe { _mm256_loadu_si256(black_box(p)) };
4343        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4344        assert_eq_m256i(r, e);
4345    }
4346
4347    #[simd_test(enable = "avx")]
4348    const fn test_mm256_storeu_si256() {
4349        let a = _mm256_set1_epi8(9);
4350        let mut r = _mm256_undefined_si256();
4351        unsafe {
4352            _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4353        }
4354        assert_eq_m256i(r, a);
4355    }
4356
4357    #[simd_test(enable = "avx")]
4358    const fn test_mm256_maskload_pd() {
4359        let a = &[1.0f64, 2., 3., 4.];
4360        let p = a.as_ptr();
4361        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4362        let r = unsafe { _mm256_maskload_pd(black_box(p), mask) };
4363        let e = _mm256_setr_pd(0., 2., 0., 4.);
4364        assert_eq_m256d(r, e);
4365    }
4366
4367    #[simd_test(enable = "avx")]
4368    const fn test_mm256_maskstore_pd() {
4369        let mut r = _mm256_set1_pd(0.);
4370        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4371        let a = _mm256_setr_pd(1., 2., 3., 4.);
4372        unsafe {
4373            _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4374        }
4375        let e = _mm256_setr_pd(0., 2., 0., 4.);
4376        assert_eq_m256d(r, e);
4377    }
4378
4379    #[simd_test(enable = "avx")]
4380    const fn test_mm_maskload_pd() {
4381        let a = &[1.0f64, 2.];
4382        let p = a.as_ptr();
4383        let mask = _mm_setr_epi64x(0, !0);
4384        let r = unsafe { _mm_maskload_pd(black_box(p), mask) };
4385        let e = _mm_setr_pd(0., 2.);
4386        assert_eq_m128d(r, e);
4387    }
4388
4389    #[simd_test(enable = "avx")]
4390    const fn test_mm_maskstore_pd() {
4391        let mut r = _mm_set1_pd(0.);
4392        let mask = _mm_setr_epi64x(0, !0);
4393        let a = _mm_setr_pd(1., 2.);
4394        unsafe {
4395            _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4396        }
4397        let e = _mm_setr_pd(0., 2.);
4398        assert_eq_m128d(r, e);
4399    }
4400
4401    #[simd_test(enable = "avx")]
4402    const fn test_mm256_maskload_ps() {
4403        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4404        let p = a.as_ptr();
4405        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4406        let r = unsafe { _mm256_maskload_ps(black_box(p), mask) };
4407        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4408        assert_eq_m256(r, e);
4409    }
4410
4411    #[simd_test(enable = "avx")]
4412    const fn test_mm256_maskstore_ps() {
4413        let mut r = _mm256_set1_ps(0.);
4414        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4415        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4416        unsafe {
4417            _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4418        }
4419        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4420        assert_eq_m256(r, e);
4421    }
4422
4423    #[simd_test(enable = "avx")]
4424    const fn test_mm_maskload_ps() {
4425        let a = &[1.0f32, 2., 3., 4.];
4426        let p = a.as_ptr();
4427        let mask = _mm_setr_epi32(0, !0, 0, !0);
4428        let r = unsafe { _mm_maskload_ps(black_box(p), mask) };
4429        let e = _mm_setr_ps(0., 2., 0., 4.);
4430        assert_eq_m128(r, e);
4431    }
4432
4433    #[simd_test(enable = "avx")]
4434    const fn test_mm_maskstore_ps() {
4435        let mut r = _mm_set1_ps(0.);
4436        let mask = _mm_setr_epi32(0, !0, 0, !0);
4437        let a = _mm_setr_ps(1., 2., 3., 4.);
4438        unsafe {
4439            _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4440        }
4441        let e = _mm_setr_ps(0., 2., 0., 4.);
4442        assert_eq_m128(r, e);
4443    }
4444
4445    #[simd_test(enable = "avx")]
4446    const fn test_mm256_movehdup_ps() {
4447        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4448        let r = _mm256_movehdup_ps(a);
4449        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4450        assert_eq_m256(r, e);
4451    }
4452
4453    #[simd_test(enable = "avx")]
4454    const fn test_mm256_moveldup_ps() {
4455        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4456        let r = _mm256_moveldup_ps(a);
4457        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4458        assert_eq_m256(r, e);
4459    }
4460
4461    #[simd_test(enable = "avx")]
4462    const fn test_mm256_movedup_pd() {
4463        let a = _mm256_setr_pd(1., 2., 3., 4.);
4464        let r = _mm256_movedup_pd(a);
4465        let e = _mm256_setr_pd(1., 1., 3., 3.);
4466        assert_eq_m256d(r, e);
4467    }
4468
4469    #[simd_test(enable = "avx")]
4470    fn test_mm256_lddqu_si256() {
4471        #[rustfmt::skip]
4472        let a = _mm256_setr_epi8(
4473            1, 2, 3, 4, 5, 6, 7, 8,
4474            9, 10, 11, 12, 13, 14, 15, 16,
4475            17, 18, 19, 20, 21, 22, 23, 24,
4476            25, 26, 27, 28, 29, 30, 31, 32,
4477        );
4478        let p = ptr::addr_of!(a);
4479        let r = unsafe { _mm256_lddqu_si256(black_box(p)) };
4480        #[rustfmt::skip]
4481        let e = _mm256_setr_epi8(
4482            1, 2, 3, 4, 5, 6, 7, 8,
4483            9, 10, 11, 12, 13, 14, 15, 16,
4484            17, 18, 19, 20, 21, 22, 23, 24,
4485            25, 26, 27, 28, 29, 30, 31, 32,
4486        );
4487        assert_eq_m256i(r, e);
4488    }
4489
4490    #[simd_test(enable = "avx")]
4491    #[cfg_attr(miri, ignore)] // Inline asm (for non-temporal store), which is not supported by Miri
4492    fn test_mm256_stream_si256() {
4493        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4494        let mut r = _mm256_undefined_si256();
4495        unsafe {
4496            _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4497        }
4498        _mm_sfence();
4499        assert_eq_m256i(r, a);
4500    }
4501
4502    #[simd_test(enable = "avx")]
4503    #[cfg_attr(miri, ignore)] // Inline asm (for non-temporal store), which is not supported by Miri
4504    fn test_mm256_stream_pd() {
4505        #[repr(align(32))]
4506        struct Memory {
4507            pub data: [f64; 4],
4508        }
4509        let a = _mm256_set1_pd(7.0);
4510        let mut mem = Memory { data: [-1.0; 4] };
4511
4512        unsafe {
4513            _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4514        }
4515        _mm_sfence();
4516        for i in 0..4 {
4517            assert_eq!(mem.data[i], get_m256d(a, i));
4518        }
4519    }
4520
4521    #[simd_test(enable = "avx")]
4522    #[cfg_attr(miri, ignore)] // Inline asm (for non-temporal store), which is not supported by Miri
4523    fn test_mm256_stream_ps() {
4524        #[repr(align(32))]
4525        struct Memory {
4526            pub data: [f32; 8],
4527        }
4528        let a = _mm256_set1_ps(7.0);
4529        let mut mem = Memory { data: [-1.0; 8] };
4530
4531        unsafe {
4532            _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4533        }
4534        _mm_sfence();
4535        for i in 0..8 {
4536            assert_eq!(mem.data[i], get_m256(a, i));
4537        }
4538    }
4539
4540    #[simd_test(enable = "avx")]
4541    fn test_mm256_rcp_ps() {
4542        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4543        let r = _mm256_rcp_ps(a);
4544        #[rustfmt::skip]
4545        let e = _mm256_setr_ps(
4546            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4547            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4548        );
4549        let rel_err = 0.00048828125;
4550        for i in 0..8 {
4551            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4552        }
4553    }
4554
4555    #[simd_test(enable = "avx")]
4556    fn test_mm256_rsqrt_ps() {
4557        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4558        let r = _mm256_rsqrt_ps(a);
4559        #[rustfmt::skip]
4560        let e = _mm256_setr_ps(
4561            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4562            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4563        );
4564        let rel_err = 0.00048828125;
4565        for i in 0..8 {
4566            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4567        }
4568    }
4569
4570    #[simd_test(enable = "avx")]
4571    const fn test_mm256_unpackhi_pd() {
4572        let a = _mm256_setr_pd(1., 2., 3., 4.);
4573        let b = _mm256_setr_pd(5., 6., 7., 8.);
4574        let r = _mm256_unpackhi_pd(a, b);
4575        let e = _mm256_setr_pd(2., 6., 4., 8.);
4576        assert_eq_m256d(r, e);
4577    }
4578
4579    #[simd_test(enable = "avx")]
4580    const fn test_mm256_unpackhi_ps() {
4581        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4582        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4583        let r = _mm256_unpackhi_ps(a, b);
4584        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4585        assert_eq_m256(r, e);
4586    }
4587
4588    #[simd_test(enable = "avx")]
4589    const fn test_mm256_unpacklo_pd() {
4590        let a = _mm256_setr_pd(1., 2., 3., 4.);
4591        let b = _mm256_setr_pd(5., 6., 7., 8.);
4592        let r = _mm256_unpacklo_pd(a, b);
4593        let e = _mm256_setr_pd(1., 5., 3., 7.);
4594        assert_eq_m256d(r, e);
4595    }
4596
4597    #[simd_test(enable = "avx")]
4598    const fn test_mm256_unpacklo_ps() {
4599        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4600        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4601        let r = _mm256_unpacklo_ps(a, b);
4602        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4603        assert_eq_m256(r, e);
4604    }
4605
4606    #[simd_test(enable = "avx")]
4607    const fn test_mm256_testz_si256() {
4608        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4609        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4610        let r = _mm256_testz_si256(a, b);
4611        assert_eq!(r, 0);
4612        let b = _mm256_set1_epi64x(0);
4613        let r = _mm256_testz_si256(a, b);
4614        assert_eq!(r, 1);
4615    }
4616
4617    #[simd_test(enable = "avx")]
4618    const fn test_mm256_testc_si256() {
4619        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4620        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4621        let r = _mm256_testc_si256(a, b);
4622        assert_eq!(r, 0);
4623        let b = _mm256_set1_epi64x(0);
4624        let r = _mm256_testc_si256(a, b);
4625        assert_eq!(r, 1);
4626    }
4627
4628    #[simd_test(enable = "avx")]
4629    fn test_mm256_testnzc_si256() {
4630        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4631        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4632        let r = _mm256_testnzc_si256(a, b);
4633        assert_eq!(r, 1);
4634        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4635        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4636        let r = _mm256_testnzc_si256(a, b);
4637        assert_eq!(r, 0);
4638    }
4639
4640    #[simd_test(enable = "avx")]
4641    fn test_mm256_testz_pd() {
4642        let a = _mm256_setr_pd(1., 2., 3., 4.);
4643        let b = _mm256_setr_pd(5., 6., 7., 8.);
4644        let r = _mm256_testz_pd(a, b);
4645        assert_eq!(r, 1);
4646        let a = _mm256_set1_pd(-1.);
4647        let r = _mm256_testz_pd(a, a);
4648        assert_eq!(r, 0);
4649    }
4650
4651    #[simd_test(enable = "avx")]
4652    fn test_mm256_testc_pd() {
4653        let a = _mm256_setr_pd(1., 2., 3., 4.);
4654        let b = _mm256_setr_pd(5., 6., 7., 8.);
4655        let r = _mm256_testc_pd(a, b);
4656        assert_eq!(r, 1);
4657        let a = _mm256_set1_pd(1.);
4658        let b = _mm256_set1_pd(-1.);
4659        let r = _mm256_testc_pd(a, b);
4660        assert_eq!(r, 0);
4661    }
4662
4663    #[simd_test(enable = "avx")]
4664    fn test_mm256_testnzc_pd() {
4665        let a = _mm256_setr_pd(1., 2., 3., 4.);
4666        let b = _mm256_setr_pd(5., 6., 7., 8.);
4667        let r = _mm256_testnzc_pd(a, b);
4668        assert_eq!(r, 0);
4669        let a = _mm256_setr_pd(1., -1., -1., -1.);
4670        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4671        let r = _mm256_testnzc_pd(a, b);
4672        assert_eq!(r, 1);
4673    }
4674
4675    #[simd_test(enable = "avx")]
4676    const fn test_mm_testz_pd() {
4677        let a = _mm_setr_pd(1., 2.);
4678        let b = _mm_setr_pd(5., 6.);
4679        let r = _mm_testz_pd(a, b);
4680        assert_eq!(r, 1);
4681        let a = _mm_set1_pd(-1.);
4682        let r = _mm_testz_pd(a, a);
4683        assert_eq!(r, 0);
4684    }
4685
4686    #[simd_test(enable = "avx")]
4687    const fn test_mm_testc_pd() {
4688        let a = _mm_setr_pd(1., 2.);
4689        let b = _mm_setr_pd(5., 6.);
4690        let r = _mm_testc_pd(a, b);
4691        assert_eq!(r, 1);
4692        let a = _mm_set1_pd(1.);
4693        let b = _mm_set1_pd(-1.);
4694        let r = _mm_testc_pd(a, b);
4695        assert_eq!(r, 0);
4696    }
4697
4698    #[simd_test(enable = "avx")]
4699    fn test_mm_testnzc_pd() {
4700        let a = _mm_setr_pd(1., 2.);
4701        let b = _mm_setr_pd(5., 6.);
4702        let r = _mm_testnzc_pd(a, b);
4703        assert_eq!(r, 0);
4704        let a = _mm_setr_pd(1., -1.);
4705        let b = _mm_setr_pd(-1., -1.);
4706        let r = _mm_testnzc_pd(a, b);
4707        assert_eq!(r, 1);
4708    }
4709
4710    #[simd_test(enable = "avx")]
4711    fn test_mm256_testz_ps() {
4712        let a = _mm256_set1_ps(1.);
4713        let r = _mm256_testz_ps(a, a);
4714        assert_eq!(r, 1);
4715        let a = _mm256_set1_ps(-1.);
4716        let r = _mm256_testz_ps(a, a);
4717        assert_eq!(r, 0);
4718    }
4719
4720    #[simd_test(enable = "avx")]
4721    fn test_mm256_testc_ps() {
4722        let a = _mm256_set1_ps(1.);
4723        let r = _mm256_testc_ps(a, a);
4724        assert_eq!(r, 1);
4725        let b = _mm256_set1_ps(-1.);
4726        let r = _mm256_testc_ps(a, b);
4727        assert_eq!(r, 0);
4728    }
4729
4730    #[simd_test(enable = "avx")]
4731    fn test_mm256_testnzc_ps() {
4732        let a = _mm256_set1_ps(1.);
4733        let r = _mm256_testnzc_ps(a, a);
4734        assert_eq!(r, 0);
4735        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4736        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4737        let r = _mm256_testnzc_ps(a, b);
4738        assert_eq!(r, 1);
4739    }
4740
4741    #[simd_test(enable = "avx")]
4742    const fn test_mm_testz_ps() {
4743        let a = _mm_set1_ps(1.);
4744        let r = _mm_testz_ps(a, a);
4745        assert_eq!(r, 1);
4746        let a = _mm_set1_ps(-1.);
4747        let r = _mm_testz_ps(a, a);
4748        assert_eq!(r, 0);
4749    }
4750
4751    #[simd_test(enable = "avx")]
4752    const fn test_mm_testc_ps() {
4753        let a = _mm_set1_ps(1.);
4754        let r = _mm_testc_ps(a, a);
4755        assert_eq!(r, 1);
4756        let b = _mm_set1_ps(-1.);
4757        let r = _mm_testc_ps(a, b);
4758        assert_eq!(r, 0);
4759    }
4760
4761    #[simd_test(enable = "avx")]
4762    fn test_mm_testnzc_ps() {
4763        let a = _mm_set1_ps(1.);
4764        let r = _mm_testnzc_ps(a, a);
4765        assert_eq!(r, 0);
4766        let a = _mm_setr_ps(1., -1., -1., -1.);
4767        let b = _mm_setr_ps(-1., -1., 1., 1.);
4768        let r = _mm_testnzc_ps(a, b);
4769        assert_eq!(r, 1);
4770    }
4771
4772    #[simd_test(enable = "avx")]
4773    const fn test_mm256_movemask_pd() {
4774        let a = _mm256_setr_pd(1., -2., 3., -4.);
4775        let r = _mm256_movemask_pd(a);
4776        assert_eq!(r, 0xA);
4777    }
4778
4779    #[simd_test(enable = "avx")]
4780    const fn test_mm256_movemask_ps() {
4781        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4782        let r = _mm256_movemask_ps(a);
4783        assert_eq!(r, 0xAA);
4784    }
4785
4786    #[simd_test(enable = "avx")]
4787    const fn test_mm256_setzero_pd() {
4788        let r = _mm256_setzero_pd();
4789        assert_eq_m256d(r, _mm256_set1_pd(0.));
4790    }
4791
4792    #[simd_test(enable = "avx")]
4793    const fn test_mm256_setzero_ps() {
4794        let r = _mm256_setzero_ps();
4795        assert_eq_m256(r, _mm256_set1_ps(0.));
4796    }
4797
4798    #[simd_test(enable = "avx")]
4799    const fn test_mm256_setzero_si256() {
4800        let r = _mm256_setzero_si256();
4801        assert_eq_m256i(r, _mm256_set1_epi8(0));
4802    }
4803
4804    #[simd_test(enable = "avx")]
4805    const fn test_mm256_set_pd() {
4806        let r = _mm256_set_pd(1., 2., 3., 4.);
4807        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4808    }
4809
4810    #[simd_test(enable = "avx")]
4811    const fn test_mm256_set_ps() {
4812        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4813        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4814    }
4815
4816    #[simd_test(enable = "avx")]
4817    const fn test_mm256_set_epi8() {
4818        #[rustfmt::skip]
4819        let r = _mm256_set_epi8(
4820            1, 2, 3, 4, 5, 6, 7, 8,
4821            9, 10, 11, 12, 13, 14, 15, 16,
4822            17, 18, 19, 20, 21, 22, 23, 24,
4823            25, 26, 27, 28, 29, 30, 31, 32,
4824        );
4825        #[rustfmt::skip]
4826        let e = _mm256_setr_epi8(
4827            32, 31, 30, 29, 28, 27, 26, 25,
4828            24, 23, 22, 21, 20, 19, 18, 17,
4829            16, 15, 14, 13, 12, 11, 10, 9,
4830            8, 7, 6, 5, 4, 3, 2, 1
4831        );
4832        assert_eq_m256i(r, e);
4833    }
4834
4835    #[simd_test(enable = "avx")]
4836    const fn test_mm256_set_epi16() {
4837        #[rustfmt::skip]
4838        let r = _mm256_set_epi16(
4839            1, 2, 3, 4, 5, 6, 7, 8,
4840            9, 10, 11, 12, 13, 14, 15, 16,
4841        );
4842        #[rustfmt::skip]
4843        let e = _mm256_setr_epi16(
4844            16, 15, 14, 13, 12, 11, 10, 9, 8,
4845            7, 6, 5, 4, 3, 2, 1,
4846        );
4847        assert_eq_m256i(r, e);
4848    }
4849
4850    #[simd_test(enable = "avx")]
4851    const fn test_mm256_set_epi32() {
4852        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4853        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4854    }
4855
4856    #[simd_test(enable = "avx")]
4857    const fn test_mm256_set_epi64x() {
4858        let r = _mm256_set_epi64x(1, 2, 3, 4);
4859        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4860    }
4861
4862    #[simd_test(enable = "avx")]
4863    const fn test_mm256_setr_pd() {
4864        let r = _mm256_setr_pd(1., 2., 3., 4.);
4865        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4866    }
4867
4868    #[simd_test(enable = "avx")]
4869    const fn test_mm256_setr_ps() {
4870        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4871        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4872    }
4873
4874    #[simd_test(enable = "avx")]
4875    const fn test_mm256_setr_epi8() {
4876        #[rustfmt::skip]
4877        let r = _mm256_setr_epi8(
4878            1, 2, 3, 4, 5, 6, 7, 8,
4879            9, 10, 11, 12, 13, 14, 15, 16,
4880            17, 18, 19, 20, 21, 22, 23, 24,
4881            25, 26, 27, 28, 29, 30, 31, 32,
4882        );
4883        #[rustfmt::skip]
4884        let e = _mm256_setr_epi8(
4885            1, 2, 3, 4, 5, 6, 7, 8,
4886            9, 10, 11, 12, 13, 14, 15, 16,
4887            17, 18, 19, 20, 21, 22, 23, 24,
4888            25, 26, 27, 28, 29, 30, 31, 32
4889        );
4890
4891        assert_eq_m256i(r, e);
4892    }
4893
4894    #[simd_test(enable = "avx")]
4895    const fn test_mm256_setr_epi16() {
4896        #[rustfmt::skip]
4897        let r = _mm256_setr_epi16(
4898            1, 2, 3, 4, 5, 6, 7, 8,
4899            9, 10, 11, 12, 13, 14, 15, 16,
4900        );
4901        #[rustfmt::skip]
4902        let e = _mm256_setr_epi16(
4903            1, 2, 3, 4, 5, 6, 7, 8,
4904            9, 10, 11, 12, 13, 14, 15, 16,
4905        );
4906        assert_eq_m256i(r, e);
4907    }
4908
4909    #[simd_test(enable = "avx")]
4910    const fn test_mm256_setr_epi32() {
4911        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4912        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4913    }
4914
4915    #[simd_test(enable = "avx")]
4916    const fn test_mm256_setr_epi64x() {
4917        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4918        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4919    }
4920
4921    #[simd_test(enable = "avx")]
4922    const fn test_mm256_set1_pd() {
4923        let r = _mm256_set1_pd(1.);
4924        assert_eq_m256d(r, _mm256_set1_pd(1.));
4925    }
4926
4927    #[simd_test(enable = "avx")]
4928    const fn test_mm256_set1_ps() {
4929        let r = _mm256_set1_ps(1.);
4930        assert_eq_m256(r, _mm256_set1_ps(1.));
4931    }
4932
4933    #[simd_test(enable = "avx")]
4934    const fn test_mm256_set1_epi8() {
4935        let r = _mm256_set1_epi8(1);
4936        assert_eq_m256i(r, _mm256_set1_epi8(1));
4937    }
4938
4939    #[simd_test(enable = "avx")]
4940    const fn test_mm256_set1_epi16() {
4941        let r = _mm256_set1_epi16(1);
4942        assert_eq_m256i(r, _mm256_set1_epi16(1));
4943    }
4944
4945    #[simd_test(enable = "avx")]
4946    const fn test_mm256_set1_epi32() {
4947        let r = _mm256_set1_epi32(1);
4948        assert_eq_m256i(r, _mm256_set1_epi32(1));
4949    }
4950
4951    #[simd_test(enable = "avx")]
4952    const fn test_mm256_set1_epi64x() {
4953        let r = _mm256_set1_epi64x(1);
4954        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4955    }
4956
4957    #[simd_test(enable = "avx")]
4958    const fn test_mm256_castpd_ps() {
4959        let a = _mm256_setr_pd(1., 2., 3., 4.);
4960        let r = _mm256_castpd_ps(a);
4961        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4962        assert_eq_m256(r, e);
4963    }
4964
4965    #[simd_test(enable = "avx")]
4966    const fn test_mm256_castps_pd() {
4967        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4968        let r = _mm256_castps_pd(a);
4969        let e = _mm256_setr_pd(1., 2., 3., 4.);
4970        assert_eq_m256d(r, e);
4971    }
4972
4973    #[simd_test(enable = "avx")]
4974    const fn test_mm256_castps_si256() {
4975        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4976        let r = _mm256_castps_si256(a);
4977        #[rustfmt::skip]
4978        let e = _mm256_setr_epi8(
4979            0, 0, -128, 63, 0, 0, 0, 64,
4980            0, 0, 64, 64, 0, 0, -128, 64,
4981            0, 0, -96, 64, 0, 0, -64, 64,
4982            0, 0, -32, 64, 0, 0, 0, 65,
4983        );
4984        assert_eq_m256i(r, e);
4985    }
4986
4987    #[simd_test(enable = "avx")]
4988    const fn test_mm256_castsi256_ps() {
4989        #[rustfmt::skip]
4990        let a = _mm256_setr_epi8(
4991            0, 0, -128, 63, 0, 0, 0, 64,
4992            0, 0, 64, 64, 0, 0, -128, 64,
4993            0, 0, -96, 64, 0, 0, -64, 64,
4994            0, 0, -32, 64, 0, 0, 0, 65,
4995        );
4996        let r = _mm256_castsi256_ps(a);
4997        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4998        assert_eq_m256(r, e);
4999    }
5000
5001    #[simd_test(enable = "avx")]
5002    const fn test_mm256_castpd_si256() {
5003        let a = _mm256_setr_pd(1., 2., 3., 4.);
5004        let r = _mm256_castpd_si256(a);
5005        assert_eq_m256d(unsafe { transmute(r) }, a);
5006    }
5007
5008    #[simd_test(enable = "avx")]
5009    const fn test_mm256_castsi256_pd() {
5010        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5011        let r = _mm256_castsi256_pd(a);
5012        assert_eq_m256d(r, unsafe { transmute(a) });
5013    }
5014
5015    #[simd_test(enable = "avx")]
5016    const fn test_mm256_castps256_ps128() {
5017        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5018        let r = _mm256_castps256_ps128(a);
5019        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
5020    }
5021
5022    #[simd_test(enable = "avx")]
5023    const fn test_mm256_castpd256_pd128() {
5024        let a = _mm256_setr_pd(1., 2., 3., 4.);
5025        let r = _mm256_castpd256_pd128(a);
5026        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
5027    }
5028
5029    #[simd_test(enable = "avx")]
5030    const fn test_mm256_castsi256_si128() {
5031        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5032        let r = _mm256_castsi256_si128(a);
5033        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
5034    }
5035
5036    #[simd_test(enable = "avx")]
5037    const fn test_mm256_castps128_ps256() {
5038        let a = _mm_setr_ps(1., 2., 3., 4.);
5039        let r = _mm256_castps128_ps256(a);
5040        assert_eq_m128(_mm256_castps256_ps128(r), a);
5041    }
5042
5043    #[simd_test(enable = "avx")]
5044    const fn test_mm256_castpd128_pd256() {
5045        let a = _mm_setr_pd(1., 2.);
5046        let r = _mm256_castpd128_pd256(a);
5047        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
5048    }
5049
5050    #[simd_test(enable = "avx")]
5051    const fn test_mm256_castsi128_si256() {
5052        let a = _mm_setr_epi32(1, 2, 3, 4);
5053        let r = _mm256_castsi128_si256(a);
5054        assert_eq_m128i(_mm256_castsi256_si128(r), a);
5055    }
5056
5057    #[simd_test(enable = "avx")]
5058    const fn test_mm256_zextps128_ps256() {
5059        let a = _mm_setr_ps(1., 2., 3., 4.);
5060        let r = _mm256_zextps128_ps256(a);
5061        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
5062        assert_eq_m256(r, e);
5063    }
5064
5065    #[simd_test(enable = "avx")]
5066    const fn test_mm256_zextsi128_si256() {
5067        let a = _mm_setr_epi64x(1, 2);
5068        let r = _mm256_zextsi128_si256(a);
5069        let e = _mm256_setr_epi64x(1, 2, 0, 0);
5070        assert_eq_m256i(r, e);
5071    }
5072
5073    #[simd_test(enable = "avx")]
5074    const fn test_mm256_zextpd128_pd256() {
5075        let a = _mm_setr_pd(1., 2.);
5076        let r = _mm256_zextpd128_pd256(a);
5077        let e = _mm256_setr_pd(1., 2., 0., 0.);
5078        assert_eq_m256d(r, e);
5079    }
5080
5081    #[simd_test(enable = "avx")]
5082    const fn test_mm256_set_m128() {
5083        let hi = _mm_setr_ps(5., 6., 7., 8.);
5084        let lo = _mm_setr_ps(1., 2., 3., 4.);
5085        let r = _mm256_set_m128(hi, lo);
5086        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5087        assert_eq_m256(r, e);
5088    }
5089
5090    #[simd_test(enable = "avx")]
5091    const fn test_mm256_set_m128d() {
5092        let hi = _mm_setr_pd(3., 4.);
5093        let lo = _mm_setr_pd(1., 2.);
5094        let r = _mm256_set_m128d(hi, lo);
5095        let e = _mm256_setr_pd(1., 2., 3., 4.);
5096        assert_eq_m256d(r, e);
5097    }
5098
5099    #[simd_test(enable = "avx")]
5100    const fn test_mm256_set_m128i() {
5101        #[rustfmt::skip]
5102        let hi = _mm_setr_epi8(
5103            17, 18, 19, 20,
5104            21, 22, 23, 24,
5105            25, 26, 27, 28,
5106            29, 30, 31, 32,
5107        );
5108        #[rustfmt::skip]
5109        let lo = _mm_setr_epi8(
5110            1, 2, 3, 4,
5111            5, 6, 7, 8,
5112            9, 10, 11, 12,
5113            13, 14, 15, 16,
5114        );
5115        let r = _mm256_set_m128i(hi, lo);
5116        #[rustfmt::skip]
5117        let e = _mm256_setr_epi8(
5118            1, 2, 3, 4, 5, 6, 7, 8,
5119            9, 10, 11, 12, 13, 14, 15, 16,
5120            17, 18, 19, 20, 21, 22, 23, 24,
5121            25, 26, 27, 28, 29, 30, 31, 32,
5122        );
5123        assert_eq_m256i(r, e);
5124    }
5125
5126    #[simd_test(enable = "avx")]
5127    const fn test_mm256_setr_m128() {
5128        let lo = _mm_setr_ps(1., 2., 3., 4.);
5129        let hi = _mm_setr_ps(5., 6., 7., 8.);
5130        let r = _mm256_setr_m128(lo, hi);
5131        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5132        assert_eq_m256(r, e);
5133    }
5134
5135    #[simd_test(enable = "avx")]
5136    const fn test_mm256_setr_m128d() {
5137        let lo = _mm_setr_pd(1., 2.);
5138        let hi = _mm_setr_pd(3., 4.);
5139        let r = _mm256_setr_m128d(lo, hi);
5140        let e = _mm256_setr_pd(1., 2., 3., 4.);
5141        assert_eq_m256d(r, e);
5142    }
5143
5144    #[simd_test(enable = "avx")]
5145    const fn test_mm256_setr_m128i() {
5146        #[rustfmt::skip]
5147        let lo = _mm_setr_epi8(
5148            1, 2, 3, 4,
5149            5, 6, 7, 8,
5150            9, 10, 11, 12,
5151            13, 14, 15, 16,
5152        );
5153        #[rustfmt::skip]
5154        let hi = _mm_setr_epi8(
5155            17, 18, 19, 20, 21, 22, 23, 24,
5156            25, 26, 27, 28, 29, 30, 31, 32,
5157        );
5158        let r = _mm256_setr_m128i(lo, hi);
5159        #[rustfmt::skip]
5160        let e = _mm256_setr_epi8(
5161            1, 2, 3, 4, 5, 6, 7, 8,
5162            9, 10, 11, 12, 13, 14, 15, 16,
5163            17, 18, 19, 20, 21, 22, 23, 24,
5164            25, 26, 27, 28, 29, 30, 31, 32,
5165        );
5166        assert_eq_m256i(r, e);
5167    }
5168
5169    #[simd_test(enable = "avx")]
5170    const fn test_mm256_loadu2_m128() {
5171        let hi = &[5., 6., 7., 8.];
5172        let hiaddr = hi.as_ptr();
5173        let lo = &[1., 2., 3., 4.];
5174        let loaddr = lo.as_ptr();
5175        let r = unsafe { _mm256_loadu2_m128(hiaddr, loaddr) };
5176        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5177        assert_eq_m256(r, e);
5178    }
5179
5180    #[simd_test(enable = "avx")]
5181    const fn test_mm256_loadu2_m128d() {
5182        let hi = &[3., 4.];
5183        let hiaddr = hi.as_ptr();
5184        let lo = &[1., 2.];
5185        let loaddr = lo.as_ptr();
5186        let r = unsafe { _mm256_loadu2_m128d(hiaddr, loaddr) };
5187        let e = _mm256_setr_pd(1., 2., 3., 4.);
5188        assert_eq_m256d(r, e);
5189    }
5190
5191    #[simd_test(enable = "avx")]
5192    const fn test_mm256_loadu2_m128i() {
5193        #[rustfmt::skip]
5194        let hi = _mm_setr_epi8(
5195            17, 18, 19, 20, 21, 22, 23, 24,
5196            25, 26, 27, 28, 29, 30, 31, 32,
5197        );
5198        #[rustfmt::skip]
5199        let lo = _mm_setr_epi8(
5200            1, 2, 3, 4, 5, 6, 7, 8,
5201            9, 10, 11, 12, 13, 14, 15, 16,
5202        );
5203        let r = unsafe {
5204            _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _)
5205        };
5206        #[rustfmt::skip]
5207        let e = _mm256_setr_epi8(
5208            1, 2, 3, 4, 5, 6, 7, 8,
5209            9, 10, 11, 12, 13, 14, 15, 16,
5210            17, 18, 19, 20, 21, 22, 23, 24,
5211            25, 26, 27, 28, 29, 30, 31, 32,
5212        );
5213        assert_eq_m256i(r, e);
5214    }
5215
5216    #[simd_test(enable = "avx")]
5217    const fn test_mm256_storeu2_m128() {
5218        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5219        let mut hi = _mm_undefined_ps();
5220        let mut lo = _mm_undefined_ps();
5221        unsafe {
5222            _mm256_storeu2_m128(
5223                ptr::addr_of_mut!(hi) as *mut f32,
5224                ptr::addr_of_mut!(lo) as *mut f32,
5225                a,
5226            );
5227        }
5228        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5229        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5230    }
5231
5232    #[simd_test(enable = "avx")]
5233    const fn test_mm256_storeu2_m128d() {
5234        let a = _mm256_setr_pd(1., 2., 3., 4.);
5235        let mut hi = _mm_undefined_pd();
5236        let mut lo = _mm_undefined_pd();
5237        unsafe {
5238            _mm256_storeu2_m128d(
5239                ptr::addr_of_mut!(hi) as *mut f64,
5240                ptr::addr_of_mut!(lo) as *mut f64,
5241                a,
5242            );
5243        }
5244        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5245        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5246    }
5247
5248    #[simd_test(enable = "avx")]
5249    const fn test_mm256_storeu2_m128i() {
5250        #[rustfmt::skip]
5251        let a = _mm256_setr_epi8(
5252            1, 2, 3, 4, 5, 6, 7, 8,
5253            9, 10, 11, 12, 13, 14, 15, 16,
5254            17, 18, 19, 20, 21, 22, 23, 24,
5255            25, 26, 27, 28, 29, 30, 31, 32,
5256        );
5257        let mut hi = _mm_undefined_si128();
5258        let mut lo = _mm_undefined_si128();
5259        unsafe {
5260            _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5261        }
5262        #[rustfmt::skip]
5263        let e_hi = _mm_setr_epi8(
5264            17, 18, 19, 20, 21, 22, 23, 24,
5265            25, 26, 27, 28, 29, 30, 31, 32
5266        );
5267        #[rustfmt::skip]
5268        let e_lo = _mm_setr_epi8(
5269            1, 2, 3, 4, 5, 6, 7, 8,
5270            9, 10, 11, 12, 13, 14, 15, 16
5271        );
5272
5273        assert_eq_m128i(hi, e_hi);
5274        assert_eq_m128i(lo, e_lo);
5275    }
5276
5277    #[simd_test(enable = "avx")]
5278    const fn test_mm256_cvtss_f32() {
5279        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5280        let r = _mm256_cvtss_f32(a);
5281        assert_eq!(r, 1.);
5282    }
5283}