Skip to main content

core/stdarch/crates/core_arch/src/aarch64/neon/
mod.rs

1//! ARMv8 ASIMD intrinsics
2
3#![allow(non_camel_case_types)]
4
5#[rustfmt::skip]
6mod generated;
7#[rustfmt::skip]
8#[stable(feature = "neon_intrinsics", since = "1.59.0")]
9pub use self::generated::*;
10
11// FIXME: replace neon with asimd
12
13use crate::{
14    core_arch::{arm_shared::*, simd::*},
15    intrinsics::{simd::*, *},
16    mem::transmute,
17};
18#[cfg(test)]
19use stdarch_test::assert_instr;
20
21types! {
22    #![stable(feature = "neon_intrinsics", since = "1.59.0")]
23
24    /// ARM-specific 64-bit wide vector of one packed `f64`.
25    pub struct float64x1_t(1 x f64); // FIXME: check this!
26    /// ARM-specific 128-bit wide vector of two packed `f64`.
27    pub struct float64x2_t(2 x f64);
28}
29
30/// ARM-specific type containing two `float64x1_t` vectors.
31#[repr(C)]
32#[derive(Copy, Clone, Debug)]
33#[stable(feature = "neon_intrinsics", since = "1.59.0")]
34pub struct float64x1x2_t(pub float64x1_t, pub float64x1_t);
35/// ARM-specific type containing three `float64x1_t` vectors.
36#[repr(C)]
37#[derive(Copy, Clone, Debug)]
38#[stable(feature = "neon_intrinsics", since = "1.59.0")]
39pub struct float64x1x3_t(pub float64x1_t, pub float64x1_t, pub float64x1_t);
40/// ARM-specific type containing four `float64x1_t` vectors.
41#[repr(C)]
42#[derive(Copy, Clone, Debug)]
43#[stable(feature = "neon_intrinsics", since = "1.59.0")]
44pub struct float64x1x4_t(
45    pub float64x1_t,
46    pub float64x1_t,
47    pub float64x1_t,
48    pub float64x1_t,
49);
50
51/// ARM-specific type containing two `float64x2_t` vectors.
52#[repr(C)]
53#[derive(Copy, Clone, Debug)]
54#[stable(feature = "neon_intrinsics", since = "1.59.0")]
55pub struct float64x2x2_t(pub float64x2_t, pub float64x2_t);
56/// ARM-specific type containing three `float64x2_t` vectors.
57#[repr(C)]
58#[derive(Copy, Clone, Debug)]
59#[stable(feature = "neon_intrinsics", since = "1.59.0")]
60pub struct float64x2x3_t(pub float64x2_t, pub float64x2_t, pub float64x2_t);
61/// ARM-specific type containing four `float64x2_t` vectors.
62#[repr(C)]
63#[derive(Copy, Clone, Debug)]
64#[stable(feature = "neon_intrinsics", since = "1.59.0")]
65pub struct float64x2x4_t(
66    pub float64x2_t,
67    pub float64x2_t,
68    pub float64x2_t,
69    pub float64x2_t,
70);
71
72/// Helper for the 'shift right and insert' functions.
73macro_rules! shift_right_and_insert {
74    ($ty:ty, $width:literal, $N:expr, $a:expr, $b:expr) => {{
75        type V = Simd<$ty, $width>;
76
77        if $N as u32 == <$ty>::BITS {
78            $a
79        } else {
80            let a: V = transmute($a);
81            let b: V = transmute($b);
82
83            let mask = <$ty>::MAX >> $N;
84            let kept: V = simd_and(a, V::splat(!mask));
85
86            let shift_counts = V::splat($N as $ty);
87            let shifted = simd_shr(b, shift_counts);
88
89            transmute(simd_or(kept, shifted))
90        }
91    }};
92}
93
94pub(crate) use shift_right_and_insert;
95
96/// Load multiple single-element structures to one, two, three, or four registers
97#[inline]
98#[target_feature(enable = "neon")]
99#[cfg_attr(test, assert_instr(ldr))]
100#[stable(feature = "neon_intrinsics", since = "1.59.0")]
101pub unsafe fn vld1_dup_f64(ptr: *const f64) -> float64x1_t {
102    vld1_f64(ptr)
103}
104
105/// Load multiple single-element structures to one, two, three, or four registers
106#[inline]
107#[target_feature(enable = "neon")]
108#[cfg_attr(test, assert_instr(ld1r))]
109#[stable(feature = "neon_intrinsics", since = "1.59.0")]
110pub unsafe fn vld1q_dup_f64(ptr: *const f64) -> float64x2_t {
111    let x = vld1q_lane_f64::<0>(ptr, transmute(f64x2::splat(0.)));
112    simd_shuffle!(x, x, [0, 0])
113}
114
115/// Load one single-element structure to one lane of one register.
116#[inline]
117#[target_feature(enable = "neon")]
118#[rustc_legacy_const_generics(2)]
119#[cfg_attr(test, assert_instr(ldr, LANE = 0))]
120#[stable(feature = "neon_intrinsics", since = "1.59.0")]
121pub unsafe fn vld1_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x1_t) -> float64x1_t {
122    static_assert!(LANE == 0);
123    simd_insert!(src, LANE as u32, *ptr)
124}
125
126/// Load one single-element structure to one lane of one register.
127#[inline]
128#[target_feature(enable = "neon")]
129#[rustc_legacy_const_generics(2)]
130#[cfg_attr(test, assert_instr(ld1, LANE = 1))]
131#[stable(feature = "neon_intrinsics", since = "1.59.0")]
132pub unsafe fn vld1q_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x2_t) -> float64x2_t {
133    static_assert_uimm_bits!(LANE, 1);
134    simd_insert!(src, LANE as u32, *ptr)
135}
136
137/// Bitwise Select instructions. This instruction sets each bit in the destination SIMD&FP register
138/// to the corresponding bit from the first source SIMD&FP register when the original
139/// destination bit was 1, otherwise from the second source SIMD&FP register.
140#[inline]
141#[target_feature(enable = "neon")]
142#[cfg_attr(test, assert_instr(bsl))]
143#[stable(feature = "neon_intrinsics", since = "1.59.0")]
144pub fn vbsl_f64(a: uint64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
145    let not = int64x1_t::splat(-1);
146    unsafe {
147        transmute(simd_or(
148            simd_and(a, transmute(b)),
149            simd_and(simd_xor(a, transmute(not)), transmute(c)),
150        ))
151    }
152}
153/// Bitwise Select.
154#[inline]
155#[target_feature(enable = "neon")]
156#[cfg_attr(test, assert_instr(bsl))]
157#[stable(feature = "neon_intrinsics", since = "1.59.0")]
158pub fn vbsl_p64(a: poly64x1_t, b: poly64x1_t, c: poly64x1_t) -> poly64x1_t {
159    let not = int64x1_t::splat(-1);
160    unsafe { simd_or(simd_and(a, b), simd_and(simd_xor(a, transmute(not)), c)) }
161}
162/// Bitwise Select. (128-bit)
163#[inline]
164#[target_feature(enable = "neon")]
165#[cfg_attr(test, assert_instr(bsl))]
166#[stable(feature = "neon_intrinsics", since = "1.59.0")]
167pub fn vbslq_f64(a: uint64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
168    let not = int64x2_t::splat(-1);
169    unsafe {
170        transmute(simd_or(
171            simd_and(a, transmute(b)),
172            simd_and(simd_xor(a, transmute(not)), transmute(c)),
173        ))
174    }
175}
176/// Bitwise Select. (128-bit)
177#[inline]
178#[target_feature(enable = "neon")]
179#[cfg_attr(test, assert_instr(bsl))]
180#[stable(feature = "neon_intrinsics", since = "1.59.0")]
181pub fn vbslq_p64(a: poly64x2_t, b: poly64x2_t, c: poly64x2_t) -> poly64x2_t {
182    let not = int64x2_t::splat(-1);
183    unsafe { simd_or(simd_and(a, b), simd_and(simd_xor(a, transmute(not)), c)) }
184}
185
186/// Vector add.
187#[inline]
188#[target_feature(enable = "neon")]
189#[cfg_attr(test, assert_instr(fadd))]
190#[stable(feature = "neon_intrinsics", since = "1.59.0")]
191pub fn vadd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
192    unsafe { simd_add(a, b) }
193}
194
195/// Vector add.
196#[inline]
197#[target_feature(enable = "neon")]
198#[cfg_attr(test, assert_instr(fadd))]
199#[stable(feature = "neon_intrinsics", since = "1.59.0")]
200pub fn vaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
201    unsafe { simd_add(a, b) }
202}
203
204/// Vector add.
205#[inline]
206#[target_feature(enable = "neon")]
207#[cfg_attr(test, assert_instr(add))]
208#[stable(feature = "neon_intrinsics", since = "1.59.0")]
209pub fn vadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
210    unsafe { simd_add(a, b) }
211}
212
213/// Vector add.
214#[inline]
215#[target_feature(enable = "neon")]
216#[cfg_attr(test, assert_instr(add))]
217#[stable(feature = "neon_intrinsics", since = "1.59.0")]
218pub fn vadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
219    unsafe { simd_add(a, b) }
220}
221
222/// Vector add.
223#[inline]
224#[target_feature(enable = "neon")]
225#[cfg_attr(test, assert_instr(add))]
226#[stable(feature = "neon_intrinsics", since = "1.59.0")]
227pub fn vaddd_s64(a: i64, b: i64) -> i64 {
228    a.wrapping_add(b)
229}
230
231/// Vector add.
232#[inline]
233#[target_feature(enable = "neon")]
234#[cfg_attr(test, assert_instr(add))]
235#[stable(feature = "neon_intrinsics", since = "1.59.0")]
236pub fn vaddd_u64(a: u64, b: u64) -> u64 {
237    a.wrapping_add(b)
238}
239
240/// Extract vector from pair of vectors
241#[inline]
242#[target_feature(enable = "neon")]
243#[cfg_attr(test, assert_instr(nop, N = 0))]
244#[rustc_legacy_const_generics(2)]
245#[stable(feature = "neon_intrinsics", since = "1.59.0")]
246pub fn vext_p64<const N: i32>(a: poly64x1_t, _b: poly64x1_t) -> poly64x1_t {
247    static_assert!(N == 0);
248    a
249}
250
251/// Extract vector from pair of vectors
252#[inline]
253#[target_feature(enable = "neon")]
254#[cfg_attr(test, assert_instr(nop, N = 0))]
255#[rustc_legacy_const_generics(2)]
256#[stable(feature = "neon_intrinsics", since = "1.59.0")]
257pub fn vext_f64<const N: i32>(a: float64x1_t, _b: float64x1_t) -> float64x1_t {
258    static_assert!(N == 0);
259    a
260}
261
262/// Duplicate vector element to vector or scalar
263#[inline]
264#[target_feature(enable = "neon")]
265#[cfg_attr(test, assert_instr(fmov))]
266#[stable(feature = "neon_intrinsics", since = "1.59.0")]
267pub fn vdup_n_p64(value: p64) -> poly64x1_t {
268    unsafe { transmute(u64x1::new(value)) }
269}
270
271/// Duplicate vector element to vector or scalar
272#[inline]
273#[target_feature(enable = "neon")]
274#[cfg_attr(test, assert_instr(nop))]
275#[stable(feature = "neon_intrinsics", since = "1.59.0")]
276pub fn vdup_n_f64(value: f64) -> float64x1_t {
277    float64x1_t::splat(value)
278}
279
280/// Duplicate vector element to vector or scalar
281#[inline]
282#[target_feature(enable = "neon")]
283#[cfg_attr(test, assert_instr(dup))]
284#[stable(feature = "neon_intrinsics", since = "1.59.0")]
285pub fn vdupq_n_p64(value: p64) -> poly64x2_t {
286    unsafe { transmute(u64x2::new(value, value)) }
287}
288
289/// Duplicate vector element to vector or scalar
290#[inline]
291#[target_feature(enable = "neon")]
292#[cfg_attr(test, assert_instr(dup))]
293#[stable(feature = "neon_intrinsics", since = "1.59.0")]
294pub fn vdupq_n_f64(value: f64) -> float64x2_t {
295    float64x2_t::splat(value)
296}
297
298/// Duplicate vector element to vector or scalar
299#[inline]
300#[target_feature(enable = "neon")]
301#[cfg_attr(test, assert_instr(fmov))]
302#[stable(feature = "neon_intrinsics", since = "1.59.0")]
303pub fn vmov_n_p64(value: p64) -> poly64x1_t {
304    vdup_n_p64(value)
305}
306
307/// Duplicate vector element to vector or scalar
308#[inline]
309#[target_feature(enable = "neon")]
310#[cfg_attr(test, assert_instr(nop))]
311#[stable(feature = "neon_intrinsics", since = "1.59.0")]
312pub fn vmov_n_f64(value: f64) -> float64x1_t {
313    vdup_n_f64(value)
314}
315
316/// Duplicate vector element to vector or scalar
317#[inline]
318#[target_feature(enable = "neon")]
319#[cfg_attr(test, assert_instr(dup))]
320#[stable(feature = "neon_intrinsics", since = "1.59.0")]
321pub fn vmovq_n_p64(value: p64) -> poly64x2_t {
322    vdupq_n_p64(value)
323}
324
325/// Duplicate vector element to vector or scalar
326#[inline]
327#[target_feature(enable = "neon")]
328#[cfg_attr(test, assert_instr(dup))]
329#[stable(feature = "neon_intrinsics", since = "1.59.0")]
330pub fn vmovq_n_f64(value: f64) -> float64x2_t {
331    vdupq_n_f64(value)
332}
333
334/// Duplicate vector element to vector or scalar
335#[inline]
336#[target_feature(enable = "neon")]
337#[rustc_legacy_const_generics(1)]
338#[stable(feature = "neon_intrinsics", since = "1.59.0")]
339#[cfg_attr(
340    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
341    assert_instr(nop, IMM5 = 0)
342)]
343pub fn vget_lane_f64<const IMM5: i32>(v: float64x1_t) -> f64 {
344    static_assert!(IMM5 == 0);
345    unsafe { simd_extract!(v, IMM5 as u32) }
346}
347
348/// Shift left
349#[inline]
350#[target_feature(enable = "neon")]
351#[cfg_attr(test, assert_instr(nop, N = 2))]
352#[rustc_legacy_const_generics(1)]
353#[stable(feature = "neon_intrinsics", since = "1.59.0")]
354pub fn vshld_n_s64<const N: i32>(a: i64) -> i64 {
355    static_assert_uimm_bits!(N, 6);
356    a << N
357}
358
359/// Shift left
360#[inline]
361#[target_feature(enable = "neon")]
362#[cfg_attr(test, assert_instr(nop, N = 2))]
363#[rustc_legacy_const_generics(1)]
364#[stable(feature = "neon_intrinsics", since = "1.59.0")]
365pub fn vshld_n_u64<const N: i32>(a: u64) -> u64 {
366    static_assert_uimm_bits!(N, 6);
367    a << N
368}
369
370/// Signed shift right
371#[inline]
372#[target_feature(enable = "neon")]
373#[cfg_attr(test, assert_instr(nop, N = 2))]
374#[rustc_legacy_const_generics(1)]
375#[stable(feature = "neon_intrinsics", since = "1.59.0")]
376pub fn vshrd_n_s64<const N: i32>(a: i64) -> i64 {
377    static_assert!(N >= 1 && N <= 64);
378    let n: i32 = if N == 64 { 63 } else { N };
379    a >> n
380}
381
382/// Unsigned shift right
383#[inline]
384#[target_feature(enable = "neon")]
385#[cfg_attr(test, assert_instr(nop, N = 2))]
386#[rustc_legacy_const_generics(1)]
387#[stable(feature = "neon_intrinsics", since = "1.59.0")]
388pub fn vshrd_n_u64<const N: i32>(a: u64) -> u64 {
389    static_assert!(N >= 1 && N <= 64);
390    let n: i32 = if N == 64 {
391        return 0;
392    } else {
393        N
394    };
395    a >> n
396}
397
398/// Signed shift right and accumulate
399#[inline]
400#[target_feature(enable = "neon")]
401#[cfg_attr(test, assert_instr(nop, N = 2))]
402#[rustc_legacy_const_generics(2)]
403#[stable(feature = "neon_intrinsics", since = "1.59.0")]
404pub fn vsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
405    static_assert!(N >= 1 && N <= 64);
406    a.wrapping_add(vshrd_n_s64::<N>(b))
407}
408
409/// Unsigned shift right and accumulate
410#[inline]
411#[target_feature(enable = "neon")]
412#[cfg_attr(test, assert_instr(nop, N = 2))]
413#[rustc_legacy_const_generics(2)]
414#[stable(feature = "neon_intrinsics", since = "1.59.0")]
415pub fn vsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
416    static_assert!(N >= 1 && N <= 64);
417    a.wrapping_add(vshrd_n_u64::<N>(b))
418}
419
420#[cfg(test)]
421mod tests {
422    use crate::core_arch::aarch64::test_support::*;
423    use crate::core_arch::arm_shared::test_support::*;
424    use crate::core_arch::{aarch64::neon::*, aarch64::*, simd::*};
425    use stdarch_test::simd_test;
426
427    #[simd_test(enable = "neon")]
428    fn test_vadd_f64() {
429        let a = f64x1::from_array([1.]);
430        let b = f64x1::from_array([8.]);
431        let e = f64x1::from_array([9.]);
432        let r = f64x1::from(vadd_f64(a.into(), b.into()));
433        assert_eq!(r, e);
434    }
435
436    #[simd_test(enable = "neon")]
437    fn test_vaddq_f64() {
438        let a = f64x2::new(1., 2.);
439        let b = f64x2::new(8., 7.);
440        let e = f64x2::new(9., 9.);
441        let r = f64x2::from(vaddq_f64(a.into(), b.into()));
442        assert_eq!(r, e);
443    }
444
445    #[simd_test(enable = "neon")]
446    fn test_vadd_s64() {
447        let a = i64x1::from_array([1]);
448        let b = i64x1::from_array([8]);
449        let e = i64x1::from_array([9]);
450        let r = i64x1::from(vadd_s64(a.into(), b.into()));
451        assert_eq!(r, e);
452    }
453
454    #[simd_test(enable = "neon")]
455    fn test_vadd_u64() {
456        let a = u64x1::from_array([1]);
457        let b = u64x1::from_array([8]);
458        let e = u64x1::from_array([9]);
459        let r = u64x1::from(vadd_u64(a.into(), b.into()));
460        assert_eq!(r, e);
461    }
462
463    #[simd_test(enable = "neon")]
464    fn test_vaddd_s64() {
465        let a = 1_i64;
466        let b = 8_i64;
467        let e = 9_i64;
468        let r: i64 = vaddd_s64(a, b);
469        assert_eq!(r, e);
470    }
471
472    #[simd_test(enable = "neon")]
473    fn test_vaddd_u64() {
474        let a = 1_u64;
475        let b = 8_u64;
476        let e = 9_u64;
477        let r: u64 = vaddd_u64(a, b);
478        assert_eq!(r, e);
479    }
480
481    #[simd_test(enable = "neon")]
482    fn test_vext_p64() {
483        let a = u64x1::new(0);
484        let b = u64x1::new(1);
485        let e = u64x1::new(0);
486        let r = u64x1::from(vext_p64::<0>(a.into(), b.into()));
487        assert_eq!(r, e);
488    }
489
490    #[simd_test(enable = "neon")]
491    fn test_vext_f64() {
492        let a = f64x1::new(0.);
493        let b = f64x1::new(1.);
494        let e = f64x1::new(0.);
495        let r = f64x1::from(vext_f64::<0>(a.into(), b.into()));
496        assert_eq!(r, e);
497    }
498
499    #[simd_test(enable = "neon")]
500    fn test_vshld_n_s64() {
501        let a: i64 = 1;
502        let e: i64 = 4;
503        let r: i64 = vshld_n_s64::<2>(a);
504        assert_eq!(r, e);
505    }
506
507    #[simd_test(enable = "neon")]
508    fn test_vshld_n_u64() {
509        let a: u64 = 1;
510        let e: u64 = 4;
511        let r: u64 = vshld_n_u64::<2>(a);
512        assert_eq!(r, e);
513    }
514
515    #[simd_test(enable = "neon")]
516    fn test_vshrd_n_s64() {
517        let a: i64 = 4;
518        let e: i64 = 1;
519        let r: i64 = vshrd_n_s64::<2>(a);
520        assert_eq!(r, e);
521    }
522
523    #[simd_test(enable = "neon")]
524    fn test_vshrd_n_u64() {
525        let a: u64 = 4;
526        let e: u64 = 1;
527        let r: u64 = vshrd_n_u64::<2>(a);
528        assert_eq!(r, e);
529    }
530
531    #[simd_test(enable = "neon")]
532    fn test_vsrad_n_s64() {
533        let a: i64 = 1;
534        let b: i64 = 4;
535        let e: i64 = 2;
536        let r: i64 = vsrad_n_s64::<2>(a, b);
537        assert_eq!(r, e);
538    }
539
540    #[simd_test(enable = "neon")]
541    fn test_vsrad_n_u64() {
542        let a: u64 = 1;
543        let b: u64 = 4;
544        let e: u64 = 2;
545        let r: u64 = vsrad_n_u64::<2>(a, b);
546        assert_eq!(r, e);
547    }
548
549    #[simd_test(enable = "neon")]
550    fn test_vdup_n_f64() {
551        let a: f64 = 3.3;
552        let e = f64x1::new(3.3);
553        let r = f64x1::from(vdup_n_f64(a));
554        assert_eq!(r, e);
555    }
556
557    #[simd_test(enable = "neon")]
558    fn test_vdup_n_p64() {
559        let a: u64 = 3;
560        let e = u64x1::new(3);
561        let r = u64x1::from(vdup_n_p64(a));
562        assert_eq!(r, e);
563    }
564
565    #[simd_test(enable = "neon")]
566    fn test_vdupq_n_f64() {
567        let a: f64 = 3.3;
568        let e = f64x2::new(3.3, 3.3);
569        let r = f64x2::from(vdupq_n_f64(a));
570        assert_eq!(r, e);
571    }
572
573    #[simd_test(enable = "neon")]
574    fn test_vdupq_n_p64() {
575        let a: u64 = 3;
576        let e = u64x2::new(3, 3);
577        let r = u64x2::from(vdupq_n_p64(a));
578        assert_eq!(r, e);
579    }
580
581    #[simd_test(enable = "neon")]
582    fn test_vmov_n_p64() {
583        let a: u64 = 3;
584        let e = u64x1::new(3);
585        let r = u64x1::from(vmov_n_p64(a));
586        assert_eq!(r, e);
587    }
588
589    #[simd_test(enable = "neon")]
590    fn test_vmov_n_f64() {
591        let a: f64 = 3.3;
592        let e = f64x1::new(3.3);
593        let r = f64x1::from(vmov_n_f64(a));
594        assert_eq!(r, e);
595    }
596
597    #[simd_test(enable = "neon")]
598    fn test_vmovq_n_p64() {
599        let a: u64 = 3;
600        let e = u64x2::new(3, 3);
601        let r = u64x2::from(vmovq_n_p64(a));
602        assert_eq!(r, e);
603    }
604
605    #[simd_test(enable = "neon")]
606    fn test_vmovq_n_f64() {
607        let a: f64 = 3.3;
608        let e = f64x2::new(3.3, 3.3);
609        let r = f64x2::from(vmovq_n_f64(a));
610        assert_eq!(r, e);
611    }
612
613    #[simd_test(enable = "neon")]
614    fn test_vget_lane_f64() {
615        let v = f64x1::new(1.0);
616        let r = vget_lane_f64::<0>(v.into());
617        assert_eq!(r, 1.0);
618    }
619
620    #[simd_test(enable = "neon")]
621    fn test_vcopy_lane_s64() {
622        let a = i64x1::new(1);
623        let b = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
624        let e = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
625        let r = i64x1::from(vcopy_lane_s64::<0, 0>(a.into(), b.into()));
626        assert_eq!(r, e);
627    }
628
629    #[simd_test(enable = "neon")]
630    fn test_vcopy_lane_u64() {
631        let a = u64x1::new(1);
632        let b = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
633        let e = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
634        let r = u64x1::from(vcopy_lane_u64::<0, 0>(a.into(), b.into()));
635        assert_eq!(r, e);
636    }
637
638    #[simd_test(enable = "neon")]
639    fn test_vcopy_lane_p64() {
640        let a = u64x1::new(1);
641        let b = u64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
642        let e = u64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
643        let r = u64x1::from(vcopy_lane_p64::<0, 0>(a.into(), b.into()));
644        assert_eq!(r, e);
645    }
646
647    #[simd_test(enable = "neon")]
648    fn test_vcopy_lane_f64() {
649        let a = f64x1::from_array([1.]);
650        let b = f64x1::from_array([0.]);
651        let e = f64x1::from_array([0.]);
652        let r = f64x1::from(vcopy_lane_f64::<0, 0>(a.into(), b.into()));
653        assert_eq!(r, e);
654    }
655
656    #[simd_test(enable = "neon")]
657    fn test_vbsl_f64() {
658        let a = u64x1::new(0x8000000000000000);
659        let b = f64x1::new(-1.23f64);
660        let c = f64x1::new(2.34f64);
661        let e = f64x1::new(-2.34f64);
662        let r = f64x1::from(vbsl_f64(a.into(), b.into(), c.into()));
663        assert_eq!(r, e);
664    }
665
666    #[simd_test(enable = "neon")]
667    fn test_vbsl_p64() {
668        let a = u64x1::new(1);
669        let b = u64x1::new(u64::MAX);
670        let c = u64x1::new(u64::MIN);
671        let e = u64x1::new(1);
672        let r = u64x1::from(vbsl_p64(a.into(), b.into(), c.into()));
673        assert_eq!(r, e);
674    }
675
676    #[simd_test(enable = "neon")]
677    fn test_vbslq_f64() {
678        let a = u64x2::new(1, 0x8000000000000000);
679        let b = f64x2::new(f64::MAX, -1.23f64);
680        let c = f64x2::new(f64::MIN, 2.34f64);
681        let e = f64x2::new(f64::MIN, -2.34f64);
682        let r = f64x2::from(vbslq_f64(a.into(), b.into(), c.into()));
683        assert_eq!(r, e);
684    }
685
686    #[simd_test(enable = "neon")]
687    fn test_vbslq_p64() {
688        let a = u64x2::new(u64::MAX, 1);
689        let b = u64x2::new(u64::MAX, u64::MAX);
690        let c = u64x2::new(u64::MIN, u64::MIN);
691        let e = u64x2::new(u64::MAX, 1);
692        let r = u64x2::from(vbslq_p64(a.into(), b.into(), c.into()));
693        assert_eq!(r, e);
694    }
695
696    #[simd_test(enable = "neon")]
697    fn test_vld1_f64() {
698        let a: [f64; 2] = [0., 1.];
699        let e = f64x1::new(1.);
700        let r = unsafe { f64x1::from(vld1_f64(a[1..].as_ptr())) };
701        assert_eq!(r, e)
702    }
703
704    #[simd_test(enable = "neon")]
705    fn test_vld1q_f64() {
706        let a: [f64; 3] = [0., 1., 2.];
707        let e = f64x2::new(1., 2.);
708        let r = unsafe { f64x2::from(vld1q_f64(a[1..].as_ptr())) };
709        assert_eq!(r, e)
710    }
711
712    #[simd_test(enable = "neon")]
713    fn test_vld1_dup_f64() {
714        let a: [f64; 2] = [1., 42.];
715        let e = f64x1::new(42.);
716        let r = unsafe { f64x1::from(vld1_dup_f64(a[1..].as_ptr())) };
717        assert_eq!(r, e)
718    }
719
720    #[simd_test(enable = "neon")]
721    fn test_vld1q_dup_f64() {
722        let elem: f64 = 42.;
723        let e = f64x2::new(42., 42.);
724        let r = unsafe { f64x2::from(vld1q_dup_f64(&elem)) };
725        assert_eq!(r, e)
726    }
727
728    #[simd_test(enable = "neon")]
729    fn test_vld1_lane_f64() {
730        let a = f64x1::new(0.);
731        let elem: f64 = 42.;
732        let e = f64x1::new(42.);
733        let r = unsafe { f64x1::from(vld1_lane_f64::<0>(&elem, a.into())) };
734        assert_eq!(r, e)
735    }
736
737    #[simd_test(enable = "neon")]
738    fn test_vld1q_lane_f64() {
739        let a = f64x2::new(0., 1.);
740        let elem: f64 = 42.;
741        let e = f64x2::new(0., 42.);
742        let r = unsafe { f64x2::from(vld1q_lane_f64::<1>(&elem, a.into())) };
743        assert_eq!(r, e)
744    }
745
746    #[simd_test(enable = "neon")]
747    fn test_vst1_f64() {
748        let mut vals = [0_f64; 2];
749        let a = f64x1::new(1.);
750
751        unsafe {
752            vst1_f64(vals[1..].as_mut_ptr(), a.into());
753        }
754
755        assert_eq!(vals[0], 0.);
756        assert_eq!(vals[1], 1.);
757    }
758
759    #[simd_test(enable = "neon")]
760    fn test_vst1q_f64() {
761        let mut vals = [0_f64; 3];
762        let a = f64x2::new(1., 2.);
763
764        unsafe {
765            vst1q_f64(vals[1..].as_mut_ptr(), a.into());
766        }
767
768        assert_eq!(vals[0], 0.);
769        assert_eq!(vals[1], 1.);
770        assert_eq!(vals[2], 2.);
771    }
772
773    macro_rules! wide_store_load_roundtrip {
774        ($elem_ty:ty, $len:expr, $vec_ty:ty, $store:expr, $load:expr) => {
775            let vals: [$elem_ty; $len] = crate::array::from_fn(|i| i as $elem_ty);
776            let a: $vec_ty = transmute(vals);
777            let mut tmp = core::mem::MaybeUninit::<[$elem_ty; $len]>::uninit();
778            $store(tmp.as_mut_ptr().cast(), a);
779
780            // With Miri this will check that all elements were initialized.
781            let tmp = tmp.assume_init();
782
783            let r: $vec_ty = $load(tmp.as_ptr().cast());
784            let out: [$elem_ty; $len] = transmute(r);
785            assert_eq!(out, vals);
786        };
787    }
788
789    macro_rules! wide_store_load_roundtrip_fp16 {
790        ($( $name:ident $args:tt);* $(;)?) => {
791            $(
792                #[cfg_attr(miri, ignore)] // uses unsupported vendor intrinsics
793                #[simd_test(enable = "neon,fp16")]
794                #[cfg(not(target_arch = "arm64ec"))]
795                unsafe fn $name() {
796                    wide_store_load_roundtrip! $args;
797                }
798            )*
799        };
800    }
801
802    wide_store_load_roundtrip_fp16! {
803        test_vld1_f16_x2(f16, 8, float16x4x2_t, vst1_f16_x2, vld1_f16_x2);
804        test_vld1_f16_x3(f16, 12, float16x4x3_t, vst1_f16_x3, vld1_f16_x3);
805        test_vld1_f16_x4(f16, 16, float16x4x4_t, vst1_f16_x4, vld1_f16_x4);
806
807        test_vld1q_f16_x2(f16, 16, float16x8x2_t, vst1q_f16_x2, vld1q_f16_x2);
808        test_vld1q_f16_x3(f16, 24, float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3);
809        test_vld1q_f16_x4(f16, 32, float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4);
810
811        test_vld2_f16(f16, 8, float16x4x2_t, vst2_f16, vld2_f16);
812        test_vld3_f16(f16, 12, float16x4x3_t, vst3_f16, vld3_f16);
813        test_vld4_f16(f16, 16, float16x4x4_t, vst4_f16, vld4_f16);
814
815        test_vld2q_f16(f16, 16, float16x8x2_t, vst2q_f16, vld2q_f16);
816        test_vld3q_f16(f16, 24, float16x8x3_t, vst3q_f16, vld3q_f16);
817        test_vld4q_f16(f16, 32, float16x8x4_t, vst4q_f16, vld4q_f16);
818    }
819
820    macro_rules! wide_store_load_roundtrip_aes {
821        ($( $name:ident $args:tt);* $(;)?) => {
822            $(
823                #[simd_test(enable = "neon,aes")]
824                unsafe fn $name() {
825                    wide_store_load_roundtrip! $args;
826                }
827            )*
828        };
829    }
830
831    wide_store_load_roundtrip_aes! {
832        test_vld1_p64_x2(p64, 2, poly64x1x2_t, vst1_p64_x2, vld1_p64_x2);
833        test_vld1_p64_x3(p64, 3, poly64x1x3_t, vst1_p64_x3, vld1_p64_x3);
834        test_vld1_p64_x4(p64, 4, poly64x1x4_t, vst1_p64_x4, vld1_p64_x4);
835
836        test_vld1q_p64_x2(p64, 4, poly64x2x2_t, vst1q_p64_x2, vld1q_p64_x2);
837        test_vld1q_p64_x3(p64, 6, poly64x2x3_t, vst1q_p64_x3, vld1q_p64_x3);
838        test_vld1q_p64_x4(p64, 8, poly64x2x4_t, vst1q_p64_x4, vld1q_p64_x4);
839    }
840
841    macro_rules! wide_store_load_roundtrip_neon {
842        ($( $name:ident $args:tt);* $(;)?) => {
843            $(
844                #[simd_test(enable = "neon")]
845                unsafe fn $name() {
846                    wide_store_load_roundtrip! $args;
847                }
848            )*
849        };
850    }
851
852    wide_store_load_roundtrip_neon! {
853        test_vld1_f32_x2(f32, 4, float32x2x2_t, vst1_f32_x2, vld1_f32_x2);
854        test_vld1_f32_x3(f32, 6, float32x2x3_t, vst1_f32_x3, vld1_f32_x3);
855        test_vld1_f32_x4(f32, 8, float32x2x4_t, vst1_f32_x4, vld1_f32_x4);
856
857        test_vld1q_f32_x2(f32, 8, float32x4x2_t, vst1q_f32_x2, vld1q_f32_x2);
858        test_vld1q_f32_x3(f32, 12, float32x4x3_t, vst1q_f32_x3, vld1q_f32_x3);
859        test_vld1q_f32_x4(f32, 16, float32x4x4_t, vst1q_f32_x4, vld1q_f32_x4);
860
861        test_vld1_f64_x2(f64, 2, float64x1x2_t, vst1_f64_x2, vld1_f64_x2);
862        test_vld1_f64_x3(f64, 3, float64x1x3_t, vst1_f64_x3, vld1_f64_x3);
863        test_vld1_f64_x4(f64, 4, float64x1x4_t, vst1_f64_x4, vld1_f64_x4);
864
865        test_vld1q_f64_x2(f64, 4, float64x2x2_t, vst1q_f64_x2, vld1q_f64_x2);
866        test_vld1q_f64_x3(f64, 6, float64x2x3_t, vst1q_f64_x3, vld1q_f64_x3);
867        test_vld1q_f64_x4(f64, 8, float64x2x4_t, vst1q_f64_x4, vld1q_f64_x4);
868
869        test_vld1_s8_x2(i8, 16, int8x8x2_t, vst1_s8_x2, vld1_s8_x2);
870        test_vld1_s8_x3(i8, 24, int8x8x3_t, vst1_s8_x3, vld1_s8_x3);
871        test_vld1_s8_x4(i8, 32, int8x8x4_t, vst1_s8_x4, vld1_s8_x4);
872
873        test_vld1q_s8_x2(i8, 32, int8x16x2_t, vst1q_s8_x2, vld1q_s8_x2);
874        test_vld1q_s8_x3(i8, 48, int8x16x3_t, vst1q_s8_x3, vld1q_s8_x3);
875        test_vld1q_s8_x4(i8, 64, int8x16x4_t, vst1q_s8_x4, vld1q_s8_x4);
876
877        test_vld1_s16_x2(i16, 8, int16x4x2_t, vst1_s16_x2, vld1_s16_x2);
878        test_vld1_s16_x3(i16, 12, int16x4x3_t, vst1_s16_x3, vld1_s16_x3);
879        test_vld1_s16_x4(i16, 16, int16x4x4_t, vst1_s16_x4, vld1_s16_x4);
880
881        test_vld1q_s16_x2(i16, 16, int16x8x2_t, vst1q_s16_x2, vld1q_s16_x2);
882        test_vld1q_s16_x3(i16, 24, int16x8x3_t, vst1q_s16_x3, vld1q_s16_x3);
883        test_vld1q_s16_x4(i16, 32, int16x8x4_t, vst1q_s16_x4, vld1q_s16_x4);
884
885        test_vld1_s32_x2(i32, 4, int32x2x2_t, vst1_s32_x2, vld1_s32_x2);
886        test_vld1_s32_x3(i32, 6, int32x2x3_t, vst1_s32_x3, vld1_s32_x3);
887        test_vld1_s32_x4(i32, 8, int32x2x4_t, vst1_s32_x4, vld1_s32_x4);
888
889        test_vld1q_s32_x2(i32, 8, int32x4x2_t, vst1q_s32_x2, vld1q_s32_x2);
890        test_vld1q_s32_x3(i32, 12, int32x4x3_t, vst1q_s32_x3, vld1q_s32_x3);
891        test_vld1q_s32_x4(i32, 16, int32x4x4_t, vst1q_s32_x4, vld1q_s32_x4);
892
893        test_vld1_s64_x2(i64, 2, int64x1x2_t, vst1_s64_x2, vld1_s64_x2);
894        test_vld1_s64_x3(i64, 3, int64x1x3_t, vst1_s64_x3, vld1_s64_x3);
895        test_vld1_s64_x4(i64, 4, int64x1x4_t, vst1_s64_x4, vld1_s64_x4);
896
897        test_vld1q_s64_x2(i64, 4, int64x2x2_t, vst1q_s64_x2, vld1q_s64_x2);
898        test_vld1q_s64_x3(i64, 6, int64x2x3_t, vst1q_s64_x3, vld1q_s64_x3);
899        test_vld1q_s64_x4(i64, 8, int64x2x4_t, vst1q_s64_x4, vld1q_s64_x4);
900
901        test_vld1_u8_x2(u8, 16, uint8x8x2_t, vst1_u8_x2, vld1_u8_x2);
902        test_vld1_u8_x3(u8, 24, uint8x8x3_t, vst1_u8_x3, vld1_u8_x3);
903        test_vld1_u8_x4(u8, 32, uint8x8x4_t, vst1_u8_x4, vld1_u8_x4);
904
905        test_vld1q_u8_x2(u8, 32, uint8x16x2_t, vst1q_u8_x2, vld1q_u8_x2);
906        test_vld1q_u8_x3(u8, 48, uint8x16x3_t, vst1q_u8_x3, vld1q_u8_x3);
907        test_vld1q_u8_x4(u8, 64, uint8x16x4_t, vst1q_u8_x4, vld1q_u8_x4);
908
909        test_vld1_u16_x2(u16, 8, uint16x4x2_t, vst1_u16_x2, vld1_u16_x2);
910        test_vld1_u16_x3(u16, 12, uint16x4x3_t, vst1_u16_x3, vld1_u16_x3);
911        test_vld1_u16_x4(u16, 16, uint16x4x4_t, vst1_u16_x4, vld1_u16_x4);
912
913        test_vld1q_u16_x2(u16, 16, uint16x8x2_t, vst1q_u16_x2, vld1q_u16_x2);
914        test_vld1q_u16_x3(u16, 24, uint16x8x3_t, vst1q_u16_x3, vld1q_u16_x3);
915        test_vld1q_u16_x4(u16, 32, uint16x8x4_t, vst1q_u16_x4, vld1q_u16_x4);
916
917        test_vld1_u32_x2(u32, 4, uint32x2x2_t, vst1_u32_x2, vld1_u32_x2);
918        test_vld1_u32_x3(u32, 6, uint32x2x3_t, vst1_u32_x3, vld1_u32_x3);
919        test_vld1_u32_x4(u32, 8, uint32x2x4_t, vst1_u32_x4, vld1_u32_x4);
920
921        test_vld1q_u32_x2(u32, 8, uint32x4x2_t, vst1q_u32_x2, vld1q_u32_x2);
922        test_vld1q_u32_x3(u32, 12, uint32x4x3_t, vst1q_u32_x3, vld1q_u32_x3);
923        test_vld1q_u32_x4(u32, 16, uint32x4x4_t, vst1q_u32_x4, vld1q_u32_x4);
924
925        test_vld1_u64_x2(u64, 2, uint64x1x2_t, vst1_u64_x2, vld1_u64_x2);
926        test_vld1_u64_x3(u64, 3, uint64x1x3_t, vst1_u64_x3, vld1_u64_x3);
927        test_vld1_u64_x4(u64, 4, uint64x1x4_t, vst1_u64_x4, vld1_u64_x4);
928
929        test_vld1q_u64_x2(u64, 4, uint64x2x2_t, vst1q_u64_x2, vld1q_u64_x2);
930        test_vld1q_u64_x3(u64, 6, uint64x2x3_t, vst1q_u64_x3, vld1q_u64_x3);
931        test_vld1q_u64_x4(u64, 8, uint64x2x4_t, vst1q_u64_x4, vld1q_u64_x4);
932
933        test_vld1_p8_x2(p8, 16, poly8x8x2_t, vst1_p8_x2, vld1_p8_x2);
934        test_vld1_p8_x3(p8, 24, poly8x8x3_t, vst1_p8_x3, vld1_p8_x3);
935        test_vld1_p8_x4(p8, 32, poly8x8x4_t, vst1_p8_x4, vld1_p8_x4);
936
937        test_vld1q_p8_x2(p8, 32, poly8x16x2_t, vst1q_p8_x2, vld1q_p8_x2);
938        test_vld1q_p8_x3(p8, 48, poly8x16x3_t, vst1q_p8_x3, vld1q_p8_x3);
939        test_vld1q_p8_x4(p8, 64, poly8x16x4_t, vst1q_p8_x4, vld1q_p8_x4);
940
941        test_vld1_p16_x2(p16, 8, poly16x4x2_t, vst1_p16_x2, vld1_p16_x2);
942        test_vld1_p16_x3(p16, 12, poly16x4x3_t, vst1_p16_x3, vld1_p16_x3);
943        test_vld1_p16_x4(p16, 16, poly16x4x4_t, vst1_p16_x4, vld1_p16_x4);
944
945        test_vld1q_p16_x2(p16, 16, poly16x8x2_t, vst1q_p16_x2, vld1q_p16_x2);
946        test_vld1q_p16_x3(p16, 24, poly16x8x3_t, vst1q_p16_x3, vld1q_p16_x3);
947        test_vld1q_p16_x4(p16, 32, poly16x8x4_t, vst1q_p16_x4, vld1q_p16_x4);
948    }
949
950    wide_store_load_roundtrip_neon! {
951        test_vld2_f32(f32, 4, float32x2x2_t, vst2_f32, vld2_f32);
952        test_vld3_f32(f32, 6, float32x2x3_t, vst3_f32, vld3_f32);
953        test_vld4_f32(f32, 8, float32x2x4_t, vst4_f32, vld4_f32);
954
955        test_vld2q_f32(f32, 8, float32x4x2_t, vst2q_f32, vld2q_f32);
956        test_vld3q_f32(f32, 12, float32x4x3_t, vst3q_f32, vld3q_f32);
957        test_vld4q_f32(f32, 16, float32x4x4_t, vst4q_f32, vld4q_f32);
958
959        test_vld2_f64(f64, 2, float64x1x2_t, vst2_f64, vld2_f64);
960        test_vld3_f64(f64, 3, float64x1x3_t, vst3_f64, vld3_f64);
961        test_vld4_f64(f64, 4, float64x1x4_t, vst4_f64, vld4_f64);
962
963        test_vld2q_f64(f64, 4, float64x2x2_t, vst2q_f64, vld2q_f64);
964        test_vld3q_f64(f64, 6, float64x2x3_t, vst3q_f64, vld3q_f64);
965        test_vld4q_f64(f64, 8, float64x2x4_t, vst4q_f64, vld4q_f64);
966
967        test_vld2_s8(i8, 16, int8x8x2_t, vst2_s8, vld2_s8);
968        test_vld3_s8(i8, 24, int8x8x3_t, vst3_s8, vld3_s8);
969        test_vld4_s8(i8, 32, int8x8x4_t, vst4_s8, vld4_s8);
970
971        test_vld2q_s8(i8, 32, int8x16x2_t, vst2q_s8, vld2q_s8);
972        test_vld3q_s8(i8, 48, int8x16x3_t, vst3q_s8, vld3q_s8);
973        test_vld4q_s8(i8, 64, int8x16x4_t, vst4q_s8, vld4q_s8);
974
975        test_vld2_s16(i16, 8, int16x4x2_t, vst2_s16, vld2_s16);
976        test_vld3_s16(i16, 12, int16x4x3_t, vst3_s16, vld3_s16);
977        test_vld4_s16(i16, 16, int16x4x4_t, vst4_s16, vld4_s16);
978
979        test_vld2q_s16(i16, 16, int16x8x2_t, vst2q_s16, vld2q_s16);
980        test_vld3q_s16(i16, 24, int16x8x3_t, vst3q_s16, vld3q_s16);
981        test_vld4q_s16(i16, 32, int16x8x4_t, vst4q_s16, vld4q_s16);
982
983        test_vld2_s32(i32, 4, int32x2x2_t, vst2_s32, vld2_s32);
984        test_vld3_s32(i32, 6, int32x2x3_t, vst3_s32, vld3_s32);
985        test_vld4_s32(i32, 8, int32x2x4_t, vst4_s32, vld4_s32);
986
987        test_vld2q_s32(i32, 8, int32x4x2_t, vst2q_s32, vld2q_s32);
988        test_vld3q_s32(i32, 12, int32x4x3_t, vst3q_s32, vld3q_s32);
989        test_vld4q_s32(i32, 16, int32x4x4_t, vst4q_s32, vld4q_s32);
990
991        test_vld2_s64(i64, 2, int64x1x2_t, vst2_s64, vld2_s64);
992        test_vld3_s64(i64, 3, int64x1x3_t, vst3_s64, vld3_s64);
993        test_vld4_s64(i64, 4, int64x1x4_t, vst4_s64, vld4_s64);
994
995        test_vld2q_s64(i64, 4, int64x2x2_t, vst2q_s64, vld2q_s64);
996        test_vld3q_s64(i64, 6, int64x2x3_t, vst3q_s64, vld3q_s64);
997        test_vld4q_s64(i64, 8, int64x2x4_t, vst4q_s64, vld4q_s64);
998
999        test_vld2_u8(u8, 16, uint8x8x2_t, vst2_u8, vld2_u8);
1000        test_vld3_u8(u8, 24, uint8x8x3_t, vst3_u8, vld3_u8);
1001        test_vld4_u8(u8, 32, uint8x8x4_t, vst4_u8, vld4_u8);
1002
1003        test_vld2q_u8(u8, 32, uint8x16x2_t, vst2q_u8, vld2q_u8);
1004        test_vld3q_u8(u8, 48, uint8x16x3_t, vst3q_u8, vld3q_u8);
1005        test_vld4q_u8(u8, 64, uint8x16x4_t, vst4q_u8, vld4q_u8);
1006
1007        test_vld2_u16(u16, 8, uint16x4x2_t, vst2_u16, vld2_u16);
1008        test_vld3_u16(u16, 12, uint16x4x3_t, vst3_u16, vld3_u16);
1009        test_vld4_u16(u16, 16, uint16x4x4_t, vst4_u16, vld4_u16);
1010
1011        test_vld2q_u16(u16, 16, uint16x8x2_t, vst2q_u16, vld2q_u16);
1012        test_vld3q_u16(u16, 24, uint16x8x3_t, vst3q_u16, vld3q_u16);
1013        test_vld4q_u16(u16, 32, uint16x8x4_t, vst4q_u16, vld4q_u16);
1014
1015        test_vld2_u32(u32, 4, uint32x2x2_t, vst2_u32, vld2_u32);
1016        test_vld3_u32(u32, 6, uint32x2x3_t, vst3_u32, vld3_u32);
1017        test_vld4_u32(u32, 8, uint32x2x4_t, vst4_u32, vld4_u32);
1018
1019        test_vld2q_u32(u32, 8, uint32x4x2_t, vst2q_u32, vld2q_u32);
1020        test_vld3q_u32(u32, 12, uint32x4x3_t, vst3q_u32, vld3q_u32);
1021        test_vld4q_u32(u32, 16, uint32x4x4_t, vst4q_u32, vld4q_u32);
1022
1023        test_vld2_u64(u64, 2, uint64x1x2_t, vst2_u64, vld2_u64);
1024        test_vld3_u64(u64, 3, uint64x1x3_t, vst3_u64, vld3_u64);
1025        test_vld4_u64(u64, 4, uint64x1x4_t, vst4_u64, vld4_u64);
1026
1027        test_vld2q_u64(u64, 4, uint64x2x2_t, vst2q_u64, vld2q_u64);
1028        test_vld3q_u64(u64, 6, uint64x2x3_t, vst3q_u64, vld3q_u64);
1029        test_vld4q_u64(u64, 8, uint64x2x4_t, vst4q_u64, vld4q_u64);
1030
1031        test_vld2_p8(p8, 16, poly8x8x2_t, vst2_p8, vld2_p8);
1032        test_vld3_p8(p8, 24, poly8x8x3_t, vst3_p8, vld3_p8);
1033        test_vld4_p8(p8, 32, poly8x8x4_t, vst4_p8, vld4_p8);
1034
1035        test_vld2q_p8(p8, 32, poly8x16x2_t, vst2q_p8, vld2q_p8);
1036        test_vld3q_p8(p8, 48, poly8x16x3_t, vst3q_p8, vld3q_p8);
1037        test_vld4q_p8(p8, 64, poly8x16x4_t, vst4q_p8, vld4q_p8);
1038
1039        test_vld2_p16(p16, 8, poly16x4x2_t, vst2_p16, vld2_p16);
1040        test_vld3_p16(p16, 12, poly16x4x3_t, vst3_p16, vld3_p16);
1041        test_vld4_p16(p16, 16, poly16x4x4_t, vst4_p16, vld4_p16);
1042
1043        test_vld2q_p16(p16, 16, poly16x8x2_t, vst2q_p16, vld2q_p16);
1044        test_vld3q_p16(p16, 24, poly16x8x3_t, vst3q_p16, vld3q_p16);
1045        test_vld4q_p16(p16, 32, poly16x8x4_t, vst4q_p16, vld4q_p16);
1046    }
1047
1048    macro_rules! lane_wide_store_load_roundtrip {
1049        ($elem_ty:ty, $len:expr, $idx:expr, $vec_ty:ty, $store:ident, $load:ident) => {
1050            let vals: [$elem_ty; $len] = crate::array::from_fn(|i| i as $elem_ty);
1051            let a: $vec_ty = transmute(vals);
1052            let mut tmp = [0 as $elem_ty; 4];
1053            $store::<$idx>(tmp.as_mut_ptr().cast(), a);
1054            let r: $vec_ty = $load::<$idx>(tmp.as_ptr().cast(), a);
1055            let out: [$elem_ty; $len] = transmute(r);
1056            assert_eq!(out, vals);
1057        };
1058    }
1059
1060    macro_rules! lane_wide_store_load_roundtrip_neon {
1061        ($( $name:ident $args:tt);* $(;)?) => {
1062            $(
1063                #[cfg_attr(miri, ignore)] // uses unsupported vendor intrinsics
1064                #[simd_test(enable = "neon")]
1065                unsafe fn $name() {
1066                    lane_wide_store_load_roundtrip! $args;
1067                }
1068            )*
1069        };
1070    }
1071
1072    lane_wide_store_load_roundtrip_neon! {
1073        test_vld2q_lane_s8(i8, 32, 15, int8x16x2_t, vst2q_lane_s8, vld2q_lane_s8);
1074        test_vld3q_lane_s8(i8, 48, 15, int8x16x3_t, vst3q_lane_s8, vld3q_lane_s8);
1075        test_vld4q_lane_s8(i8, 64, 15, int8x16x4_t, vst4q_lane_s8, vld4q_lane_s8);
1076
1077        test_vld2q_lane_u8(u8, 32, 15, uint8x16x2_t, vst2q_lane_u8, vld2q_lane_u8);
1078        test_vld3q_lane_u8(u8, 48, 15, uint8x16x3_t, vst3q_lane_u8, vld3q_lane_u8);
1079        test_vld4q_lane_u8(u8, 64, 15, uint8x16x4_t, vst4q_lane_u8, vld4q_lane_u8);
1080
1081        test_vld2_lane_s64(i64, 2, 0, int64x1x2_t, vst2_lane_s64, vld2_lane_s64);
1082        test_vld3_lane_s64(i64, 3, 0, int64x1x3_t, vst3_lane_s64, vld3_lane_s64);
1083        test_vld4_lane_s64(i64, 4, 0, int64x1x4_t, vst4_lane_s64, vld4_lane_s64);
1084        test_vld2q_lane_s64(i64, 4, 1, int64x2x2_t, vst2q_lane_s64, vld2q_lane_s64);
1085        test_vld3q_lane_s64(i64, 6, 1, int64x2x3_t, vst3q_lane_s64, vld3q_lane_s64);
1086        test_vld4q_lane_s64(i64, 8, 1, int64x2x4_t, vst4q_lane_s64, vld4q_lane_s64);
1087
1088        test_vld2_lane_u64(u64, 2, 0, uint64x1x2_t, vst2_lane_u64, vld2_lane_u64);
1089        test_vld3_lane_u64(u64, 3, 0, uint64x1x3_t, vst3_lane_u64, vld3_lane_u64);
1090        test_vld4_lane_u64(u64, 4, 0, uint64x1x4_t, vst4_lane_u64, vld4_lane_u64);
1091        test_vld2q_lane_u64(u64, 4, 1, uint64x2x2_t, vst2q_lane_u64, vld2q_lane_u64);
1092        test_vld3q_lane_u64(u64, 6, 1, uint64x2x3_t, vst3q_lane_u64, vld3q_lane_u64);
1093        test_vld4q_lane_u64(u64, 8, 1, uint64x2x4_t, vst4q_lane_u64, vld4q_lane_u64);
1094    }
1095}
1096
1097#[cfg(test)]
1098#[path = "../../arm_shared/neon/table_lookup_tests.rs"]
1099mod table_lookup_tests;
1100
1101#[cfg(test)]
1102#[path = "../../arm_shared/neon/shift_and_insert_tests.rs"]
1103mod shift_and_insert_tests;
1104
1105#[cfg(test)]
1106#[path = "../../arm_shared/neon/load_tests.rs"]
1107mod load_tests;
1108
1109#[cfg(test)]
1110#[path = "../../arm_shared/neon/store_tests.rs"]
1111mod store_tests;