1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
# mach: aarch64
# Check the store single 1-element structure to one lane instructions:
# st1, st2, st3, st4.
# Check the addressing modes: no offset, post-index immediate offset,
# post-index register offset.
.include "testutils.inc"
.data
.align 4
input:
.word 0x04030201
.word 0x08070605
.word 0x0c0b0a09
.word 0x100f0e0d
.word 0x14131211
.word 0x18171615
.word 0x1c1b1a19
.word 0x201f1e1d
output:
.zero 64
start
adrp x0, input
add x0, x0, :lo12:input
adrp x1, output
add x1, x1, :lo12:output
mov x2, x0
ldr q0, [x2], 16
ldr q1, [x2]
mov x2, x0
ldr q2, [x2], 16
ldr q3, [x2]
mov x2, x1
mov x3, #1
mov x4, #4
st1 {v0.b}[0], [x2], 1
st1 {v0.b}[1], [x2], x3
st1 {v0.h}[1], [x2], 2
st1 {v0.s}[1], [x2], x4
st1 {v0.d}[1], [x2]
ldr q4, [x1]
addv b4, v4.16b
mov x5, v4.d[0]
cmp x5, #136
bne .Lfailure
mov x2, x1
mov x3, #16
mov x4, #4
st2 {v0.d, v1.d}[0], [x2], x3
st2 {v0.s, v1.s}[2], [x2], 8
st2 {v0.h, v1.h}[6], [x2], x4
st2 {v0.b, v1.b}[14], [x2], 2
st2 {v0.b, v1.b}[15], [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2]
addv b4, v4.16b
addv b5, v5.16b
mov x5, v4.d[0]
mov x6, v5.d[0]
cmp x5, #200
bne .Lfailure
cmp x6, #72
bne .Lfailure
mov x2, x1
mov x3, #12
st3 {v0.s, v1.s, v2.s}[0], [x2], 12
st3 {v0.s, v1.s, v2.s}[1], [x2], x3
st3 {v0.s, v1.s, v2.s}[2], [x2], 12
st3 {v0.s, v1.s, v2.s}[3], [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2], 16
ldr q6, [x2]
addv b4, v4.16b
addv b5, v5.16b
addv b6, v6.16b
mov x4, v4.d[0]
mov x5, v5.d[0]
mov x6, v6.d[0]
cmp x4, #120
bne .Lfailure
cmp x5, #8
bne .Lfailure
cmp x6, #24
bne .Lfailure
mov x2, x1
mov x3, #16
st4 {v0.s, v1.s, v2.s, v3.s}[0], [x2], 16
st4 {v0.s, v1.s, v2.s, v3.s}[1], [x2], x3
st4 {v0.s, v1.s, v2.s, v3.s}[2], [x2], 16
st4 {v0.s, v1.s, v2.s, v3.s}[3], [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2], 16
ldr q6, [x2], 16
ldr q7, [x2]
addv b4, v4.16b
addv b5, v5.16b
addv b6, v6.16b
addv b7, v7.16b
mov x4, v4.d[0]
mov x5, v5.d[0]
mov x6, v6.d[0]
mov x7, v7.d[0]
cmp x4, #168
bne .Lfailure
cmp x5, #232
bne .Lfailure
cmp x6, #40
bne .Lfailure
cmp x7, #104
bne .Lfailure
pass
.Lfailure:
fail
|