1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
|
/* { dg-do compile } */
/* { dg-skip-if "Require optimsation to compile DCE tests" { *-*-* } { "-O0" } { "" } } */
/* { dg-require-effective-target arm_v8m_main_cde_ok } */
/* { dg-add-options arm_v8m_main_cde } */
/* { dg-final { check-function-bodies "**" "" } } */
/* These are the scalar intrinsics.
uint32_t __arm_cx1(int coproc, uint32_t imm);
uint32_t __arm_cx1a(int coproc, uint32_t acc, uint32_t imm);
uint32_t __arm_cx2(int coproc, uint32_t n, uint32_t imm);
uint32_t __arm_cx2a(int coproc, uint32_t acc, uint32_t n, uint32_t imm);
uint32_t __arm_cx3(int coproc, uint32_t n, uint32_t m, uint32_t imm);
uint32_t __arm_cx3a(int coproc, uint32_t acc, uint32_t n, uint32_t m, uint32_t imm);
uint64_t __arm_cx1d(int coproc, uint32_t imm);
uint64_t __arm_cx1da(int coproc, uint64_t acc, uint32_t imm);
uint64_t __arm_cx2d(int coproc, uint32_t n, uint32_t imm);
uint64_t __arm_cx2da(int coproc, uint64_t acc, uint32_t n, uint32_t imm);
uint64_t __arm_cx3d(int coproc, uint32_t n, uint32_t m, uint32_t imm);
uint64_t __arm_cx3da(int coproc, uint64_t acc, uint32_t n, uint32_t m, uint32_t imm); */
#include "arm_cde.h"
#define TEST_CDE_SCALAR_INTRINSIC(name, accum_type, arguments) \
accum_type test_cde_##name (__attribute__ ((unused)) uint32_t n, \
__attribute__ ((unused)) uint32_t m) \
{ \
accum_type accum = 0; \
accum += __arm_##name arguments; \
return accum; \
}
/* Basic test that we produce the assembly as expected. */
/*
** test_cde_cx1:
** cx1 p0, r0, #33
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx1, uint32_t, (0, 33))
/*
** test_cde_cx1a:
** movs r0, #0
** cx1a p0, r0, #33
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx1a, uint32_t, (0, accum, 33))
/*
** test_cde_cx2:
** cx2 p0, r0, r0, #33
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx2, uint32_t, (0, n, 33))
/*
** test_cde_cx2a:
** movs (r[0-9]+), #0
** cx2a p0, \1, r0, #33
** mov r0, \1
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx2a, uint32_t, (0, accum, n, 33))
/*
** test_cde_cx3:
** cx3 p0, r0, r0, r1, #33
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx3, uint32_t, (0, n, m, 33))
/*
** test_cde_cx3a:
** movs (r[0-9]+), #0
** cx3a p0, \1, r0, r1, #33
** mov r0, \1
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx3a, uint32_t, (0, accum, n, m, 33))
/*
** test_cde_cx1d:
** cx1d p0, r0, r1, #33
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx1d, uint64_t, (0, 33))
/*
** test_cde_cx1da:
** movs r0, #0
** movs r1, #0
** cx1da p0, r0, r1, #33
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx1da, uint64_t, (0, accum, 33))
/*
** test_cde_cx2d:
** cx2d p0, r0, r1, r0, #33
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx2d, uint64_t, (0, n, 33))
/* This particular function gets optimised by the compiler in two different
ways depending on the optimisation level. So does test_cde_cx3da. That's
why we have two different regexes in each of these function body checks. */
/*
** test_cde_cx2da:
** (
** mov (r[0-9]+), r0
** movs r0, #0
** movs r1, #0
** cx2da p0, r0, r1, \1, #33
** |
** movs (r[0-9]+), #0
** movs (r[0-9]+), #0
** cx2da p0, \2, \3, r0, #33
** mov r0, \2
** mov r1, \3
** )
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx2da, uint64_t, (0, accum, n, 33))
/*
** test_cde_cx3d:
** cx3d p0, r0, r1, r0, r1, #33
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx3d, uint64_t, (0, n, m, 33))
/*
** test_cde_cx3da:
** ...
** (
** movs (r[0-9]+), #0
** movs (r[0-9]+), #0
** cx3da p0, \1, \2, r0, r1, #33
** mov r0, \1
** mov r1, \2
** |
** movs r0, #0
** movs r1, #0
** cx3da p0, r0, r1, r[0-9]+, r[0-9]+, #33
** )
** ...
** bx lr
*/
TEST_CDE_SCALAR_INTRINSIC (cx3da, uint64_t, (0, accum, n, m, 33))
/* Ensure this function gets DCE'd out after optimisation.
Should be such since the ACLE specification mentions these functions are
stateless and pure. */
/*
** test_cde_dce:
** bx lr
*/
void test_cde_dce (uint32_t n, uint32_t m)
{
uint64_t accum = 0;
__arm_cx1 (0, 33);
__arm_cx1a (0, accum, 33);
__arm_cx2 (0, n, 33);
__arm_cx2a (0, accum, n, 33);
__arm_cx3 (0, n, m, 33);
__arm_cx3a (0, accum, n, m, 33);
__arm_cx1d (0, 33);
__arm_cx1da (0, accum, 33);
__arm_cx2d (0, n, 33);
__arm_cx2da (0, accum, n, 33);
__arm_cx3d (0, n, m, 33);
__arm_cx3da (0, accum, n, m, 33);
}
/* Checking this function allows constants with symbolic names.
This test must be run under some level of optimisation.
The actual check we perform is that the function is provided something that,
at the point of expansion, is an immediate. That check is not as strict as
having something that is an immediate directly.
Since we've already checked these intrinsics generate code in the manner we
expect (above), here we just check that all the instructions we expect are
there. To ensure the instructions are from these functions we use different
constants and search for those specifically with `scan-assembler-times`. */
/* Checking this function allows constants with symbolic names. */
uint32_t test_cde2 (uint32_t n, uint32_t m)
{
int coproc = 6;
uint32_t imm = 30;
uint32_t accum = 0;
accum += __arm_cx1 (coproc, imm);
accum += __arm_cx1a (coproc, accum, imm);
accum += __arm_cx2 (coproc, n, imm);
accum += __arm_cx2a (coproc, accum, n, imm);
accum += __arm_cx3 (coproc, n, m, imm);
accum += __arm_cx3a (coproc, accum, n, m, imm);
return accum;
}
/* Checking this function allows constants with symbolic names. */
uint64_t test_cdedi2 (uint32_t n, uint32_t m)
{
int coproc = 6;
uint32_t imm = 30;
uint64_t accum = 0;
accum += __arm_cx1d (coproc, imm);
accum += __arm_cx1da (coproc, accum, imm);
accum += __arm_cx2d (coproc, n, imm);
accum += __arm_cx2da (coproc, accum, n, imm);
accum += __arm_cx3d (coproc, n, m, imm);
accum += __arm_cx3da (coproc, accum, n, m, imm);
return accum;
}
/* { dg-final { scan-assembler-times "cx1\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx2\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx3\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx1a\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx2a\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx3a\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx1d\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx2d\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx3d\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx1da\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx2da\\tp6" 1 } } */
/* { dg-final { scan-assembler-times "cx3da\\tp6" 1 } } */
|