1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
|
.text
.align 2
#include "fft_const.h"
#if defined(FFT_FIXED)
#define PTR_SHIFT 2
#define PTR_SIZE 4
#define DATA_LOAD lw
#define DATA_STORE sw
#define FFT_MUL mul
#define FFT_ADD add
#define FFT_SUB sub
#define REG0 x3
#define REG1 x4
#define REG2 x5
#define REG3 x6
#define REG4 x7
#define REG5 x8
#elif defined(FFT_FLOATING)
#if defined(FP_HALF)
#define PTR_SHIFT 1
#define PTR_SIZE 2
#define DATA_LOAD flh
#define DATA_STORE fsh
#define FFT_MUL fmul.h
#define FFT_ADD fadd.h
#define FFT_SUB fsub.h
#elif defined(FP_SINGLE)
#define PTR_SHIFT 2
#define PTR_SIZE 4
#define DATA_LOAD flw
#define DATA_STORE fsw
#define FFT_MUL fmul.s
#define FFT_ADD fadd.s
#define FFT_SUB fsub.s
#elif defined(FP_DOUBLE)
#define PTR_SHIFT 3
#define PTR_SIZE 8
#define DATA_LOAD fld
#define DATA_STORE fsd
#define FFT_MUL fmul.d
#define FFT_ADD fadd.d
#define FFT_SUB fsub.d
#endif
#define REG0 f0
#define REG1 f1
#define REG2 f2
#define REG3 f3
#define REG4 f4
#define REG5 f5
#else
#error FFT_FIXED or FFT_FLOATING not defined
#endif
.globl vf_test
vf_test:
utidx x2
add x1, x1, x2
add x1, x1, x1
stop
.globl vf_fft_init
vf_fft_init:
# IN:
# x1: lane start (utidx=0 actually has this pos due to stripmining)
# x2: bit mask to select FFT block from op idx
# x3: bit mask to select operand in FFT block from op idx
# x4: necessary shift to adjust TF appropriately ( REMOVED )
# x5: half the current FFT size (add to get the second op)
# OUT:
# x1: Has the first operand pos = (opid & i_x2) << 1 + (opid & i_x3)
# x2: Has the second operand pos = o_x1 + i_x5
# x3: Has the twiddle factor pos = (opid & i_x3) << i_x4
utidx x6
add x6, x1, x6 # x6 <= opid
and x2, x2, x6 # x2 <= opid & i_x2
and x3, x3, x6 # x3 <= opid & i_x3
slli x2, x2, 1 # x2 <= (opid & i_x2) << 1
add x1, x2, x3 # x1 is now the proper result
add x2, x1, x5 # x2 is now the proper result
sll x3, x3, x4
stop
.globl vf_fft_scale
vf_fft_scale:
# IN:
# x1: Has the first operand pos (reused)
# x2: Has the second operand pos (reused)
# x3: Has the twiddle factor pos (reused)
# x4: Has the tf real ptr
# x5: Has the tf imag ptr
# x6: Has the workspace real ptr
# x7: Has the workspace imag ptr
# x8: Has the fixed point shift ( REMOVED )
# OUT:
# x1: Has the first operand offset = i_x1 << 3
# x2: Has the second operand offset = i_x2 << 3
# x3: Has the scale factor real
# x4: Has the scale factor imag
# Convert positions into actual memory offsets from table start
slli x1, x1, PTR_SHIFT # x1 <= i_x1 << 3 (proper result)
slli x2, x2, PTR_SHIFT # x2 <= i_x2 << 3 (proper result)
slli x3, x3, PTR_SHIFT # x3 <= i_x3 << 3 (tf offset)
# Compute memory locations
add x4, x4, x3 # x4 <= load address for tf real
add x5, x5, x3 # x5 <= load address for tf imag
add x6, x6, x2 # x6 <= load address for op2 real
add x7, x7, x2 # x7 <= load address for op2 imag
# Actually read memory
DATA_LOAD REG1, 0(x4) # tf real (a)
DATA_LOAD REG2, 0(x5) # tf imag (bi)
DATA_LOAD REG3, 0(x6) # op2 real (c)
DATA_LOAD REG4, 0(x7) # op2 imag (di)
# Do the math using 3 multiplies
FFT_ADD REG0, REG1, REG2 # REG0 <= a + b
FFT_SUB REG2, REG2, REG1 # REG2 <= b - a
FFT_MUL REG0, REG0, REG4 # REG0 <= (a+b)d
#ifdef FFT_FIXED
sra REG0, REG0, REG5 # DO NOT SHIFT FOR FLOATING
#endif
FFT_MUL REG2, REG2, REG3 # REG2 <= (b-a)c
#ifdef FFT_FIXED
sra REG2, REG2, REG5 # DO NOT SHIFT FOR FLOATING
#endif
FFT_ADD REG3, REG3, REG4 # REG3 <= c + d
FFT_MUL REG4, REG1, REG3 # REG4 <= a(c+d)
#ifdef FFT_FIXED
sra REG4, REG4, REG5 # DO NOT SHIFT FOR FLOATING
#endif
# Prepare final result
FFT_SUB REG0, REG4, REG0 # REG0 <= a(c+d) - (a+b)d (scale real)
FFT_ADD REG1, REG4, REG2 # REG1 <= a(c+d) + (b-a)c (scale imag)
stop
/*
# Four multiply version
# Do the multiplications (a+bi)(c+di) needs ac ad bc bd
mul x3, x4, x6 # x3 <= ac
mul x4, x4, x7 # x4 <= adi
mul x6, x5, x6 # x6 <= bc
mul x5, x5, x7 # x5 <= bdi
sra x3, x3, x8 # These 4 shifts make sure the fixed pt properly aligned
sra x4, x4, x8
sra x5, x5, x8
sra x6, x6, x8
# Do the additions (ac - bd) and (bc + ad)
sub x3, x3, x5 # x3 <= ac - bd (proper result)
add x4, x4, x6 # x4 <= bc + ad (proper result)
*/
.globl vf_fft_exec
vf_fft_exec:
# IN:
# x1: Has the first operand offset (reused)
# x2: Has the second operand offset (reused)
# x3: Has the scale factor real (reused)
# x4: Has the scale factor imag (reused)
# x5: Has the workspace real ptr
# x6: Has the workspace imag ptr
# OUT:
# x1: Has the first operand offset (carry)
# x2: Has the second operand offset (carry)
# x5: Has the first result real
# x6: Has the first result imag
# x7: Has the second result real
# x8: Has the second result imag
# Compute first operand memory locations
add x5, x5, x1 # x5 <= load address for op1 real
add x6, x6, x1 # x6 <= load address for op1 imag
#actually read memory
DATA_LOAD REG2, 0(x5) # op1 real
DATA_LOAD REG3, 0(x6) # op1 imag
# Do the add/subs (res1=op1+scale), (res2=op1-scale)
FFT_SUB REG4, REG2, REG0 # res2 real
FFT_SUB REG5, REG3, REG1 # res2 imag
FFT_ADD REG2, REG2, REG0 # res1 real
FFT_ADD REG3, REG3, REG1 # res1 imag
stop
.globl vf_fft_store1
vf_fft_store1:
# IN:
# x1: Has the first operand offset (reused)
# x2: Has the second operand offset (reused)
# x3: Has the workspace real ptr
# x4: Has the workspace imag ptr
# x5: Has the first result real (reused)
# x6: Has the first result imag (reused)
# x7: Has the second result real (reused)
# x8: Has the second result imag (reused)
# OUT:
# x2: Has the second operand offset (carry)
# x7: Has the second result real (carry)
# x8: Has the second result imag (carry)
# Compute first result memory locations
add x3, x3, x1
add x4, x4, x1
# actually write memory
DATA_STORE REG2, 0(x3)
DATA_STORE REG3, 0(x4)
stop
.globl vf_fft_store2
vf_fft_store2:
# IN:
# x2: Has the second operand offset
# x3: Has the workspace real ptr
# x4: Has the workspace imag ptr
# x7: Has the second result real
# x8: Has the second result imag
# OUT: (none)
# Compute second result memory locations
add x3, x3, x2
add x4, x4, x2
# actually write memory
DATA_STORE REG4, 0(x3)
DATA_STORE REG5, 0(x4)
stop
|