1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
|
# mach: bfin
// FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
// INTERNAL STATE
// TWO OUTPUTS PER ITERATION
// This program computes a FIR filter without maintaining a buffer of internal
// state.
// This example computes two output samples per inner loop. The following
// diagram shows the alignment required for signal x and coefficients c:
// x0 x1 x2 x3 x4 x5
// c0 c1 c2 c3 c4 -> output z(0)=x0*c0 + x1*c1 + ...
// c0 c1 c2 c3 c4 -> z(1)=x1*c0 + x2*c1 + ...
// L-1
// ---
// Z(k) = \ c(n) * x(n+k)
// /
// ---
// n=0
// Naive, first stab at spliting this for dual MACS.
// L/2-1 L/2-1
// --- ---
// R(k) = \ (x(2n) * y(2n+k)) + \ (x(2n-1) * y(2n-1+k))
// / /
// --- ---
// n=0 n=0
// Alternate, better partitioning for the machine.
// L-1
// ---
// R(0) = \ x(n) * y(n)
// /
// ---
// n=0
// L-1
// ---
// R(1) = \ x(n) * y(n+1)
// /
// ---
// n=0
// L-1
// ---
// R(2) = \ x(n) * y(n+2)
// /
// ---
// n=0
// L-1
// ---
// R(3) = \ x(n) * y(n+3)
// /
// ---
// n=0
// .
// .
// .
// .
// Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel
// L-1
// ---
// R(2k) = \ x(n) * y(n+2k)
// /
// ---
// n=0
// L-1
// ---
// R(2k+1) = \ x(n) * y(n+2k+1)
// /
// ---
// n=0
// Implementation
// --------------
// Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0
// is loaded into register R1:
// +-------+ R0
// | x1 x0 |
// +-------+
// +-------+ R1
// | c1 c0 | compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0
// +-------+
// Now load x2 into lo half of R0, and compute the next two MACs:
// +-------+ R0
// | x1 x2 |
// +-------+
// +-------+ R1
// | c1 c0 | compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used)
// +-------+
// Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:
// +-------+ R0
// | x3 x2 |
// +-------+
// +-------+ R2
// | c3 c2 | compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used)
// +-------+
// Load x4 into low half of R0:
// +-------+ R0
// | x3 x4 |
// +-------+
// +-------+ R1
// | c3 c2 | compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used)
// +-------+
// //This is a reference FIR function used to test: */
//void firf (float input[], float output[], float coeffs[],
// long input_size, long coeffs_size)
//{
// long i, k;
// for(i=0; i< input_size; i++){
// output[i] = 0;
// for(k=0; k < coeffs_size; k++)
// output[i] += input[k+i] * coeffs[k];
// }
//}
.include "testutils.inc"
start
R0 = 0; R1 = 0; R2 = 0;
P1 = 128 (X); // Load loop bounds in R5, R6, and divide by 2
P2 = 64 (X);
// P0 holds pointer to input data in one memory
// bank. Increments by 2 after each inner-loop iter
loadsym P0, input;
// Pointer to coeffs in alternate memory bank.
loadsym I1, coef;
// Pointer to outputs in any memory bank.
loadsym I2, output;
// Setup outer do-loop for M/2 iterations
// (2 outputs are computed per pass)
LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;
L$0:
loadsym I1, coef;
I0 = P0;
// Set-up inner do-loop for L/2 iterations
// (2 MACs are computed per pass)
LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;
// Load first two data elements in r0,
// and two coeffs into r1:
R0.L = W [ I0 ++ ];
A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
L$1:
A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP;
L$1end:
A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
// Line 1: do 2 MACs and load next data element into RL0.
// Line 2: do 2 MACs, load next data element into RH0,
// and load next 2 coeffs
R0.H = A1, R0.L = A0;
// advance data pointer by 2 16b elements
P0 += 4;
L$0end:
[ I2 ++ ] = R0; // store 2 outputs
// Check results
loadsym I2, output;
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x2000 );
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
pass
.data
input:
.dw 0x0000
.dw 0x0000
.dw 0x0000
.dw 0x0000
.dw 0x4000
.dw 0x0000
.dw 0x0000
.dw 0x0000
.dw 0x0000
.dw 0x0000
.space ((128-10)*2); // must pad with zeros or uninitialized values.
.data
coef:
.dw 0x1000
.dw 0x2000
.dw 0x4000
.dw 0x2000
.dw 0x1000
.dw 0x0000
.space ((64-6)*2); // must pad with zeros or uninitialized values.
.data
output:
.space (128*4)
|