00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00026 #include "dsputil.h"
00027 #include "mmx.h"
00028
00029 #define IdctAdjustBeforeShift 8
00030
00031
00032
00033
00034
00035
00036 static uint16_t idct_constants[(4 + 7 + 1) * 4];
00037 static const uint16_t idct_cosine_table[7] = {
00038 64277, 60547, 54491, 46341, 36410, 25080, 12785
00039 };
00040
00041 #define r0 mm0
00042 #define r1 mm1
00043 #define r2 mm2
00044 #define r3 mm3
00045 #define r4 mm4
00046 #define r5 mm5
00047 #define r6 mm6
00048 #define r7 mm7
00049
00050
00051 #define BeginIDCT() { \
00052 movq_m2r(*I(3), r2); \
00053 movq_m2r(*C(3), r6); \
00054 movq_r2r(r2, r4); \
00055 movq_m2r(*J(5), r7); \
00056 pmulhw_r2r(r6, r4); \
00057 movq_m2r(*C(5), r1); \
00058 pmulhw_r2r(r7, r6); \
00059 movq_r2r(r1, r5); \
00060 pmulhw_r2r(r2, r1); \
00061 movq_m2r(*I(1), r3); \
00062 pmulhw_r2r(r7, r5); \
00063 movq_m2r(*C(1), r0); \
00064 paddw_r2r(r2, r4); \
00065 paddw_r2r(r7, r6); \
00066 paddw_r2r(r1, r2); \
00067 movq_m2r(*J(7), r1); \
00068 paddw_r2r(r5, r7); \
00069 movq_r2r(r0, r5); \
00070 pmulhw_r2r(r3, r0); \
00071 paddsw_r2r(r7, r4); \
00072 pmulhw_r2r(r1, r5); \
00073 movq_m2r(*C(7), r7); \
00074 psubsw_r2r(r2, r6); \
00075 paddw_r2r(r3, r0); \
00076 pmulhw_r2r(r7, r3); \
00077 movq_m2r(*I(2), r2); \
00078 pmulhw_r2r(r1, r7); \
00079 paddw_r2r(r1, r5); \
00080 movq_r2r(r2, r1); \
00081 pmulhw_m2r(*C(2), r2); \
00082 psubsw_r2r(r5, r3); \
00083 movq_m2r(*J(6), r5); \
00084 paddsw_r2r(r7, r0); \
00085 movq_r2r(r5, r7); \
00086 psubsw_r2r(r4, r0); \
00087 pmulhw_m2r(*C(2), r5); \
00088 paddw_r2r(r1, r2); \
00089 pmulhw_m2r(*C(6), r1); \
00090 paddsw_r2r(r4, r4); \
00091 paddsw_r2r(r0, r4); \
00092 psubsw_r2r(r6, r3); \
00093 paddw_r2r(r7, r5); \
00094 paddsw_r2r(r6, r6); \
00095 pmulhw_m2r(*C(6), r7); \
00096 paddsw_r2r(r3, r6); \
00097 movq_r2m(r4, *I(1)); \
00098 psubsw_r2r(r5, r1); \
00099 movq_m2r(*C(4), r4); \
00100 movq_r2r(r3, r5); \
00101 pmulhw_r2r(r4, r3); \
00102 paddsw_r2r(r2, r7); \
00103 movq_r2m(r6, *I(2)); \
00104 movq_r2r(r0, r2); \
00105 movq_m2r(*I(0), r6); \
00106 pmulhw_r2r(r4, r0); \
00107 paddw_r2r(r3, r5); \
00108 movq_m2r(*J(4), r3); \
00109 psubsw_r2r(r1, r5); \
00110 paddw_r2r(r0, r2); \
00111 psubsw_r2r(r3, r6); \
00112 movq_r2r(r6, r0); \
00113 pmulhw_r2r(r4, r6); \
00114 paddsw_r2r(r3, r3); \
00115 paddsw_r2r(r1, r1); \
00116 paddsw_r2r(r0, r3); \
00117 paddsw_r2r(r5, r1); \
00118 pmulhw_r2r(r3, r4); \
00119 paddsw_r2r(r0, r6); \
00120 psubsw_r2r(r2, r6); \
00121 paddsw_r2r(r2, r2); \
00122 movq_m2r(*I(1), r0); \
00123 paddsw_r2r(r6, r2); \
00124 paddw_r2r(r3, r4); \
00125 psubsw_r2r(r1, r2); \
00126 }
00127
00128
00129 #define RowIDCT() { \
00130 \
00131 BeginIDCT(); \
00132 \
00133 movq_m2r(*I(2), r3); \
00134 psubsw_r2r(r7, r4); \
00135 paddsw_r2r(r1, r1); \
00136 paddsw_r2r(r7, r7); \
00137 paddsw_r2r(r2, r1); \
00138 paddsw_r2r(r4, r7); \
00139 psubsw_r2r(r3, r4); \
00140 paddsw_r2r(r3, r3); \
00141 psubsw_r2r(r5, r6); \
00142 paddsw_r2r(r5, r5); \
00143 paddsw_r2r(r4, r3); \
00144 paddsw_r2r(r6, r5); \
00145 psubsw_r2r(r0, r7); \
00146 paddsw_r2r(r0, r0); \
00147 movq_r2m(r1, *I(1)); \
00148 paddsw_r2r(r7, r0); \
00149 }
00150
00151
00152 #define ColumnIDCT() { \
00153 \
00154 BeginIDCT(); \
00155 \
00156 paddsw_m2r(*Eight, r2); \
00157 paddsw_r2r(r1, r1); \
00158 paddsw_r2r(r2, r1); \
00159 psraw_i2r(4, r2); \
00160 psubsw_r2r(r7, r4); \
00161 psraw_i2r(4, r1); \
00162 movq_m2r(*I(2), r3); \
00163 paddsw_r2r(r7, r7); \
00164 movq_r2m(r2, *I(2)); \
00165 paddsw_r2r(r4, r7); \
00166 movq_r2m(r1, *I(1)); \
00167 psubsw_r2r(r3, r4); \
00168 paddsw_m2r(*Eight, r4); \
00169 paddsw_r2r(r3, r3); \
00170 paddsw_r2r(r4, r3); \
00171 psraw_i2r(4, r4); \
00172 psubsw_r2r(r5, r6); \
00173 psraw_i2r(4, r3); \
00174 paddsw_m2r(*Eight, r6); \
00175 paddsw_r2r(r5, r5); \
00176 paddsw_r2r(r6, r5); \
00177 psraw_i2r(4, r6); \
00178 movq_r2m(r4, *J(4)); \
00179 psraw_i2r(4, r5); \
00180 movq_r2m(r3, *I(3)); \
00181 psubsw_r2r(r0, r7); \
00182 paddsw_m2r(*Eight, r7); \
00183 paddsw_r2r(r0, r0); \
00184 paddsw_r2r(r7, r0); \
00185 psraw_i2r(4, r7); \
00186 movq_r2m(r6, *J(6)); \
00187 psraw_i2r(4, r0); \
00188 movq_r2m(r5, *J(5)); \
00189 movq_r2m(r7, *J(7)); \
00190 movq_r2m(r0, *I(0)); \
00191 }
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224 #define Transpose() { \
00225 movq_r2r(r4, r1); \
00226 punpcklwd_r2r(r5, r4); \
00227 movq_r2m(r0, *I(0)); \
00228 punpckhwd_r2r(r5, r1); \
00229 movq_r2r(r6, r0); \
00230 punpcklwd_r2r(r7, r6); \
00231 movq_r2r(r4, r5); \
00232 punpckldq_r2r(r6, r4); \
00233 punpckhdq_r2r(r6, r5); \
00234 movq_r2r(r1, r6); \
00235 movq_r2m(r4, *J(4)); \
00236 punpckhwd_r2r(r7, r0); \
00237 movq_r2m(r5, *J(5)); \
00238 punpckhdq_r2r(r0, r6); \
00239 movq_m2r(*I(0), r4); \
00240 punpckldq_r2r(r0, r1); \
00241 movq_m2r(*I(1), r5); \
00242 movq_r2r(r4, r0); \
00243 movq_r2m(r6, *J(7)); \
00244 punpcklwd_r2r(r5, r0); \
00245 movq_r2m(r1, *J(6)); \
00246 punpckhwd_r2r(r5, r4); \
00247 movq_r2r(r2, r5); \
00248 punpcklwd_r2r(r3, r2); \
00249 movq_r2r(r0, r1); \
00250 punpckldq_r2r(r2, r0); \
00251 punpckhdq_r2r(r2, r1); \
00252 movq_r2r(r4, r2); \
00253 movq_r2m(r0, *I(0)); \
00254 punpckhwd_r2r(r3, r5); \
00255 movq_r2m(r1, *I(1)); \
00256 punpckhdq_r2r(r5, r4); \
00257 punpckldq_r2r(r5, r2); \
00258 movq_r2m(r4, *I(3)); \
00259 movq_r2m(r2, *I(2)); \
00260 }
00261
00262 void ff_vp3_dsp_init_mmx(void)
00263 {
00264 int j = 16;
00265 uint16_t *p;
00266
00267 j = 1;
00268 do {
00269 p = idct_constants + ((j + 3) << 2);
00270 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
00271 } while (++j <= 7);
00272
00273 idct_constants[44] = idct_constants[45] =
00274 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
00275 }
00276
00277 void ff_vp3_idct_mmx(int16_t *output_data)
00278 {
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288 #define C(x) (idct_constants + 16 + (x - 1) * 4)
00289 #define Eight (idct_constants + 44)
00290
00291
00292
00293 #define I(K) (output_data + K * 8)
00294 #define J(K) (output_data + ((K - 4) * 8) + 4)
00295
00296 RowIDCT();
00297 Transpose();
00298
00299 #undef I
00300 #undef J
00301 #define I(K) (output_data + (K * 8) + 32)
00302 #define J(K) (output_data + ((K - 4) * 8) + 36)
00303
00304 RowIDCT();
00305 Transpose();
00306
00307 #undef I
00308 #undef J
00309 #define I(K) (output_data + K * 8)
00310 #define J(K) (output_data + K * 8)
00311
00312 ColumnIDCT();
00313
00314 #undef I
00315 #undef J
00316 #define I(K) (output_data + (K * 8) + 4)
00317 #define J(K) (output_data + (K * 8) + 4)
00318
00319 ColumnIDCT();
00320
00321 #undef I
00322 #undef J
00323
00324 }
00325
00326 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
00327 {
00328 ff_vp3_idct_mmx(block);
00329 put_signed_pixels_clamped_mmx(block, dest, line_size);
00330 }
00331
00332 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
00333 {
00334 ff_vp3_idct_mmx(block);
00335 add_pixels_clamped_mmx(block, dest, line_size);
00336 }