00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "common.h"
00023 #include "dsputil.h"
00024
00025 #include "mmx.h"
00026
00027 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
00028
00029 #define ROW_SHIFT 11
00030 #define COL_SHIFT 6
00031
00032 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
00033 #define rounder(bias) {round (bias), round (bias)}
00034
00035 #if 0
00036
00037 static inline void idct_row (int16_t * row, int offset,
00038 int16_t * table, int32_t * rounder)
00039 {
00040 int C1, C2, C3, C4, C5, C6, C7;
00041 int a0, a1, a2, a3, b0, b1, b2, b3;
00042
00043 row += offset;
00044
00045 C1 = table[1];
00046 C2 = table[2];
00047 C3 = table[3];
00048 C4 = table[4];
00049 C5 = table[5];
00050 C6 = table[6];
00051 C7 = table[7];
00052
00053 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
00054 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
00055 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
00056 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
00057
00058 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
00059 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
00060 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
00061 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
00062
00063 row[0] = (a0 + b0) >> ROW_SHIFT;
00064 row[1] = (a1 + b1) >> ROW_SHIFT;
00065 row[2] = (a2 + b2) >> ROW_SHIFT;
00066 row[3] = (a3 + b3) >> ROW_SHIFT;
00067 row[4] = (a3 - b3) >> ROW_SHIFT;
00068 row[5] = (a2 - b2) >> ROW_SHIFT;
00069 row[6] = (a1 - b1) >> ROW_SHIFT;
00070 row[7] = (a0 - b0) >> ROW_SHIFT;
00071 }
00072 #endif
00073
00074
00075
00076
00077 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
00078 c4, c6, c4, c6, \
00079 c1, c3, -c1, -c5, \
00080 c5, c7, c3, -c7, \
00081 c4, -c6, c4, -c6, \
00082 -c4, c2, c4, -c2, \
00083 c5, -c1, c3, -c1, \
00084 c7, c3, c7, -c5 }
00085
00086 static inline void mmxext_row_head (int16_t * row, int offset, const int16_t * table)
00087 {
00088 movq_m2r (*(row+offset), mm2);
00089
00090 movq_m2r (*(row+offset+4), mm5);
00091 movq_r2r (mm2, mm0);
00092
00093 movq_m2r (*table, mm3);
00094 movq_r2r (mm5, mm6);
00095
00096 movq_m2r (*(table+4), mm4);
00097 pmaddwd_r2r (mm0, mm3);
00098
00099 pshufw_r2r (mm2, mm2, 0x4e);
00100 }
00101
00102 static inline void mmxext_row (const int16_t * table, const int32_t * rounder)
00103 {
00104 movq_m2r (*(table+8), mm1);
00105 pmaddwd_r2r (mm2, mm4);
00106
00107 pmaddwd_m2r (*(table+16), mm0);
00108 pshufw_r2r (mm6, mm6, 0x4e);
00109
00110 movq_m2r (*(table+12), mm7);
00111 pmaddwd_r2r (mm5, mm1);
00112
00113 paddd_m2r (*rounder, mm3);
00114 pmaddwd_r2r (mm6, mm7);
00115
00116 pmaddwd_m2r (*(table+20), mm2);
00117 paddd_r2r (mm4, mm3);
00118
00119 pmaddwd_m2r (*(table+24), mm5);
00120 movq_r2r (mm3, mm4);
00121
00122 pmaddwd_m2r (*(table+28), mm6);
00123 paddd_r2r (mm7, mm1);
00124
00125 paddd_m2r (*rounder, mm0);
00126 psubd_r2r (mm1, mm3);
00127
00128 psrad_i2r (ROW_SHIFT, mm3);
00129 paddd_r2r (mm4, mm1);
00130
00131 paddd_r2r (mm2, mm0);
00132 psrad_i2r (ROW_SHIFT, mm1);
00133
00134 paddd_r2r (mm6, mm5);
00135 movq_r2r (mm0, mm4);
00136
00137 paddd_r2r (mm5, mm0);
00138 psubd_r2r (mm5, mm4);
00139 }
00140
00141 static inline void mmxext_row_tail (int16_t * row, int store)
00142 {
00143 psrad_i2r (ROW_SHIFT, mm0);
00144
00145 psrad_i2r (ROW_SHIFT, mm4);
00146
00147 packssdw_r2r (mm0, mm1);
00148
00149 packssdw_r2r (mm3, mm4);
00150
00151 movq_r2m (mm1, *(row+store));
00152 pshufw_r2r (mm4, mm4, 0xb1);
00153
00154
00155
00156 movq_r2m (mm4, *(row+store+4));
00157 }
00158
00159 static inline void mmxext_row_mid (int16_t * row, int store,
00160 int offset, const int16_t * table)
00161 {
00162 movq_m2r (*(row+offset), mm2);
00163 psrad_i2r (ROW_SHIFT, mm0);
00164
00165 movq_m2r (*(row+offset+4), mm5);
00166 psrad_i2r (ROW_SHIFT, mm4);
00167
00168 packssdw_r2r (mm0, mm1);
00169 movq_r2r (mm5, mm6);
00170
00171 packssdw_r2r (mm3, mm4);
00172 movq_r2r (mm2, mm0);
00173
00174 movq_r2m (mm1, *(row+store));
00175 pshufw_r2r (mm4, mm4, 0xb1);
00176
00177 movq_m2r (*table, mm3);
00178 movq_r2m (mm4, *(row+store+4));
00179
00180 pmaddwd_r2r (mm0, mm3);
00181
00182 movq_m2r (*(table+4), mm4);
00183 pshufw_r2r (mm2, mm2, 0x4e);
00184 }
00185
00186
00187
00188
00189 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
00190 c4, c6, -c4, -c2, \
00191 c1, c3, c3, -c7, \
00192 c5, c7, -c1, -c5, \
00193 c4, -c6, c4, -c2, \
00194 -c4, c2, c4, -c6, \
00195 c5, -c1, c7, -c5, \
00196 c7, c3, c3, -c1 }
00197
00198 static inline void mmx_row_head (int16_t * row, int offset, const int16_t * table)
00199 {
00200 movq_m2r (*(row+offset), mm2);
00201
00202 movq_m2r (*(row+offset+4), mm5);
00203 movq_r2r (mm2, mm0);
00204
00205 movq_m2r (*table, mm3);
00206 movq_r2r (mm5, mm6);
00207
00208 punpckldq_r2r (mm0, mm0);
00209
00210 movq_m2r (*(table+4), mm4);
00211 pmaddwd_r2r (mm0, mm3);
00212
00213 movq_m2r (*(table+8), mm1);
00214 punpckhdq_r2r (mm2, mm2);
00215 }
00216
00217 static inline void mmx_row (const int16_t * table, const int32_t * rounder)
00218 {
00219 pmaddwd_r2r (mm2, mm4);
00220 punpckldq_r2r (mm5, mm5);
00221
00222 pmaddwd_m2r (*(table+16), mm0);
00223 punpckhdq_r2r (mm6, mm6);
00224
00225 movq_m2r (*(table+12), mm7);
00226 pmaddwd_r2r (mm5, mm1);
00227
00228 paddd_m2r (*rounder, mm3);
00229 pmaddwd_r2r (mm6, mm7);
00230
00231 pmaddwd_m2r (*(table+20), mm2);
00232 paddd_r2r (mm4, mm3);
00233
00234 pmaddwd_m2r (*(table+24), mm5);
00235 movq_r2r (mm3, mm4);
00236
00237 pmaddwd_m2r (*(table+28), mm6);
00238 paddd_r2r (mm7, mm1);
00239
00240 paddd_m2r (*rounder, mm0);
00241 psubd_r2r (mm1, mm3);
00242
00243 psrad_i2r (ROW_SHIFT, mm3);
00244 paddd_r2r (mm4, mm1);
00245
00246 paddd_r2r (mm2, mm0);
00247 psrad_i2r (ROW_SHIFT, mm1);
00248
00249 paddd_r2r (mm6, mm5);
00250 movq_r2r (mm0, mm7);
00251
00252 paddd_r2r (mm5, mm0);
00253 psubd_r2r (mm5, mm7);
00254 }
00255
00256 static inline void mmx_row_tail (int16_t * row, int store)
00257 {
00258 psrad_i2r (ROW_SHIFT, mm0);
00259
00260 psrad_i2r (ROW_SHIFT, mm7);
00261
00262 packssdw_r2r (mm0, mm1);
00263
00264 packssdw_r2r (mm3, mm7);
00265
00266 movq_r2m (mm1, *(row+store));
00267 movq_r2r (mm7, mm4);
00268
00269 pslld_i2r (16, mm7);
00270
00271 psrld_i2r (16, mm4);
00272
00273 por_r2r (mm4, mm7);
00274
00275
00276
00277 movq_r2m (mm7, *(row+store+4));
00278 }
00279
00280 static inline void mmx_row_mid (int16_t * row, int store,
00281 int offset, const int16_t * table)
00282 {
00283 movq_m2r (*(row+offset), mm2);
00284 psrad_i2r (ROW_SHIFT, mm0);
00285
00286 movq_m2r (*(row+offset+4), mm5);
00287 psrad_i2r (ROW_SHIFT, mm7);
00288
00289 packssdw_r2r (mm0, mm1);
00290 movq_r2r (mm5, mm6);
00291
00292 packssdw_r2r (mm3, mm7);
00293 movq_r2r (mm2, mm0);
00294
00295 movq_r2m (mm1, *(row+store));
00296 movq_r2r (mm7, mm1);
00297
00298 punpckldq_r2r (mm0, mm0);
00299 psrld_i2r (16, mm7);
00300
00301 movq_m2r (*table, mm3);
00302 pslld_i2r (16, mm1);
00303
00304 movq_m2r (*(table+4), mm4);
00305 por_r2r (mm1, mm7);
00306
00307 movq_m2r (*(table+8), mm1);
00308 punpckhdq_r2r (mm2, mm2);
00309
00310 movq_r2m (mm7, *(row+store+4));
00311 pmaddwd_r2r (mm0, mm3);
00312 }
00313
00314
00315 #if 0
00316
00317 static inline void idct_col (int16_t * col, int offset)
00318 {
00319
00320 #define F(c,x) (((c) * (x)) >> 16)
00321
00322
00323 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
00324
00325 int16_t x0, x1, x2, x3, x4, x5, x6, x7;
00326 int16_t y0, y1, y2, y3, y4, y5, y6, y7;
00327 int16_t a0, a1, a2, a3, b0, b1, b2, b3;
00328 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
00329
00330 col += offset;
00331
00332 x0 = col[0*8];
00333 x1 = col[1*8];
00334 x2 = col[2*8];
00335 x3 = col[3*8];
00336 x4 = col[4*8];
00337 x5 = col[5*8];
00338 x6 = col[6*8];
00339 x7 = col[7*8];
00340
00341 u04 = S (x0 + x4);
00342 v04 = S (x0 - x4);
00343 u26 = S (F (T2, x6) + x2);
00344 v26 = S (F (T2, x2) - x6);
00345
00346 a0 = S (u04 + u26);
00347 a1 = S (v04 + v26);
00348 a2 = S (v04 - v26);
00349 a3 = S (u04 - u26);
00350
00351 u17 = S (F (T1, x7) + x1);
00352 v17 = S (F (T1, x1) - x7);
00353 u35 = S (F (T3, x5) + x3);
00354 v35 = S (F (T3, x3) - x5);
00355
00356 b0 = S (u17 + u35);
00357 b3 = S (v17 - v35);
00358 u12 = S (u17 - u35);
00359 v12 = S (v17 + v35);
00360 u12 = S (2 * F (C4, u12));
00361 v12 = S (2 * F (C4, v12));
00362 b1 = S (u12 + v12);
00363 b2 = S (u12 - v12);
00364
00365 y0 = S (a0 + b0) >> COL_SHIFT;
00366 y1 = S (a1 + b1) >> COL_SHIFT;
00367 y2 = S (a2 + b2) >> COL_SHIFT;
00368 y3 = S (a3 + b3) >> COL_SHIFT;
00369
00370 y4 = S (a3 - b3) >> COL_SHIFT;
00371 y5 = S (a2 - b2) >> COL_SHIFT;
00372 y6 = S (a1 - b1) >> COL_SHIFT;
00373 y7 = S (a0 - b0) >> COL_SHIFT;
00374
00375 col[0*8] = y0;
00376 col[1*8] = y1;
00377 col[2*8] = y2;
00378 col[3*8] = y3;
00379 col[4*8] = y4;
00380 col[5*8] = y5;
00381 col[6*8] = y6;
00382 col[7*8] = y7;
00383 }
00384 #endif
00385
00386
00387
00388 static inline void idct_col (int16_t * col, int offset)
00389 {
00390 #define T1 13036
00391 #define T2 27146
00392 #define T3 43790
00393 #define C4 23170
00394
00395 static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
00396 static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
00397 static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
00398 static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
00399
00400
00401
00402
00403 movq_m2r (*t1_vector, mm0);
00404
00405 movq_m2r (*(col+offset+1*8), mm1);
00406 movq_r2r (mm0, mm2);
00407
00408 movq_m2r (*(col+offset+7*8), mm4);
00409 pmulhw_r2r (mm1, mm0);
00410
00411 movq_m2r (*t3_vector, mm5);
00412 pmulhw_r2r (mm4, mm2);
00413
00414 movq_m2r (*(col+offset+5*8), mm6);
00415 movq_r2r (mm5, mm7);
00416
00417 movq_m2r (*(col+offset+3*8), mm3);
00418 psubsw_r2r (mm4, mm0);
00419
00420 movq_m2r (*t2_vector, mm4);
00421 pmulhw_r2r (mm3, mm5);
00422
00423 paddsw_r2r (mm2, mm1);
00424 pmulhw_r2r (mm6, mm7);
00425
00426
00427
00428 movq_r2r (mm4, mm2);
00429 paddsw_r2r (mm3, mm5);
00430
00431 pmulhw_m2r (*(col+offset+2*8), mm4);
00432 paddsw_r2r (mm6, mm7);
00433
00434 psubsw_r2r (mm6, mm5);
00435 paddsw_r2r (mm3, mm7);
00436
00437 movq_m2r (*(col+offset+6*8), mm3);
00438 movq_r2r (mm0, mm6);
00439
00440 pmulhw_r2r (mm3, mm2);
00441 psubsw_r2r (mm5, mm0);
00442
00443 psubsw_r2r (mm3, mm4);
00444 paddsw_r2r (mm6, mm5);
00445
00446 movq_r2m (mm0, *(col+offset+3*8));
00447 movq_r2r (mm1, mm6);
00448
00449 paddsw_m2r (*(col+offset+2*8), mm2);
00450 paddsw_r2r (mm7, mm6);
00451
00452 psubsw_r2r (mm7, mm1);
00453 movq_r2r (mm1, mm7);
00454
00455 movq_m2r (*(col+offset+0*8), mm3);
00456 paddsw_r2r (mm5, mm1);
00457
00458 movq_m2r (*c4_vector, mm0);
00459 psubsw_r2r (mm5, mm7);
00460
00461 movq_r2m (mm6, *(col+offset+5*8));
00462 pmulhw_r2r (mm0, mm1);
00463
00464 movq_r2r (mm4, mm6);
00465 pmulhw_r2r (mm0, mm7);
00466
00467 movq_m2r (*(col+offset+4*8), mm5);
00468 movq_r2r (mm3, mm0);
00469
00470 psubsw_r2r (mm5, mm3);
00471 paddsw_r2r (mm5, mm0);
00472
00473 paddsw_r2r (mm3, mm4);
00474 movq_r2r (mm0, mm5);
00475
00476 psubsw_r2r (mm6, mm3);
00477 paddsw_r2r (mm2, mm5);
00478
00479 paddsw_r2r (mm1, mm1);
00480 psubsw_r2r (mm2, mm0);
00481
00482 paddsw_r2r (mm7, mm7);
00483 movq_r2r (mm3, mm2);
00484
00485 movq_r2r (mm4, mm6);
00486 paddsw_r2r (mm7, mm3);
00487
00488 psraw_i2r (COL_SHIFT, mm3);
00489 paddsw_r2r (mm1, mm4);
00490
00491 psraw_i2r (COL_SHIFT, mm4);
00492 psubsw_r2r (mm1, mm6);
00493
00494 movq_m2r (*(col+offset+5*8), mm1);
00495 psubsw_r2r (mm7, mm2);
00496
00497 psraw_i2r (COL_SHIFT, mm6);
00498 movq_r2r (mm5, mm7);
00499
00500 movq_r2m (mm4, *(col+offset+1*8));
00501 psraw_i2r (COL_SHIFT, mm2);
00502
00503 movq_r2m (mm3, *(col+offset+2*8));
00504 paddsw_r2r (mm1, mm5);
00505
00506 movq_m2r (*(col+offset+3*8), mm4);
00507 psubsw_r2r (mm1, mm7);
00508
00509 psraw_i2r (COL_SHIFT, mm5);
00510 movq_r2r (mm0, mm3);
00511
00512 movq_r2m (mm2, *(col+offset+5*8));
00513 psubsw_r2r (mm4, mm3);
00514
00515 psraw_i2r (COL_SHIFT, mm7);
00516 paddsw_r2r (mm0, mm4);
00517
00518 movq_r2m (mm5, *(col+offset+0*8));
00519 psraw_i2r (COL_SHIFT, mm3);
00520
00521 movq_r2m (mm6, *(col+offset+6*8));
00522 psraw_i2r (COL_SHIFT, mm4);
00523
00524 movq_r2m (mm7, *(col+offset+7*8));
00525
00526 movq_r2m (mm3, *(col+offset+4*8));
00527
00528 movq_r2m (mm4, *(col+offset+3*8));
00529
00530 #undef T1
00531 #undef T2
00532 #undef T3
00533 #undef C4
00534 }
00535
00536 static const int32_t rounder0[] ATTR_ALIGN(8) =
00537 rounder ((1 << (COL_SHIFT - 1)) - 0.5);
00538 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
00539 static const int32_t rounder1[] ATTR_ALIGN(8) =
00540 rounder (1.25683487303);
00541 static const int32_t rounder7[] ATTR_ALIGN(8) =
00542 rounder (-0.25);
00543 static const int32_t rounder2[] ATTR_ALIGN(8) =
00544 rounder (0.60355339059);
00545 static const int32_t rounder6[] ATTR_ALIGN(8) =
00546 rounder (-0.25);
00547 static const int32_t rounder3[] ATTR_ALIGN(8) =
00548 rounder (0.087788325588);
00549 static const int32_t rounder5[] ATTR_ALIGN(8) =
00550 rounder (-0.441341716183);
00551
00552 #undef COL_SHIFT
00553 #undef ROW_SHIFT
00554
00555 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
00556 void idct (int16_t * block) \
00557 { \
00558 static const int16_t table04[] ATTR_ALIGN(16) = \
00559 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
00560 static const int16_t table17[] ATTR_ALIGN(16) = \
00561 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
00562 static const int16_t table26[] ATTR_ALIGN(16) = \
00563 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
00564 static const int16_t table35[] ATTR_ALIGN(16) = \
00565 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
00566 \
00567 idct_row_head (block, 0*8, table04); \
00568 idct_row (table04, rounder0); \
00569 idct_row_mid (block, 0*8, 4*8, table04); \
00570 idct_row (table04, rounder4); \
00571 idct_row_mid (block, 4*8, 1*8, table17); \
00572 idct_row (table17, rounder1); \
00573 idct_row_mid (block, 1*8, 7*8, table17); \
00574 idct_row (table17, rounder7); \
00575 idct_row_mid (block, 7*8, 2*8, table26); \
00576 idct_row (table26, rounder2); \
00577 idct_row_mid (block, 2*8, 6*8, table26); \
00578 idct_row (table26, rounder6); \
00579 idct_row_mid (block, 6*8, 3*8, table35); \
00580 idct_row (table35, rounder3); \
00581 idct_row_mid (block, 3*8, 5*8, table35); \
00582 idct_row (table35, rounder5); \
00583 idct_row_tail (block, 5*8); \
00584 \
00585 idct_col (block, 0); \
00586 idct_col (block, 4); \
00587 }
00588
00589 void ff_mmx_idct(DCTELEM *block);
00590 void ff_mmxext_idct(DCTELEM *block);
00591
00592 declare_idct (ff_mmxext_idct, mmxext_table,
00593 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
00594
00595 declare_idct (ff_mmx_idct, mmx_table,
00596 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
00597