00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include <stdlib.h>
00040 #include <string.h>
00041 #include "dsputil.h"
00042
00043 #include "gcc_fixes.h"
00044
00045 #include "dsputil_ppc.h"
00046
00047 #define vector_s16_t vector signed short
00048 #define const_vector_s16_t const vector signed short
00049 #define vector_u16_t vector unsigned short
00050 #define vector_s8_t vector signed char
00051 #define vector_u8_t vector unsigned char
00052 #define vector_s32_t vector signed int
00053 #define vector_u32_t vector unsigned int
00054
00055 #define IDCT_HALF \
00056 \
00057 t1 = vec_mradds (a1, vx7, vx1 ); \
00058 t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
00059 t7 = vec_mradds (a2, vx5, vx3); \
00060 t3 = vec_mradds (ma2, vx3, vx5); \
00061 \
00062 \
00063 t5 = vec_adds (vx0, vx4); \
00064 t0 = vec_subs (vx0, vx4); \
00065 t2 = vec_mradds (a0, vx6, vx2); \
00066 t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
00067 t6 = vec_adds (t8, t3); \
00068 t3 = vec_subs (t8, t3); \
00069 t8 = vec_subs (t1, t7); \
00070 t1 = vec_adds (t1, t7); \
00071 \
00072 \
00073 t7 = vec_adds (t5, t2); \
00074 t2 = vec_subs (t5, t2); \
00075 t5 = vec_adds (t0, t4); \
00076 t0 = vec_subs (t0, t4); \
00077 t4 = vec_subs (t8, t3); \
00078 t3 = vec_adds (t8, t3); \
00079 \
00080 \
00081 vy0 = vec_adds (t7, t1); \
00082 vy7 = vec_subs (t7, t1); \
00083 vy1 = vec_mradds (c4, t3, t5); \
00084 vy6 = vec_mradds (mc4, t3, t5); \
00085 vy2 = vec_mradds (c4, t4, t0); \
00086 vy5 = vec_mradds (mc4, t4, t0); \
00087 vy3 = vec_adds (t2, t6); \
00088 vy4 = vec_subs (t2, t6);
00089
00090
00091 #define IDCT \
00092 vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
00093 vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
00094 vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \
00095 vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \
00096 vector_u16_t shift; \
00097 \
00098 c4 = vec_splat (constants[0], 0); \
00099 a0 = vec_splat (constants[0], 1); \
00100 a1 = vec_splat (constants[0], 2); \
00101 a2 = vec_splat (constants[0], 3); \
00102 mc4 = vec_splat (constants[0], 4); \
00103 ma2 = vec_splat (constants[0], 5); \
00104 bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \
00105 \
00106 zero = vec_splat_s16 (0); \
00107 shift = vec_splat_u16 (4); \
00108 \
00109 vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
00110 vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
00111 vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
00112 vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
00113 vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
00114 vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
00115 vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
00116 vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
00117 \
00118 IDCT_HALF \
00119 \
00120 vx0 = vec_mergeh (vy0, vy4); \
00121 vx1 = vec_mergel (vy0, vy4); \
00122 vx2 = vec_mergeh (vy1, vy5); \
00123 vx3 = vec_mergel (vy1, vy5); \
00124 vx4 = vec_mergeh (vy2, vy6); \
00125 vx5 = vec_mergel (vy2, vy6); \
00126 vx6 = vec_mergeh (vy3, vy7); \
00127 vx7 = vec_mergel (vy3, vy7); \
00128 \
00129 vy0 = vec_mergeh (vx0, vx4); \
00130 vy1 = vec_mergel (vx0, vx4); \
00131 vy2 = vec_mergeh (vx1, vx5); \
00132 vy3 = vec_mergel (vx1, vx5); \
00133 vy4 = vec_mergeh (vx2, vx6); \
00134 vy5 = vec_mergel (vx2, vx6); \
00135 vy6 = vec_mergeh (vx3, vx7); \
00136 vy7 = vec_mergel (vx3, vx7); \
00137 \
00138 vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
00139 vx1 = vec_mergel (vy0, vy4); \
00140 vx2 = vec_mergeh (vy1, vy5); \
00141 vx3 = vec_mergel (vy1, vy5); \
00142 vx4 = vec_mergeh (vy2, vy6); \
00143 vx5 = vec_mergel (vy2, vy6); \
00144 vx6 = vec_mergeh (vy3, vy7); \
00145 vx7 = vec_mergel (vy3, vy7); \
00146 \
00147 IDCT_HALF \
00148 \
00149 shift = vec_splat_u16 (6); \
00150 vx0 = vec_sra (vy0, shift); \
00151 vx1 = vec_sra (vy1, shift); \
00152 vx2 = vec_sra (vy2, shift); \
00153 vx3 = vec_sra (vy3, shift); \
00154 vx4 = vec_sra (vy4, shift); \
00155 vx5 = vec_sra (vy5, shift); \
00156 vx6 = vec_sra (vy6, shift); \
00157 vx7 = vec_sra (vy7, shift);
00158
00159
00160 static const_vector_s16_t constants[5] = {
00161 (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
00162 (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
00163 (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
00164 (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
00165 (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
00166 };
00167
00168 void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
00169 {
00170 POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
00171 vector_u8_t tmp;
00172
00173 #ifdef CONFIG_POWERPC_PERF
00174 POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
00175 #endif
00176 IDCT
00177
00178 #define COPY(dest,src) \
00179 tmp = vec_packsu (src, src); \
00180 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
00181 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
00182
00183 COPY (dest, vx0) dest += stride;
00184 COPY (dest, vx1) dest += stride;
00185 COPY (dest, vx2) dest += stride;
00186 COPY (dest, vx3) dest += stride;
00187 COPY (dest, vx4) dest += stride;
00188 COPY (dest, vx5) dest += stride;
00189 COPY (dest, vx6) dest += stride;
00190 COPY (dest, vx7)
00191
00192 POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
00193 }
00194
00195 void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
00196 {
00197 POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
00198 vector_u8_t tmp;
00199 vector_s16_t tmp2, tmp3;
00200 vector_u8_t perm0;
00201 vector_u8_t perm1;
00202 vector_u8_t p0, p1, p;
00203
00204 #ifdef CONFIG_POWERPC_PERF
00205 POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
00206 #endif
00207
00208 IDCT
00209
00210 p0 = vec_lvsl (0, dest);
00211 p1 = vec_lvsl (stride, dest);
00212 p = vec_splat_u8 (-1);
00213 perm0 = vec_mergeh (p, p0);
00214 perm1 = vec_mergeh (p, p1);
00215
00216 #define ADD(dest,src,perm) \
00217 \
00218 tmp = vec_ld (0, dest); \
00219 tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \
00220 tmp3 = vec_adds (tmp2, src); \
00221 tmp = vec_packsu (tmp3, tmp3); \
00222 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
00223 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
00224
00225 ADD (dest, vx0, perm0) dest += stride;
00226 ADD (dest, vx1, perm1) dest += stride;
00227 ADD (dest, vx2, perm0) dest += stride;
00228 ADD (dest, vx3, perm1) dest += stride;
00229 ADD (dest, vx4, perm0) dest += stride;
00230 ADD (dest, vx5, perm1) dest += stride;
00231 ADD (dest, vx6, perm0) dest += stride;
00232 ADD (dest, vx7, perm1)
00233
00234 POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
00235 }
00236