00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 static void H264_CHROMA_MC8_TMPL(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y, int rnd)
00029 {
00030 DECLARE_ALIGNED_8(static const uint64_t, ff_pw_28) = 0x001C001C001C001CULL;
00031 const uint64_t *rnd_reg;
00032 DECLARE_ALIGNED_8(uint64_t, AA);
00033 DECLARE_ALIGNED_8(uint64_t, DD);
00034 int i;
00035
00036 if(y==0 && x==0) {
00037
00038 H264_CHROMA_MC8_MV0(dst, src, stride, h);
00039 return;
00040 }
00041
00042 assert(x<8 && y<8 && x>=0 && y>=0);
00043
00044 if(y==0 || x==0)
00045 {
00046
00047 const int dxy = x ? 1 : stride;
00048
00049 rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3;
00050
00051 asm volatile(
00052 "movd %0, %%mm5\n\t"
00053 "movq %1, %%mm4\n\t"
00054 "movq %2, %%mm6\n\t"
00055 "punpcklwd %%mm5, %%mm5\n\t"
00056 "punpckldq %%mm5, %%mm5\n\t"
00057 "pxor %%mm7, %%mm7\n\t"
00058 "psubw %%mm5, %%mm4\n\t"
00059 :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg));
00060
00061 for(i=0; i<h; i++) {
00062 asm volatile(
00063
00064 "movq %0, %%mm0\n\t"
00065 "movq %1, %%mm2\n\t"
00066 :: "m"(src[0]), "m"(src[dxy]));
00067
00068 asm volatile(
00069
00070
00071 "movq %%mm0, %%mm1\n\t"
00072 "movq %%mm2, %%mm3\n\t"
00073 "punpcklbw %%mm7, %%mm0\n\t"
00074 "punpckhbw %%mm7, %%mm1\n\t"
00075 "punpcklbw %%mm7, %%mm2\n\t"
00076 "punpckhbw %%mm7, %%mm3\n\t"
00077 "pmullw %%mm4, %%mm0\n\t"
00078 "pmullw %%mm4, %%mm1\n\t"
00079 "pmullw %%mm5, %%mm2\n\t"
00080 "pmullw %%mm5, %%mm3\n\t"
00081
00082
00083 "paddw %%mm6, %%mm0\n\t"
00084 "paddw %%mm6, %%mm1\n\t"
00085 "paddw %%mm2, %%mm0\n\t"
00086 "paddw %%mm3, %%mm1\n\t"
00087 "psrlw $3, %%mm0\n\t"
00088 "psrlw $3, %%mm1\n\t"
00089 "packuswb %%mm1, %%mm0\n\t"
00090 H264_CHROMA_OP(%0, %%mm0)
00091 "movq %%mm0, %0\n\t"
00092 : "=m" (dst[0]));
00093
00094 src += stride;
00095 dst += stride;
00096 }
00097 return;
00098 }
00099
00100
00101 rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28;
00102 asm volatile("movd %2, %%mm4\n\t"
00103 "movd %3, %%mm6\n\t"
00104 "punpcklwd %%mm4, %%mm4\n\t"
00105 "punpcklwd %%mm6, %%mm6\n\t"
00106 "punpckldq %%mm4, %%mm4\n\t"
00107 "punpckldq %%mm6, %%mm6\n\t"
00108 "movq %%mm4, %%mm5\n\t"
00109 "pmullw %%mm6, %%mm4\n\t"
00110 "psllw $3, %%mm5\n\t"
00111 "psllw $3, %%mm6\n\t"
00112 "movq %%mm5, %%mm7\n\t"
00113 "paddw %%mm6, %%mm7\n\t"
00114 "movq %%mm4, %1\n\t"
00115 "psubw %%mm4, %%mm5\n\t"
00116 "psubw %%mm4, %%mm6\n\t"
00117 "paddw %4, %%mm4\n\t"
00118 "psubw %%mm7, %%mm4\n\t"
00119 "pxor %%mm7, %%mm7\n\t"
00120 "movq %%mm4, %0\n\t"
00121 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
00122
00123 asm volatile(
00124
00125 "movq %0, %%mm0\n\t"
00126 "movq %1, %%mm1\n\t"
00127 : : "m" (src[0]), "m" (src[1]));
00128
00129 for(i=0; i<h; i++) {
00130 src += stride;
00131
00132 asm volatile(
00133
00134
00135 "movq %%mm0, %%mm2\n\t"
00136 "movq %%mm1, %%mm3\n\t"
00137 "punpckhbw %%mm7, %%mm0\n\t"
00138 "punpcklbw %%mm7, %%mm1\n\t"
00139 "punpcklbw %%mm7, %%mm2\n\t"
00140 "punpckhbw %%mm7, %%mm3\n\t"
00141 "pmullw %0, %%mm0\n\t"
00142 "pmullw %0, %%mm2\n\t"
00143 "pmullw %%mm5, %%mm1\n\t"
00144 "pmullw %%mm5, %%mm3\n\t"
00145 "paddw %%mm1, %%mm2\n\t"
00146 "paddw %%mm0, %%mm3\n\t"
00147 : : "m" (AA));
00148
00149 asm volatile(
00150
00151 "movq %0, %%mm0\n\t"
00152 "movq %%mm0, %%mm1\n\t"
00153 "punpcklbw %%mm7, %%mm0\n\t"
00154 "punpckhbw %%mm7, %%mm1\n\t"
00155 "pmullw %%mm6, %%mm0\n\t"
00156 "pmullw %%mm6, %%mm1\n\t"
00157 "paddw %%mm0, %%mm2\n\t"
00158 "paddw %%mm1, %%mm3\n\t"
00159 : : "m" (src[0]));
00160
00161 asm volatile(
00162
00163 "movq %1, %%mm1\n\t"
00164 "movq %%mm1, %%mm0\n\t"
00165 "movq %%mm1, %%mm4\n\t"
00166 "punpcklbw %%mm7, %%mm0\n\t"
00167 "punpckhbw %%mm7, %%mm4\n\t"
00168 "pmullw %2, %%mm0\n\t"
00169 "pmullw %2, %%mm4\n\t"
00170 "paddw %%mm0, %%mm2\n\t"
00171 "paddw %%mm4, %%mm3\n\t"
00172 "movq %0, %%mm0\n\t"
00173 : : "m" (src[0]), "m" (src[1]), "m" (DD));
00174
00175 asm volatile(
00176
00177 "paddw %1, %%mm2\n\t"
00178 "paddw %1, %%mm3\n\t"
00179 "psrlw $6, %%mm2\n\t"
00180 "psrlw $6, %%mm3\n\t"
00181 "packuswb %%mm3, %%mm2\n\t"
00182 H264_CHROMA_OP(%0, %%mm2)
00183 "movq %%mm2, %0\n\t"
00184 : "=m" (dst[0]) : "m" (*rnd_reg));
00185 dst+= stride;
00186 }
00187 }
00188
00189 static void H264_CHROMA_MC4_TMPL(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
00190 {
00191 asm volatile(
00192 "pxor %%mm7, %%mm7 \n\t"
00193 "movd %5, %%mm2 \n\t"
00194 "movd %6, %%mm3 \n\t"
00195 "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
00196 "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
00197 "punpcklwd %%mm2, %%mm2 \n\t"
00198 "punpcklwd %%mm3, %%mm3 \n\t"
00199 "punpcklwd %%mm2, %%mm2 \n\t"
00200 "punpcklwd %%mm3, %%mm3 \n\t"
00201 "psubw %%mm2, %%mm4 \n\t"
00202 "psubw %%mm3, %%mm5 \n\t"
00203
00204 "movd (%1), %%mm0 \n\t"
00205 "movd 1(%1), %%mm6 \n\t"
00206 "add %3, %1 \n\t"
00207 "punpcklbw %%mm7, %%mm0 \n\t"
00208 "punpcklbw %%mm7, %%mm6 \n\t"
00209 "pmullw %%mm4, %%mm0 \n\t"
00210 "pmullw %%mm2, %%mm6 \n\t"
00211 "paddw %%mm0, %%mm6 \n\t"
00212
00213 "1: \n\t"
00214 "movd (%1), %%mm0 \n\t"
00215 "movd 1(%1), %%mm1 \n\t"
00216 "add %3, %1 \n\t"
00217 "punpcklbw %%mm7, %%mm0 \n\t"
00218 "punpcklbw %%mm7, %%mm1 \n\t"
00219 "pmullw %%mm4, %%mm0 \n\t"
00220 "pmullw %%mm2, %%mm1 \n\t"
00221 "paddw %%mm0, %%mm1 \n\t"
00222 "movq %%mm1, %%mm0 \n\t"
00223 "pmullw %%mm5, %%mm6 \n\t"
00224 "pmullw %%mm3, %%mm1 \n\t"
00225 "paddw %4, %%mm6 \n\t"
00226 "paddw %%mm6, %%mm1 \n\t"
00227 "psrlw $6, %%mm1 \n\t"
00228 "packuswb %%mm1, %%mm1 \n\t"
00229 H264_CHROMA_OP4((%0), %%mm1, %%mm6)
00230 "movd %%mm1, (%0) \n\t"
00231 "add %3, %0 \n\t"
00232 "movd (%1), %%mm6 \n\t"
00233 "movd 1(%1), %%mm1 \n\t"
00234 "add %3, %1 \n\t"
00235 "punpcklbw %%mm7, %%mm6 \n\t"
00236 "punpcklbw %%mm7, %%mm1 \n\t"
00237 "pmullw %%mm4, %%mm6 \n\t"
00238 "pmullw %%mm2, %%mm1 \n\t"
00239 "paddw %%mm6, %%mm1 \n\t"
00240 "movq %%mm1, %%mm6 \n\t"
00241 "pmullw %%mm5, %%mm0 \n\t"
00242 "pmullw %%mm3, %%mm1 \n\t"
00243 "paddw %4, %%mm0 \n\t"
00244 "paddw %%mm0, %%mm1 \n\t"
00245 "psrlw $6, %%mm1 \n\t"
00246 "packuswb %%mm1, %%mm1 \n\t"
00247 H264_CHROMA_OP4((%0), %%mm1, %%mm0)
00248 "movd %%mm1, (%0) \n\t"
00249 "add %3, %0 \n\t"
00250 "sub $2, %2 \n\t"
00251 "jnz 1b \n\t"
00252 : "+r"(dst), "+r"(src), "+r"(h)
00253 : "r"((long)stride), "m"(ff_pw_32), "m"(x), "m"(y)
00254 );
00255 }
00256
00257 #ifdef H264_CHROMA_MC2_TMPL
00258 static void H264_CHROMA_MC2_TMPL(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
00259 {
00260 int tmp = ((1<<16)-1)*x + 8;
00261 int CD= tmp*y;
00262 int AB= (tmp<<3) - CD;
00263 asm volatile(
00264
00265
00266 "movd %0, %%mm5\n\t"
00267 "movd %1, %%mm6\n\t"
00268 "punpckldq %%mm5, %%mm5\n\t"
00269 "punpckldq %%mm6, %%mm6\n\t"
00270 "pxor %%mm7, %%mm7\n\t"
00271
00272 "movd %2, %%mm2\n\t"
00273 "punpcklbw %%mm7, %%mm2\n\t"
00274 "pshufw $0x94, %%mm2, %%mm2\n\t"
00275 :: "r"(AB), "r"(CD), "m"(src[0]));
00276
00277
00278 asm volatile(
00279 "1:\n\t"
00280 "add %4, %1\n\t"
00281
00282 "movq %%mm2, %%mm1\n\t"
00283 "pmaddwd %%mm5, %%mm1\n\t"
00284
00285 "movd (%1), %%mm0\n\t"
00286 "punpcklbw %%mm7, %%mm0\n\t"
00287 "pshufw $0x94, %%mm0, %%mm0\n\t"
00288
00289 "movq %%mm0, %%mm2\n\t"
00290 "pmaddwd %%mm6, %%mm0\n\t"
00291 "paddw %3, %%mm1\n\t"
00292 "paddw %%mm0, %%mm1\n\t"
00293
00294 "psrlw $6, %%mm1\n\t"
00295 "packssdw %%mm7, %%mm1\n\t"
00296 "packuswb %%mm7, %%mm1\n\t"
00297 H264_CHROMA_OP4((%0), %%mm1, %%mm3)
00298 "movd %%mm1, %%esi\n\t"
00299 "movw %%si, (%0)\n\t"
00300 "add %4, %0\n\t"
00301 "sub $1, %2\n\t"
00302 "jnz 1b\n\t"
00303 : "+r" (dst), "+r"(src), "+r"(h)
00304 : "m" (ff_pw_32), "r"((long)stride)
00305 : "%esi");
00306
00307 }
00308 #endif
00309