diff --git a/postproc/swscale.c b/postproc/swscale.c index dbf61ce8df..7bca3c39b9 100644 --- a/postproc/swscale.c +++ b/postproc/swscale.c @@ -117,10 +117,6 @@ untested special converters extern int verbose; // defined in mplayer.c /* NOTES - -known BUGS with known cause (no bugreports please!, but patches are welcome :) ) -horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11) - Special versions: fast Y 1:1 scaling (no interpolation in y direction) TODO @@ -1020,12 +1016,17 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out } #ifdef ARCH_X86 -static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode) +static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits) { - uint8_t *fragment; - int imm8OfPShufW1; - int imm8OfPShufW2; - int fragmentLength; + uint8_t *fragmentA; + int imm8OfPShufW1A; + int imm8OfPShufW2A; + int fragmentLengthA; + uint8_t *fragmentB; + int imm8OfPShufW1B; + int imm8OfPShufW2B; + int fragmentLengthB; + int fragmentPos; int xpos, i; @@ -1037,22 +1038,18 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode) "jmp 9f \n\t" // Begin "0: \n\t" - "movq (%%esi), %%mm0 \n\t" //FIXME Alignment - "movq %%mm0, %%mm1 \n\t" - "psrlq $8, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF + "movq (%%edx, %%eax), %%mm3 \n\t" + "movd (%%ecx, %%esi), %%mm0 \n\t" + "movd 1(%%ecx, %%esi), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" "pshufw $0xFF, %%mm1, %%mm1 \n\t" "1: \n\t" - "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry "pshufw $0xFF, %%mm0, %%mm0 \n\t" "2: \n\t" - "psrlw $9, %%mm3 \n\t" "psubw %%mm1, %%mm0 \n\t" + "movl 8(%%ebx, %%eax), %%esi \n\t" "pmullw %%mm3, %%mm0 \n\t" - "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF "psllw $7, %%mm1 \n\t" "paddw %%mm1, %%mm0 \n\t" @@ -1071,13 +1068,54 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode) "subl %0, %2 \n\t" "leal 9b, %3 \n\t" "subl %0, %3 \n\t" - :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), - "=r" (fragmentLength) + + + :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), + "=r" (fragmentLengthA) + ); + + asm volatile( + "jmp 9f \n\t" + // Begin + "0: \n\t" + "movq (%%edx, %%eax), %%mm3 \n\t" + "movd (%%ecx, %%esi), %%mm0 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "pshufw $0xFF, %%mm0, %%mm1 \n\t" + "1: \n\t" + "pshufw $0xFF, %%mm0, %%mm0 \n\t" + "2: \n\t" + "psubw %%mm1, %%mm0 \n\t" + "movl 8(%%ebx, %%eax), %%esi \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "psllw $7, %%mm1 \n\t" + "paddw %%mm1, %%mm0 \n\t" + + "movq %%mm0, (%%edi, %%eax) \n\t" + + "addl $8, %%eax \n\t" + // End + "9: \n\t" +// "int $3\n\t" + "leal 0b, %0 \n\t" + "leal 1b, %1 \n\t" + "leal 2b, %2 \n\t" + "decl %1 \n\t" + "decl %2 \n\t" + "subl %0, %1 \n\t" + "subl %0, %2 \n\t" + "leal 9b, %3 \n\t" + "subl %0, %3 \n\t" + + + :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), + "=r" (fragmentLengthB) ); xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers - - for(i=0; i>16; @@ -1088,20 +1126,65 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode) int c=((xpos+xInc*2)>>16) - xx; int d=((xpos+xInc*3)>>16) - xx; - memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength); + filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9; + filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9; + filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9; + filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9; + filterPos[i/2]= xx; - funnyCode[fragmentLength*i/4 + imm8OfPShufW1]= - funnyCode[fragmentLength*i/4 + imm8OfPShufW2]= - a | (b<<2) | (c<<4) | (d<<6); + if(d+1<4) + { + int maxShift= 3-(d+1); + int shift=0; - // if we dont need to read 8 bytes than dont :), reduces the chance of - // crossing a cache line - if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E; + memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB); - funnyCode[fragmentLength*(i+4)/4]= RET; + funnyCode[fragmentPos + imm8OfPShufW1B]= + (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6); + funnyCode[fragmentPos + imm8OfPShufW2B]= + a | (b<<2) | (c<<4) | (d<<6); + + if(i+3>=dstW) shift=maxShift; //avoid overread + else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align + + if(shift && i>=shift) + { + funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift; + funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift; + filterPos[i/2]-=shift; + } + + fragmentPos+= fragmentLengthB; + } + else + { + int maxShift= 3-d; + int shift=0; + + memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA); + + funnyCode[fragmentPos + imm8OfPShufW1A]= + funnyCode[fragmentPos + imm8OfPShufW2A]= + a | (b<<2) | (c<<4) | (d<<6); + + if(i+4>=dstW) shift=maxShift; //avoid overread + else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align + + if(shift && i>=shift) + { + funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift; + funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift; + filterPos[i/2]-=shift; + } + + fragmentPos+= fragmentLengthA; + } + + funnyCode[fragmentPos]= RET; } xpos+=xInc; } + filterPos[i/2]= xpos>>16; // needed to jump to the next part } #endif // ARCH_X86 @@ -1565,8 +1648,13 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, // cant downscale !!! if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) { - initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode); - initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode); + c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t)); + c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t)); + c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW /2/8+8)*sizeof(int32_t)); + c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t)); + + initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8); + initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4); } #endif } // Init Horizontal stuff @@ -2014,6 +2102,15 @@ void freeSwsContext(SwsContext *c){ if(c->chrMmxFilter) free(c->chrMmxFilter); c->chrMmxFilter = NULL; + if(c->lumMmx2Filter) free(c->lumMmx2Filter); + c->lumMmx2Filter=NULL; + if(c->chrMmx2Filter) free(c->chrMmx2Filter); + c->chrMmx2Filter=NULL; + if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos); + c->lumMmx2FilterPos=NULL; + if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos); + c->chrMmx2FilterPos=NULL; + free(c); } diff --git a/postproc/swscale.h b/postproc/swscale.h index 03b63a6501..97584672da 100644 --- a/postproc/swscale.h +++ b/postproc/swscale.h @@ -69,6 +69,10 @@ typedef struct SwsContext{ uint8_t __attribute__((aligned(32))) funnyYCode[10000]; uint8_t __attribute__((aligned(32))) funnyUVCode[10000]; + int32_t *lumMmx2FilterPos; + int32_t *chrMmx2FilterPos; + int16_t *lumMmx2Filter; + int16_t *chrMmx2Filter; int canMMX2BeUsed; diff --git a/postproc/swscale_template.c b/postproc/swscale_template.c index 291ba0ccfb..e76020eab7 100644 --- a/postproc/swscale_template.c +++ b/postproc/swscale_template.c @@ -2238,7 +2238,8 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hLumFilter, int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, - int srcFormat, uint8_t *formatConvBuffer) + int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, + int32_t *mmx2FilterPos) { if(srcFormat==IMGFMT_YUY2) { @@ -2294,35 +2295,21 @@ static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, in { asm volatile( "pxor %%mm7, %%mm7 \n\t" - "pxor %%mm2, %%mm2 \n\t" // 2*xalpha - "movd %5, %%mm6 \n\t" // xInc&0xFFFF - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "movq %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" - "paddw %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" - "paddw %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF - "movq %%mm2, %%mm4 \n\t" - "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" + "movl %0, %%ecx \n\t" + "movl %1, %%edi \n\t" + "movl %2, %%edx \n\t" + "movl %3, %%ebx \n\t" "xorl %%eax, %%eax \n\t" // i - "movl %0, %%esi \n\t" // src - "movl %1, %%edi \n\t" // buf1 - "movl %3, %%edx \n\t" // (xInc*4)>>16 - "xorl %%ecx, %%ecx \n\t" - "xorl %%ebx, %%ebx \n\t" - "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF + PREFETCH" (%%ecx) \n\t" + PREFETCH" 32(%%ecx) \n\t" + PREFETCH" 64(%%ecx) \n\t" #define FUNNY_Y_CODE \ - PREFETCH" 1024(%%esi) \n\t"\ - PREFETCH" 1056(%%esi) \n\t"\ - PREFETCH" 1088(%%esi) \n\t"\ - "call *%6 \n\t"\ - "movq %%mm4, %%mm2 \n\t"\ - "xorl %%ecx, %%ecx \n\t" + "movl (%%ebx), %%esi \n\t"\ + "call *%4 \n\t"\ + "addl (%%ebx, %%eax), %%ecx \n\t"\ + "addl %%eax, %%edi \n\t"\ + "xorl %%eax, %%eax \n\t"\ FUNNY_Y_CODE FUNNY_Y_CODE @@ -2333,8 +2320,8 @@ FUNNY_Y_CODE FUNNY_Y_CODE FUNNY_Y_CODE - :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), - "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode) + :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), + "m" (funnyYCode) : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" ); for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; @@ -2402,7 +2389,8 @@ FUNNY_Y_CODE inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2, int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, - int srcFormat, uint8_t *formatConvBuffer) + int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, + int32_t *mmx2FilterPos) { if(srcFormat==IMGFMT_YUY2) { @@ -2469,65 +2457,44 @@ inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, u if(canMMX2BeUsed) { asm volatile( - "pxor %%mm7, %%mm7 \n\t" - "pxor %%mm2, %%mm2 \n\t" // 2*xalpha - "movd %5, %%mm6 \n\t" // xInc&0xFFFF - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "movq %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" - "paddw %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" - "paddw %%mm6, %%mm2 \n\t" - "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF - "movq %%mm2, %%mm4 \n\t" - "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "xorl %%eax, %%eax \n\t" // i - "movl %0, %%esi \n\t" // src - "movl %1, %%edi \n\t" // buf1 - "movl %3, %%edx \n\t" // (xInc*4)>>16 - "xorl %%ecx, %%ecx \n\t" - "xorl %%ebx, %%ebx \n\t" - "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF + "pxor %%mm7, %%mm7 \n\t" + "movl %0, %%ecx \n\t" + "movl %1, %%edi \n\t" + "movl %2, %%edx \n\t" + "movl %3, %%ebx \n\t" + "xorl %%eax, %%eax \n\t" // i + PREFETCH" (%%ecx) \n\t" + PREFETCH" 32(%%ecx) \n\t" + PREFETCH" 64(%%ecx) \n\t" -#define FUNNYUVCODE \ - PREFETCH" 1024(%%esi) \n\t"\ - PREFETCH" 1056(%%esi) \n\t"\ - PREFETCH" 1088(%%esi) \n\t"\ - "call *%7 \n\t"\ - "movq %%mm4, %%mm2 \n\t"\ - "xorl %%ecx, %%ecx \n\t" +#define FUNNY_UV_CODE \ + "movl (%%ebx), %%esi \n\t"\ + "call *%4 \n\t"\ + "addl (%%ebx, %%eax), %%ecx \n\t"\ + "addl %%eax, %%edi \n\t"\ + "xorl %%eax, %%eax \n\t"\ -FUNNYUVCODE -FUNNYUVCODE -FUNNYUVCODE -FUNNYUVCODE +FUNNY_UV_CODE +FUNNY_UV_CODE +FUNNY_UV_CODE +FUNNY_UV_CODE + "xorl %%eax, %%eax \n\t" // i + "movl %5, %%ecx \n\t" // src + "movl %1, %%edi \n\t" // buf1 + "addl $4096, %%edi \n\t" + PREFETCH" (%%ecx) \n\t" + PREFETCH" 32(%%ecx) \n\t" + PREFETCH" 64(%%ecx) \n\t" -FUNNYUVCODE -FUNNYUVCODE -FUNNYUVCODE -FUNNYUVCODE - "xorl %%eax, %%eax \n\t" // i - "movl %6, %%esi \n\t" // src - "movl %1, %%edi \n\t" // buf1 - "addl $4096, %%edi \n\t" +FUNNY_UV_CODE +FUNNY_UV_CODE +FUNNY_UV_CODE +FUNNY_UV_CODE -FUNNYUVCODE -FUNNYUVCODE -FUNNYUVCODE -FUNNYUVCODE - -FUNNYUVCODE -FUNNYUVCODE -FUNNYUVCODE -FUNNYUVCODE - - :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), - "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode) - : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" - ); + :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), + "m" (funnyUVCode), "m" (src2) + : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" + ); for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { // printf("%d %d %d\n", dstWidth, i, srcW); @@ -2749,7 +2716,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar // printf("%d %d\n", lumBufIndex, vLumBufSize); RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, - funnyYCode, c->srcFormat, formatConvBuffer); + funnyYCode, c->srcFormat, formatConvBuffer, + c->lumMmx2Filter, c->lumMmx2FilterPos); lastInLumBuf++; } while(lastInChrBuf < lastChrSrcY) @@ -2763,7 +2731,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar //FIXME replace parameters through context struct (some at least) RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, - funnyUVCode, c->srcFormat, formatConvBuffer); + funnyUVCode, c->srcFormat, formatConvBuffer, + c->chrMmx2Filter, c->chrMmx2FilterPos); lastInChrBuf++; } //wrap buf index around to stay inside the ring buffer @@ -2787,7 +2756,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, - funnyYCode, c->srcFormat, formatConvBuffer); + funnyYCode, c->srcFormat, formatConvBuffer, + c->lumMmx2Filter, c->lumMmx2FilterPos); lastInLumBuf++; } while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1)) @@ -2800,7 +2770,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, - funnyUVCode, c->srcFormat, formatConvBuffer); + funnyUVCode, c->srcFormat, formatConvBuffer, + c->chrMmx2Filter, c->chrMmx2FilterPos); lastInChrBuf++; } //wrap buf index around to stay inside the ring buffer