1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

fixed a rounding bug thing in the X1 Filter

changed the X1 Filter slightly to make flat blocks look like in the 9tap lpf
minor change to the -pp numbers & added decimal numbers in comments
new experimental horizontal deblocking filter

Originally committed as revision 2180 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
This commit is contained in:
Michael Niedermayer 2001-10-13 02:31:15 +00:00
parent 67b4cf184a
commit 9f45d04d3a
3 changed files with 510 additions and 67 deletions

View File

@ -27,8 +27,9 @@ isHorizMinMaxOk a
doHorizLowPass E a a* doHorizLowPass E a a*
doHorizDefFilter E ac ac doHorizDefFilter E ac ac
deRing deRing
RKAlgo1 E a a* Vertical RKAlgo1 E a a*
X1 a E E* Vertical X1 a E E*
Horizontal X1 a E E*
* i dont have a 3dnow CPU -> its untested * i dont have a 3dnow CPU -> its untested
@ -40,7 +41,7 @@ c = checked against the other implementations (-vo md5)
/* /*
TODO: TODO:
verify that everything workes as it should verify that everything workes as it should (how?)
reduce the time wasted on the mem transfer reduce the time wasted on the mem transfer
implement dering implement dering
implement everything in C at least (done at the moment but ...) implement everything in C at least (done at the moment but ...)
@ -51,6 +52,9 @@ write a faster and higher quality deblocking filter :)
do something about the speed of the horizontal filters do something about the speed of the horizontal filters
make the mainloop more flexible (variable number of blocks at once make the mainloop more flexible (variable number of blocks at once
(the if/else stuff per block is slowing things down) (the if/else stuff per block is slowing things down)
compare the quality & speed of all filters
implement a few simple deinterlacing filters
split this huge file
... ...
Notes: Notes:
@ -58,7 +62,7 @@ Notes:
*/ */
/* /*
Changelog: Changelog: use the CVS log
0.1.3 0.1.3
bugfixes: last 3 lines not brightness/contrast corrected bugfixes: last 3 lines not brightness/contrast corrected
brightness statistics messed up with initial black pic brightness statistics messed up with initial black pic
@ -99,11 +103,13 @@ static uint64_t bm10000000= 0xFF00000000000000LL;
static uint64_t bm10000001= 0xFF000000000000FFLL; static uint64_t bm10000001= 0xFF000000000000FFLL;
static uint64_t bm11000011= 0xFFFF00000000FFFFLL; static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
static uint64_t bm00000011= 0x000000000000FFFFLL; static uint64_t bm00000011= 0x000000000000FFFFLL;
static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
static uint64_t bm11000000= 0xFFFF000000000000LL; static uint64_t bm11000000= 0xFFFF000000000000LL;
static uint64_t bm00011000= 0x000000FFFF000000LL; static uint64_t bm00011000= 0x000000FFFF000000LL;
static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
static uint64_t b00= 0x0000000000000000LL; static uint64_t b00= 0x0000000000000000LL;
static uint64_t b01= 0x0101010101010101LL;
static uint64_t b02= 0x0202020202020202LL; static uint64_t b02= 0x0202020202020202LL;
static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
@ -544,7 +550,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
x/8 = 1 x/8 = 1
1 12 12 23 1 12 12 23
*/ */
static inline void vertRKFilter(uint8_t *src, int stride, int QP) static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
{ {
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
// FIXME rounding // FIXME rounding
@ -638,7 +644,8 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
/** /**
* Experimental Filter 1 * Experimental Filter 1
* will nor damage linear gradients * will not damage linear gradients
* Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
* can only smooth blocks at the expected locations (it cant smooth them if they did move) * can only smooth blocks at the expected locations (it cant smooth them if they did move)
* MMX2 version does correct clipping C version doesnt * MMX2 version does correct clipping C version doesnt
*/ */
@ -675,9 +682,13 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
"movq %%mm4, %%mm3 \n\t" // d "movq %%mm4, %%mm3 \n\t" // d
"psubusb pQPb, %%mm4 \n\t" "psubusb pQPb, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
"psubusb b01, %%mm3 \n\t"
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
PAVGB(%%mm7, %%mm3) // d/2 PAVGB(%%mm7, %%mm3) // d/2
"movq %%mm3, %%mm1 \n\t" // d/2
PAVGB(%%mm7, %%mm3) // d/4
PAVGB(%%mm1, %%mm3) // 3*d/8
"movq (%0, %1, 4), %%mm0 \n\t" // line 4 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
@ -691,31 +702,31 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx) \n\t" // line 5 "movq %%mm0, (%%ebx) \n\t" // line 5
PAVGB(%%mm7, %%mm3) // d/4 PAVGB(%%mm7, %%mm1) // d/4
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
"psubusb %%mm3, %%mm0 \n\t" "psubusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
"movq (%%ebx, %1), %%mm0 \n\t" // line 6 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
"paddusb %%mm3, %%mm0 \n\t" "paddusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx, %1) \n\t" // line 6 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
PAVGB(%%mm7, %%mm3) // d/8 PAVGB(%%mm7, %%mm1) // d/8
"movq (%%eax, %1), %%mm0 \n\t" // line 2 "movq (%%eax, %1), %%mm0 \n\t" // line 2
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
"psubusb %%mm3, %%mm0 \n\t" "psubusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%eax, %1) \n\t" // line 2 "movq %%mm0, (%%eax, %1) \n\t" // line 2
"movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
"paddusb %%mm3, %%mm0 \n\t" "paddusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
@ -739,7 +750,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
{ {
int a= src[l3] - src[l4]; int a= src[l3] - src[l4];
int b= src[l4] - src[l5]; int b= src[l4] - src[l5];
int c= src[l6] - src[l7]; int c= src[l5] - src[l6];
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
@ -749,8 +760,8 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
src[l2] +=v/8; src[l2] +=v/8;
src[l3] +=v/4; src[l3] +=v/4;
src[l4] +=v/2; src[l4] +=3*v/8;
src[l5] -=v/2; src[l5] -=3*v/8;
src[l6] -=v/4; src[l6] -=v/4;
src[l7] -=v/8; src[l7] -=v/8;
@ -789,6 +800,211 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
#endif #endif
} }
/**
* Experimental Filter 1 (Horizontal)
* will not damage linear gradients
* Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
* can only smooth blocks at the expected locations (it cant smooth them if they did move)
* MMX2 version does correct clipping C version doesnt
* not identical with the vertical one
*/
static inline void horizX1Filter(uint8_t *src, int stride, int QP)
{
int y;
static uint64_t *lut= NULL;
if(lut==NULL)
{
int i;
lut= (uint64_t*)memalign(8, 256*8);
for(i=0; i<256; i++)
{
int v= i < 128 ? 2*i : 2*(i-256);
/*
//Simulate 112242211 9-Tap filter
uint64_t a= (v/16) & 0xFF;
uint64_t b= (v/8) & 0xFF;
uint64_t c= (v/4) & 0xFF;
uint64_t d= (3*v/8) & 0xFF;
*/
//Simulate piecewise linear interpolation
uint64_t a= (v/16) & 0xFF;
uint64_t b= (v*3/16) & 0xFF;
uint64_t c= (v*5/16) & 0xFF;
uint64_t d= (7*v/16) & 0xFF;
uint64_t A= (0x100 - a)&0xFF;
uint64_t B= (0x100 - b)&0xFF;
uint64_t C= (0x100 - c)&0xFF;
uint64_t D= (0x100 - c)&0xFF;
lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
(D<<24) | (C<<16) | (B<<8) | (A);
//lut[i] = (v<<32) | (v<<24);
}
}
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm volatile(
"pxor %%mm7, %%mm7 \n\t" // 0
// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
"leal (%0, %1), %%eax \n\t"
"leal (%%eax, %1, 4), %%ebx \n\t"
"movq b80, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" // QP
"movq %%mm5, %%mm4 \n\t"
"paddusb %%mm5, %%mm5 \n\t" // 2QP
"paddusb %%mm5, %%mm4 \n\t" // 3QP
"pxor %%mm5, %%mm5 \n\t" // 0
"psubb %%mm4, %%mm5 \n\t" // -3QP
"por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
"psllq $24, %%mm5 \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
#define HX1old(a) \
"movd " #a ", %%mm0 \n\t"\
"movd 4" #a ", %%mm1 \n\t"\
"punpckldq %%mm1, %%mm0 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\
"psubusb %%mm1, %%mm2 \n\t"\
"psubusb %%mm0, %%mm1 \n\t"\
"por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
"pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
"pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
"psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
"psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
"paddb %%mm5, %%mm1 \n\t"\
"psubusb %%mm5, %%mm1 \n\t"\
PAVGB(%%mm7, %%mm1)\
"pxor %%mm2, %%mm1 \n\t"\
"psubb %%mm2, %%mm1 \n\t"\
"psrlq $24, %%mm1 \n\t"\
"movd %%mm1, %%ecx \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"movq %%mm0, " #a " \n\t"\
/*
HX1old((%0))
HX1old((%%eax))
HX1old((%%eax, %1))
HX1old((%%eax, %1, 2))
HX1old((%0, %1, 4))
HX1old((%%ebx))
HX1old((%%ebx, %1))
HX1old((%%ebx, %1, 2))
*/
//FIXME add some comments, its unreadable ...
#define HX1b(a, c, b, d) \
"movd " #a ", %%mm0 \n\t"\
"movd 4" #a ", %%mm1 \n\t"\
"punpckldq %%mm1, %%mm0 \n\t"\
"movd " #b ", %%mm4 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\
"movd 4" #b ", %%mm3 \n\t"\
"psubusb %%mm1, %%mm2 \n\t"\
"psubusb %%mm0, %%mm1 \n\t"\
"por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
"pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
"punpckldq %%mm3, %%mm4 \n\t"\
"movq %%mm1, %%mm3 \n\t"\
"psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
"paddb %%mm6, %%mm0 \n\t"\
"psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
"psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
"movq %%mm4, %%mm3 \n\t"\
"paddb %%mm5, %%mm1 \n\t"\
"psubusb %%mm5, %%mm1 \n\t"\
"psrlq $8, %%mm3 \n\t"\
PAVGB(%%mm7, %%mm1)\
"pxor %%mm2, %%mm1 \n\t"\
"psubb %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm2 \n\t"\
"psrlq $24, %%mm1 \n\t"\
"psubusb %%mm3, %%mm2 \n\t"\
"movd %%mm1, %%ecx \n\t"\
"psubusb %%mm4, %%mm3 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
"por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
"paddb %%mm6, %%mm0 \n\t"\
"pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
"movq %%mm3, %%mm1 \n\t"\
"psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
"movq %%mm0, " #a " \n\t"\
PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
"paddb %%mm6, %%mm4 \n\t"\
"psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
"psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
"paddb %%mm5, %%mm3 \n\t"\
"psubusb %%mm5, %%mm3 \n\t"\
PAVGB(%%mm7, %%mm3)\
"pxor %%mm2, %%mm3 \n\t"\
"psubb %%mm2, %%mm3 \n\t"\
"psrlq $24, %%mm3 \n\t"\
"movd " #c ", %%mm0 \n\t"\
"movd 4" #c ", %%mm1 \n\t"\
"punpckldq %%mm1, %%mm0 \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"movq %%mm0, " #c " \n\t"\
"movd %%mm3, %%ecx \n\t"\
"movd " #d ", %%mm0 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm4 \n\t"\
"movd 4" #d ", %%mm1 \n\t"\
"paddb %%mm6, %%mm4 \n\t"\
"punpckldq %%mm1, %%mm0 \n\t"\
"movq %%mm4, " #b " \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"movq %%mm0, " #d " \n\t"\
HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
:
: "r" (src), "r" (stride), "r" (QP), "r" (lut)
: "%eax", "%ebx", "%ecx"
);
#else
//FIXME (has little in common with the mmx2 version)
for(y=0; y<BLOCK_SIZE; y++)
{
int a= src[1] - src[2];
int b= src[3] - src[4];
int c= src[5] - src[6];
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
if(d < QP)
{
int v = d * SIGN(-b);
src[1] +=v/8;
src[2] +=v/4;
src[3] +=3*v/8;
src[4] -=3*v/8;
src[5] -=v/4;
src[6] -=v/8;
}
src+=stride;
}
#endif
}
static inline void doVertDefFilter(uint8_t src[], int stride, int QP) static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
{ {
@ -1638,13 +1854,14 @@ void postprocess(unsigned char * src[], int src_stride,
vertical_size >>= 1; vertical_size >>= 1;
src_stride >>= 1; src_stride >>= 1;
dst_stride >>= 1; dst_stride >>= 1;
mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
if(1) if(1)
{ {
postProcess(src[1], src_stride, dst[1], dst_stride, postProcess(src[1], src_stride, dst[1], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
postProcess(src[2], src_stride, dst[2], dst_stride, postProcess(src[2], src_stride, dst[2], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
} }
else else
{ {
@ -1929,9 +2146,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
#endif #endif
if(mode & V_DEBLOCK) if(mode & V_DEBLOCK)
{ {
if(mode & RK_FILTER) if(mode & V_RK1_FILTER)
vertRKFilter(vertBlock, stride, QP); vertRK1Filter(vertBlock, stride, QP);
else if(mode & X1_FILTER) else if(mode & V_X1_FILTER)
vertX1Filter(vertBlock, stride, QP); vertX1Filter(vertBlock, stride, QP);
else else
{ {
@ -1961,6 +2178,10 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
T0= rdtsc(); T0= rdtsc();
#endif #endif
if(mode & H_DEBLOCK) if(mode & H_DEBLOCK)
{
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, QP);
else
{ {
if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
{ {
@ -1970,6 +2191,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
else else
doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
} }
}
#ifdef MORE_TIMEING #ifdef MORE_TIMEING
T1= rdtsc(); T1= rdtsc();
horizTime+= T1-T0; horizTime+= T1-T0;

View File

@ -28,24 +28,23 @@
#define DERING 0x04 #define DERING 0x04
#define LEVEL_FIX 0x08 /* Brightness & Contrast */ #define LEVEL_FIX 0x08 /* Brightness & Contrast */
#define LUM_V_DEBLOCK V_DEBLOCK #define LUM_V_DEBLOCK V_DEBLOCK // 1
#define LUM_H_DEBLOCK H_DEBLOCK #define LUM_H_DEBLOCK H_DEBLOCK // 2
#define CHROM_V_DEBLOCK (V_DEBLOCK<<4) #define CHROM_V_DEBLOCK (V_DEBLOCK<<4) // 16
#define CHROM_H_DEBLOCK (H_DEBLOCK<<4) #define CHROM_H_DEBLOCK (H_DEBLOCK<<4) // 32
#define LUM_DERING DERING #define LUM_DERING DERING // 4
#define CHROM_DERING (DERING<<4) #define CHROM_DERING (DERING<<4) // 64
#define LUM_LEVEL_FIX LEVEL_FIX #define LUM_LEVEL_FIX LEVEL_FIX // 8
//not supported currently //not supported currently
#define CHROM_LEVEL_FIX (LEVEL_FIX<<4) #define CHROM_LEVEL_FIX (LEVEL_FIX<<4) // 128
// Experimental stuff // Experimental vertical filters
#define RK_FILTER 0x0100 #define V_RK1_FILTER 0x0100 // 256
#define LUM_V_RK_FILTER RK_FILTER #define V_X1_FILTER 0x0200 // 512
#define CHROM_V_RK_FILTER (RK_FILTER<<4)
#define X1_FILTER 0x0200 // Experimental horizontal filters
#define LUM_V_X1_FILTER X1_FILTER #define H_RK1_FILTER 0x1000 // 4096
#define CHROM_V_X1_FILTER (X1_FILTER<<4) #define H_X1_FILTER 0x2000 // 8192
#define TIMEING #define TIMEING

View File

@ -27,8 +27,9 @@ isHorizMinMaxOk a
doHorizLowPass E a a* doHorizLowPass E a a*
doHorizDefFilter E ac ac doHorizDefFilter E ac ac
deRing deRing
RKAlgo1 E a a* Vertical RKAlgo1 E a a*
X1 a E E* Vertical X1 a E E*
Horizontal X1 a E E*
* i dont have a 3dnow CPU -> its untested * i dont have a 3dnow CPU -> its untested
@ -40,7 +41,7 @@ c = checked against the other implementations (-vo md5)
/* /*
TODO: TODO:
verify that everything workes as it should verify that everything workes as it should (how?)
reduce the time wasted on the mem transfer reduce the time wasted on the mem transfer
implement dering implement dering
implement everything in C at least (done at the moment but ...) implement everything in C at least (done at the moment but ...)
@ -51,6 +52,9 @@ write a faster and higher quality deblocking filter :)
do something about the speed of the horizontal filters do something about the speed of the horizontal filters
make the mainloop more flexible (variable number of blocks at once make the mainloop more flexible (variable number of blocks at once
(the if/else stuff per block is slowing things down) (the if/else stuff per block is slowing things down)
compare the quality & speed of all filters
implement a few simple deinterlacing filters
split this huge file
... ...
Notes: Notes:
@ -58,7 +62,7 @@ Notes:
*/ */
/* /*
Changelog: Changelog: use the CVS log
0.1.3 0.1.3
bugfixes: last 3 lines not brightness/contrast corrected bugfixes: last 3 lines not brightness/contrast corrected
brightness statistics messed up with initial black pic brightness statistics messed up with initial black pic
@ -99,11 +103,13 @@ static uint64_t bm10000000= 0xFF00000000000000LL;
static uint64_t bm10000001= 0xFF000000000000FFLL; static uint64_t bm10000001= 0xFF000000000000FFLL;
static uint64_t bm11000011= 0xFFFF00000000FFFFLL; static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
static uint64_t bm00000011= 0x000000000000FFFFLL; static uint64_t bm00000011= 0x000000000000FFFFLL;
static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
static uint64_t bm11000000= 0xFFFF000000000000LL; static uint64_t bm11000000= 0xFFFF000000000000LL;
static uint64_t bm00011000= 0x000000FFFF000000LL; static uint64_t bm00011000= 0x000000FFFF000000LL;
static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
static uint64_t b00= 0x0000000000000000LL; static uint64_t b00= 0x0000000000000000LL;
static uint64_t b01= 0x0101010101010101LL;
static uint64_t b02= 0x0202020202020202LL; static uint64_t b02= 0x0202020202020202LL;
static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
@ -544,7 +550,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
x/8 = 1 x/8 = 1
1 12 12 23 1 12 12 23
*/ */
static inline void vertRKFilter(uint8_t *src, int stride, int QP) static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
{ {
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
// FIXME rounding // FIXME rounding
@ -638,7 +644,8 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
/** /**
* Experimental Filter 1 * Experimental Filter 1
* will nor damage linear gradients * will not damage linear gradients
* Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
* can only smooth blocks at the expected locations (it cant smooth them if they did move) * can only smooth blocks at the expected locations (it cant smooth them if they did move)
* MMX2 version does correct clipping C version doesnt * MMX2 version does correct clipping C version doesnt
*/ */
@ -675,9 +682,13 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
"movq %%mm4, %%mm3 \n\t" // d "movq %%mm4, %%mm3 \n\t" // d
"psubusb pQPb, %%mm4 \n\t" "psubusb pQPb, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
"psubusb b01, %%mm3 \n\t"
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
PAVGB(%%mm7, %%mm3) // d/2 PAVGB(%%mm7, %%mm3) // d/2
"movq %%mm3, %%mm1 \n\t" // d/2
PAVGB(%%mm7, %%mm3) // d/4
PAVGB(%%mm1, %%mm3) // 3*d/8
"movq (%0, %1, 4), %%mm0 \n\t" // line 4 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
@ -691,31 +702,31 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx) \n\t" // line 5 "movq %%mm0, (%%ebx) \n\t" // line 5
PAVGB(%%mm7, %%mm3) // d/4 PAVGB(%%mm7, %%mm1) // d/4
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
"psubusb %%mm3, %%mm0 \n\t" "psubusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
"movq (%%ebx, %1), %%mm0 \n\t" // line 6 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
"paddusb %%mm3, %%mm0 \n\t" "paddusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx, %1) \n\t" // line 6 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
PAVGB(%%mm7, %%mm3) // d/8 PAVGB(%%mm7, %%mm1) // d/8
"movq (%%eax, %1), %%mm0 \n\t" // line 2 "movq (%%eax, %1), %%mm0 \n\t" // line 2
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
"psubusb %%mm3, %%mm0 \n\t" "psubusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%eax, %1) \n\t" // line 2 "movq %%mm0, (%%eax, %1) \n\t" // line 2
"movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
"paddusb %%mm3, %%mm0 \n\t" "paddusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
@ -739,7 +750,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
{ {
int a= src[l3] - src[l4]; int a= src[l3] - src[l4];
int b= src[l4] - src[l5]; int b= src[l4] - src[l5];
int c= src[l6] - src[l7]; int c= src[l5] - src[l6];
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
@ -749,8 +760,8 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
src[l2] +=v/8; src[l2] +=v/8;
src[l3] +=v/4; src[l3] +=v/4;
src[l4] +=v/2; src[l4] +=3*v/8;
src[l5] -=v/2; src[l5] -=3*v/8;
src[l6] -=v/4; src[l6] -=v/4;
src[l7] -=v/8; src[l7] -=v/8;
@ -789,6 +800,211 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
#endif #endif
} }
/**
* Experimental Filter 1 (Horizontal)
* will not damage linear gradients
* Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
* can only smooth blocks at the expected locations (it cant smooth them if they did move)
* MMX2 version does correct clipping C version doesnt
* not identical with the vertical one
*/
static inline void horizX1Filter(uint8_t *src, int stride, int QP)
{
int y;
static uint64_t *lut= NULL;
if(lut==NULL)
{
int i;
lut= (uint64_t*)memalign(8, 256*8);
for(i=0; i<256; i++)
{
int v= i < 128 ? 2*i : 2*(i-256);
/*
//Simulate 112242211 9-Tap filter
uint64_t a= (v/16) & 0xFF;
uint64_t b= (v/8) & 0xFF;
uint64_t c= (v/4) & 0xFF;
uint64_t d= (3*v/8) & 0xFF;
*/
//Simulate piecewise linear interpolation
uint64_t a= (v/16) & 0xFF;
uint64_t b= (v*3/16) & 0xFF;
uint64_t c= (v*5/16) & 0xFF;
uint64_t d= (7*v/16) & 0xFF;
uint64_t A= (0x100 - a)&0xFF;
uint64_t B= (0x100 - b)&0xFF;
uint64_t C= (0x100 - c)&0xFF;
uint64_t D= (0x100 - c)&0xFF;
lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
(D<<24) | (C<<16) | (B<<8) | (A);
//lut[i] = (v<<32) | (v<<24);
}
}
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm volatile(
"pxor %%mm7, %%mm7 \n\t" // 0
// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
"leal (%0, %1), %%eax \n\t"
"leal (%%eax, %1, 4), %%ebx \n\t"
"movq b80, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" // QP
"movq %%mm5, %%mm4 \n\t"
"paddusb %%mm5, %%mm5 \n\t" // 2QP
"paddusb %%mm5, %%mm4 \n\t" // 3QP
"pxor %%mm5, %%mm5 \n\t" // 0
"psubb %%mm4, %%mm5 \n\t" // -3QP
"por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
"psllq $24, %%mm5 \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
#define HX1old(a) \
"movd " #a ", %%mm0 \n\t"\
"movd 4" #a ", %%mm1 \n\t"\
"punpckldq %%mm1, %%mm0 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\
"psubusb %%mm1, %%mm2 \n\t"\
"psubusb %%mm0, %%mm1 \n\t"\
"por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
"pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
"pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
"psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
"psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
"paddb %%mm5, %%mm1 \n\t"\
"psubusb %%mm5, %%mm1 \n\t"\
PAVGB(%%mm7, %%mm1)\
"pxor %%mm2, %%mm1 \n\t"\
"psubb %%mm2, %%mm1 \n\t"\
"psrlq $24, %%mm1 \n\t"\
"movd %%mm1, %%ecx \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"movq %%mm0, " #a " \n\t"\
/*
HX1old((%0))
HX1old((%%eax))
HX1old((%%eax, %1))
HX1old((%%eax, %1, 2))
HX1old((%0, %1, 4))
HX1old((%%ebx))
HX1old((%%ebx, %1))
HX1old((%%ebx, %1, 2))
*/
//FIXME add some comments, its unreadable ...
#define HX1b(a, c, b, d) \
"movd " #a ", %%mm0 \n\t"\
"movd 4" #a ", %%mm1 \n\t"\
"punpckldq %%mm1, %%mm0 \n\t"\
"movd " #b ", %%mm4 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\
"movd 4" #b ", %%mm3 \n\t"\
"psubusb %%mm1, %%mm2 \n\t"\
"psubusb %%mm0, %%mm1 \n\t"\
"por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
"pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
"punpckldq %%mm3, %%mm4 \n\t"\
"movq %%mm1, %%mm3 \n\t"\
"psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
"paddb %%mm6, %%mm0 \n\t"\
"psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
"psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
"movq %%mm4, %%mm3 \n\t"\
"paddb %%mm5, %%mm1 \n\t"\
"psubusb %%mm5, %%mm1 \n\t"\
"psrlq $8, %%mm3 \n\t"\
PAVGB(%%mm7, %%mm1)\
"pxor %%mm2, %%mm1 \n\t"\
"psubb %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm2 \n\t"\
"psrlq $24, %%mm1 \n\t"\
"psubusb %%mm3, %%mm2 \n\t"\
"movd %%mm1, %%ecx \n\t"\
"psubusb %%mm4, %%mm3 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
"por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
"paddb %%mm6, %%mm0 \n\t"\
"pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
"movq %%mm3, %%mm1 \n\t"\
"psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
"movq %%mm0, " #a " \n\t"\
PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
"paddb %%mm6, %%mm4 \n\t"\
"psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
"psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
"paddb %%mm5, %%mm3 \n\t"\
"psubusb %%mm5, %%mm3 \n\t"\
PAVGB(%%mm7, %%mm3)\
"pxor %%mm2, %%mm3 \n\t"\
"psubb %%mm2, %%mm3 \n\t"\
"psrlq $24, %%mm3 \n\t"\
"movd " #c ", %%mm0 \n\t"\
"movd 4" #c ", %%mm1 \n\t"\
"punpckldq %%mm1, %%mm0 \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"movq %%mm0, " #c " \n\t"\
"movd %%mm3, %%ecx \n\t"\
"movd " #d ", %%mm0 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm4 \n\t"\
"movd 4" #d ", %%mm1 \n\t"\
"paddb %%mm6, %%mm4 \n\t"\
"punpckldq %%mm1, %%mm0 \n\t"\
"movq %%mm4, " #b " \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
"paddb %%mm6, %%mm0 \n\t"\
"movq %%mm0, " #d " \n\t"\
HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
:
: "r" (src), "r" (stride), "r" (QP), "r" (lut)
: "%eax", "%ebx", "%ecx"
);
#else
//FIXME (has little in common with the mmx2 version)
for(y=0; y<BLOCK_SIZE; y++)
{
int a= src[1] - src[2];
int b= src[3] - src[4];
int c= src[5] - src[6];
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
if(d < QP)
{
int v = d * SIGN(-b);
src[1] +=v/8;
src[2] +=v/4;
src[3] +=3*v/8;
src[4] -=3*v/8;
src[5] -=v/4;
src[6] -=v/8;
}
src+=stride;
}
#endif
}
static inline void doVertDefFilter(uint8_t src[], int stride, int QP) static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
{ {
@ -1638,13 +1854,14 @@ void postprocess(unsigned char * src[], int src_stride,
vertical_size >>= 1; vertical_size >>= 1;
src_stride >>= 1; src_stride >>= 1;
dst_stride >>= 1; dst_stride >>= 1;
mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
if(1) if(1)
{ {
postProcess(src[1], src_stride, dst[1], dst_stride, postProcess(src[1], src_stride, dst[1], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
postProcess(src[2], src_stride, dst[2], dst_stride, postProcess(src[2], src_stride, dst[2], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
} }
else else
{ {
@ -1929,9 +2146,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
#endif #endif
if(mode & V_DEBLOCK) if(mode & V_DEBLOCK)
{ {
if(mode & RK_FILTER) if(mode & V_RK1_FILTER)
vertRKFilter(vertBlock, stride, QP); vertRK1Filter(vertBlock, stride, QP);
else if(mode & X1_FILTER) else if(mode & V_X1_FILTER)
vertX1Filter(vertBlock, stride, QP); vertX1Filter(vertBlock, stride, QP);
else else
{ {
@ -1961,6 +2178,10 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
T0= rdtsc(); T0= rdtsc();
#endif #endif
if(mode & H_DEBLOCK) if(mode & H_DEBLOCK)
{
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, QP);
else
{ {
if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
{ {
@ -1970,6 +2191,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
else else
doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
} }
}
#ifdef MORE_TIMEING #ifdef MORE_TIMEING
T1= rdtsc(); T1= rdtsc();
horizTime+= T1-T0; horizTime+= T1-T0;