1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-13 21:28:01 +02:00

bugfixes: last 3 lines not brightness/contrast corrected

brightness statistics messed up with initial black pic
changed initial values of the brightness statistics
C++ -> C conversation
QP range question solved (very likely 1<=QP<=32 according to arpi)
new experimental vertical deblocking filter
RK filter has 3dNow support now (untested)

Originally committed as revision 2169 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
This commit is contained in:
Michael Niedermayer 2001-10-11 22:35:45 +00:00
parent 13e0052810
commit d5a1a99518
3 changed files with 549 additions and 169 deletions

View File

@ -27,6 +27,9 @@ isHorizMinMaxOk a
doHorizLowPass E a a* doHorizLowPass E a a*
doHorizDefFilter E ac ac doHorizDefFilter E ac ac
deRing deRing
RKAlgo1 E a a*
X1 a E E*
* i dont have a 3dnow CPU -> its untested * i dont have a 3dnow CPU -> its untested
E = Exact implementation E = Exact implementation
@ -41,11 +44,13 @@ verify that everything workes as it should
reduce the time wasted on the mem transfer reduce the time wasted on the mem transfer
implement dering implement dering
implement everything in C at least (done at the moment but ...) implement everything in C at least (done at the moment but ...)
figure range of QP out (assuming <256 for now)
unroll stuff if instructions depend too much on the prior one unroll stuff if instructions depend too much on the prior one
we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
move YScale thing to the end instead of fixing QP move YScale thing to the end instead of fixing QP
write a faster and higher quality deblocking filter :) write a faster and higher quality deblocking filter :)
do something about the speed of the horizontal filters
make the mainloop more flexible (variable number of blocks at once
(the if/else stuff per block is slowing things down)
... ...
Notes: Notes:
@ -54,6 +59,14 @@ Notes:
/* /*
Changelog: Changelog:
0.1.3
bugfixes: last 3 lines not brightness/contrast corrected
brightness statistics messed up with initial black pic
changed initial values of the brightness statistics
C++ -> C conversation
QP range question solved (very likely 1<=QP<=32 according to arpi)
new experimental vertical deblocking filter
RK filter has 3dNow support now (untested)
0.1.2 0.1.2
fixed a bug in the horizontal default filter fixed a bug in the horizontal default filter
3dnow version of the Horizontal & Vertical Lowpass filters 3dnow version of the Horizontal & Vertical Lowpass filters
@ -66,6 +79,7 @@ Changelog:
#include <inttypes.h> #include <inttypes.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include "../config.h" #include "../config.h"
//#undef HAVE_MMX2 //#undef HAVE_MMX2
//#define HAVE_3DNOW //#define HAVE_3DNOW
@ -160,9 +174,10 @@ static inline void prefetcht2(void *p)
/** /**
* Check if the middle 8x8 Block in the given 8x10 block is flat * Check if the middle 8x8 Block in the given 8x10 block is flat
*/ */
static inline bool isVertDC(uint8_t src[], int stride){ static inline int isVertDC(uint8_t src[], int stride){
// return true; // return true;
int numEq= 0; int numEq= 0;
int y;
src+= stride; // src points to begin of the 8x8 Block src+= stride; // src points to begin of the 8x8 Block
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
@ -242,7 +257,7 @@ static inline bool isVertDC(uint8_t src[], int stride){
// uint8_t *temp= src; // uint8_t *temp= src;
#else #else
for(int y=0; y<BLOCK_SIZE-1; y++) for(y=0; y<BLOCK_SIZE-1; y++)
{ {
if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++; if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
@ -268,10 +283,11 @@ static inline bool isVertDC(uint8_t src[], int stride){
} }
} }
*/ */
return numEq > vFlatnessThreshold; // for(int i=0; i<numEq/8; i++) src[i]=255;
return (numEq > vFlatnessThreshold) ? 1 : 0;
} }
static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP) static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
{ {
#ifdef HAVE_MMX #ifdef HAVE_MMX
int isOk; int isOk;
@ -295,13 +311,14 @@ static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
: "=r" (isOk) : "=r" (isOk)
: "r" (src), "r" (stride) : "r" (src), "r" (stride)
); );
return isOk; return isOk ? 1 : 0;
#else #else
int isOk2= true; int isOk2= 1;
for(int x=0; x<BLOCK_SIZE; x++) int x;
for(x=0; x<BLOCK_SIZE; x++)
{ {
if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false; if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
} }
/* if(isOk && !isOk2 || !isOk && isOk2) /* if(isOk && !isOk2 || !isOk && isOk2)
{ {
@ -484,8 +501,8 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
const int l7= stride + l6; const int l7= stride + l6;
const int l8= stride + l7; const int l8= stride + l7;
const int l9= stride + l8; const int l9= stride + l8;
int x;
for(int x=0; x<BLOCK_SIZE; x++) for(x=0; x<BLOCK_SIZE; x++)
{ {
const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
@ -529,7 +546,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
*/ */
static inline void vertRKFilter(uint8_t *src, int stride, int QP) static inline void vertRKFilter(uint8_t *src, int stride, int QP)
{ {
#ifdef HAVE_MMX2 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
// FIXME rounding // FIXME rounding
asm volatile( asm volatile(
"pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm7, %%mm7 \n\t" // 0
@ -549,7 +566,7 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
"movq %%mm2, %%mm4 \n\t" // line 4 "movq %%mm2, %%mm4 \n\t" // line 4
"pcmpeqb %%mm5, %%mm5 \n\t" // -1 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
"pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
"pavgb %%mm3, %%mm5 \n\t" PAVGB(%%mm3, %%mm5)
"paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
"psubusb %%mm3, %%mm4 \n\t" "psubusb %%mm3, %%mm4 \n\t"
"psubusb %%mm2, %%mm3 \n\t" "psubusb %%mm2, %%mm3 \n\t"
@ -600,16 +617,18 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
const int l7= stride + l6; const int l7= stride + l6;
const int l8= stride + l7; const int l8= stride + l7;
const int l9= stride + l8; const int l9= stride + l8;
for(int x=0; x<BLOCK_SIZE; x++) int x;
for(x=0; x<BLOCK_SIZE; x++)
{ {
if(ABS(src[l4]-src[l5]) < QP + QP/4) if(ABS(src[l4]-src[l5]) < QP + QP/4)
{ {
int x = src[l5] - src[l4]; int v = (src[l5] - src[l4]);
src[l3] +=v/8;
src[l4] +=v/2;
src[l5] -=v/2;
src[l6] -=v/8;
src[l3] +=x/8;
src[l4] +=x/2;
src[l5] -=x/2;
src[l6] -=x/8;
} }
src++; src++;
} }
@ -619,18 +638,126 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
/** /**
* Experimental Filter 1 * Experimental Filter 1
* will nor damage linear gradients
* can only smooth blocks at the expected locations (it cant smooth them if they did move)
* MMX2 version does correct clipping C version doesnt
*/ */
static inline void vertX1Filter(uint8_t *src, int stride, int QP) static inline void vertX1Filter(uint8_t *src, int stride, int QP)
{ {
#ifdef HAVE_MMX2X #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
// FIXME
asm volatile( asm volatile(
"pxor %%mm7, %%mm7 \n\t" // 0
// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
"leal (%0, %1), %%eax \n\t"
"leal (%%eax, %1, 4), %%ebx \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
"movq (%0, %1, 4), %%mm1 \n\t" // line 4
"movq %%mm1, %%mm2 \n\t" // line 4
"psubusb %%mm0, %%mm1 \n\t"
"psubusb %%mm2, %%mm0 \n\t"
"por %%mm1, %%mm0 \n\t" // |l2 - l3|
"movq (%%ebx), %%mm3 \n\t" // line 5
"movq (%%ebx, %1), %%mm4 \n\t" // line 6
"movq %%mm3, %%mm5 \n\t" // line 5
"psubusb %%mm4, %%mm3 \n\t"
"psubusb %%mm5, %%mm4 \n\t"
"por %%mm4, %%mm3 \n\t" // |l5 - l6|
PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
"movq %%mm2, %%mm1 \n\t" // line 4
"psubusb %%mm5, %%mm2 \n\t"
"movq %%mm2, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
"psubusb %%mm1, %%mm5 \n\t"
"por %%mm5, %%mm4 \n\t" // |l4 - l5|
"psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
"movq %%mm4, %%mm3 \n\t" // d
"psubusb pQPb, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
PAVGB(%%mm7, %%mm3) // d/2
"movq (%0, %1, 4), %%mm0 \n\t" // line 4
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
"psubusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%0, %1, 4) \n\t" // line 4
"movq (%%ebx), %%mm0 \n\t" // line 5
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
"paddusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx) \n\t" // line 5
PAVGB(%%mm7, %%mm3) // d/4
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
"psubusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
"movq (%%ebx, %1), %%mm0 \n\t" // line 6
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
"paddusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx, %1) \n\t" // line 6
PAVGB(%%mm7, %%mm3) // d/8
"movq (%%eax, %1), %%mm0 \n\t" // line 2
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
"psubusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%eax, %1) \n\t" // line 2
"movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
"paddusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
: :
: "r" (src), "r" (stride) : "r" (src), "r" (stride)
: "%eax", "%ebx" : "%eax", "%ebx"
); );
#else #else
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= stride + l3;
const int l5= stride + l4;
const int l6= stride + l5;
const int l7= stride + l6;
const int l8= stride + l7;
const int l9= stride + l8;
int x;
for(x=0; x<BLOCK_SIZE; x++)
{
int a= src[l3] - src[l4];
int b= src[l4] - src[l5];
int c= src[l6] - src[l7];
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
if(d < QP)
{
int v = d * SIGN(-b);
src[l2] +=v/8;
src[l3] +=v/4;
src[l4] +=v/2;
src[l5] -=v/2;
src[l6] -=v/4;
src[l7] -=v/8;
}
src++;
}
/*
const int l1= stride; const int l1= stride;
const int l2= stride + l1; const int l2= stride + l1;
const int l3= stride + l2; const int l3= stride + l2;
@ -658,7 +785,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
} }
src++; src++;
} }
*/
#endif #endif
} }
@ -908,8 +1035,8 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
const int l7= stride + l6; const int l7= stride + l6;
const int l8= stride + l7; const int l8= stride + l7;
// const int l9= stride + l8; // const int l9= stride + l8;
int x;
for(int x=0; x<BLOCK_SIZE; x++) for(x=0; x<BLOCK_SIZE; x++)
{ {
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
if(ABS(middleEnergy) < 8*QP) if(ABS(middleEnergy) < 8*QP)
@ -947,7 +1074,7 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
/** /**
* Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
*/ */
static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride) static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
{ {
// src++; // src++;
int numEq= 0; int numEq= 0;
@ -1007,7 +1134,8 @@ asm volatile (
// printf("%d\n", numEq); // printf("%d\n", numEq);
numEq= (256 - (numEq & 0xFF)) &0xFF; numEq= (256 - (numEq & 0xFF)) &0xFF;
#else #else
for(int y=0; y<BLOCK_SIZE; y++) int y;
for(y=0; y<BLOCK_SIZE; y++)
{ {
if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
@ -1044,7 +1172,7 @@ asm volatile (
return numEq > hFlatnessThreshold; return numEq > hFlatnessThreshold;
} }
static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP) static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
{ {
#ifdef MMX_FIXME #ifdef MMX_FIXME
FIXME FIXME
@ -1071,9 +1199,9 @@ FIXME
); );
return isOk; return isOk;
#else #else
if(abs(src[0] - src[7]) > 2*QP) return false; if(abs(src[0] - src[7]) > 2*QP) return 0;
return true; return 1;
#endif #endif
} }
@ -1173,7 +1301,8 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
#else #else
uint8_t *src= tempBlock; uint8_t *src= tempBlock;
for(int y=0; y<BLOCK_SIZE; y++) int y;
for(y=0; y<BLOCK_SIZE; y++)
{ {
dst[0] = src[0]; dst[0] = src[0];
dst[1] = src[1]; dst[1] = src[1];
@ -1375,7 +1504,8 @@ Implemented Exact 7-Tap
#else #else
uint8_t *temp= tempBlock; uint8_t *temp= tempBlock;
for(int y=0; y<BLOCK_SIZE; y++) int y;
for(y=0; y<BLOCK_SIZE; y++)
{ {
const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
@ -1502,7 +1632,7 @@ void postprocess(unsigned char * src[], int src_stride,
return; return;
*/ */
postProcess(src[0], src_stride, dst[0], dst_stride, postProcess(src[0], src_stride, dst[0], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, false, mode); horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
horizontal_size >>= 1; horizontal_size >>= 1;
vertical_size >>= 1; vertical_size >>= 1;
@ -1512,9 +1642,9 @@ void postprocess(unsigned char * src[], int src_stride,
if(1) if(1)
{ {
postProcess(src[1], src_stride, dst[1], dst_stride, postProcess(src[1], src_stride, dst[1], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
postProcess(src[2], src_stride, dst[2], dst_stride, postProcess(src[2], src_stride, dst[2], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
} }
else else
{ {
@ -1543,11 +1673,19 @@ int getModeForQuality(int quality){
/** /**
* Copies a block from src to dst and fixes the blacklevel * Copies a block from src to dst and fixes the blacklevel
* numLines must be a multiple of 4
* levelFix == 0 -> dont touch the brighness & contrast
*/ */
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
int numLines, int levelFix)
{ {
int i;
if(levelFix)
{
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
"movl %4, %%eax \n\t"
"movl %%eax, temp0\n\t"
"pushl %0 \n\t" "pushl %0 \n\t"
"pushl %1 \n\t" "pushl %1 \n\t"
"leal (%2,%2), %%eax \n\t" "leal (%2,%2), %%eax \n\t"
@ -1555,14 +1693,6 @@ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int sr
"movq packedYOffset, %%mm2 \n\t" "movq packedYOffset, %%mm2 \n\t"
"movq packedYScale, %%mm3 \n\t" "movq packedYScale, %%mm3 \n\t"
#define SIMPLE_CPY \
"movq (%0), %%mm0 \n\t"\
"movq (%0,%2), %%mm1 \n\t"\
"psubusb %%mm2, %%mm0 \n\t"\
"psubusb %%mm2, %%mm1 \n\t"\
"movq %%mm0, (%1) \n\t"\
"movq %%mm1, (%1, %3) \n\t"\
#define SCALED_CPY \ #define SCALED_CPY \
"movq (%0), %%mm0 \n\t"\ "movq (%0), %%mm0 \n\t"\
"movq (%0,%2), %%mm1 \n\t"\ "movq (%0,%2), %%mm1 \n\t"\
@ -1585,33 +1715,75 @@ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int sr
"packuswb %%mm5, %%mm4 \n\t"\ "packuswb %%mm5, %%mm4 \n\t"\
"movq %%mm4, (%1, %3) \n\t"\ "movq %%mm4, (%1, %3) \n\t"\
"1: \n\t"
SCALED_CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
SCALED_CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
"decl temp0 \n\t"
"jnz 1b \n\t"
#define CPY SCALED_CPY
//#define CPY SIMPLE_CPY
// "prefetchnta 8(%0)\n\t"
CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
CPY
"popl %1 \n\t" "popl %1 \n\t"
"popl %0 \n\t" "popl %0 \n\t"
: : "r" (src), : : "r" (src),
"r" (dst), "r" (dst),
"r" (srcStride), "r" (srcStride),
"r" (dstStride) "r" (dstStride),
"m" (numLines>>2)
: "%eax", "%ebx" : "%eax", "%ebx"
); );
#else #else
for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2 for(i=0; i<numLines; i++)
memcpy( &(dst[dstStride*i]), memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE); &(src[srcStride*i]), BLOCK_SIZE);
#endif #endif
}
else
{
#ifdef HAVE_MMX
asm volatile(
"movl %4, %%eax \n\t"
"movl %%eax, temp0\n\t"
"pushl %0 \n\t"
"pushl %1 \n\t"
"leal (%2,%2), %%eax \n\t"
"leal (%3,%3), %%ebx \n\t"
"movq packedYOffset, %%mm2 \n\t"
"movq packedYScale, %%mm3 \n\t"
#define SIMPLE_CPY \
"movq (%0), %%mm0 \n\t"\
"movq (%0,%2), %%mm1 \n\t"\
"movq %%mm0, (%1) \n\t"\
"movq %%mm1, (%1, %3) \n\t"\
"1: \n\t"
SIMPLE_CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
SIMPLE_CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
"decl temp0 \n\t"
"jnz 1b \n\t"
"popl %1 \n\t"
"popl %0 \n\t"
: : "r" (src),
"r" (dst),
"r" (srcStride),
"r" (dstStride),
"m" (numLines>>2)
: "%eax", "%ebx"
);
#else
for(i=0; i<numLines; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif
}
} }
@ -1619,33 +1791,50 @@ CPY
* Filters array of bytes (Y or U or V values) * Filters array of bytes (Y or U or V values)
*/ */
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, bool isColor, int mode) QP_STORE_T QPs[], int QPStride, int isColor, int mode)
{ {
int x,y;
/* we need 64bit here otherwise we´ll going to have a problem
after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL;
int black=0, white=255; // blackest black and whitest white in the picture
#ifdef TIMEING #ifdef TIMEING
long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
sumTime= rdtsc(); sumTime= rdtsc();
#endif #endif
/* we need 64bit here otherwise we´ll going to have a problem
after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL;
if(!yHistogram) if(!yHistogram)
{ {
yHistogram= new uint64_t[256]; int i;
for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256; yHistogram= (uint64_t*)malloc(8*256);
for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
} }
int black=0, white=255; // blackest black and whitest white in the picture
if(!isColor) if(!isColor)
{ {
uint64_t sum= 0; uint64_t sum= 0;
for(int i=0; i<256; i++) int i;
static int framenum= -1;
uint64_t maxClipped;
uint64_t clipped;
double scale;
framenum++;
if(framenum == 1) yHistogram[0]= width*height/64*15/256;
for(i=0; i<256; i++)
{
sum+= yHistogram[i]; sum+= yHistogram[i];
// printf("%d ", yHistogram[i]);
}
// printf("\n\n");
uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold); /* we allways get a completly black picture first */
uint64_t clipped= sum; maxClipped= (uint64_t)(sum * maxClippedThreshold);
clipped= sum;
for(black=255; black>0; black--) for(black=255; black>0; black--)
{ {
if(clipped < maxClipped) break; if(clipped < maxClipped) break;
@ -1665,9 +1854,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
packedYOffset|= packedYOffset<<16; packedYOffset|= packedYOffset<<16;
packedYOffset|= packedYOffset<<8; packedYOffset|= packedYOffset<<8;
double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
packedYScale= uint16_t(scale*256.0 + 0.5); packedYScale= (uint16_t)(scale*256.0 + 0.5);
packedYScale|= packedYScale<<32; packedYScale|= packedYScale<<32;
packedYScale|= packedYScale<<16; packedYScale|= packedYScale<<16;
} }
@ -1677,10 +1866,10 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
packedYOffset= 0; packedYOffset= 0;
} }
for(int x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
blockCopy(dst + x, dstStride, src + x, srcStride); blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
for(int y=0; y<height; y+=BLOCK_SIZE) for(y=0; y<height; y+=BLOCK_SIZE)
{ {
//1% speedup if these are here instead of the inner loop //1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]); uint8_t *srcBlock= &(src[y*srcStride]);
@ -1690,8 +1879,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
// finish 1 block before the next otherwise we´ll might have a problem // finish 1 block before the next otherwise we´ll might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(int x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
{ {
const int stride= dstStride;
int QP= isColor ? int QP= isColor ?
QPs[(y>>3)*QPStride + (x>>3)]: QPs[(y>>3)*QPStride + (x>>3)]:
(QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8; (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
@ -1707,7 +1897,6 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
#endif #endif
const int stride= dstStride;
if(y + 12 < height) if(y + 12 < height)
{ {
#ifdef MORE_TIMEING #ifdef MORE_TIMEING
@ -1730,7 +1919,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
if(!isColor) yHistogram[ srcBlock[0] ]++; if(!isColor) yHistogram[ srcBlock[0] ]++;
blockCopy(vertBlock + dstStride*2, dstStride, blockCopy(vertBlock + dstStride*2, dstStride,
vertSrcBlock + srcStride*2, srcStride); vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
#ifdef MORE_TIMEING #ifdef MORE_TIMEING
@ -1742,7 +1931,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
{ {
if(mode & RK_FILTER) if(mode & RK_FILTER)
vertRKFilter(vertBlock, stride, QP); vertRKFilter(vertBlock, stride, QP);
else if(0) else if(mode & X1_FILTER)
vertX1Filter(vertBlock, stride, QP); vertX1Filter(vertBlock, stride, QP);
else else
{ {
@ -1762,12 +1951,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
#endif #endif
} }
else else
{ blockCopy(vertBlock + dstStride*1, dstStride,
for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2 vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
memcpy( &(vertBlock[dstStride*i]),
&(vertSrcBlock[srcStride*i]), BLOCK_SIZE);
}
if(x - 8 >= 0 && x<width) if(x - 8 >= 0 && x<width)
{ {
@ -1813,8 +1999,8 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
sumTime= rdtsc() - sumTime; sumTime= rdtsc() - sumTime;
if(!isColor) if(!isColor)
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
, black, white); , black, white);
#endif #endif
} }

View File

@ -26,6 +26,7 @@
#define V_DEBLOCK 0x01 #define V_DEBLOCK 0x01
#define H_DEBLOCK 0x02 #define H_DEBLOCK 0x02
#define DERING 0x04 #define DERING 0x04
#define LEVEL_FIX 0x08 /* Brightness & Contrast */
#define LUM_V_DEBLOCK V_DEBLOCK #define LUM_V_DEBLOCK V_DEBLOCK
#define LUM_H_DEBLOCK H_DEBLOCK #define LUM_H_DEBLOCK H_DEBLOCK
@ -33,12 +34,19 @@
#define CHROM_H_DEBLOCK (H_DEBLOCK<<4) #define CHROM_H_DEBLOCK (H_DEBLOCK<<4)
#define LUM_DERING DERING #define LUM_DERING DERING
#define CHROM_DERING (DERING<<4) #define CHROM_DERING (DERING<<4)
#define LUM_LEVEL_FIX LEVEL_FIX
//not supported currently
#define CHROM_LEVEL_FIX (LEVEL_FIX<<4)
// Experimental stuff // Experimental stuff
#define RK_FILTER 0x0100 #define RK_FILTER 0x0100
#define LUM_V_RK_FILTER RK_FILTER #define LUM_V_RK_FILTER RK_FILTER
#define CHROM_V_RK_FILTER (RK_FILTER<<4) #define CHROM_V_RK_FILTER (RK_FILTER<<4)
#define X1_FILTER 0x0200
#define LUM_V_X1_FILTER X1_FILTER
#define CHROM_V_X1_FILTER (X1_FILTER<<4)
#define TIMEING #define TIMEING
#define MORE_TIMEING #define MORE_TIMEING
@ -60,7 +68,7 @@
//#include <inttypes.h> //#include <inttypes.h>
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, bool isColor, int mode); QP_STORE_T QPs[], int QPStride, int isColor, int mode);
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -27,6 +27,9 @@ isHorizMinMaxOk a
doHorizLowPass E a a* doHorizLowPass E a a*
doHorizDefFilter E ac ac doHorizDefFilter E ac ac
deRing deRing
RKAlgo1 E a a*
X1 a E E*
* i dont have a 3dnow CPU -> its untested * i dont have a 3dnow CPU -> its untested
E = Exact implementation E = Exact implementation
@ -41,11 +44,13 @@ verify that everything workes as it should
reduce the time wasted on the mem transfer reduce the time wasted on the mem transfer
implement dering implement dering
implement everything in C at least (done at the moment but ...) implement everything in C at least (done at the moment but ...)
figure range of QP out (assuming <256 for now)
unroll stuff if instructions depend too much on the prior one unroll stuff if instructions depend too much on the prior one
we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
move YScale thing to the end instead of fixing QP move YScale thing to the end instead of fixing QP
write a faster and higher quality deblocking filter :) write a faster and higher quality deblocking filter :)
do something about the speed of the horizontal filters
make the mainloop more flexible (variable number of blocks at once
(the if/else stuff per block is slowing things down)
... ...
Notes: Notes:
@ -54,6 +59,14 @@ Notes:
/* /*
Changelog: Changelog:
0.1.3
bugfixes: last 3 lines not brightness/contrast corrected
brightness statistics messed up with initial black pic
changed initial values of the brightness statistics
C++ -> C conversation
QP range question solved (very likely 1<=QP<=32 according to arpi)
new experimental vertical deblocking filter
RK filter has 3dNow support now (untested)
0.1.2 0.1.2
fixed a bug in the horizontal default filter fixed a bug in the horizontal default filter
3dnow version of the Horizontal & Vertical Lowpass filters 3dnow version of the Horizontal & Vertical Lowpass filters
@ -66,6 +79,7 @@ Changelog:
#include <inttypes.h> #include <inttypes.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include "../config.h" #include "../config.h"
//#undef HAVE_MMX2 //#undef HAVE_MMX2
//#define HAVE_3DNOW //#define HAVE_3DNOW
@ -160,9 +174,10 @@ static inline void prefetcht2(void *p)
/** /**
* Check if the middle 8x8 Block in the given 8x10 block is flat * Check if the middle 8x8 Block in the given 8x10 block is flat
*/ */
static inline bool isVertDC(uint8_t src[], int stride){ static inline int isVertDC(uint8_t src[], int stride){
// return true; // return true;
int numEq= 0; int numEq= 0;
int y;
src+= stride; // src points to begin of the 8x8 Block src+= stride; // src points to begin of the 8x8 Block
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
@ -242,7 +257,7 @@ static inline bool isVertDC(uint8_t src[], int stride){
// uint8_t *temp= src; // uint8_t *temp= src;
#else #else
for(int y=0; y<BLOCK_SIZE-1; y++) for(y=0; y<BLOCK_SIZE-1; y++)
{ {
if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++; if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
@ -268,10 +283,11 @@ static inline bool isVertDC(uint8_t src[], int stride){
} }
} }
*/ */
return numEq > vFlatnessThreshold; // for(int i=0; i<numEq/8; i++) src[i]=255;
return (numEq > vFlatnessThreshold) ? 1 : 0;
} }
static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP) static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
{ {
#ifdef HAVE_MMX #ifdef HAVE_MMX
int isOk; int isOk;
@ -295,13 +311,14 @@ static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
: "=r" (isOk) : "=r" (isOk)
: "r" (src), "r" (stride) : "r" (src), "r" (stride)
); );
return isOk; return isOk ? 1 : 0;
#else #else
int isOk2= true; int isOk2= 1;
for(int x=0; x<BLOCK_SIZE; x++) int x;
for(x=0; x<BLOCK_SIZE; x++)
{ {
if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false; if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
} }
/* if(isOk && !isOk2 || !isOk && isOk2) /* if(isOk && !isOk2 || !isOk && isOk2)
{ {
@ -484,8 +501,8 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
const int l7= stride + l6; const int l7= stride + l6;
const int l8= stride + l7; const int l8= stride + l7;
const int l9= stride + l8; const int l9= stride + l8;
int x;
for(int x=0; x<BLOCK_SIZE; x++) for(x=0; x<BLOCK_SIZE; x++)
{ {
const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
@ -529,7 +546,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
*/ */
static inline void vertRKFilter(uint8_t *src, int stride, int QP) static inline void vertRKFilter(uint8_t *src, int stride, int QP)
{ {
#ifdef HAVE_MMX2 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
// FIXME rounding // FIXME rounding
asm volatile( asm volatile(
"pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm7, %%mm7 \n\t" // 0
@ -549,7 +566,7 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
"movq %%mm2, %%mm4 \n\t" // line 4 "movq %%mm2, %%mm4 \n\t" // line 4
"pcmpeqb %%mm5, %%mm5 \n\t" // -1 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
"pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
"pavgb %%mm3, %%mm5 \n\t" PAVGB(%%mm3, %%mm5)
"paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
"psubusb %%mm3, %%mm4 \n\t" "psubusb %%mm3, %%mm4 \n\t"
"psubusb %%mm2, %%mm3 \n\t" "psubusb %%mm2, %%mm3 \n\t"
@ -600,16 +617,18 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
const int l7= stride + l6; const int l7= stride + l6;
const int l8= stride + l7; const int l8= stride + l7;
const int l9= stride + l8; const int l9= stride + l8;
for(int x=0; x<BLOCK_SIZE; x++) int x;
for(x=0; x<BLOCK_SIZE; x++)
{ {
if(ABS(src[l4]-src[l5]) < QP + QP/4) if(ABS(src[l4]-src[l5]) < QP + QP/4)
{ {
int x = src[l5] - src[l4]; int v = (src[l5] - src[l4]);
src[l3] +=v/8;
src[l4] +=v/2;
src[l5] -=v/2;
src[l6] -=v/8;
src[l3] +=x/8;
src[l4] +=x/2;
src[l5] -=x/2;
src[l6] -=x/8;
} }
src++; src++;
} }
@ -619,18 +638,126 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
/** /**
* Experimental Filter 1 * Experimental Filter 1
* will nor damage linear gradients
* can only smooth blocks at the expected locations (it cant smooth them if they did move)
* MMX2 version does correct clipping C version doesnt
*/ */
static inline void vertX1Filter(uint8_t *src, int stride, int QP) static inline void vertX1Filter(uint8_t *src, int stride, int QP)
{ {
#ifdef HAVE_MMX2X #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
// FIXME
asm volatile( asm volatile(
"pxor %%mm7, %%mm7 \n\t" // 0
// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
"leal (%0, %1), %%eax \n\t"
"leal (%%eax, %1, 4), %%ebx \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
"movq (%0, %1, 4), %%mm1 \n\t" // line 4
"movq %%mm1, %%mm2 \n\t" // line 4
"psubusb %%mm0, %%mm1 \n\t"
"psubusb %%mm2, %%mm0 \n\t"
"por %%mm1, %%mm0 \n\t" // |l2 - l3|
"movq (%%ebx), %%mm3 \n\t" // line 5
"movq (%%ebx, %1), %%mm4 \n\t" // line 6
"movq %%mm3, %%mm5 \n\t" // line 5
"psubusb %%mm4, %%mm3 \n\t"
"psubusb %%mm5, %%mm4 \n\t"
"por %%mm4, %%mm3 \n\t" // |l5 - l6|
PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
"movq %%mm2, %%mm1 \n\t" // line 4
"psubusb %%mm5, %%mm2 \n\t"
"movq %%mm2, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
"psubusb %%mm1, %%mm5 \n\t"
"por %%mm5, %%mm4 \n\t" // |l4 - l5|
"psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
"movq %%mm4, %%mm3 \n\t" // d
"psubusb pQPb, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
PAVGB(%%mm7, %%mm3) // d/2
"movq (%0, %1, 4), %%mm0 \n\t" // line 4
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
"psubusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%0, %1, 4) \n\t" // line 4
"movq (%%ebx), %%mm0 \n\t" // line 5
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
"paddusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx) \n\t" // line 5
PAVGB(%%mm7, %%mm3) // d/4
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
"psubusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
"movq (%%ebx, %1), %%mm0 \n\t" // line 6
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
"paddusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx, %1) \n\t" // line 6
PAVGB(%%mm7, %%mm3) // d/8
"movq (%%eax, %1), %%mm0 \n\t" // line 2
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
"psubusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%eax, %1) \n\t" // line 2
"movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
"paddusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
: :
: "r" (src), "r" (stride) : "r" (src), "r" (stride)
: "%eax", "%ebx" : "%eax", "%ebx"
); );
#else #else
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= stride + l3;
const int l5= stride + l4;
const int l6= stride + l5;
const int l7= stride + l6;
const int l8= stride + l7;
const int l9= stride + l8;
int x;
for(x=0; x<BLOCK_SIZE; x++)
{
int a= src[l3] - src[l4];
int b= src[l4] - src[l5];
int c= src[l6] - src[l7];
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
if(d < QP)
{
int v = d * SIGN(-b);
src[l2] +=v/8;
src[l3] +=v/4;
src[l4] +=v/2;
src[l5] -=v/2;
src[l6] -=v/4;
src[l7] -=v/8;
}
src++;
}
/*
const int l1= stride; const int l1= stride;
const int l2= stride + l1; const int l2= stride + l1;
const int l3= stride + l2; const int l3= stride + l2;
@ -658,7 +785,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
} }
src++; src++;
} }
*/
#endif #endif
} }
@ -908,8 +1035,8 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
const int l7= stride + l6; const int l7= stride + l6;
const int l8= stride + l7; const int l8= stride + l7;
// const int l9= stride + l8; // const int l9= stride + l8;
int x;
for(int x=0; x<BLOCK_SIZE; x++) for(x=0; x<BLOCK_SIZE; x++)
{ {
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
if(ABS(middleEnergy) < 8*QP) if(ABS(middleEnergy) < 8*QP)
@ -947,7 +1074,7 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
/** /**
* Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
*/ */
static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride) static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
{ {
// src++; // src++;
int numEq= 0; int numEq= 0;
@ -1007,7 +1134,8 @@ asm volatile (
// printf("%d\n", numEq); // printf("%d\n", numEq);
numEq= (256 - (numEq & 0xFF)) &0xFF; numEq= (256 - (numEq & 0xFF)) &0xFF;
#else #else
for(int y=0; y<BLOCK_SIZE; y++) int y;
for(y=0; y<BLOCK_SIZE; y++)
{ {
if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
@ -1044,7 +1172,7 @@ asm volatile (
return numEq > hFlatnessThreshold; return numEq > hFlatnessThreshold;
} }
static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP) static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
{ {
#ifdef MMX_FIXME #ifdef MMX_FIXME
FIXME FIXME
@ -1071,9 +1199,9 @@ FIXME
); );
return isOk; return isOk;
#else #else
if(abs(src[0] - src[7]) > 2*QP) return false; if(abs(src[0] - src[7]) > 2*QP) return 0;
return true; return 1;
#endif #endif
} }
@ -1173,7 +1301,8 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
#else #else
uint8_t *src= tempBlock; uint8_t *src= tempBlock;
for(int y=0; y<BLOCK_SIZE; y++) int y;
for(y=0; y<BLOCK_SIZE; y++)
{ {
dst[0] = src[0]; dst[0] = src[0];
dst[1] = src[1]; dst[1] = src[1];
@ -1375,7 +1504,8 @@ Implemented Exact 7-Tap
#else #else
uint8_t *temp= tempBlock; uint8_t *temp= tempBlock;
for(int y=0; y<BLOCK_SIZE; y++) int y;
for(y=0; y<BLOCK_SIZE; y++)
{ {
const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
@ -1502,7 +1632,7 @@ void postprocess(unsigned char * src[], int src_stride,
return; return;
*/ */
postProcess(src[0], src_stride, dst[0], dst_stride, postProcess(src[0], src_stride, dst[0], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, false, mode); horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
horizontal_size >>= 1; horizontal_size >>= 1;
vertical_size >>= 1; vertical_size >>= 1;
@ -1512,9 +1642,9 @@ void postprocess(unsigned char * src[], int src_stride,
if(1) if(1)
{ {
postProcess(src[1], src_stride, dst[1], dst_stride, postProcess(src[1], src_stride, dst[1], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
postProcess(src[2], src_stride, dst[2], dst_stride, postProcess(src[2], src_stride, dst[2], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
} }
else else
{ {
@ -1543,11 +1673,19 @@ int getModeForQuality(int quality){
/** /**
* Copies a block from src to dst and fixes the blacklevel * Copies a block from src to dst and fixes the blacklevel
* numLines must be a multiple of 4
* levelFix == 0 -> dont touch the brighness & contrast
*/ */
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
int numLines, int levelFix)
{ {
int i;
if(levelFix)
{
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
"movl %4, %%eax \n\t"
"movl %%eax, temp0\n\t"
"pushl %0 \n\t" "pushl %0 \n\t"
"pushl %1 \n\t" "pushl %1 \n\t"
"leal (%2,%2), %%eax \n\t" "leal (%2,%2), %%eax \n\t"
@ -1555,14 +1693,6 @@ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int sr
"movq packedYOffset, %%mm2 \n\t" "movq packedYOffset, %%mm2 \n\t"
"movq packedYScale, %%mm3 \n\t" "movq packedYScale, %%mm3 \n\t"
#define SIMPLE_CPY \
"movq (%0), %%mm0 \n\t"\
"movq (%0,%2), %%mm1 \n\t"\
"psubusb %%mm2, %%mm0 \n\t"\
"psubusb %%mm2, %%mm1 \n\t"\
"movq %%mm0, (%1) \n\t"\
"movq %%mm1, (%1, %3) \n\t"\
#define SCALED_CPY \ #define SCALED_CPY \
"movq (%0), %%mm0 \n\t"\ "movq (%0), %%mm0 \n\t"\
"movq (%0,%2), %%mm1 \n\t"\ "movq (%0,%2), %%mm1 \n\t"\
@ -1585,33 +1715,75 @@ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int sr
"packuswb %%mm5, %%mm4 \n\t"\ "packuswb %%mm5, %%mm4 \n\t"\
"movq %%mm4, (%1, %3) \n\t"\ "movq %%mm4, (%1, %3) \n\t"\
"1: \n\t"
SCALED_CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
SCALED_CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
"decl temp0 \n\t"
"jnz 1b \n\t"
#define CPY SCALED_CPY
//#define CPY SIMPLE_CPY
// "prefetchnta 8(%0)\n\t"
CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
CPY
"popl %1 \n\t" "popl %1 \n\t"
"popl %0 \n\t" "popl %0 \n\t"
: : "r" (src), : : "r" (src),
"r" (dst), "r" (dst),
"r" (srcStride), "r" (srcStride),
"r" (dstStride) "r" (dstStride),
"m" (numLines>>2)
: "%eax", "%ebx" : "%eax", "%ebx"
); );
#else #else
for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2 for(i=0; i<numLines; i++)
memcpy( &(dst[dstStride*i]), memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE); &(src[srcStride*i]), BLOCK_SIZE);
#endif #endif
}
else
{
#ifdef HAVE_MMX
asm volatile(
"movl %4, %%eax \n\t"
"movl %%eax, temp0\n\t"
"pushl %0 \n\t"
"pushl %1 \n\t"
"leal (%2,%2), %%eax \n\t"
"leal (%3,%3), %%ebx \n\t"
"movq packedYOffset, %%mm2 \n\t"
"movq packedYScale, %%mm3 \n\t"
#define SIMPLE_CPY \
"movq (%0), %%mm0 \n\t"\
"movq (%0,%2), %%mm1 \n\t"\
"movq %%mm0, (%1) \n\t"\
"movq %%mm1, (%1, %3) \n\t"\
"1: \n\t"
SIMPLE_CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
SIMPLE_CPY
"addl %%eax, %0 \n\t"
"addl %%ebx, %1 \n\t"
"decl temp0 \n\t"
"jnz 1b \n\t"
"popl %1 \n\t"
"popl %0 \n\t"
: : "r" (src),
"r" (dst),
"r" (srcStride),
"r" (dstStride),
"m" (numLines>>2)
: "%eax", "%ebx"
);
#else
for(i=0; i<numLines; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif
}
} }
@ -1619,33 +1791,50 @@ CPY
* Filters array of bytes (Y or U or V values) * Filters array of bytes (Y or U or V values)
*/ */
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, bool isColor, int mode) QP_STORE_T QPs[], int QPStride, int isColor, int mode)
{ {
int x,y;
/* we need 64bit here otherwise we´ll going to have a problem
after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL;
int black=0, white=255; // blackest black and whitest white in the picture
#ifdef TIMEING #ifdef TIMEING
long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
sumTime= rdtsc(); sumTime= rdtsc();
#endif #endif
/* we need 64bit here otherwise we´ll going to have a problem
after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL;
if(!yHistogram) if(!yHistogram)
{ {
yHistogram= new uint64_t[256]; int i;
for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256; yHistogram= (uint64_t*)malloc(8*256);
for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
} }
int black=0, white=255; // blackest black and whitest white in the picture
if(!isColor) if(!isColor)
{ {
uint64_t sum= 0; uint64_t sum= 0;
for(int i=0; i<256; i++) int i;
static int framenum= -1;
uint64_t maxClipped;
uint64_t clipped;
double scale;
framenum++;
if(framenum == 1) yHistogram[0]= width*height/64*15/256;
for(i=0; i<256; i++)
{
sum+= yHistogram[i]; sum+= yHistogram[i];
// printf("%d ", yHistogram[i]);
}
// printf("\n\n");
uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold); /* we allways get a completly black picture first */
uint64_t clipped= sum; maxClipped= (uint64_t)(sum * maxClippedThreshold);
clipped= sum;
for(black=255; black>0; black--) for(black=255; black>0; black--)
{ {
if(clipped < maxClipped) break; if(clipped < maxClipped) break;
@ -1665,9 +1854,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
packedYOffset|= packedYOffset<<16; packedYOffset|= packedYOffset<<16;
packedYOffset|= packedYOffset<<8; packedYOffset|= packedYOffset<<8;
double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
packedYScale= uint16_t(scale*256.0 + 0.5); packedYScale= (uint16_t)(scale*256.0 + 0.5);
packedYScale|= packedYScale<<32; packedYScale|= packedYScale<<32;
packedYScale|= packedYScale<<16; packedYScale|= packedYScale<<16;
} }
@ -1677,10 +1866,10 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
packedYOffset= 0; packedYOffset= 0;
} }
for(int x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
blockCopy(dst + x, dstStride, src + x, srcStride); blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
for(int y=0; y<height; y+=BLOCK_SIZE) for(y=0; y<height; y+=BLOCK_SIZE)
{ {
//1% speedup if these are here instead of the inner loop //1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]); uint8_t *srcBlock= &(src[y*srcStride]);
@ -1690,8 +1879,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
// finish 1 block before the next otherwise we´ll might have a problem // finish 1 block before the next otherwise we´ll might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(int x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
{ {
const int stride= dstStride;
int QP= isColor ? int QP= isColor ?
QPs[(y>>3)*QPStride + (x>>3)]: QPs[(y>>3)*QPStride + (x>>3)]:
(QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8; (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
@ -1707,7 +1897,6 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
#endif #endif
const int stride= dstStride;
if(y + 12 < height) if(y + 12 < height)
{ {
#ifdef MORE_TIMEING #ifdef MORE_TIMEING
@ -1730,7 +1919,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
if(!isColor) yHistogram[ srcBlock[0] ]++; if(!isColor) yHistogram[ srcBlock[0] ]++;
blockCopy(vertBlock + dstStride*2, dstStride, blockCopy(vertBlock + dstStride*2, dstStride,
vertSrcBlock + srcStride*2, srcStride); vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
#ifdef MORE_TIMEING #ifdef MORE_TIMEING
@ -1742,7 +1931,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
{ {
if(mode & RK_FILTER) if(mode & RK_FILTER)
vertRKFilter(vertBlock, stride, QP); vertRKFilter(vertBlock, stride, QP);
else if(0) else if(mode & X1_FILTER)
vertX1Filter(vertBlock, stride, QP); vertX1Filter(vertBlock, stride, QP);
else else
{ {
@ -1762,12 +1951,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
#endif #endif
} }
else else
{ blockCopy(vertBlock + dstStride*1, dstStride,
for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2 vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
memcpy( &(vertBlock[dstStride*i]),
&(vertSrcBlock[srcStride*i]), BLOCK_SIZE);
}
if(x - 8 >= 0 && x<width) if(x - 8 >= 0 && x<width)
{ {
@ -1813,8 +1999,8 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
sumTime= rdtsc() - sumTime; sumTime= rdtsc() - sumTime;
if(!isColor) if(!isColor)
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
, black, white); , black, white);
#endif #endif
} }