mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-13 21:28:01 +02:00
bugfixes: last 3 lines not brightness/contrast corrected
brightness statistics messed up with initial black pic changed initial values of the brightness statistics C++ -> C conversation QP range question solved (very likely 1<=QP<=32 according to arpi) new experimental vertical deblocking filter RK filter has 3dNow support now (untested) Originally committed as revision 2169 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
This commit is contained in:
parent
13e0052810
commit
d5a1a99518
@ -27,6 +27,9 @@ isHorizMinMaxOk a
|
|||||||
doHorizLowPass E a a*
|
doHorizLowPass E a a*
|
||||||
doHorizDefFilter E ac ac
|
doHorizDefFilter E ac ac
|
||||||
deRing
|
deRing
|
||||||
|
RKAlgo1 E a a*
|
||||||
|
X1 a E E*
|
||||||
|
|
||||||
|
|
||||||
* i dont have a 3dnow CPU -> its untested
|
* i dont have a 3dnow CPU -> its untested
|
||||||
E = Exact implementation
|
E = Exact implementation
|
||||||
@ -41,11 +44,13 @@ verify that everything workes as it should
|
|||||||
reduce the time wasted on the mem transfer
|
reduce the time wasted on the mem transfer
|
||||||
implement dering
|
implement dering
|
||||||
implement everything in C at least (done at the moment but ...)
|
implement everything in C at least (done at the moment but ...)
|
||||||
figure range of QP out (assuming <256 for now)
|
|
||||||
unroll stuff if instructions depend too much on the prior one
|
unroll stuff if instructions depend too much on the prior one
|
||||||
we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
|
we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
|
||||||
move YScale thing to the end instead of fixing QP
|
move YScale thing to the end instead of fixing QP
|
||||||
write a faster and higher quality deblocking filter :)
|
write a faster and higher quality deblocking filter :)
|
||||||
|
do something about the speed of the horizontal filters
|
||||||
|
make the mainloop more flexible (variable number of blocks at once
|
||||||
|
(the if/else stuff per block is slowing things down)
|
||||||
...
|
...
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
@ -54,6 +59,14 @@ Notes:
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
Changelog:
|
Changelog:
|
||||||
|
0.1.3
|
||||||
|
bugfixes: last 3 lines not brightness/contrast corrected
|
||||||
|
brightness statistics messed up with initial black pic
|
||||||
|
changed initial values of the brightness statistics
|
||||||
|
C++ -> C conversation
|
||||||
|
QP range question solved (very likely 1<=QP<=32 according to arpi)
|
||||||
|
new experimental vertical deblocking filter
|
||||||
|
RK filter has 3dNow support now (untested)
|
||||||
0.1.2
|
0.1.2
|
||||||
fixed a bug in the horizontal default filter
|
fixed a bug in the horizontal default filter
|
||||||
3dnow version of the Horizontal & Vertical Lowpass filters
|
3dnow version of the Horizontal & Vertical Lowpass filters
|
||||||
@ -66,6 +79,7 @@ Changelog:
|
|||||||
|
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
#include "../config.h"
|
#include "../config.h"
|
||||||
//#undef HAVE_MMX2
|
//#undef HAVE_MMX2
|
||||||
//#define HAVE_3DNOW
|
//#define HAVE_3DNOW
|
||||||
@ -160,9 +174,10 @@ static inline void prefetcht2(void *p)
|
|||||||
/**
|
/**
|
||||||
* Check if the middle 8x8 Block in the given 8x10 block is flat
|
* Check if the middle 8x8 Block in the given 8x10 block is flat
|
||||||
*/
|
*/
|
||||||
static inline bool isVertDC(uint8_t src[], int stride){
|
static inline int isVertDC(uint8_t src[], int stride){
|
||||||
// return true;
|
// return true;
|
||||||
int numEq= 0;
|
int numEq= 0;
|
||||||
|
int y;
|
||||||
src+= stride; // src points to begin of the 8x8 Block
|
src+= stride; // src points to begin of the 8x8 Block
|
||||||
#ifdef HAVE_MMX
|
#ifdef HAVE_MMX
|
||||||
asm volatile(
|
asm volatile(
|
||||||
@ -242,7 +257,7 @@ static inline bool isVertDC(uint8_t src[], int stride){
|
|||||||
// uint8_t *temp= src;
|
// uint8_t *temp= src;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
for(int y=0; y<BLOCK_SIZE-1; y++)
|
for(y=0; y<BLOCK_SIZE-1; y++)
|
||||||
{
|
{
|
||||||
if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
|
if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
|
||||||
if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
|
if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
|
||||||
@ -268,10 +283,11 @@ static inline bool isVertDC(uint8_t src[], int stride){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
return numEq > vFlatnessThreshold;
|
// for(int i=0; i<numEq/8; i++) src[i]=255;
|
||||||
|
return (numEq > vFlatnessThreshold) ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
|
static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_MMX
|
#ifdef HAVE_MMX
|
||||||
int isOk;
|
int isOk;
|
||||||
@ -295,13 +311,14 @@ static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
|
|||||||
: "=r" (isOk)
|
: "=r" (isOk)
|
||||||
: "r" (src), "r" (stride)
|
: "r" (src), "r" (stride)
|
||||||
);
|
);
|
||||||
return isOk;
|
return isOk ? 1 : 0;
|
||||||
#else
|
#else
|
||||||
|
|
||||||
int isOk2= true;
|
int isOk2= 1;
|
||||||
for(int x=0; x<BLOCK_SIZE; x++)
|
int x;
|
||||||
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
{
|
{
|
||||||
if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false;
|
if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
|
||||||
}
|
}
|
||||||
/* if(isOk && !isOk2 || !isOk && isOk2)
|
/* if(isOk && !isOk2 || !isOk && isOk2)
|
||||||
{
|
{
|
||||||
@ -484,8 +501,8 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
|
|||||||
const int l7= stride + l6;
|
const int l7= stride + l6;
|
||||||
const int l8= stride + l7;
|
const int l8= stride + l7;
|
||||||
const int l9= stride + l8;
|
const int l9= stride + l8;
|
||||||
|
int x;
|
||||||
for(int x=0; x<BLOCK_SIZE; x++)
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
{
|
{
|
||||||
const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
|
const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
|
||||||
const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
|
const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
|
||||||
@ -529,7 +546,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
|
|||||||
*/
|
*/
|
||||||
static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_MMX2
|
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
||||||
// FIXME rounding
|
// FIXME rounding
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"pxor %%mm7, %%mm7 \n\t" // 0
|
"pxor %%mm7, %%mm7 \n\t" // 0
|
||||||
@ -549,7 +566,7 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
|||||||
"movq %%mm2, %%mm4 \n\t" // line 4
|
"movq %%mm2, %%mm4 \n\t" // line 4
|
||||||
"pcmpeqb %%mm5, %%mm5 \n\t" // -1
|
"pcmpeqb %%mm5, %%mm5 \n\t" // -1
|
||||||
"pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
|
"pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
|
||||||
"pavgb %%mm3, %%mm5 \n\t"
|
PAVGB(%%mm3, %%mm5)
|
||||||
"paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
|
"paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
|
||||||
"psubusb %%mm3, %%mm4 \n\t"
|
"psubusb %%mm3, %%mm4 \n\t"
|
||||||
"psubusb %%mm2, %%mm3 \n\t"
|
"psubusb %%mm2, %%mm3 \n\t"
|
||||||
@ -600,16 +617,18 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
|||||||
const int l7= stride + l6;
|
const int l7= stride + l6;
|
||||||
const int l8= stride + l7;
|
const int l8= stride + l7;
|
||||||
const int l9= stride + l8;
|
const int l9= stride + l8;
|
||||||
for(int x=0; x<BLOCK_SIZE; x++)
|
int x;
|
||||||
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
{
|
{
|
||||||
if(ABS(src[l4]-src[l5]) < QP + QP/4)
|
if(ABS(src[l4]-src[l5]) < QP + QP/4)
|
||||||
{
|
{
|
||||||
int x = src[l5] - src[l4];
|
int v = (src[l5] - src[l4]);
|
||||||
|
|
||||||
|
src[l3] +=v/8;
|
||||||
|
src[l4] +=v/2;
|
||||||
|
src[l5] -=v/2;
|
||||||
|
src[l6] -=v/8;
|
||||||
|
|
||||||
src[l3] +=x/8;
|
|
||||||
src[l4] +=x/2;
|
|
||||||
src[l5] -=x/2;
|
|
||||||
src[l6] -=x/8;
|
|
||||||
}
|
}
|
||||||
src++;
|
src++;
|
||||||
}
|
}
|
||||||
@ -619,18 +638,126 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Experimental Filter 1
|
* Experimental Filter 1
|
||||||
|
* will nor damage linear gradients
|
||||||
|
* can only smooth blocks at the expected locations (it cant smooth them if they did move)
|
||||||
|
* MMX2 version does correct clipping C version doesnt
|
||||||
*/
|
*/
|
||||||
static inline void vertX1Filter(uint8_t *src, int stride, int QP)
|
static inline void vertX1Filter(uint8_t *src, int stride, int QP)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_MMX2X
|
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
||||||
// FIXME
|
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
"pxor %%mm7, %%mm7 \n\t" // 0
|
||||||
|
// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
|
||||||
|
"leal (%0, %1), %%eax \n\t"
|
||||||
|
"leal (%%eax, %1, 4), %%ebx \n\t"
|
||||||
|
// 0 1 2 3 4 5 6 7 8 9
|
||||||
|
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
|
||||||
|
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
|
||||||
|
"movq (%0, %1, 4), %%mm1 \n\t" // line 4
|
||||||
|
"movq %%mm1, %%mm2 \n\t" // line 4
|
||||||
|
"psubusb %%mm0, %%mm1 \n\t"
|
||||||
|
"psubusb %%mm2, %%mm0 \n\t"
|
||||||
|
"por %%mm1, %%mm0 \n\t" // |l2 - l3|
|
||||||
|
"movq (%%ebx), %%mm3 \n\t" // line 5
|
||||||
|
"movq (%%ebx, %1), %%mm4 \n\t" // line 6
|
||||||
|
"movq %%mm3, %%mm5 \n\t" // line 5
|
||||||
|
"psubusb %%mm4, %%mm3 \n\t"
|
||||||
|
"psubusb %%mm5, %%mm4 \n\t"
|
||||||
|
"por %%mm4, %%mm3 \n\t" // |l5 - l6|
|
||||||
|
PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
|
||||||
|
"movq %%mm2, %%mm1 \n\t" // line 4
|
||||||
|
"psubusb %%mm5, %%mm2 \n\t"
|
||||||
|
"movq %%mm2, %%mm4 \n\t"
|
||||||
|
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
|
||||||
|
"psubusb %%mm1, %%mm5 \n\t"
|
||||||
|
"por %%mm5, %%mm4 \n\t" // |l4 - l5|
|
||||||
|
"psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
|
||||||
|
"movq %%mm4, %%mm3 \n\t" // d
|
||||||
|
"psubusb pQPb, %%mm4 \n\t"
|
||||||
|
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
|
||||||
|
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
|
||||||
|
|
||||||
|
PAVGB(%%mm7, %%mm3) // d/2
|
||||||
|
|
||||||
|
"movq (%0, %1, 4), %%mm0 \n\t" // line 4
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
|
||||||
|
"psubusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%0, %1, 4) \n\t" // line 4
|
||||||
|
|
||||||
|
"movq (%%ebx), %%mm0 \n\t" // line 5
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
|
||||||
|
"paddusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%ebx) \n\t" // line 5
|
||||||
|
|
||||||
|
PAVGB(%%mm7, %%mm3) // d/4
|
||||||
|
|
||||||
|
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
|
||||||
|
"psubusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
|
||||||
|
|
||||||
|
"movq (%%ebx, %1), %%mm0 \n\t" // line 6
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
|
||||||
|
"paddusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%ebx, %1) \n\t" // line 6
|
||||||
|
|
||||||
|
PAVGB(%%mm7, %%mm3) // d/8
|
||||||
|
|
||||||
|
"movq (%%eax, %1), %%mm0 \n\t" // line 2
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
|
||||||
|
"psubusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%eax, %1) \n\t" // line 2
|
||||||
|
|
||||||
|
"movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
|
||||||
|
"paddusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
|
||||||
|
|
||||||
:
|
:
|
||||||
: "r" (src), "r" (stride)
|
: "r" (src), "r" (stride)
|
||||||
: "%eax", "%ebx"
|
: "%eax", "%ebx"
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
const int l1= stride;
|
||||||
|
const int l2= stride + l1;
|
||||||
|
const int l3= stride + l2;
|
||||||
|
const int l4= stride + l3;
|
||||||
|
const int l5= stride + l4;
|
||||||
|
const int l6= stride + l5;
|
||||||
|
const int l7= stride + l6;
|
||||||
|
const int l8= stride + l7;
|
||||||
|
const int l9= stride + l8;
|
||||||
|
int x;
|
||||||
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
|
{
|
||||||
|
int a= src[l3] - src[l4];
|
||||||
|
int b= src[l4] - src[l5];
|
||||||
|
int c= src[l6] - src[l7];
|
||||||
|
|
||||||
|
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
|
||||||
|
|
||||||
|
if(d < QP)
|
||||||
|
{
|
||||||
|
int v = d * SIGN(-b);
|
||||||
|
|
||||||
|
src[l2] +=v/8;
|
||||||
|
src[l3] +=v/4;
|
||||||
|
src[l4] +=v/2;
|
||||||
|
src[l5] -=v/2;
|
||||||
|
src[l6] -=v/4;
|
||||||
|
src[l7] -=v/8;
|
||||||
|
|
||||||
|
}
|
||||||
|
src++;
|
||||||
|
}
|
||||||
|
/*
|
||||||
const int l1= stride;
|
const int l1= stride;
|
||||||
const int l2= stride + l1;
|
const int l2= stride + l1;
|
||||||
const int l3= stride + l2;
|
const int l3= stride + l2;
|
||||||
@ -658,7 +785,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
|
|||||||
}
|
}
|
||||||
src++;
|
src++;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -908,8 +1035,8 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
|
|||||||
const int l7= stride + l6;
|
const int l7= stride + l6;
|
||||||
const int l8= stride + l7;
|
const int l8= stride + l7;
|
||||||
// const int l9= stride + l8;
|
// const int l9= stride + l8;
|
||||||
|
int x;
|
||||||
for(int x=0; x<BLOCK_SIZE; x++)
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
{
|
{
|
||||||
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
|
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
|
||||||
if(ABS(middleEnergy) < 8*QP)
|
if(ABS(middleEnergy) < 8*QP)
|
||||||
@ -947,7 +1074,7 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
|
|||||||
/**
|
/**
|
||||||
* Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
|
* Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
|
||||||
*/
|
*/
|
||||||
static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride)
|
static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
|
||||||
{
|
{
|
||||||
// src++;
|
// src++;
|
||||||
int numEq= 0;
|
int numEq= 0;
|
||||||
@ -1007,7 +1134,8 @@ asm volatile (
|
|||||||
// printf("%d\n", numEq);
|
// printf("%d\n", numEq);
|
||||||
numEq= (256 - (numEq & 0xFF)) &0xFF;
|
numEq= (256 - (numEq & 0xFF)) &0xFF;
|
||||||
#else
|
#else
|
||||||
for(int y=0; y<BLOCK_SIZE; y++)
|
int y;
|
||||||
|
for(y=0; y<BLOCK_SIZE; y++)
|
||||||
{
|
{
|
||||||
if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
|
if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
|
||||||
if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
|
if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
|
||||||
@ -1044,7 +1172,7 @@ asm volatile (
|
|||||||
return numEq > hFlatnessThreshold;
|
return numEq > hFlatnessThreshold;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP)
|
static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
|
||||||
{
|
{
|
||||||
#ifdef MMX_FIXME
|
#ifdef MMX_FIXME
|
||||||
FIXME
|
FIXME
|
||||||
@ -1071,9 +1199,9 @@ FIXME
|
|||||||
);
|
);
|
||||||
return isOk;
|
return isOk;
|
||||||
#else
|
#else
|
||||||
if(abs(src[0] - src[7]) > 2*QP) return false;
|
if(abs(src[0] - src[7]) > 2*QP) return 0;
|
||||||
|
|
||||||
return true;
|
return 1;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1173,7 +1301,8 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
|
|||||||
#else
|
#else
|
||||||
uint8_t *src= tempBlock;
|
uint8_t *src= tempBlock;
|
||||||
|
|
||||||
for(int y=0; y<BLOCK_SIZE; y++)
|
int y;
|
||||||
|
for(y=0; y<BLOCK_SIZE; y++)
|
||||||
{
|
{
|
||||||
dst[0] = src[0];
|
dst[0] = src[0];
|
||||||
dst[1] = src[1];
|
dst[1] = src[1];
|
||||||
@ -1375,7 +1504,8 @@ Implemented Exact 7-Tap
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
uint8_t *temp= tempBlock;
|
uint8_t *temp= tempBlock;
|
||||||
for(int y=0; y<BLOCK_SIZE; y++)
|
int y;
|
||||||
|
for(y=0; y<BLOCK_SIZE; y++)
|
||||||
{
|
{
|
||||||
const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
|
const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
|
||||||
const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
|
const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
|
||||||
@ -1502,7 +1632,7 @@ void postprocess(unsigned char * src[], int src_stride,
|
|||||||
return;
|
return;
|
||||||
*/
|
*/
|
||||||
postProcess(src[0], src_stride, dst[0], dst_stride,
|
postProcess(src[0], src_stride, dst[0], dst_stride,
|
||||||
horizontal_size, vertical_size, QP_store, QP_stride, false, mode);
|
horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
|
||||||
|
|
||||||
horizontal_size >>= 1;
|
horizontal_size >>= 1;
|
||||||
vertical_size >>= 1;
|
vertical_size >>= 1;
|
||||||
@ -1512,9 +1642,9 @@ void postprocess(unsigned char * src[], int src_stride,
|
|||||||
if(1)
|
if(1)
|
||||||
{
|
{
|
||||||
postProcess(src[1], src_stride, dst[1], dst_stride,
|
postProcess(src[1], src_stride, dst[1], dst_stride,
|
||||||
horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
|
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
|
||||||
postProcess(src[2], src_stride, dst[2], dst_stride,
|
postProcess(src[2], src_stride, dst[2], dst_stride,
|
||||||
horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
|
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1543,11 +1673,19 @@ int getModeForQuality(int quality){
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Copies a block from src to dst and fixes the blacklevel
|
* Copies a block from src to dst and fixes the blacklevel
|
||||||
|
* numLines must be a multiple of 4
|
||||||
|
* levelFix == 0 -> dont touch the brighness & contrast
|
||||||
*/
|
*/
|
||||||
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride)
|
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
|
||||||
|
int numLines, int levelFix)
|
||||||
{
|
{
|
||||||
|
int i;
|
||||||
|
if(levelFix)
|
||||||
|
{
|
||||||
#ifdef HAVE_MMX
|
#ifdef HAVE_MMX
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
"movl %4, %%eax \n\t"
|
||||||
|
"movl %%eax, temp0\n\t"
|
||||||
"pushl %0 \n\t"
|
"pushl %0 \n\t"
|
||||||
"pushl %1 \n\t"
|
"pushl %1 \n\t"
|
||||||
"leal (%2,%2), %%eax \n\t"
|
"leal (%2,%2), %%eax \n\t"
|
||||||
@ -1555,14 +1693,6 @@ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int sr
|
|||||||
"movq packedYOffset, %%mm2 \n\t"
|
"movq packedYOffset, %%mm2 \n\t"
|
||||||
"movq packedYScale, %%mm3 \n\t"
|
"movq packedYScale, %%mm3 \n\t"
|
||||||
|
|
||||||
#define SIMPLE_CPY \
|
|
||||||
"movq (%0), %%mm0 \n\t"\
|
|
||||||
"movq (%0,%2), %%mm1 \n\t"\
|
|
||||||
"psubusb %%mm2, %%mm0 \n\t"\
|
|
||||||
"psubusb %%mm2, %%mm1 \n\t"\
|
|
||||||
"movq %%mm0, (%1) \n\t"\
|
|
||||||
"movq %%mm1, (%1, %3) \n\t"\
|
|
||||||
|
|
||||||
#define SCALED_CPY \
|
#define SCALED_CPY \
|
||||||
"movq (%0), %%mm0 \n\t"\
|
"movq (%0), %%mm0 \n\t"\
|
||||||
"movq (%0,%2), %%mm1 \n\t"\
|
"movq (%0,%2), %%mm1 \n\t"\
|
||||||
@ -1585,33 +1715,75 @@ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int sr
|
|||||||
"packuswb %%mm5, %%mm4 \n\t"\
|
"packuswb %%mm5, %%mm4 \n\t"\
|
||||||
"movq %%mm4, (%1, %3) \n\t"\
|
"movq %%mm4, (%1, %3) \n\t"\
|
||||||
|
|
||||||
|
"1: \n\t"
|
||||||
|
SCALED_CPY
|
||||||
|
"addl %%eax, %0 \n\t"
|
||||||
|
"addl %%ebx, %1 \n\t"
|
||||||
|
SCALED_CPY
|
||||||
|
"addl %%eax, %0 \n\t"
|
||||||
|
"addl %%ebx, %1 \n\t"
|
||||||
|
"decl temp0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
#define CPY SCALED_CPY
|
|
||||||
//#define CPY SIMPLE_CPY
|
|
||||||
// "prefetchnta 8(%0)\n\t"
|
|
||||||
CPY
|
|
||||||
"addl %%eax, %0 \n\t"
|
|
||||||
"addl %%ebx, %1 \n\t"
|
|
||||||
CPY
|
|
||||||
"addl %%eax, %0 \n\t"
|
|
||||||
"addl %%ebx, %1 \n\t"
|
|
||||||
CPY
|
|
||||||
"addl %%eax, %0 \n\t"
|
|
||||||
"addl %%ebx, %1 \n\t"
|
|
||||||
CPY
|
|
||||||
"popl %1 \n\t"
|
"popl %1 \n\t"
|
||||||
"popl %0 \n\t"
|
"popl %0 \n\t"
|
||||||
: : "r" (src),
|
: : "r" (src),
|
||||||
"r" (dst),
|
"r" (dst),
|
||||||
"r" (srcStride),
|
"r" (srcStride),
|
||||||
"r" (dstStride)
|
"r" (dstStride),
|
||||||
|
"m" (numLines>>2)
|
||||||
: "%eax", "%ebx"
|
: "%eax", "%ebx"
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2
|
for(i=0; i<numLines; i++)
|
||||||
memcpy( &(dst[dstStride*i]),
|
memcpy( &(dst[dstStride*i]),
|
||||||
&(src[srcStride*i]), BLOCK_SIZE);
|
&(src[srcStride*i]), BLOCK_SIZE);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#ifdef HAVE_MMX
|
||||||
|
asm volatile(
|
||||||
|
"movl %4, %%eax \n\t"
|
||||||
|
"movl %%eax, temp0\n\t"
|
||||||
|
"pushl %0 \n\t"
|
||||||
|
"pushl %1 \n\t"
|
||||||
|
"leal (%2,%2), %%eax \n\t"
|
||||||
|
"leal (%3,%3), %%ebx \n\t"
|
||||||
|
"movq packedYOffset, %%mm2 \n\t"
|
||||||
|
"movq packedYScale, %%mm3 \n\t"
|
||||||
|
|
||||||
|
#define SIMPLE_CPY \
|
||||||
|
"movq (%0), %%mm0 \n\t"\
|
||||||
|
"movq (%0,%2), %%mm1 \n\t"\
|
||||||
|
"movq %%mm0, (%1) \n\t"\
|
||||||
|
"movq %%mm1, (%1, %3) \n\t"\
|
||||||
|
|
||||||
|
"1: \n\t"
|
||||||
|
SIMPLE_CPY
|
||||||
|
"addl %%eax, %0 \n\t"
|
||||||
|
"addl %%ebx, %1 \n\t"
|
||||||
|
SIMPLE_CPY
|
||||||
|
"addl %%eax, %0 \n\t"
|
||||||
|
"addl %%ebx, %1 \n\t"
|
||||||
|
"decl temp0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"popl %1 \n\t"
|
||||||
|
"popl %0 \n\t"
|
||||||
|
: : "r" (src),
|
||||||
|
"r" (dst),
|
||||||
|
"r" (srcStride),
|
||||||
|
"r" (dstStride),
|
||||||
|
"m" (numLines>>2)
|
||||||
|
: "%eax", "%ebx"
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
for(i=0; i<numLines; i++)
|
||||||
|
memcpy( &(dst[dstStride*i]),
|
||||||
|
&(src[srcStride*i]), BLOCK_SIZE);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1619,33 +1791,50 @@ CPY
|
|||||||
* Filters array of bytes (Y or U or V values)
|
* Filters array of bytes (Y or U or V values)
|
||||||
*/
|
*/
|
||||||
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
|
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
|
||||||
QP_STORE_T QPs[], int QPStride, bool isColor, int mode)
|
QP_STORE_T QPs[], int QPStride, int isColor, int mode)
|
||||||
{
|
{
|
||||||
|
int x,y;
|
||||||
|
/* we need 64bit here otherwise we´ll going to have a problem
|
||||||
|
after watching a black picture for 5 hours*/
|
||||||
|
static uint64_t *yHistogram= NULL;
|
||||||
|
int black=0, white=255; // blackest black and whitest white in the picture
|
||||||
|
|
||||||
#ifdef TIMEING
|
#ifdef TIMEING
|
||||||
long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
|
long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
|
||||||
sumTime= rdtsc();
|
sumTime= rdtsc();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* we need 64bit here otherwise we´ll going to have a problem
|
|
||||||
after watching a black picture for 5 hours*/
|
|
||||||
static uint64_t *yHistogram= NULL;
|
|
||||||
if(!yHistogram)
|
if(!yHistogram)
|
||||||
{
|
{
|
||||||
yHistogram= new uint64_t[256];
|
int i;
|
||||||
for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256;
|
yHistogram= (uint64_t*)malloc(8*256);
|
||||||
|
for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
|
||||||
}
|
}
|
||||||
|
|
||||||
int black=0, white=255; // blackest black and whitest white in the picture
|
|
||||||
if(!isColor)
|
if(!isColor)
|
||||||
{
|
{
|
||||||
uint64_t sum= 0;
|
uint64_t sum= 0;
|
||||||
for(int i=0; i<256; i++)
|
int i;
|
||||||
|
static int framenum= -1;
|
||||||
|
uint64_t maxClipped;
|
||||||
|
uint64_t clipped;
|
||||||
|
double scale;
|
||||||
|
|
||||||
|
framenum++;
|
||||||
|
if(framenum == 1) yHistogram[0]= width*height/64*15/256;
|
||||||
|
|
||||||
|
for(i=0; i<256; i++)
|
||||||
|
{
|
||||||
sum+= yHistogram[i];
|
sum+= yHistogram[i];
|
||||||
|
// printf("%d ", yHistogram[i]);
|
||||||
|
}
|
||||||
|
// printf("\n\n");
|
||||||
|
|
||||||
uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold);
|
/* we allways get a completly black picture first */
|
||||||
|
|
||||||
uint64_t clipped= sum;
|
maxClipped= (uint64_t)(sum * maxClippedThreshold);
|
||||||
|
|
||||||
|
clipped= sum;
|
||||||
for(black=255; black>0; black--)
|
for(black=255; black>0; black--)
|
||||||
{
|
{
|
||||||
if(clipped < maxClipped) break;
|
if(clipped < maxClipped) break;
|
||||||
@ -1665,9 +1854,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
packedYOffset|= packedYOffset<<16;
|
packedYOffset|= packedYOffset<<16;
|
||||||
packedYOffset|= packedYOffset<<8;
|
packedYOffset|= packedYOffset<<8;
|
||||||
|
|
||||||
double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
|
scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
|
||||||
|
|
||||||
packedYScale= uint16_t(scale*256.0 + 0.5);
|
packedYScale= (uint16_t)(scale*256.0 + 0.5);
|
||||||
packedYScale|= packedYScale<<32;
|
packedYScale|= packedYScale<<32;
|
||||||
packedYScale|= packedYScale<<16;
|
packedYScale|= packedYScale<<16;
|
||||||
}
|
}
|
||||||
@ -1677,10 +1866,10 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
packedYOffset= 0;
|
packedYOffset= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int x=0; x<width; x+=BLOCK_SIZE)
|
for(x=0; x<width; x+=BLOCK_SIZE)
|
||||||
blockCopy(dst + x, dstStride, src + x, srcStride);
|
blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
|
||||||
|
|
||||||
for(int y=0; y<height; y+=BLOCK_SIZE)
|
for(y=0; y<height; y+=BLOCK_SIZE)
|
||||||
{
|
{
|
||||||
//1% speedup if these are here instead of the inner loop
|
//1% speedup if these are here instead of the inner loop
|
||||||
uint8_t *srcBlock= &(src[y*srcStride]);
|
uint8_t *srcBlock= &(src[y*srcStride]);
|
||||||
@ -1690,8 +1879,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
|
|
||||||
// finish 1 block before the next otherwise we´ll might have a problem
|
// finish 1 block before the next otherwise we´ll might have a problem
|
||||||
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
|
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
|
||||||
for(int x=0; x<width; x+=BLOCK_SIZE)
|
for(x=0; x<width; x+=BLOCK_SIZE)
|
||||||
{
|
{
|
||||||
|
const int stride= dstStride;
|
||||||
int QP= isColor ?
|
int QP= isColor ?
|
||||||
QPs[(y>>3)*QPStride + (x>>3)]:
|
QPs[(y>>3)*QPStride + (x>>3)]:
|
||||||
(QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
|
(QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
|
||||||
@ -1707,7 +1897,6 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
const int stride= dstStride;
|
|
||||||
if(y + 12 < height)
|
if(y + 12 < height)
|
||||||
{
|
{
|
||||||
#ifdef MORE_TIMEING
|
#ifdef MORE_TIMEING
|
||||||
@ -1730,7 +1919,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
if(!isColor) yHistogram[ srcBlock[0] ]++;
|
if(!isColor) yHistogram[ srcBlock[0] ]++;
|
||||||
|
|
||||||
blockCopy(vertBlock + dstStride*2, dstStride,
|
blockCopy(vertBlock + dstStride*2, dstStride,
|
||||||
vertSrcBlock + srcStride*2, srcStride);
|
vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
|
||||||
|
|
||||||
|
|
||||||
#ifdef MORE_TIMEING
|
#ifdef MORE_TIMEING
|
||||||
@ -1742,7 +1931,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
{
|
{
|
||||||
if(mode & RK_FILTER)
|
if(mode & RK_FILTER)
|
||||||
vertRKFilter(vertBlock, stride, QP);
|
vertRKFilter(vertBlock, stride, QP);
|
||||||
else if(0)
|
else if(mode & X1_FILTER)
|
||||||
vertX1Filter(vertBlock, stride, QP);
|
vertX1Filter(vertBlock, stride, QP);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1762,12 +1951,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
blockCopy(vertBlock + dstStride*1, dstStride,
|
||||||
for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2
|
vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
|
||||||
memcpy( &(vertBlock[dstStride*i]),
|
|
||||||
&(vertSrcBlock[srcStride*i]), BLOCK_SIZE);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if(x - 8 >= 0 && x<width)
|
if(x - 8 >= 0 && x<width)
|
||||||
{
|
{
|
||||||
@ -1813,8 +1999,8 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
sumTime= rdtsc() - sumTime;
|
sumTime= rdtsc() - sumTime;
|
||||||
if(!isColor)
|
if(!isColor)
|
||||||
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
|
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
|
||||||
int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000),
|
(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
|
||||||
int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000)
|
(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
|
||||||
, black, white);
|
, black, white);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -26,6 +26,7 @@
|
|||||||
#define V_DEBLOCK 0x01
|
#define V_DEBLOCK 0x01
|
||||||
#define H_DEBLOCK 0x02
|
#define H_DEBLOCK 0x02
|
||||||
#define DERING 0x04
|
#define DERING 0x04
|
||||||
|
#define LEVEL_FIX 0x08 /* Brightness & Contrast */
|
||||||
|
|
||||||
#define LUM_V_DEBLOCK V_DEBLOCK
|
#define LUM_V_DEBLOCK V_DEBLOCK
|
||||||
#define LUM_H_DEBLOCK H_DEBLOCK
|
#define LUM_H_DEBLOCK H_DEBLOCK
|
||||||
@ -33,12 +34,19 @@
|
|||||||
#define CHROM_H_DEBLOCK (H_DEBLOCK<<4)
|
#define CHROM_H_DEBLOCK (H_DEBLOCK<<4)
|
||||||
#define LUM_DERING DERING
|
#define LUM_DERING DERING
|
||||||
#define CHROM_DERING (DERING<<4)
|
#define CHROM_DERING (DERING<<4)
|
||||||
|
#define LUM_LEVEL_FIX LEVEL_FIX
|
||||||
|
//not supported currently
|
||||||
|
#define CHROM_LEVEL_FIX (LEVEL_FIX<<4)
|
||||||
|
|
||||||
// Experimental stuff
|
// Experimental stuff
|
||||||
#define RK_FILTER 0x0100
|
#define RK_FILTER 0x0100
|
||||||
#define LUM_V_RK_FILTER RK_FILTER
|
#define LUM_V_RK_FILTER RK_FILTER
|
||||||
#define CHROM_V_RK_FILTER (RK_FILTER<<4)
|
#define CHROM_V_RK_FILTER (RK_FILTER<<4)
|
||||||
|
|
||||||
|
#define X1_FILTER 0x0200
|
||||||
|
#define LUM_V_X1_FILTER X1_FILTER
|
||||||
|
#define CHROM_V_X1_FILTER (X1_FILTER<<4)
|
||||||
|
|
||||||
|
|
||||||
#define TIMEING
|
#define TIMEING
|
||||||
#define MORE_TIMEING
|
#define MORE_TIMEING
|
||||||
@ -60,7 +68,7 @@
|
|||||||
//#include <inttypes.h>
|
//#include <inttypes.h>
|
||||||
|
|
||||||
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
|
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
|
||||||
QP_STORE_T QPs[], int QPStride, bool isColor, int mode);
|
QP_STORE_T QPs[], int QPStride, int isColor, int mode);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@ -27,6 +27,9 @@ isHorizMinMaxOk a
|
|||||||
doHorizLowPass E a a*
|
doHorizLowPass E a a*
|
||||||
doHorizDefFilter E ac ac
|
doHorizDefFilter E ac ac
|
||||||
deRing
|
deRing
|
||||||
|
RKAlgo1 E a a*
|
||||||
|
X1 a E E*
|
||||||
|
|
||||||
|
|
||||||
* i dont have a 3dnow CPU -> its untested
|
* i dont have a 3dnow CPU -> its untested
|
||||||
E = Exact implementation
|
E = Exact implementation
|
||||||
@ -41,11 +44,13 @@ verify that everything workes as it should
|
|||||||
reduce the time wasted on the mem transfer
|
reduce the time wasted on the mem transfer
|
||||||
implement dering
|
implement dering
|
||||||
implement everything in C at least (done at the moment but ...)
|
implement everything in C at least (done at the moment but ...)
|
||||||
figure range of QP out (assuming <256 for now)
|
|
||||||
unroll stuff if instructions depend too much on the prior one
|
unroll stuff if instructions depend too much on the prior one
|
||||||
we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
|
we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
|
||||||
move YScale thing to the end instead of fixing QP
|
move YScale thing to the end instead of fixing QP
|
||||||
write a faster and higher quality deblocking filter :)
|
write a faster and higher quality deblocking filter :)
|
||||||
|
do something about the speed of the horizontal filters
|
||||||
|
make the mainloop more flexible (variable number of blocks at once
|
||||||
|
(the if/else stuff per block is slowing things down)
|
||||||
...
|
...
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
@ -54,6 +59,14 @@ Notes:
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
Changelog:
|
Changelog:
|
||||||
|
0.1.3
|
||||||
|
bugfixes: last 3 lines not brightness/contrast corrected
|
||||||
|
brightness statistics messed up with initial black pic
|
||||||
|
changed initial values of the brightness statistics
|
||||||
|
C++ -> C conversation
|
||||||
|
QP range question solved (very likely 1<=QP<=32 according to arpi)
|
||||||
|
new experimental vertical deblocking filter
|
||||||
|
RK filter has 3dNow support now (untested)
|
||||||
0.1.2
|
0.1.2
|
||||||
fixed a bug in the horizontal default filter
|
fixed a bug in the horizontal default filter
|
||||||
3dnow version of the Horizontal & Vertical Lowpass filters
|
3dnow version of the Horizontal & Vertical Lowpass filters
|
||||||
@ -66,6 +79,7 @@ Changelog:
|
|||||||
|
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
#include "../config.h"
|
#include "../config.h"
|
||||||
//#undef HAVE_MMX2
|
//#undef HAVE_MMX2
|
||||||
//#define HAVE_3DNOW
|
//#define HAVE_3DNOW
|
||||||
@ -160,9 +174,10 @@ static inline void prefetcht2(void *p)
|
|||||||
/**
|
/**
|
||||||
* Check if the middle 8x8 Block in the given 8x10 block is flat
|
* Check if the middle 8x8 Block in the given 8x10 block is flat
|
||||||
*/
|
*/
|
||||||
static inline bool isVertDC(uint8_t src[], int stride){
|
static inline int isVertDC(uint8_t src[], int stride){
|
||||||
// return true;
|
// return true;
|
||||||
int numEq= 0;
|
int numEq= 0;
|
||||||
|
int y;
|
||||||
src+= stride; // src points to begin of the 8x8 Block
|
src+= stride; // src points to begin of the 8x8 Block
|
||||||
#ifdef HAVE_MMX
|
#ifdef HAVE_MMX
|
||||||
asm volatile(
|
asm volatile(
|
||||||
@ -242,7 +257,7 @@ static inline bool isVertDC(uint8_t src[], int stride){
|
|||||||
// uint8_t *temp= src;
|
// uint8_t *temp= src;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
for(int y=0; y<BLOCK_SIZE-1; y++)
|
for(y=0; y<BLOCK_SIZE-1; y++)
|
||||||
{
|
{
|
||||||
if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
|
if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
|
||||||
if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
|
if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
|
||||||
@ -268,10 +283,11 @@ static inline bool isVertDC(uint8_t src[], int stride){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
return numEq > vFlatnessThreshold;
|
// for(int i=0; i<numEq/8; i++) src[i]=255;
|
||||||
|
return (numEq > vFlatnessThreshold) ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
|
static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_MMX
|
#ifdef HAVE_MMX
|
||||||
int isOk;
|
int isOk;
|
||||||
@ -295,13 +311,14 @@ static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
|
|||||||
: "=r" (isOk)
|
: "=r" (isOk)
|
||||||
: "r" (src), "r" (stride)
|
: "r" (src), "r" (stride)
|
||||||
);
|
);
|
||||||
return isOk;
|
return isOk ? 1 : 0;
|
||||||
#else
|
#else
|
||||||
|
|
||||||
int isOk2= true;
|
int isOk2= 1;
|
||||||
for(int x=0; x<BLOCK_SIZE; x++)
|
int x;
|
||||||
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
{
|
{
|
||||||
if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false;
|
if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
|
||||||
}
|
}
|
||||||
/* if(isOk && !isOk2 || !isOk && isOk2)
|
/* if(isOk && !isOk2 || !isOk && isOk2)
|
||||||
{
|
{
|
||||||
@ -484,8 +501,8 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
|
|||||||
const int l7= stride + l6;
|
const int l7= stride + l6;
|
||||||
const int l8= stride + l7;
|
const int l8= stride + l7;
|
||||||
const int l9= stride + l8;
|
const int l9= stride + l8;
|
||||||
|
int x;
|
||||||
for(int x=0; x<BLOCK_SIZE; x++)
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
{
|
{
|
||||||
const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
|
const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
|
||||||
const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
|
const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
|
||||||
@ -529,7 +546,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
|
|||||||
*/
|
*/
|
||||||
static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_MMX2
|
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
||||||
// FIXME rounding
|
// FIXME rounding
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"pxor %%mm7, %%mm7 \n\t" // 0
|
"pxor %%mm7, %%mm7 \n\t" // 0
|
||||||
@ -549,7 +566,7 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
|||||||
"movq %%mm2, %%mm4 \n\t" // line 4
|
"movq %%mm2, %%mm4 \n\t" // line 4
|
||||||
"pcmpeqb %%mm5, %%mm5 \n\t" // -1
|
"pcmpeqb %%mm5, %%mm5 \n\t" // -1
|
||||||
"pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
|
"pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
|
||||||
"pavgb %%mm3, %%mm5 \n\t"
|
PAVGB(%%mm3, %%mm5)
|
||||||
"paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
|
"paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
|
||||||
"psubusb %%mm3, %%mm4 \n\t"
|
"psubusb %%mm3, %%mm4 \n\t"
|
||||||
"psubusb %%mm2, %%mm3 \n\t"
|
"psubusb %%mm2, %%mm3 \n\t"
|
||||||
@ -600,16 +617,18 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
|||||||
const int l7= stride + l6;
|
const int l7= stride + l6;
|
||||||
const int l8= stride + l7;
|
const int l8= stride + l7;
|
||||||
const int l9= stride + l8;
|
const int l9= stride + l8;
|
||||||
for(int x=0; x<BLOCK_SIZE; x++)
|
int x;
|
||||||
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
{
|
{
|
||||||
if(ABS(src[l4]-src[l5]) < QP + QP/4)
|
if(ABS(src[l4]-src[l5]) < QP + QP/4)
|
||||||
{
|
{
|
||||||
int x = src[l5] - src[l4];
|
int v = (src[l5] - src[l4]);
|
||||||
|
|
||||||
|
src[l3] +=v/8;
|
||||||
|
src[l4] +=v/2;
|
||||||
|
src[l5] -=v/2;
|
||||||
|
src[l6] -=v/8;
|
||||||
|
|
||||||
src[l3] +=x/8;
|
|
||||||
src[l4] +=x/2;
|
|
||||||
src[l5] -=x/2;
|
|
||||||
src[l6] -=x/8;
|
|
||||||
}
|
}
|
||||||
src++;
|
src++;
|
||||||
}
|
}
|
||||||
@ -619,18 +638,126 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Experimental Filter 1
|
* Experimental Filter 1
|
||||||
|
* will nor damage linear gradients
|
||||||
|
* can only smooth blocks at the expected locations (it cant smooth them if they did move)
|
||||||
|
* MMX2 version does correct clipping C version doesnt
|
||||||
*/
|
*/
|
||||||
static inline void vertX1Filter(uint8_t *src, int stride, int QP)
|
static inline void vertX1Filter(uint8_t *src, int stride, int QP)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_MMX2X
|
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
||||||
// FIXME
|
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
"pxor %%mm7, %%mm7 \n\t" // 0
|
||||||
|
// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
|
||||||
|
"leal (%0, %1), %%eax \n\t"
|
||||||
|
"leal (%%eax, %1, 4), %%ebx \n\t"
|
||||||
|
// 0 1 2 3 4 5 6 7 8 9
|
||||||
|
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
|
||||||
|
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
|
||||||
|
"movq (%0, %1, 4), %%mm1 \n\t" // line 4
|
||||||
|
"movq %%mm1, %%mm2 \n\t" // line 4
|
||||||
|
"psubusb %%mm0, %%mm1 \n\t"
|
||||||
|
"psubusb %%mm2, %%mm0 \n\t"
|
||||||
|
"por %%mm1, %%mm0 \n\t" // |l2 - l3|
|
||||||
|
"movq (%%ebx), %%mm3 \n\t" // line 5
|
||||||
|
"movq (%%ebx, %1), %%mm4 \n\t" // line 6
|
||||||
|
"movq %%mm3, %%mm5 \n\t" // line 5
|
||||||
|
"psubusb %%mm4, %%mm3 \n\t"
|
||||||
|
"psubusb %%mm5, %%mm4 \n\t"
|
||||||
|
"por %%mm4, %%mm3 \n\t" // |l5 - l6|
|
||||||
|
PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
|
||||||
|
"movq %%mm2, %%mm1 \n\t" // line 4
|
||||||
|
"psubusb %%mm5, %%mm2 \n\t"
|
||||||
|
"movq %%mm2, %%mm4 \n\t"
|
||||||
|
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
|
||||||
|
"psubusb %%mm1, %%mm5 \n\t"
|
||||||
|
"por %%mm5, %%mm4 \n\t" // |l4 - l5|
|
||||||
|
"psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
|
||||||
|
"movq %%mm4, %%mm3 \n\t" // d
|
||||||
|
"psubusb pQPb, %%mm4 \n\t"
|
||||||
|
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
|
||||||
|
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
|
||||||
|
|
||||||
|
PAVGB(%%mm7, %%mm3) // d/2
|
||||||
|
|
||||||
|
"movq (%0, %1, 4), %%mm0 \n\t" // line 4
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
|
||||||
|
"psubusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%0, %1, 4) \n\t" // line 4
|
||||||
|
|
||||||
|
"movq (%%ebx), %%mm0 \n\t" // line 5
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
|
||||||
|
"paddusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%ebx) \n\t" // line 5
|
||||||
|
|
||||||
|
PAVGB(%%mm7, %%mm3) // d/4
|
||||||
|
|
||||||
|
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
|
||||||
|
"psubusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
|
||||||
|
|
||||||
|
"movq (%%ebx, %1), %%mm0 \n\t" // line 6
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
|
||||||
|
"paddusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%ebx, %1) \n\t" // line 6
|
||||||
|
|
||||||
|
PAVGB(%%mm7, %%mm3) // d/8
|
||||||
|
|
||||||
|
"movq (%%eax, %1), %%mm0 \n\t" // line 2
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
|
||||||
|
"psubusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%eax, %1) \n\t" // line 2
|
||||||
|
|
||||||
|
"movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
|
||||||
|
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
|
||||||
|
"paddusb %%mm3, %%mm0 \n\t"
|
||||||
|
"pxor %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
|
||||||
|
|
||||||
:
|
:
|
||||||
: "r" (src), "r" (stride)
|
: "r" (src), "r" (stride)
|
||||||
: "%eax", "%ebx"
|
: "%eax", "%ebx"
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
const int l1= stride;
|
||||||
|
const int l2= stride + l1;
|
||||||
|
const int l3= stride + l2;
|
||||||
|
const int l4= stride + l3;
|
||||||
|
const int l5= stride + l4;
|
||||||
|
const int l6= stride + l5;
|
||||||
|
const int l7= stride + l6;
|
||||||
|
const int l8= stride + l7;
|
||||||
|
const int l9= stride + l8;
|
||||||
|
int x;
|
||||||
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
|
{
|
||||||
|
int a= src[l3] - src[l4];
|
||||||
|
int b= src[l4] - src[l5];
|
||||||
|
int c= src[l6] - src[l7];
|
||||||
|
|
||||||
|
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
|
||||||
|
|
||||||
|
if(d < QP)
|
||||||
|
{
|
||||||
|
int v = d * SIGN(-b);
|
||||||
|
|
||||||
|
src[l2] +=v/8;
|
||||||
|
src[l3] +=v/4;
|
||||||
|
src[l4] +=v/2;
|
||||||
|
src[l5] -=v/2;
|
||||||
|
src[l6] -=v/4;
|
||||||
|
src[l7] -=v/8;
|
||||||
|
|
||||||
|
}
|
||||||
|
src++;
|
||||||
|
}
|
||||||
|
/*
|
||||||
const int l1= stride;
|
const int l1= stride;
|
||||||
const int l2= stride + l1;
|
const int l2= stride + l1;
|
||||||
const int l3= stride + l2;
|
const int l3= stride + l2;
|
||||||
@ -658,7 +785,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
|
|||||||
}
|
}
|
||||||
src++;
|
src++;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -908,8 +1035,8 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
|
|||||||
const int l7= stride + l6;
|
const int l7= stride + l6;
|
||||||
const int l8= stride + l7;
|
const int l8= stride + l7;
|
||||||
// const int l9= stride + l8;
|
// const int l9= stride + l8;
|
||||||
|
int x;
|
||||||
for(int x=0; x<BLOCK_SIZE; x++)
|
for(x=0; x<BLOCK_SIZE; x++)
|
||||||
{
|
{
|
||||||
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
|
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
|
||||||
if(ABS(middleEnergy) < 8*QP)
|
if(ABS(middleEnergy) < 8*QP)
|
||||||
@ -947,7 +1074,7 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
|
|||||||
/**
|
/**
|
||||||
* Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
|
* Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
|
||||||
*/
|
*/
|
||||||
static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride)
|
static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
|
||||||
{
|
{
|
||||||
// src++;
|
// src++;
|
||||||
int numEq= 0;
|
int numEq= 0;
|
||||||
@ -1007,7 +1134,8 @@ asm volatile (
|
|||||||
// printf("%d\n", numEq);
|
// printf("%d\n", numEq);
|
||||||
numEq= (256 - (numEq & 0xFF)) &0xFF;
|
numEq= (256 - (numEq & 0xFF)) &0xFF;
|
||||||
#else
|
#else
|
||||||
for(int y=0; y<BLOCK_SIZE; y++)
|
int y;
|
||||||
|
for(y=0; y<BLOCK_SIZE; y++)
|
||||||
{
|
{
|
||||||
if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
|
if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
|
||||||
if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
|
if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
|
||||||
@ -1044,7 +1172,7 @@ asm volatile (
|
|||||||
return numEq > hFlatnessThreshold;
|
return numEq > hFlatnessThreshold;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP)
|
static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
|
||||||
{
|
{
|
||||||
#ifdef MMX_FIXME
|
#ifdef MMX_FIXME
|
||||||
FIXME
|
FIXME
|
||||||
@ -1071,9 +1199,9 @@ FIXME
|
|||||||
);
|
);
|
||||||
return isOk;
|
return isOk;
|
||||||
#else
|
#else
|
||||||
if(abs(src[0] - src[7]) > 2*QP) return false;
|
if(abs(src[0] - src[7]) > 2*QP) return 0;
|
||||||
|
|
||||||
return true;
|
return 1;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1173,7 +1301,8 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
|
|||||||
#else
|
#else
|
||||||
uint8_t *src= tempBlock;
|
uint8_t *src= tempBlock;
|
||||||
|
|
||||||
for(int y=0; y<BLOCK_SIZE; y++)
|
int y;
|
||||||
|
for(y=0; y<BLOCK_SIZE; y++)
|
||||||
{
|
{
|
||||||
dst[0] = src[0];
|
dst[0] = src[0];
|
||||||
dst[1] = src[1];
|
dst[1] = src[1];
|
||||||
@ -1375,7 +1504,8 @@ Implemented Exact 7-Tap
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
uint8_t *temp= tempBlock;
|
uint8_t *temp= tempBlock;
|
||||||
for(int y=0; y<BLOCK_SIZE; y++)
|
int y;
|
||||||
|
for(y=0; y<BLOCK_SIZE; y++)
|
||||||
{
|
{
|
||||||
const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
|
const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
|
||||||
const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
|
const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
|
||||||
@ -1502,7 +1632,7 @@ void postprocess(unsigned char * src[], int src_stride,
|
|||||||
return;
|
return;
|
||||||
*/
|
*/
|
||||||
postProcess(src[0], src_stride, dst[0], dst_stride,
|
postProcess(src[0], src_stride, dst[0], dst_stride,
|
||||||
horizontal_size, vertical_size, QP_store, QP_stride, false, mode);
|
horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
|
||||||
|
|
||||||
horizontal_size >>= 1;
|
horizontal_size >>= 1;
|
||||||
vertical_size >>= 1;
|
vertical_size >>= 1;
|
||||||
@ -1512,9 +1642,9 @@ void postprocess(unsigned char * src[], int src_stride,
|
|||||||
if(1)
|
if(1)
|
||||||
{
|
{
|
||||||
postProcess(src[1], src_stride, dst[1], dst_stride,
|
postProcess(src[1], src_stride, dst[1], dst_stride,
|
||||||
horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
|
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
|
||||||
postProcess(src[2], src_stride, dst[2], dst_stride,
|
postProcess(src[2], src_stride, dst[2], dst_stride,
|
||||||
horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
|
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1543,11 +1673,19 @@ int getModeForQuality(int quality){
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Copies a block from src to dst and fixes the blacklevel
|
* Copies a block from src to dst and fixes the blacklevel
|
||||||
|
* numLines must be a multiple of 4
|
||||||
|
* levelFix == 0 -> dont touch the brighness & contrast
|
||||||
*/
|
*/
|
||||||
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride)
|
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
|
||||||
|
int numLines, int levelFix)
|
||||||
{
|
{
|
||||||
|
int i;
|
||||||
|
if(levelFix)
|
||||||
|
{
|
||||||
#ifdef HAVE_MMX
|
#ifdef HAVE_MMX
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
"movl %4, %%eax \n\t"
|
||||||
|
"movl %%eax, temp0\n\t"
|
||||||
"pushl %0 \n\t"
|
"pushl %0 \n\t"
|
||||||
"pushl %1 \n\t"
|
"pushl %1 \n\t"
|
||||||
"leal (%2,%2), %%eax \n\t"
|
"leal (%2,%2), %%eax \n\t"
|
||||||
@ -1555,14 +1693,6 @@ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int sr
|
|||||||
"movq packedYOffset, %%mm2 \n\t"
|
"movq packedYOffset, %%mm2 \n\t"
|
||||||
"movq packedYScale, %%mm3 \n\t"
|
"movq packedYScale, %%mm3 \n\t"
|
||||||
|
|
||||||
#define SIMPLE_CPY \
|
|
||||||
"movq (%0), %%mm0 \n\t"\
|
|
||||||
"movq (%0,%2), %%mm1 \n\t"\
|
|
||||||
"psubusb %%mm2, %%mm0 \n\t"\
|
|
||||||
"psubusb %%mm2, %%mm1 \n\t"\
|
|
||||||
"movq %%mm0, (%1) \n\t"\
|
|
||||||
"movq %%mm1, (%1, %3) \n\t"\
|
|
||||||
|
|
||||||
#define SCALED_CPY \
|
#define SCALED_CPY \
|
||||||
"movq (%0), %%mm0 \n\t"\
|
"movq (%0), %%mm0 \n\t"\
|
||||||
"movq (%0,%2), %%mm1 \n\t"\
|
"movq (%0,%2), %%mm1 \n\t"\
|
||||||
@ -1585,33 +1715,75 @@ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int sr
|
|||||||
"packuswb %%mm5, %%mm4 \n\t"\
|
"packuswb %%mm5, %%mm4 \n\t"\
|
||||||
"movq %%mm4, (%1, %3) \n\t"\
|
"movq %%mm4, (%1, %3) \n\t"\
|
||||||
|
|
||||||
|
"1: \n\t"
|
||||||
|
SCALED_CPY
|
||||||
|
"addl %%eax, %0 \n\t"
|
||||||
|
"addl %%ebx, %1 \n\t"
|
||||||
|
SCALED_CPY
|
||||||
|
"addl %%eax, %0 \n\t"
|
||||||
|
"addl %%ebx, %1 \n\t"
|
||||||
|
"decl temp0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
#define CPY SCALED_CPY
|
|
||||||
//#define CPY SIMPLE_CPY
|
|
||||||
// "prefetchnta 8(%0)\n\t"
|
|
||||||
CPY
|
|
||||||
"addl %%eax, %0 \n\t"
|
|
||||||
"addl %%ebx, %1 \n\t"
|
|
||||||
CPY
|
|
||||||
"addl %%eax, %0 \n\t"
|
|
||||||
"addl %%ebx, %1 \n\t"
|
|
||||||
CPY
|
|
||||||
"addl %%eax, %0 \n\t"
|
|
||||||
"addl %%ebx, %1 \n\t"
|
|
||||||
CPY
|
|
||||||
"popl %1 \n\t"
|
"popl %1 \n\t"
|
||||||
"popl %0 \n\t"
|
"popl %0 \n\t"
|
||||||
: : "r" (src),
|
: : "r" (src),
|
||||||
"r" (dst),
|
"r" (dst),
|
||||||
"r" (srcStride),
|
"r" (srcStride),
|
||||||
"r" (dstStride)
|
"r" (dstStride),
|
||||||
|
"m" (numLines>>2)
|
||||||
: "%eax", "%ebx"
|
: "%eax", "%ebx"
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2
|
for(i=0; i<numLines; i++)
|
||||||
memcpy( &(dst[dstStride*i]),
|
memcpy( &(dst[dstStride*i]),
|
||||||
&(src[srcStride*i]), BLOCK_SIZE);
|
&(src[srcStride*i]), BLOCK_SIZE);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#ifdef HAVE_MMX
|
||||||
|
asm volatile(
|
||||||
|
"movl %4, %%eax \n\t"
|
||||||
|
"movl %%eax, temp0\n\t"
|
||||||
|
"pushl %0 \n\t"
|
||||||
|
"pushl %1 \n\t"
|
||||||
|
"leal (%2,%2), %%eax \n\t"
|
||||||
|
"leal (%3,%3), %%ebx \n\t"
|
||||||
|
"movq packedYOffset, %%mm2 \n\t"
|
||||||
|
"movq packedYScale, %%mm3 \n\t"
|
||||||
|
|
||||||
|
#define SIMPLE_CPY \
|
||||||
|
"movq (%0), %%mm0 \n\t"\
|
||||||
|
"movq (%0,%2), %%mm1 \n\t"\
|
||||||
|
"movq %%mm0, (%1) \n\t"\
|
||||||
|
"movq %%mm1, (%1, %3) \n\t"\
|
||||||
|
|
||||||
|
"1: \n\t"
|
||||||
|
SIMPLE_CPY
|
||||||
|
"addl %%eax, %0 \n\t"
|
||||||
|
"addl %%ebx, %1 \n\t"
|
||||||
|
SIMPLE_CPY
|
||||||
|
"addl %%eax, %0 \n\t"
|
||||||
|
"addl %%ebx, %1 \n\t"
|
||||||
|
"decl temp0 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"popl %1 \n\t"
|
||||||
|
"popl %0 \n\t"
|
||||||
|
: : "r" (src),
|
||||||
|
"r" (dst),
|
||||||
|
"r" (srcStride),
|
||||||
|
"r" (dstStride),
|
||||||
|
"m" (numLines>>2)
|
||||||
|
: "%eax", "%ebx"
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
for(i=0; i<numLines; i++)
|
||||||
|
memcpy( &(dst[dstStride*i]),
|
||||||
|
&(src[srcStride*i]), BLOCK_SIZE);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1619,33 +1791,50 @@ CPY
|
|||||||
* Filters array of bytes (Y or U or V values)
|
* Filters array of bytes (Y or U or V values)
|
||||||
*/
|
*/
|
||||||
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
|
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
|
||||||
QP_STORE_T QPs[], int QPStride, bool isColor, int mode)
|
QP_STORE_T QPs[], int QPStride, int isColor, int mode)
|
||||||
{
|
{
|
||||||
|
int x,y;
|
||||||
|
/* we need 64bit here otherwise we´ll going to have a problem
|
||||||
|
after watching a black picture for 5 hours*/
|
||||||
|
static uint64_t *yHistogram= NULL;
|
||||||
|
int black=0, white=255; // blackest black and whitest white in the picture
|
||||||
|
|
||||||
#ifdef TIMEING
|
#ifdef TIMEING
|
||||||
long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
|
long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
|
||||||
sumTime= rdtsc();
|
sumTime= rdtsc();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* we need 64bit here otherwise we´ll going to have a problem
|
|
||||||
after watching a black picture for 5 hours*/
|
|
||||||
static uint64_t *yHistogram= NULL;
|
|
||||||
if(!yHistogram)
|
if(!yHistogram)
|
||||||
{
|
{
|
||||||
yHistogram= new uint64_t[256];
|
int i;
|
||||||
for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256;
|
yHistogram= (uint64_t*)malloc(8*256);
|
||||||
|
for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
|
||||||
}
|
}
|
||||||
|
|
||||||
int black=0, white=255; // blackest black and whitest white in the picture
|
|
||||||
if(!isColor)
|
if(!isColor)
|
||||||
{
|
{
|
||||||
uint64_t sum= 0;
|
uint64_t sum= 0;
|
||||||
for(int i=0; i<256; i++)
|
int i;
|
||||||
|
static int framenum= -1;
|
||||||
|
uint64_t maxClipped;
|
||||||
|
uint64_t clipped;
|
||||||
|
double scale;
|
||||||
|
|
||||||
|
framenum++;
|
||||||
|
if(framenum == 1) yHistogram[0]= width*height/64*15/256;
|
||||||
|
|
||||||
|
for(i=0; i<256; i++)
|
||||||
|
{
|
||||||
sum+= yHistogram[i];
|
sum+= yHistogram[i];
|
||||||
|
// printf("%d ", yHistogram[i]);
|
||||||
|
}
|
||||||
|
// printf("\n\n");
|
||||||
|
|
||||||
uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold);
|
/* we allways get a completly black picture first */
|
||||||
|
|
||||||
uint64_t clipped= sum;
|
maxClipped= (uint64_t)(sum * maxClippedThreshold);
|
||||||
|
|
||||||
|
clipped= sum;
|
||||||
for(black=255; black>0; black--)
|
for(black=255; black>0; black--)
|
||||||
{
|
{
|
||||||
if(clipped < maxClipped) break;
|
if(clipped < maxClipped) break;
|
||||||
@ -1665,9 +1854,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
packedYOffset|= packedYOffset<<16;
|
packedYOffset|= packedYOffset<<16;
|
||||||
packedYOffset|= packedYOffset<<8;
|
packedYOffset|= packedYOffset<<8;
|
||||||
|
|
||||||
double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
|
scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
|
||||||
|
|
||||||
packedYScale= uint16_t(scale*256.0 + 0.5);
|
packedYScale= (uint16_t)(scale*256.0 + 0.5);
|
||||||
packedYScale|= packedYScale<<32;
|
packedYScale|= packedYScale<<32;
|
||||||
packedYScale|= packedYScale<<16;
|
packedYScale|= packedYScale<<16;
|
||||||
}
|
}
|
||||||
@ -1677,10 +1866,10 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
packedYOffset= 0;
|
packedYOffset= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int x=0; x<width; x+=BLOCK_SIZE)
|
for(x=0; x<width; x+=BLOCK_SIZE)
|
||||||
blockCopy(dst + x, dstStride, src + x, srcStride);
|
blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
|
||||||
|
|
||||||
for(int y=0; y<height; y+=BLOCK_SIZE)
|
for(y=0; y<height; y+=BLOCK_SIZE)
|
||||||
{
|
{
|
||||||
//1% speedup if these are here instead of the inner loop
|
//1% speedup if these are here instead of the inner loop
|
||||||
uint8_t *srcBlock= &(src[y*srcStride]);
|
uint8_t *srcBlock= &(src[y*srcStride]);
|
||||||
@ -1690,8 +1879,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
|
|
||||||
// finish 1 block before the next otherwise we´ll might have a problem
|
// finish 1 block before the next otherwise we´ll might have a problem
|
||||||
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
|
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
|
||||||
for(int x=0; x<width; x+=BLOCK_SIZE)
|
for(x=0; x<width; x+=BLOCK_SIZE)
|
||||||
{
|
{
|
||||||
|
const int stride= dstStride;
|
||||||
int QP= isColor ?
|
int QP= isColor ?
|
||||||
QPs[(y>>3)*QPStride + (x>>3)]:
|
QPs[(y>>3)*QPStride + (x>>3)]:
|
||||||
(QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
|
(QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
|
||||||
@ -1707,7 +1897,6 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
const int stride= dstStride;
|
|
||||||
if(y + 12 < height)
|
if(y + 12 < height)
|
||||||
{
|
{
|
||||||
#ifdef MORE_TIMEING
|
#ifdef MORE_TIMEING
|
||||||
@ -1730,7 +1919,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
if(!isColor) yHistogram[ srcBlock[0] ]++;
|
if(!isColor) yHistogram[ srcBlock[0] ]++;
|
||||||
|
|
||||||
blockCopy(vertBlock + dstStride*2, dstStride,
|
blockCopy(vertBlock + dstStride*2, dstStride,
|
||||||
vertSrcBlock + srcStride*2, srcStride);
|
vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
|
||||||
|
|
||||||
|
|
||||||
#ifdef MORE_TIMEING
|
#ifdef MORE_TIMEING
|
||||||
@ -1742,7 +1931,7 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
{
|
{
|
||||||
if(mode & RK_FILTER)
|
if(mode & RK_FILTER)
|
||||||
vertRKFilter(vertBlock, stride, QP);
|
vertRKFilter(vertBlock, stride, QP);
|
||||||
else if(0)
|
else if(mode & X1_FILTER)
|
||||||
vertX1Filter(vertBlock, stride, QP);
|
vertX1Filter(vertBlock, stride, QP);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1762,12 +1951,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
blockCopy(vertBlock + dstStride*1, dstStride,
|
||||||
for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2
|
vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
|
||||||
memcpy( &(vertBlock[dstStride*i]),
|
|
||||||
&(vertSrcBlock[srcStride*i]), BLOCK_SIZE);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if(x - 8 >= 0 && x<width)
|
if(x - 8 >= 0 && x<width)
|
||||||
{
|
{
|
||||||
@ -1813,8 +1999,8 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
|
|||||||
sumTime= rdtsc() - sumTime;
|
sumTime= rdtsc() - sumTime;
|
||||||
if(!isColor)
|
if(!isColor)
|
||||||
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
|
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
|
||||||
int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000),
|
(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
|
||||||
int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000)
|
(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
|
||||||
, black, white);
|
, black, white);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user