You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-10 06:10:52 +02:00
VP8: 30% faster idct_mb
Take shortcuts based on statistically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks? Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
@@ -1186,13 +1186,16 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst,
|
static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
|
||||||
VP8Macroblock *mb)
|
|
||||||
{
|
{
|
||||||
int x, y, nnz;
|
int x, y, ch, nnz;
|
||||||
|
|
||||||
if (mb->mode != MODE_I4x4)
|
if (mb->mode != MODE_I4x4) {
|
||||||
|
uint8_t *y_dst = dst[0];
|
||||||
for (y = 0; y < 4; y++) {
|
for (y = 0; y < 4; y++) {
|
||||||
|
uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]);
|
||||||
|
if (nnz) {
|
||||||
|
if (nnz&~0x01010101) {
|
||||||
for (x = 0; x < 4; x++) {
|
for (x = 0; x < 4; x++) {
|
||||||
nnz = s->non_zero_count_cache[y][x];
|
nnz = s->non_zero_count_cache[y][x];
|
||||||
if (nnz) {
|
if (nnz) {
|
||||||
@@ -1202,29 +1205,30 @@ static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_ds
|
|||||||
s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
|
s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize);
|
||||||
|
}
|
||||||
|
}
|
||||||
y_dst += 4*s->linesize;
|
y_dst += 4*s->linesize;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (ch = 0; ch < 2; ch++) {
|
||||||
|
if (AV_RN32A(s->non_zero_count_cache[4+ch])) {
|
||||||
|
uint8_t *ch_dst = dst[1+ch];
|
||||||
for (y = 0; y < 2; y++) {
|
for (y = 0; y < 2; y++) {
|
||||||
for (x = 0; x < 2; x++) {
|
for (x = 0; x < 2; x++) {
|
||||||
nnz = s->non_zero_count_cache[4][(y<<1)+x];
|
nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
|
||||||
if (nnz) {
|
if (nnz) {
|
||||||
if (nnz == 1)
|
if (nnz == 1)
|
||||||
s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
|
s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
|
||||||
else
|
else
|
||||||
s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
|
s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
nnz = s->non_zero_count_cache[5][(y<<1)+x];
|
ch_dst += 4*s->uvlinesize;
|
||||||
if (nnz) {
|
|
||||||
if (nnz == 1)
|
|
||||||
s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
|
|
||||||
else
|
|
||||||
s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
u_dst += 4*s->uvlinesize;
|
|
||||||
v_dst += 4*s->uvlinesize;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1511,7 +1515,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
|
|||||||
prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
|
prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
|
||||||
|
|
||||||
if (!mb->skip) {
|
if (!mb->skip) {
|
||||||
idct_mb(s, dst[0], dst[1], dst[2], mb);
|
idct_mb(s, dst, mb);
|
||||||
} else {
|
} else {
|
||||||
AV_ZERO64(s->left_nnz);
|
AV_ZERO64(s->left_nnz);
|
||||||
AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned
|
AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned
|
||||||
|
@@ -109,6 +109,25 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride)
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
for (j = 0; j < 4; j++) {
|
||||||
|
uint8_t *pix = dst+j*4;
|
||||||
|
int dc = (block[j][0] + 4) >> 3;
|
||||||
|
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
|
||||||
|
block[j][0] = 0;
|
||||||
|
if (!dc)
|
||||||
|
continue;
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
pix[0] = cm[pix[0]];
|
||||||
|
pix[1] = cm[pix[1]];
|
||||||
|
pix[2] = cm[pix[2]];
|
||||||
|
pix[3] = cm[pix[3]];
|
||||||
|
pix += stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// because I like only having two parameters to pass functions...
|
// because I like only having two parameters to pass functions...
|
||||||
#define LOAD_PIXELS\
|
#define LOAD_PIXELS\
|
||||||
@@ -463,6 +482,7 @@ av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
|
|||||||
dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
|
dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
|
||||||
dsp->vp8_idct_add = vp8_idct_add_c;
|
dsp->vp8_idct_add = vp8_idct_add_c;
|
||||||
dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
|
dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
|
||||||
|
dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c;
|
||||||
|
|
||||||
dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
|
dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
|
||||||
dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;
|
dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;
|
||||||
|
@@ -33,6 +33,7 @@ typedef struct VP8DSPContext {
|
|||||||
void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
||||||
void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride);
|
void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride);
|
void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
|
void (*vp8_idct_dc_add4)(uint8_t *dst, DCTELEM block[4][16], int stride);
|
||||||
|
|
||||||
// loop filter applied to edges between macroblocks
|
// loop filter applied to edges between macroblocks
|
||||||
void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride,
|
void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride,
|
||||||
|
@@ -220,6 +220,8 @@ HVBILIN(ssse3, 8, 16, 16)
|
|||||||
|
|
||||||
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
|
extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
|
||||||
|
extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
|
||||||
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
||||||
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
|
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
@@ -283,6 +285,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
|||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
if (mm_flags & FF_MM_MMX) {
|
if (mm_flags & FF_MM_MMX) {
|
||||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
|
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
|
||||||
|
c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_mmx;
|
||||||
c->vp8_idct_add = ff_vp8_idct_add_mmx;
|
c->vp8_idct_add = ff_vp8_idct_add_mmx;
|
||||||
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
|
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
|
||||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||||
@@ -351,6 +354,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (mm_flags & FF_MM_SSE2) {
|
if (mm_flags & FF_MM_SSE2) {
|
||||||
|
c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_sse2;
|
||||||
|
|
||||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
|
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
|
||||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
|
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
|
||||||
|
|
||||||
|
@@ -900,75 +900,148 @@ cglobal put_vp8_pixels16_sse, 5,5,2
|
|||||||
REP_RET
|
REP_RET
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
; IDCT functions:
|
|
||||||
;
|
|
||||||
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
|
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
%macro ADD_DC 4
|
||||||
|
%4 m2, [r0+%3]
|
||||||
|
%4 m3, [r0+r2+%3]
|
||||||
|
%4 m4, [r1+%3]
|
||||||
|
%4 m5, [r1+r2+%3]
|
||||||
|
paddusb m2, %1
|
||||||
|
paddusb m3, %1
|
||||||
|
paddusb m4, %1
|
||||||
|
paddusb m5, %1
|
||||||
|
psubusb m2, %2
|
||||||
|
psubusb m3, %2
|
||||||
|
psubusb m4, %2
|
||||||
|
psubusb m5, %2
|
||||||
|
%4 [r0+%3], m2
|
||||||
|
%4 [r0+r2+%3], m3
|
||||||
|
%4 [r1+%3], m4
|
||||||
|
%4 [r1+r2+%3], m5
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
cglobal vp8_idct_dc_add_mmx, 3, 3
|
cglobal vp8_idct_dc_add_mmx, 3, 3
|
||||||
; load data
|
; load data
|
||||||
movd mm0, [r1]
|
movd m0, [r1]
|
||||||
|
|
||||||
; calculate DC
|
; calculate DC
|
||||||
paddw mm0, [pw_4]
|
paddw m0, [pw_4]
|
||||||
pxor mm1, mm1
|
pxor m1, m1
|
||||||
psraw mm0, 3
|
psraw m0, 3
|
||||||
movd [r1], mm1
|
movd [r1], m1
|
||||||
psubw mm1, mm0
|
psubw m1, m0
|
||||||
packuswb mm0, mm0
|
packuswb m0, m0
|
||||||
packuswb mm1, mm1
|
packuswb m1, m1
|
||||||
punpcklbw mm0, mm0
|
punpcklbw m0, m0
|
||||||
punpcklbw mm1, mm1
|
punpcklbw m1, m1
|
||||||
punpcklwd mm0, mm0
|
punpcklwd m0, m0
|
||||||
punpcklwd mm1, mm1
|
punpcklwd m1, m1
|
||||||
|
|
||||||
; add DC
|
; add DC
|
||||||
lea r1, [r0+r2*2]
|
lea r1, [r0+r2*2]
|
||||||
movd mm2, [r0]
|
ADD_DC m0, m1, 0, movh
|
||||||
movd mm3, [r0+r2]
|
|
||||||
movd mm4, [r1]
|
|
||||||
movd mm5, [r1+r2]
|
|
||||||
paddusb mm2, mm0
|
|
||||||
paddusb mm3, mm0
|
|
||||||
paddusb mm4, mm0
|
|
||||||
paddusb mm5, mm0
|
|
||||||
psubusb mm2, mm1
|
|
||||||
psubusb mm3, mm1
|
|
||||||
psubusb mm4, mm1
|
|
||||||
psubusb mm5, mm1
|
|
||||||
movd [r0], mm2
|
|
||||||
movd [r0+r2], mm3
|
|
||||||
movd [r1], mm4
|
|
||||||
movd [r1+r2], mm5
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
INIT_XMM
|
||||||
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
||||||
; load data
|
; load data
|
||||||
movd xmm0, [r1]
|
movd m0, [r1]
|
||||||
pxor xmm1, xmm1
|
pxor m1, m1
|
||||||
|
|
||||||
; calculate DC
|
; calculate DC
|
||||||
paddw xmm0, [pw_4]
|
paddw m0, [pw_4]
|
||||||
movd [r1], xmm1
|
movd [r1], m1
|
||||||
lea r1, [r0+r2*2]
|
lea r1, [r0+r2*2]
|
||||||
movd xmm2, [r0]
|
movd m2, [r0]
|
||||||
movd xmm3, [r0+r2]
|
movd m3, [r0+r2]
|
||||||
movd xmm4, [r1]
|
movd m4, [r1]
|
||||||
movd xmm5, [r1+r2]
|
movd m5, [r1+r2]
|
||||||
psraw xmm0, 3
|
psraw m0, 3
|
||||||
pshuflw xmm0, xmm0, 0
|
pshuflw m0, m0, 0
|
||||||
punpcklqdq xmm0, xmm0
|
punpcklqdq m0, m0
|
||||||
punpckldq xmm2, xmm3
|
punpckldq m2, m3
|
||||||
punpckldq xmm4, xmm5
|
punpckldq m4, m5
|
||||||
punpcklbw xmm2, xmm1
|
punpcklbw m2, m1
|
||||||
punpcklbw xmm4, xmm1
|
punpcklbw m4, m1
|
||||||
paddw xmm2, xmm0
|
paddw m2, m0
|
||||||
paddw xmm4, xmm0
|
paddw m4, m0
|
||||||
packuswb xmm2, xmm4
|
packuswb m2, m4
|
||||||
movd [r0], xmm2
|
movd [r0], m2
|
||||||
pextrd [r0+r2], xmm2, 1
|
pextrd [r0+r2], m2, 1
|
||||||
pextrd [r1], xmm2, 2
|
pextrd [r1], m2, 2
|
||||||
pextrd [r1+r2], xmm2, 3
|
pextrd [r1+r2], m2, 3
|
||||||
|
RET
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
cglobal vp8_idct_dc_add4_mmx, 3, 3
|
||||||
|
; load data
|
||||||
|
movd m0, [r1+32*0] ; A
|
||||||
|
movd m1, [r1+32*2] ; C
|
||||||
|
punpcklwd m0, [r1+32*1] ; A B
|
||||||
|
punpcklwd m1, [r1+32*3] ; C D
|
||||||
|
punpckldq m0, m1 ; A B C D
|
||||||
|
pxor m6, m6
|
||||||
|
|
||||||
|
; calculate DC
|
||||||
|
paddw m0, [pw_4]
|
||||||
|
movd [r1+32*0], m6
|
||||||
|
movd [r1+32*1], m6
|
||||||
|
movd [r1+32*2], m6
|
||||||
|
movd [r1+32*3], m6
|
||||||
|
psraw m0, 3
|
||||||
|
psubw m6, m0
|
||||||
|
packuswb m0, m0
|
||||||
|
packuswb m6, m6
|
||||||
|
punpcklbw m0, m0 ; AABBCCDD
|
||||||
|
punpcklbw m6, m6 ; AABBCCDD
|
||||||
|
movq m1, m0
|
||||||
|
movq m7, m6
|
||||||
|
punpcklbw m0, m0 ; AAAABBBB
|
||||||
|
punpckhbw m1, m1 ; CCCCDDDD
|
||||||
|
punpcklbw m6, m6 ; AAAABBBB
|
||||||
|
punpckhbw m7, m7 ; CCCCDDDD
|
||||||
|
|
||||||
|
; add DC
|
||||||
|
lea r1, [r0+r2*2]
|
||||||
|
ADD_DC m0, m6, 0, mova
|
||||||
|
ADD_DC m1, m7, 8, mova
|
||||||
|
RET
|
||||||
|
|
||||||
|
INIT_XMM
|
||||||
|
cglobal vp8_idct_dc_add4_sse2, 3, 3
|
||||||
|
; load data
|
||||||
|
movd m0, [r1+32*0] ; A
|
||||||
|
movd m1, [r1+32*2] ; C
|
||||||
|
punpcklwd m0, [r1+32*1] ; A B
|
||||||
|
punpcklwd m1, [r1+32*3] ; C D
|
||||||
|
punpckldq m0, m1 ; A B C D
|
||||||
|
pxor m1, m1
|
||||||
|
|
||||||
|
; calculate DC
|
||||||
|
paddw m0, [pw_4]
|
||||||
|
movd [r1+32*0], m1
|
||||||
|
movd [r1+32*1], m1
|
||||||
|
movd [r1+32*2], m1
|
||||||
|
movd [r1+32*3], m1
|
||||||
|
psraw m0, 3
|
||||||
|
psubw m1, m0
|
||||||
|
packuswb m0, m0
|
||||||
|
packuswb m1, m1
|
||||||
|
punpcklbw m0, m0
|
||||||
|
punpcklbw m1, m1
|
||||||
|
punpcklbw m0, m0
|
||||||
|
punpcklbw m1, m1
|
||||||
|
|
||||||
|
; add DC
|
||||||
|
lea r1, [r0+r2*2]
|
||||||
|
ADD_DC m0, m1, 0, mova
|
||||||
RET
|
RET
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
|
Reference in New Issue
Block a user