mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-02-09 14:14:39 +02:00
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
~0.3% faster overall. Originally committed as revision 24448 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
b74f70d646
commit
c25c776708
@ -835,8 +835,6 @@ static void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb
|
|||||||
int nnz_pred, nnz, nnz_total = 0;
|
int nnz_pred, nnz, nnz_total = 0;
|
||||||
int segment = s->segment;
|
int segment = s->segment;
|
||||||
|
|
||||||
s->dsp.clear_blocks((DCTELEM *)s->block);
|
|
||||||
|
|
||||||
if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
|
if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
|
||||||
AV_ZERO128(dc);
|
AV_ZERO128(dc);
|
||||||
AV_ZERO128(dc+8);
|
AV_ZERO128(dc+8);
|
||||||
|
@ -69,6 +69,10 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], int stride)
|
|||||||
t1 = block[0*4+i] - block[2*4+i];
|
t1 = block[0*4+i] - block[2*4+i];
|
||||||
t2 = MUL_35468(block[1*4+i]) - MUL_20091(block[3*4+i]);
|
t2 = MUL_35468(block[1*4+i]) - MUL_20091(block[3*4+i]);
|
||||||
t3 = MUL_20091(block[1*4+i]) + MUL_35468(block[3*4+i]);
|
t3 = MUL_20091(block[1*4+i]) + MUL_35468(block[3*4+i]);
|
||||||
|
block[0*4+i] = 0;
|
||||||
|
block[1*4+i] = 0;
|
||||||
|
block[2*4+i] = 0;
|
||||||
|
block[3*4+i] = 0;
|
||||||
|
|
||||||
tmp[i*4+0] = t0 + t3;
|
tmp[i*4+0] = t0 + t3;
|
||||||
tmp[i*4+1] = t1 + t2;
|
tmp[i*4+1] = t1 + t2;
|
||||||
@ -94,6 +98,7 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
|
|||||||
{
|
{
|
||||||
int i, dc = (block[0] + 4) >> 3;
|
int i, dc = (block[0] + 4) >> 3;
|
||||||
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
|
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
|
||||||
|
block[0] = 0;
|
||||||
|
|
||||||
for (i = 0; i < 4; i++) {
|
for (i = 0; i < 4; i++) {
|
||||||
dst[0] = cm[dst[0]];
|
dst[0] = cm[dst[0]];
|
||||||
|
@ -222,6 +222,7 @@ extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
|||||||
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
||||||
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
|
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
|
|
||||||
#define DECLARE_LOOP_FILTER(NAME)\
|
#define DECLARE_LOOP_FILTER(NAME)\
|
||||||
extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
|
extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
|
||||||
@ -328,6 +329,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (mm_flags & FF_MM_SSE) {
|
if (mm_flags & FF_MM_SSE) {
|
||||||
|
c->vp8_idct_add = ff_vp8_idct_add_sse;
|
||||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
|
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
|
||||||
}
|
}
|
||||||
|
@ -913,6 +913,7 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
|
|||||||
paddw mm0, [pw_4]
|
paddw mm0, [pw_4]
|
||||||
pxor mm1, mm1
|
pxor mm1, mm1
|
||||||
psraw mm0, 3
|
psraw mm0, 3
|
||||||
|
movd [r1], mm1
|
||||||
psubw mm1, mm0
|
psubw mm1, mm0
|
||||||
packuswb mm0, mm0
|
packuswb mm0, mm0
|
||||||
packuswb mm1, mm1
|
packuswb mm1, mm1
|
||||||
@ -944,11 +945,12 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
|
|||||||
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
||||||
; load data
|
; load data
|
||||||
movd xmm0, [r1]
|
movd xmm0, [r1]
|
||||||
lea r1, [r0+r2*2]
|
|
||||||
pxor xmm1, xmm1
|
pxor xmm1, xmm1
|
||||||
|
|
||||||
; calculate DC
|
; calculate DC
|
||||||
paddw xmm0, [pw_4]
|
paddw xmm0, [pw_4]
|
||||||
|
movd [r1], xmm1
|
||||||
|
lea r1, [r0+r2*2]
|
||||||
movd xmm2, [r0]
|
movd xmm2, [r0]
|
||||||
movd xmm3, [r0+r2]
|
movd xmm3, [r0+r2]
|
||||||
movd xmm4, [r1]
|
movd xmm4, [r1]
|
||||||
@ -1005,14 +1007,26 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_MMX
|
INIT_MMX
|
||||||
cglobal vp8_idct_add_mmx, 3, 3
|
%macro VP8_IDCT_ADD 1
|
||||||
|
cglobal vp8_idct_add_%1, 3, 3
|
||||||
; load block data
|
; load block data
|
||||||
movq m0, [r1]
|
movq m0, [r1+ 0]
|
||||||
movq m1, [r1+8]
|
movq m1, [r1+ 8]
|
||||||
movq m2, [r1+16]
|
movq m2, [r1+16]
|
||||||
movq m3, [r1+24]
|
movq m3, [r1+24]
|
||||||
movq m6, [pw_20091]
|
movq m6, [pw_20091]
|
||||||
movq m7, [pw_17734]
|
movq m7, [pw_17734]
|
||||||
|
%ifidn %1, sse
|
||||||
|
xorps xmm0, xmm0
|
||||||
|
movaps [r1+ 0], xmm0
|
||||||
|
movaps [r1+16], xmm0
|
||||||
|
%else
|
||||||
|
pxor m4, m4
|
||||||
|
movq [r1+ 0], m4
|
||||||
|
movq [r1+ 8], m4
|
||||||
|
movq [r1+16], m4
|
||||||
|
movq [r1+24], m4
|
||||||
|
%endif
|
||||||
|
|
||||||
; actual IDCT
|
; actual IDCT
|
||||||
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
|
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
|
||||||
@ -1028,6 +1042,10 @@ cglobal vp8_idct_add_mmx, 3, 3
|
|||||||
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
|
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
VP8_IDCT_ADD mmx
|
||||||
|
VP8_IDCT_ADD sse
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
|
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user