1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-02-04 06:08:26 +02:00

Various VP8 x86 deblocking speedups

SSSE3 versions, improve SSE2 versions a bit.
SSE2/SSSE3 mbedge h functions are currently broken, so explicitly disable them.

Originally committed as revision 24403 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Jason Garrett-Glaser 2010-07-21 22:11:03 +00:00
parent 8a810ccbba
commit 7dd224a42d
2 changed files with 105 additions and 90 deletions

View File

@ -223,64 +223,31 @@ extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride)
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); #define DECLARE_LOOP_FILTER(NAME)\
extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); int e, int i, int hvt);\
extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
int e, int i, int hvt);\
extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
int s, int e, int i, int hvt);\
extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
int s, int e, int i, int hvt);\
extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
int e, int i, int hvt);\
extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
int e, int i, int hvt);\
extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
int s, int e, int i, int hvt);\
extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter16y_inner_mmx (uint8_t *dst, int stride, DECLARE_LOOP_FILTER(mmx)
int e, int i, int hvt); DECLARE_LOOP_FILTER(mmxext)
extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, DECLARE_LOOP_FILTER(sse2)
int e, int i, int hvt); DECLARE_LOOP_FILTER(ssse3)
extern void ff_vp8_v_loop_filter16y_inner_sse2 (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_inner_mmx (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_v_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
#endif #endif
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
@ -384,8 +351,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; //c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; //c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
} }
if (mm_flags & FF_MM_SSSE3) { if (mm_flags & FF_MM_SSSE3) {
@ -395,6 +362,19 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
VP8_BILINEAR_MC_FUNC(0, 16, ssse3); VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
VP8_BILINEAR_MC_FUNC(1, 8, ssse3); VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
VP8_BILINEAR_MC_FUNC(2, 4, ssse3); VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
//c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
//c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
} }
if (mm_flags & FF_MM_SSE4) { if (mm_flags & FF_MM_SSE4) {

View File

@ -1229,18 +1229,22 @@ cglobal vp8_luma_dc_wht_mmx, 2,3
movd [%7+%9*2], m%4 movd [%7+%9*2], m%4
%endmacro %endmacro
%macro SPLATB_REG 3 %macro SPLATB_REG 3-4
movd %1, %2 movd %1, %2
%ifidn %3, ssse3
pshufb %1, %4
%else
punpcklbw %1, %1 punpcklbw %1, %1
%if mmsize == 16 ; sse2 %if mmsize == 16 ; sse2
punpcklwd %1, %1 pshuflw %1, %1, 0x0
pshufd %1, %1, 0x0 punpcklqdq %1, %1
%elifidn %3, mmx %elifidn %3, mmx
punpcklwd %1, %1 punpcklwd %1, %1
punpckldq %1, %1 punpckldq %1, %1
%else ; mmxext %else ; mmxext
pshufw %1, %1, 0x0 pshufw %1, %1, 0x0
%endif %endif
%endif
%endmacro %endmacro
%macro SIMPLE_LOOPFILTER 3 %macro SIMPLE_LOOPFILTER 3
@ -1252,7 +1256,10 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
%if mmsize == 8 ; mmx/mmxext %if mmsize == 8 ; mmx/mmxext
mov r3, 2 mov r3, 2
%endif %endif
SPLATB_REG m7, r2, %1 ; splat "flim" into register %ifidn %1, ssse3
pxor m0, m0
%endif
SPLATB_REG m7, r2, %1, m0 ; splat "flim" into register
; set up indexes to address 4 rows ; set up indexes to address 4 rows
mov r2, r1 mov r2, r1
@ -1398,6 +1405,8 @@ SIMPLE_LOOPFILTER mmxext, h, 6
INIT_XMM INIT_XMM
SIMPLE_LOOPFILTER sse2, v, 3 SIMPLE_LOOPFILTER sse2, v, 3
SIMPLE_LOOPFILTER sse2, h, 6 SIMPLE_LOOPFILTER sse2, h, 6
SIMPLE_LOOPFILTER ssse3, v, 3
SIMPLE_LOOPFILTER ssse3, h, 6
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
@ -1433,11 +1442,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%define stack_reg hev_thr_reg %define stack_reg hev_thr_reg
%endif %endif
%ifidn %1, ssse3
pxor m7, m7
%endif
%ifndef m8 ; mmx/mmxext or sse2 on x86-32 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
; splat function arguments ; splat function arguments
SPLATB_REG m0, E_reg, %1 ; E SPLATB_REG m0, E_reg, %1, m7 ; E
SPLATB_REG m1, I_reg, %1 ; I SPLATB_REG m1, I_reg, %1, m7 ; I
SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh
; align stack ; align stack
mov stack_reg, rsp ; backup stack pointer mov stack_reg, rsp ; backup stack pointer
@ -1470,9 +1483,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%define q0backup m8 %define q0backup m8
; splat function arguments ; splat function arguments
SPLATB_REG flim_E, E_reg, %1 ; E SPLATB_REG flim_E, E_reg, %1, m7 ; E
SPLATB_REG flim_I, I_reg, %1 ; I SPLATB_REG flim_I, I_reg, %1, m7 ; I
SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
%endif %endif
%if mmsize == 8 && %4 == 16 ; mmx/mmxext %if mmsize == 8 && %4 == 16 ; mmx/mmxext
@ -1884,15 +1897,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%endmacro %endmacro
INIT_MMX INIT_MMX
INNER_LOOPFILTER mmx, v, 6, 16, 8 INNER_LOOPFILTER mmx, v, 6, 16, 0
INNER_LOOPFILTER mmx, h, 6, 16, 8 INNER_LOOPFILTER mmx, h, 6, 16, 0
INNER_LOOPFILTER mmxext, v, 6, 16, 8 INNER_LOOPFILTER mmxext, v, 6, 16, 0
INNER_LOOPFILTER mmxext, h, 6, 16, 8 INNER_LOOPFILTER mmxext, h, 6, 16, 0
INNER_LOOPFILTER mmx, v, 6, 8, 8 INNER_LOOPFILTER mmx, v, 6, 8, 0
INNER_LOOPFILTER mmx, h, 6, 8, 8 INNER_LOOPFILTER mmx, h, 6, 8, 0
INNER_LOOPFILTER mmxext, v, 6, 8, 8 INNER_LOOPFILTER mmxext, v, 6, 8, 0
INNER_LOOPFILTER mmxext, h, 6, 8, 8 INNER_LOOPFILTER mmxext, h, 6, 8, 0
INIT_XMM INIT_XMM
INNER_LOOPFILTER sse2, v, 5, 16, 13 INNER_LOOPFILTER sse2, v, 5, 16, 13
@ -1904,6 +1917,15 @@ INNER_LOOPFILTER sse2, h, 6, 16, 13
INNER_LOOPFILTER sse2, v, 6, 8, 13 INNER_LOOPFILTER sse2, v, 6, 8, 13
INNER_LOOPFILTER sse2, h, 6, 8, 13 INNER_LOOPFILTER sse2, h, 6, 8, 13
INNER_LOOPFILTER ssse3, v, 5, 16, 13
%ifdef m8
INNER_LOOPFILTER ssse3, h, 5, 16, 13
%else
INNER_LOOPFILTER ssse3, h, 6, 16, 13
%endif
INNER_LOOPFILTER ssse3, v, 6, 8, 13
INNER_LOOPFILTER ssse3, h, 6, 8, 13
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr); ; int flimE, int flimI, int hev_thr);
@ -1984,11 +2006,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%define stack_reg hev_thr_reg %define stack_reg hev_thr_reg
%endif %endif
%ifidn %1, ssse3
pxor m7, m7
%endif
%ifndef m8 ; mmx/mmxext or sse2 on x86-32 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
; splat function arguments ; splat function arguments
SPLATB_REG m0, E_reg, %1 ; E SPLATB_REG m0, E_reg, %1, m7 ; E
SPLATB_REG m1, I_reg, %1 ; I SPLATB_REG m1, I_reg, %1, m7 ; I
SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh
; align stack ; align stack
mov stack_reg, rsp ; backup stack pointer mov stack_reg, rsp ; backup stack pointer
@ -2028,9 +2054,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%define lim_sign m15 %define lim_sign m15
; splat function arguments ; splat function arguments
SPLATB_REG flim_E, E_reg, %1 ; E SPLATB_REG flim_E, E_reg, %1, m7 ; E
SPLATB_REG flim_I, I_reg, %1 ; I SPLATB_REG flim_I, I_reg, %1, m7 ; I
SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
%endif %endif
%if mmsize == 8 && %4 == 16 ; mmx/mmxext %if mmsize == 8 && %4 == 16 ; mmx/mmxext
@ -2521,15 +2547,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endmacro %endmacro
INIT_MMX INIT_MMX
MBEDGE_LOOPFILTER mmx, v, 6, 16, 8 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0
MBEDGE_LOOPFILTER mmx, h, 6, 16, 8 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0
MBEDGE_LOOPFILTER mmxext, v, 6, 16, 8 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
MBEDGE_LOOPFILTER mmxext, h, 6, 16, 8 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
MBEDGE_LOOPFILTER mmx, v, 6, 8, 8 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0
MBEDGE_LOOPFILTER mmx, h, 6, 8, 8 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0
MBEDGE_LOOPFILTER mmxext, v, 6, 8, 8 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0
MBEDGE_LOOPFILTER mmxext, h, 6, 8, 8 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0
INIT_XMM INIT_XMM
MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16
@ -2540,3 +2566,12 @@ MBEDGE_LOOPFILTER sse2, h, 6, 16, 16
%endif %endif
MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 MBEDGE_LOOPFILTER sse2, v, 6, 8, 16
MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16
MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16
%ifdef m8
MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16
%else
MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16
%endif
MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16
MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16