From dc5eec80851418f7165257224576518f19a2aabb Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 22 Jul 2010 19:59:34 +0000 Subject: [PATCH] Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on CPUs supporting it. Originally committed as revision 24437 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/x86/vp8dsp-init.c | 4 ++++ libavcodec/x86/vp8dsp.asm | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index 66ae884705..fad399fba8 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -247,6 +247,7 @@ DECLARE_LOOP_FILTER(mmx) DECLARE_LOOP_FILTER(mmxext) DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(ssse3) +DECLARE_LOOP_FILTER(sse4) #endif @@ -379,6 +380,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) if (mm_flags & FF_MM_SSE4) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; + + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; } #endif } diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index c9d7d383fb..843873167d 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1932,10 +1932,24 @@ INNER_LOOPFILTER ssse3, h, 6, 8, 13 ; write 4 or 8 words in the mmx/xmm registers as 8 lines ; 1 and 2 are the registers to write, this can be the same (for SSE2) +; for pre-SSE4: ; 3 is a general-purpose register that we will clobber +; for SSE4: +; 3 is a pointer to the destination's 5th line ; 4 is a pointer to the destination's 4th line -; 5 is -stride and +stride -%macro WRITE_8W 6 +; 5/6 is -stride and +stride +; 7 is optimization string +%macro WRITE_8W 7 +%ifidn %7, sse4 + pextrw [%4+%5*4], %1, 0 + pextrw [%3+%5*4], %1, 1 + pextrw [%4+%5*2], %1, 2 + pextrw [%4+%5 ], %1, 3 + pextrw [%4 ], %1, 4 + pextrw [%3 ], %1, 5 + pextrw [%3+%6 ], %1, 6 + pextrw [%3+%6*2], %1, 7 +%else movd %3, %1 %if mmsize == 8 punpckhdq %1, %1 @@ -1974,6 +1988,7 @@ INNER_LOOPFILTER ssse3, h, 6, 8, 13 %if mmsize == 8 add %4, %5 %endif +%endif %endmacro %macro MBEDGE_LOOPFILTER 5 @@ -2509,14 +2524,17 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %if mmsize == 8 ; mmx/mmxext (h) WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg add dst_reg, 4 - WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg + WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 %else ; sse2 (h) lea dst8_reg, [dst8_reg+mstride_reg+1] WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 lea dst_reg, [dst2_reg+mstride_reg+4] lea dst8_reg, [dst8_reg+mstride_reg+4] - WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg - WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg + WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 +%ifidn %2, sse4 + lea dst_reg, [dst8_reg+ stride_reg] +%endif + WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 %endif %endif @@ -2574,3 +2592,10 @@ MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 %endif MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 + +%ifdef m8 +MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 +%else +MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 +%endif +MBEDGE_LOOPFILTER sse4, h, 6, 8, 16