From ab7d1c64c9aa9186acb1d988d020e59f2d3defce Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Wed, 1 Oct 2025 10:46:39 +0200 Subject: [PATCH] avcodec/x86/h263_loopfilter: Port loop filter to SSE2 Old benchmarks: h263dsp.h_loop_filter_c: 41.2 ( 1.00x) h263dsp.h_loop_filter_mmx: 39.5 ( 1.04x) h263dsp.v_loop_filter_c: 43.5 ( 1.00x) h263dsp.v_loop_filter_mmx: 16.9 ( 2.57x) New benchmarks: h263dsp.h_loop_filter_c: 41.6 ( 1.00x) h263dsp.h_loop_filter_sse2: 28.2 ( 1.48x) h263dsp.v_loop_filter_c: 42.4 ( 1.00x) h263dsp.v_loop_filter_sse2: 15.1 ( 2.81x) Signed-off-by: Andreas Rheinhardt --- libavcodec/x86/constants.c | 2 +- libavcodec/x86/constants.h | 2 +- libavcodec/x86/h263_loopfilter.asm | 167 ++++++++++++----------------- libavcodec/x86/h263dsp_init.c | 10 +- tests/checkasm/h263dsp.c | 2 +- 5 files changed, 78 insertions(+), 105 deletions(-) diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index c5f3c6428e..1e2f5990e4 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -75,7 +75,7 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x808 0x8080808080808080ULL, 0x8080808080808080ULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FC) = { 0xFCFCFCFCFCFCFCFCULL, 0xFCFCFCFCFCFCFCFCULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL }; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 4a55adb5b3..7d0bd975b9 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -56,8 +56,8 @@ extern const ymm_reg ff_pb_1; extern const ymm_reg ff_pb_2; extern const ymm_reg ff_pb_3; extern const ymm_reg ff_pb_80; +extern const xmm_reg ff_pb_FC; extern const ymm_reg ff_pb_FE; -extern const uint64_t ff_pb_FC; extern const xmm_reg ff_ps_neg; diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm index 77c8cf154d..ebe76f01af 100644 --- a/libavcodec/x86/h263_loopfilter.asm +++ b/libavcodec/x86/h263_loopfilter.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* MMX-optimized H.263 loop filter +;* SSE2-optimized H.263 loop filter ;* Copyright (c) 2003-2013 Michael Niedermayer ;* Copyright (c) 2013 Daniel Kang ;* @@ -22,7 +22,6 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA cextern pb_FC cextern h263_loop_filter_strength @@ -30,60 +29,45 @@ SECTION .text %macro H263_LOOP_FILTER 5 pxor m7, m7 - mova m0, [%1] - mova m1, [%1] - mova m2, [%4] - mova m3, [%4] + movq m0, [%1] + movq m6, [%4] + mova m5, m0 punpcklbw m0, m7 - punpckhbw m1, m7 + punpcklbw m6, m7 + psubw m0, m6 + movq m2, [%2] + movq m1, [%3] + mova m3, m2 + mova m4, m1 punpcklbw m2, m7 - punpckhbw m3, m7 - psubw m0, m2 - psubw m1, m3 - mova m2, [%2] - mova m3, [%2] - mova m4, [%3] - mova m5, [%3] - punpcklbw m2, m7 - punpckhbw m3, m7 - punpcklbw m4, m7 - punpckhbw m5, m7 - psubw m4, m2 - psubw m5, m3 - psllw m4, 2 - psllw m5, 2 - paddw m4, m0 - paddw m5, m1 + punpcklbw m1, m7 + psubw m1, m2 + psllw m1, 2 + paddw m1, m0 pxor m6, m6 - pcmpgtw m6, m4 - pcmpgtw m7, m5 - pxor m4, m6 - pxor m5, m7 - psubw m4, m6 - psubw m5, m7 - psrlw m4, 3 - psrlw m5, 3 - packuswb m4, m5 + pcmpgtw m6, m1 + pxor m1, m6 + psubw m1, m6 + psrlw m1, 3 + packuswb m1, m7 packsswb m6, m7 - pxor m7, m7 movd m2, %5 punpcklbw m2, m2 punpcklbw m2, m2 punpcklbw m2, m2 - psubusb m2, m4 - mova m3, m2 - psubusb m3, m4 - psubb m2, m3 - mova m3, [%2] - mova m4, [%3] + psubusb m2, m1 + mova m7, m2 + psubusb m7, m1 + psubb m2, m7 pxor m3, m6 pxor m4, m6 paddusb m3, m2 psubusb m4, m2 + pxor m7, m7 pxor m3, m6 pxor m4, m6 paddusb m2, m2 - packsswb m0, m1 + packsswb m0, m7 pcmpgtb m7, m0 pxor m0, m7 psubb m0, m7 @@ -94,22 +78,20 @@ SECTION .text psrlw m1, 2 pxor m1, m7 psubb m1, m7 - mova m5, [%1] - mova m6, [%4] + movq m6, [%4] psubb m5, m1 paddb m6, m1 %endmacro -INIT_MMX mmx -; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale) -cglobal h263_v_loop_filter, 3,5 +INIT_XMM sse2 +; void ff_h263_v_loop_filter_sse2(uint8_t *src, int stride, int qscale) +cglobal h263_v_loop_filter, 3,5,8 movsxdifnidn r1, r1d movsxdifnidn r2, r2d - lea r4, [h263_loop_filter_strength] - movzx r3d, BYTE [r4+r2] - movsx r2, r3b - shl r2, 1 + lea r3, [h263_loop_filter_strength] + movzx r2d, BYTE [r3+r2] + shl r2d, 1 mov r3, r0 sub r3, r1 @@ -117,73 +99,64 @@ cglobal h263_v_loop_filter, 3,5 sub r4, r1 H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d - mova [r3], m3 - mova [r0], m4 - mova [r4], m5 - mova [r0+r1], m6 + movq [r3], m3 + movq [r0], m4 + movq [r4], m5 + movq [r0+r1], m6 RET %macro TRANSPOSE4X4 2 - movd m0, [%1] - movd m1, [%1+r1] - movd m2, [%1+r1*2] - movd m3, [%1+r3] - punpcklbw m0, m1 - punpcklbw m2, m3 - mova m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd [%2+ 0], m0 - punpckhdq m0, m0 - movd [%2+ 8], m0 - movd [%2+16], m1 - punpckhdq m1, m1 - movd [%2+24], m1 + movd %1, [%2] + movd m2, [%2+r1] + movd m3, [%2+r1*2] + movd m4, [%2+r3] + punpcklbw %1, m2 + punpcklbw m3, m4 + punpcklwd %1, m3 %endmacro -; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale) -INIT_MMX mmx -cglobal h263_h_loop_filter, 3,5,0,32 +; void ff_h263_h_loop_filter_sse2(uint8_t *src, int stride, int qscale) +INIT_XMM sse2 +cglobal h263_h_loop_filter, 3,5,8,32 movsxdifnidn r1, r1d movsxdifnidn r2, r2d lea r4, [h263_loop_filter_strength] - movzx r3d, BYTE [r4+r2] - movsx r2, r3b - shl r2, 1 + movzx r2d, BYTE [r4+r2] + shl r2d, 1 sub r0, 2 lea r3, [r1*3] - - TRANSPOSE4X4 r0, rsp lea r4, [r0+r1*4] - TRANSPOSE4X4 r4, rsp+4 + + TRANSPOSE4X4 m0, r0 + TRANSPOSE4X4 m1, r4 + mova m2, m0 + punpckldq m0, m1 + mova [rsp], m0 + punpckhdq m2, m1 + mova [rsp+16], m2 H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d - mova m1, m5 - mova m0, m4 punpcklbw m5, m3 punpcklbw m4, m6 - punpckhbw m1, m3 - punpckhbw m0, m6 - mova m3, m5 - mova m6, m1 + mova m0, m5 punpcklwd m5, m4 - punpcklwd m1, m0 - punpckhwd m3, m4 - punpckhwd m6, m0 + punpckhwd m0, m4 movd [r0], m5 + movd [r4], m0 + pshufd m1, m5, 0x1 + pshufd m2, m0, 0x1 + movd [r0+r1*1], m1 + movd [r4+r1*1], m2 punpckhdq m5, m5 - movd [r0+r1*1], m5 - movd [r0+r1*2], m3 - punpckhdq m3, m3 - movd [r0+r3], m3 - movd [r4], m1 - punpckhdq m1, m1 - movd [r4+r1*1], m1 - movd [r4+r1*2], m6 - punpckhdq m6, m6 - movd [r4+r3], m6 + punpckhdq m0, m0 + movd [r0+r1*2], m5 + movd [r4+r1*2], m0 + punpckhdq m5, m5 + punpckhdq m0, m0 + movd [r0+r3], m5 + movd [r4+r3], m0 RET diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c index ab81063233..3dd5d132e5 100644 --- a/libavcodec/x86/h263dsp_init.c +++ b/libavcodec/x86/h263dsp_init.c @@ -25,15 +25,15 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/h263dsp.h" -void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale); -void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale); +void ff_h263_h_loop_filter_sse2(uint8_t *src, int stride, int qscale); +void ff_h263_v_loop_filter_sse2(uint8_t *src, int stride, int qscale); av_cold void ff_h263dsp_init_x86(H263DSPContext *c) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { - c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx; - c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx; + if (EXTERNAL_SSE2(cpu_flags)) { + c->h263_h_loop_filter = ff_h263_h_loop_filter_sse2; + c->h263_v_loop_filter = ff_h263_v_loop_filter_sse2; } } diff --git a/tests/checkasm/h263dsp.c b/tests/checkasm/h263dsp.c index 2d0957a90b..f99d376adc 100644 --- a/tests/checkasm/h263dsp.c +++ b/tests/checkasm/h263dsp.c @@ -34,7 +34,7 @@ static void check_loop_filter(char dim, filter func) LOCAL_ALIGNED_16(uint8_t, buf1, [32 * 32]); int qscale = rnd() % 32; - declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int); + declare_func(void, uint8_t *, int, int); for (size_t y = 0; y < 32; y++) for (size_t x = 0; x < 32; x++)