1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-10-06 05:47:18 +02:00

avcodec/x86/h263_loopfilter: Port loop filter to SSE2

Old benchmarks:
h263dsp.h_loop_filter_c:                                41.2 ( 1.00x)
h263dsp.h_loop_filter_mmx:                              39.5 ( 1.04x)
h263dsp.v_loop_filter_c:                                43.5 ( 1.00x)
h263dsp.v_loop_filter_mmx:                              16.9 ( 2.57x)

New benchmarks:
h263dsp.h_loop_filter_c:                                41.6 ( 1.00x)
h263dsp.h_loop_filter_sse2:                             28.2 ( 1.48x)
h263dsp.v_loop_filter_c:                                42.4 ( 1.00x)
h263dsp.v_loop_filter_sse2:                             15.1 ( 2.81x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-10-01 10:46:39 +02:00
committed by James Almer
parent a8a16c15c8
commit ab7d1c64c9
5 changed files with 78 additions and 105 deletions

View File

@@ -75,7 +75,7 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x808
0x8080808080808080ULL, 0x8080808080808080ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FC) = { 0xFCFCFCFCFCFCFCFCULL, 0xFCFCFCFCFCFCFCFCULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };

View File

@@ -56,8 +56,8 @@ extern const ymm_reg ff_pb_1;
extern const ymm_reg ff_pb_2;
extern const ymm_reg ff_pb_3;
extern const ymm_reg ff_pb_80;
extern const xmm_reg ff_pb_FC;
extern const ymm_reg ff_pb_FE;
extern const uint64_t ff_pb_FC;
extern const xmm_reg ff_ps_neg;

View File

@@ -1,5 +1,5 @@
;******************************************************************************
;* MMX-optimized H.263 loop filter
;* SSE2-optimized H.263 loop filter
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
@@ -22,7 +22,6 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_FC
cextern h263_loop_filter_strength
@@ -30,60 +29,45 @@ SECTION .text
%macro H263_LOOP_FILTER 5
pxor m7, m7
mova m0, [%1]
mova m1, [%1]
mova m2, [%4]
mova m3, [%4]
movq m0, [%1]
movq m6, [%4]
mova m5, m0
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m6, m7
psubw m0, m6
movq m2, [%2]
movq m1, [%3]
mova m3, m2
mova m4, m1
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova m2, [%2]
mova m3, [%2]
mova m4, [%3]
mova m5, [%3]
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
psubw m4, m2
psubw m5, m3
psllw m4, 2
psllw m5, 2
paddw m4, m0
paddw m5, m1
punpcklbw m1, m7
psubw m1, m2
psllw m1, 2
paddw m1, m0
pxor m6, m6
pcmpgtw m6, m4
pcmpgtw m7, m5
pxor m4, m6
pxor m5, m7
psubw m4, m6
psubw m5, m7
psrlw m4, 3
psrlw m5, 3
packuswb m4, m5
pcmpgtw m6, m1
pxor m1, m6
psubw m1, m6
psrlw m1, 3
packuswb m1, m7
packsswb m6, m7
pxor m7, m7
movd m2, %5
punpcklbw m2, m2
punpcklbw m2, m2
punpcklbw m2, m2
psubusb m2, m4
mova m3, m2
psubusb m3, m4
psubb m2, m3
mova m3, [%2]
mova m4, [%3]
psubusb m2, m1
mova m7, m2
psubusb m7, m1
psubb m2, m7
pxor m3, m6
pxor m4, m6
paddusb m3, m2
psubusb m4, m2
pxor m7, m7
pxor m3, m6
pxor m4, m6
paddusb m2, m2
packsswb m0, m1
packsswb m0, m7
pcmpgtb m7, m0
pxor m0, m7
psubb m0, m7
@@ -94,22 +78,20 @@ SECTION .text
psrlw m1, 2
pxor m1, m7
psubb m1, m7
mova m5, [%1]
mova m6, [%4]
movq m6, [%4]
psubb m5, m1
paddb m6, m1
%endmacro
INIT_MMX mmx
; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
cglobal h263_v_loop_filter, 3,5
INIT_XMM sse2
; void ff_h263_v_loop_filter_sse2(uint8_t *src, int stride, int qscale)
cglobal h263_v_loop_filter, 3,5,8
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
lea r3, [h263_loop_filter_strength]
movzx r2d, BYTE [r3+r2]
shl r2d, 1
mov r3, r0
sub r3, r1
@@ -117,73 +99,64 @@ cglobal h263_v_loop_filter, 3,5
sub r4, r1
H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
mova [r3], m3
mova [r0], m4
mova [r4], m5
mova [r0+r1], m6
movq [r3], m3
movq [r0], m4
movq [r4], m5
movq [r0+r1], m6
RET
%macro TRANSPOSE4X4 2
movd m0, [%1]
movd m1, [%1+r1]
movd m2, [%1+r1*2]
movd m3, [%1+r3]
punpcklbw m0, m1
punpcklbw m2, m3
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
movd [%2+ 0], m0
punpckhdq m0, m0
movd [%2+ 8], m0
movd [%2+16], m1
punpckhdq m1, m1
movd [%2+24], m1
movd %1, [%2]
movd m2, [%2+r1]
movd m3, [%2+r1*2]
movd m4, [%2+r3]
punpcklbw %1, m2
punpcklbw m3, m4
punpcklwd %1, m3
%endmacro
; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
INIT_MMX mmx
cglobal h263_h_loop_filter, 3,5,0,32
; void ff_h263_h_loop_filter_sse2(uint8_t *src, int stride, int qscale)
INIT_XMM sse2
cglobal h263_h_loop_filter, 3,5,8,32
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
movzx r2d, BYTE [r4+r2]
shl r2d, 1
sub r0, 2
lea r3, [r1*3]
TRANSPOSE4X4 r0, rsp
lea r4, [r0+r1*4]
TRANSPOSE4X4 r4, rsp+4
TRANSPOSE4X4 m0, r0
TRANSPOSE4X4 m1, r4
mova m2, m0
punpckldq m0, m1
mova [rsp], m0
punpckhdq m2, m1
mova [rsp+16], m2
H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
mova m1, m5
mova m0, m4
punpcklbw m5, m3
punpcklbw m4, m6
punpckhbw m1, m3
punpckhbw m0, m6
mova m3, m5
mova m6, m1
mova m0, m5
punpcklwd m5, m4
punpcklwd m1, m0
punpckhwd m3, m4
punpckhwd m6, m0
punpckhwd m0, m4
movd [r0], m5
movd [r4], m0
pshufd m1, m5, 0x1
pshufd m2, m0, 0x1
movd [r0+r1*1], m1
movd [r4+r1*1], m2
punpckhdq m5, m5
movd [r0+r1*1], m5
movd [r0+r1*2], m3
punpckhdq m3, m3
movd [r0+r3], m3
movd [r4], m1
punpckhdq m1, m1
movd [r4+r1*1], m1
movd [r4+r1*2], m6
punpckhdq m6, m6
movd [r4+r3], m6
punpckhdq m0, m0
movd [r0+r1*2], m5
movd [r4+r1*2], m0
punpckhdq m5, m5
punpckhdq m0, m0
movd [r0+r3], m5
movd [r4+r3], m0
RET

View File

@@ -25,15 +25,15 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/h263dsp.h"
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
void ff_h263_h_loop_filter_sse2(uint8_t *src, int stride, int qscale);
void ff_h263_v_loop_filter_sse2(uint8_t *src, int stride, int qscale);
av_cold void ff_h263dsp_init_x86(H263DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
if (EXTERNAL_SSE2(cpu_flags)) {
c->h263_h_loop_filter = ff_h263_h_loop_filter_sse2;
c->h263_v_loop_filter = ff_h263_v_loop_filter_sse2;
}
}

View File

@@ -34,7 +34,7 @@ static void check_loop_filter(char dim, filter func)
LOCAL_ALIGNED_16(uint8_t, buf1, [32 * 32]);
int qscale = rnd() % 32;
declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);
declare_func(void, uint8_t *, int, int);
for (size_t y = 0; y < 32; y++)
for (size_t x = 0; x < 32; x++)