1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-21 10:55:51 +02:00

avfilter/bwdif: add avx2 filter_line function

8-bit:
2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3
10-bit:
2.00x faster (1703±1.7 vs. 853±2.0 decicycles) compared with ssse3
This commit is contained in:
James Darnley 2023-02-20 20:55:08 +01:00
parent a937723ca9
commit 073ec3b9da
2 changed files with 36 additions and 5 deletions

View File

@ -26,18 +26,22 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
SECTION_RODATA 32
pw_coefhf: times 4 dw 1016, 5570
pw_coefhf1: times 8 dw -3801
pw_coefsp: times 4 dw 5077, -981
pw_splfdif: times 4 dw -768, 768
pw_coefhf: times 8 dw 1016, 5570
pw_coefhf1: times 16 dw -3801
pw_coefsp: times 8 dw 5077, -981
pw_splfdif: times 8 dw -768, 768
SECTION .text
%macro LOAD8 2
%if mmsize == 32
pmovzxbw %1, %2
%else
movh %1, %2
punpcklbw %1, m7
%endif
%endmacro
%macro LOAD12 2
@ -45,8 +49,14 @@ SECTION .text
%endmacro
%macro DISP8 0
%if mmsize == 32
vextracti128 xm1, m2, 1
packuswb xm2, xm1
movu [dstq], xm2
%else
packuswb m2, m2
movh [dstq], m2
%endif
%endmacro
%macro DISP12 0
@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
prefs, mrefs, prefs2, mrefs2, \
prefs3, mrefs3, prefs4, \
mrefs4, parity, clip_max
%if mmsize == 32
vpbroadcastw m12, WORD clip_maxm
%else
movd m12, DWORD clip_maxm
SPLATW m12, m12, 0
%endif
%else
cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
prefs, mrefs, prefs2, mrefs2, \
@ -264,3 +278,8 @@ INIT_XMM ssse3
BWDIF
INIT_XMM sse2
BWDIF
%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
INIT_YMM avx2
BWDIF
%endif

View File

@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2,
@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
{
@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
bwdif->filter_line = ff_bwdif_filter_line_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_ssse3;
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_avx2;
} else if (bit_depth <= 12) {
if (EXTERNAL_SSE2(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
}
}