mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
avfilter/bwdif: add avx2 filter_line function
8-bit: 2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3 10-bit: 2.00x faster (1703±1.7 vs. 853±2.0 decicycles) compared with ssse3
This commit is contained in:
parent
a937723ca9
commit
073ec3b9da
@ -26,18 +26,22 @@
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
SECTION_RODATA 32
|
||||
|
||||
pw_coefhf: times 4 dw 1016, 5570
|
||||
pw_coefhf1: times 8 dw -3801
|
||||
pw_coefsp: times 4 dw 5077, -981
|
||||
pw_splfdif: times 4 dw -768, 768
|
||||
pw_coefhf: times 8 dw 1016, 5570
|
||||
pw_coefhf1: times 16 dw -3801
|
||||
pw_coefsp: times 8 dw 5077, -981
|
||||
pw_splfdif: times 8 dw -768, 768
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro LOAD8 2
|
||||
%if mmsize == 32
|
||||
pmovzxbw %1, %2
|
||||
%else
|
||||
movh %1, %2
|
||||
punpcklbw %1, m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOAD12 2
|
||||
@ -45,8 +49,14 @@ SECTION .text
|
||||
%endmacro
|
||||
|
||||
%macro DISP8 0
|
||||
%if mmsize == 32
|
||||
vextracti128 xm1, m2, 1
|
||||
packuswb xm2, xm1
|
||||
movu [dstq], xm2
|
||||
%else
|
||||
packuswb m2, m2
|
||||
movh [dstq], m2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DISP12 0
|
||||
@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
|
||||
prefs, mrefs, prefs2, mrefs2, \
|
||||
prefs3, mrefs3, prefs4, \
|
||||
mrefs4, parity, clip_max
|
||||
%if mmsize == 32
|
||||
vpbroadcastw m12, WORD clip_maxm
|
||||
%else
|
||||
movd m12, DWORD clip_maxm
|
||||
SPLATW m12, m12, 0
|
||||
%endif
|
||||
%else
|
||||
cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
|
||||
prefs, mrefs, prefs2, mrefs2, \
|
||||
@ -264,3 +278,8 @@ INIT_XMM ssse3
|
||||
BWDIF
|
||||
INIT_XMM sse2
|
||||
BWDIF
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
|
||||
INIT_YMM avx2
|
||||
BWDIF
|
||||
%endif
|
||||
|
@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next,
|
||||
int w, int prefs, int mrefs, int prefs2,
|
||||
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||
int mrefs4, int parity, int clip_max);
|
||||
void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
|
||||
int w, int prefs, int mrefs, int prefs2,
|
||||
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||
int mrefs4, int parity, int clip_max);
|
||||
|
||||
void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
|
||||
int w, int prefs, int mrefs, int prefs2,
|
||||
@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
|
||||
int w, int prefs, int mrefs, int prefs2,
|
||||
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||
int mrefs4, int parity, int clip_max);
|
||||
void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void *next,
|
||||
int w, int prefs, int mrefs, int prefs2,
|
||||
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||
int mrefs4, int parity, int clip_max);
|
||||
|
||||
av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
|
||||
{
|
||||
@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
|
||||
bwdif->filter_line = ff_bwdif_filter_line_sse2;
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
bwdif->filter_line = ff_bwdif_filter_line_ssse3;
|
||||
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
|
||||
bwdif->filter_line = ff_bwdif_filter_line_avx2;
|
||||
} else if (bit_depth <= 12) {
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
|
||||
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
|
||||
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user