1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

vf_bwdif: add AVX512 implementation

I also tried replacing some of the instructions by more elaborate ones
using masks, but I found no performance gain significant enough to be worth
maintaining two code paths, so this implementation merely replaces the AVX2
implementation by drop-in AVX512 equivalents.

bwdif8_c:                                             6362.2 ( 1.00x)
bwdif8_sse2:                                          1004.9 ( 6.33x)
bwdif8_ssse3:                                          946.0 ( 6.73x)
bwdif8_avx2:                                           477.9 (13.31x)
bwdif8_avx512:                                         273.3 (23.28x)

bwdif10_c:                                            6341.5 ( 1.00x)
bwdif10_sse2:                                          872.4 ( 7.27x)
bwdif10_ssse3:                                         803.4 ( 7.89x)
bwdif10_avx2:                                          416.7 (15.22x)
bwdif10_avx512:                                        224.3 (28.27x)

Realtime test at 3840x2160 yuv420p:

avx2:   frame=20000 fps=3370 q=-0.0 Lsize=N/A time=00:06:40.00 bitrate=N/A speed=67.4x elapsed=0:00:05.93
avx512: frame=20000 fps=5077 q=-0.0 Lsize=N/A time=00:06:40.00 bitrate=N/A speed= 102x elapsed=0:00:03.93

The use of this function is gated behind avx512icl so that it doesn't
downclock on Skylake.
This commit is contained in:
Niklas Haas
2025-07-29 13:27:52 +02:00
committed by kierank
parent 586a1cd088
commit 7f00e24d70
2 changed files with 38 additions and 8 deletions

View File

@ -26,17 +26,17 @@
%include "libavutil/x86/x86util.asm" %include "libavutil/x86/x86util.asm"
SECTION_RODATA 32 SECTION_RODATA 64
pw_coefhf: times 8 dw 1016, 5570 pw_coefhf: times 16 dw 1016, 5570
pw_coefhf1: times 16 dw -3801 pw_coefhf1: times 32 dw -3801
pw_coefsp: times 8 dw 5077, -981 pw_coefsp: times 16 dw 5077, -981
pw_splfdif: times 8 dw -768, 768 pw_splfdif: times 16 dw -768, 768
SECTION .text SECTION .text
%macro LOAD8 2 %macro LOAD8 2
%if mmsize == 32 %if mmsize >= 32
pmovzxbw %1, %2 pmovzxbw %1, %2
%else %else
movh %1, %2 movh %1, %2
@ -49,7 +49,10 @@ SECTION .text
%endmacro %endmacro
%macro DISP8 0 %macro DISP8 0
%if mmsize == 32 %if mmsize == 64
pmaxsw m2, m7
vpmovuswb [dstq], m2
%elif mmsize == 32
vextracti128 xm1, m2, 1 vextracti128 xm1, m2, 1
packuswb xm2, xm1 packuswb xm2, xm1
movu [dstq], xm2 movu [dstq], xm2
@ -123,8 +126,13 @@ SECTION .text
mova m6, m7 mova m6, m7
psubw m6, m3 psubw m6, m3
pmaxsw m6, m5 pmaxsw m6, m5
%if cpuflag(avx512)
pcmpgtw k1, m2, m7
vpmovm2w m3, k1
%else
mova m3, m2 mova m3, m2
pcmpgtw m3, m7 pcmpgtw m3, m7
%endif
pand m6, m3 pand m6, m3
pmaxsw m2, m6 pmaxsw m2, m6
mova m11, m2 mova m11, m2
@ -169,7 +177,12 @@ SECTION .text
paddw m6, m5 paddw m6, m5
psubw m1, m4 psubw m1, m4
ABS1 m1, m4 ABS1 m1, m4
%if cpuflag(avx512)
pcmpgtw k1, m1, m9
vpmovm2w m1, k1
%else
pcmpgtw m1, m9 pcmpgtw m1, m9
%endif
mova m4, m1 mova m4, m1
punpcklwd m1, m4 punpcklwd m1, m4
punpckhwd m4, m4 punpckhwd m4, m4
@ -254,7 +267,7 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
prefs, mrefs, prefs2, mrefs2, \ prefs, mrefs, prefs2, mrefs2, \
prefs3, mrefs3, prefs4, \ prefs3, mrefs3, prefs4, \
mrefs4, parity, clip_max mrefs4, parity, clip_max
%if mmsize == 32 %if mmsize >= 32
vpbroadcastw m12, WORD clip_maxm vpbroadcastw m12, WORD clip_maxm
%else %else
movd m12, DWORD clip_maxm movd m12, DWORD clip_maxm
@ -283,3 +296,8 @@ BWDIF
INIT_YMM avx2 INIT_YMM avx2
BWDIF BWDIF
%endif %endif
%if HAVE_AVX512_EXTERNAL && ARCH_X86_64
INIT_ZMM avx512icl
BWDIF
%endif

View File

@ -36,6 +36,10 @@ void ff_bwdif_filter_line_avx2(void *dst, const void *prev, const void *cur, con
int w, int prefs, int mrefs, int prefs2, int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max); int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_avx512icl(void *dst, const void *prev, const void *cur, const void *next,
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_12bit_sse2(void *dst, const void *prev, const void *cur, const void *next, void ff_bwdif_filter_line_12bit_sse2(void *dst, const void *prev, const void *cur, const void *next,
int w, int prefs, int mrefs, int prefs2, int w, int prefs, int mrefs, int prefs2,
@ -49,6 +53,10 @@ void ff_bwdif_filter_line_12bit_avx2(void *dst, const void *prev, const void *cu
int w, int prefs, int mrefs, int prefs2, int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max); int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_12bit_avx512icl(void *dst, const void *prev, const void *cur, const void *next,
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth) av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth)
{ {
@ -61,6 +69,8 @@ av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth)
bwdif->filter_line = ff_bwdif_filter_line_ssse3; bwdif->filter_line = ff_bwdif_filter_line_ssse3;
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags)) if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_avx2; bwdif->filter_line = ff_bwdif_filter_line_avx2;
if (ARCH_X86_64 && EXTERNAL_AVX512ICL(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_avx512icl;
} else if (bit_depth <= 12) { } else if (bit_depth <= 12) {
if (EXTERNAL_SSE2(cpu_flags)) if (EXTERNAL_SSE2(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
@ -68,5 +78,7 @@ av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth)
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags)) if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2; bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
if (ARCH_X86_64 && EXTERNAL_AVX512ICL(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx512icl;
} }
} }