You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
vf_bwdif: add AVX512 implementation
I also tried replacing some of the instructions by more elaborate ones using masks, but I found no performance gain significant enough to be worth maintaining two code paths, so this implementation merely replaces the AVX2 implementation by drop-in AVX512 equivalents. bwdif8_c: 6362.2 ( 1.00x) bwdif8_sse2: 1004.9 ( 6.33x) bwdif8_ssse3: 946.0 ( 6.73x) bwdif8_avx2: 477.9 (13.31x) bwdif8_avx512: 273.3 (23.28x) bwdif10_c: 6341.5 ( 1.00x) bwdif10_sse2: 872.4 ( 7.27x) bwdif10_ssse3: 803.4 ( 7.89x) bwdif10_avx2: 416.7 (15.22x) bwdif10_avx512: 224.3 (28.27x) Realtime test at 3840x2160 yuv420p: avx2: frame=20000 fps=3370 q=-0.0 Lsize=N/A time=00:06:40.00 bitrate=N/A speed=67.4x elapsed=0:00:05.93 avx512: frame=20000 fps=5077 q=-0.0 Lsize=N/A time=00:06:40.00 bitrate=N/A speed= 102x elapsed=0:00:03.93 The use of this function is gated behind avx512icl so that it doesn't downclock on Skylake.
This commit is contained in:
@ -26,17 +26,17 @@
|
|||||||
|
|
||||||
%include "libavutil/x86/x86util.asm"
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
SECTION_RODATA 32
|
SECTION_RODATA 64
|
||||||
|
|
||||||
pw_coefhf: times 8 dw 1016, 5570
|
pw_coefhf: times 16 dw 1016, 5570
|
||||||
pw_coefhf1: times 16 dw -3801
|
pw_coefhf1: times 32 dw -3801
|
||||||
pw_coefsp: times 8 dw 5077, -981
|
pw_coefsp: times 16 dw 5077, -981
|
||||||
pw_splfdif: times 8 dw -768, 768
|
pw_splfdif: times 16 dw -768, 768
|
||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
%macro LOAD8 2
|
%macro LOAD8 2
|
||||||
%if mmsize == 32
|
%if mmsize >= 32
|
||||||
pmovzxbw %1, %2
|
pmovzxbw %1, %2
|
||||||
%else
|
%else
|
||||||
movh %1, %2
|
movh %1, %2
|
||||||
@ -49,7 +49,10 @@ SECTION .text
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro DISP8 0
|
%macro DISP8 0
|
||||||
%if mmsize == 32
|
%if mmsize == 64
|
||||||
|
pmaxsw m2, m7
|
||||||
|
vpmovuswb [dstq], m2
|
||||||
|
%elif mmsize == 32
|
||||||
vextracti128 xm1, m2, 1
|
vextracti128 xm1, m2, 1
|
||||||
packuswb xm2, xm1
|
packuswb xm2, xm1
|
||||||
movu [dstq], xm2
|
movu [dstq], xm2
|
||||||
@ -123,8 +126,13 @@ SECTION .text
|
|||||||
mova m6, m7
|
mova m6, m7
|
||||||
psubw m6, m3
|
psubw m6, m3
|
||||||
pmaxsw m6, m5
|
pmaxsw m6, m5
|
||||||
|
%if cpuflag(avx512)
|
||||||
|
pcmpgtw k1, m2, m7
|
||||||
|
vpmovm2w m3, k1
|
||||||
|
%else
|
||||||
mova m3, m2
|
mova m3, m2
|
||||||
pcmpgtw m3, m7
|
pcmpgtw m3, m7
|
||||||
|
%endif
|
||||||
pand m6, m3
|
pand m6, m3
|
||||||
pmaxsw m2, m6
|
pmaxsw m2, m6
|
||||||
mova m11, m2
|
mova m11, m2
|
||||||
@ -169,7 +177,12 @@ SECTION .text
|
|||||||
paddw m6, m5
|
paddw m6, m5
|
||||||
psubw m1, m4
|
psubw m1, m4
|
||||||
ABS1 m1, m4
|
ABS1 m1, m4
|
||||||
|
%if cpuflag(avx512)
|
||||||
|
pcmpgtw k1, m1, m9
|
||||||
|
vpmovm2w m1, k1
|
||||||
|
%else
|
||||||
pcmpgtw m1, m9
|
pcmpgtw m1, m9
|
||||||
|
%endif
|
||||||
mova m4, m1
|
mova m4, m1
|
||||||
punpcklwd m1, m4
|
punpcklwd m1, m4
|
||||||
punpckhwd m4, m4
|
punpckhwd m4, m4
|
||||||
@ -254,7 +267,7 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
|
|||||||
prefs, mrefs, prefs2, mrefs2, \
|
prefs, mrefs, prefs2, mrefs2, \
|
||||||
prefs3, mrefs3, prefs4, \
|
prefs3, mrefs3, prefs4, \
|
||||||
mrefs4, parity, clip_max
|
mrefs4, parity, clip_max
|
||||||
%if mmsize == 32
|
%if mmsize >= 32
|
||||||
vpbroadcastw m12, WORD clip_maxm
|
vpbroadcastw m12, WORD clip_maxm
|
||||||
%else
|
%else
|
||||||
movd m12, DWORD clip_maxm
|
movd m12, DWORD clip_maxm
|
||||||
@ -283,3 +296,8 @@ BWDIF
|
|||||||
INIT_YMM avx2
|
INIT_YMM avx2
|
||||||
BWDIF
|
BWDIF
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
%if HAVE_AVX512_EXTERNAL && ARCH_X86_64
|
||||||
|
INIT_ZMM avx512icl
|
||||||
|
BWDIF
|
||||||
|
%endif
|
||||||
|
@ -36,6 +36,10 @@ void ff_bwdif_filter_line_avx2(void *dst, const void *prev, const void *cur, con
|
|||||||
int w, int prefs, int mrefs, int prefs2,
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||||
int mrefs4, int parity, int clip_max);
|
int mrefs4, int parity, int clip_max);
|
||||||
|
void ff_bwdif_filter_line_avx512icl(void *dst, const void *prev, const void *cur, const void *next,
|
||||||
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
|
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||||
|
int mrefs4, int parity, int clip_max);
|
||||||
|
|
||||||
void ff_bwdif_filter_line_12bit_sse2(void *dst, const void *prev, const void *cur, const void *next,
|
void ff_bwdif_filter_line_12bit_sse2(void *dst, const void *prev, const void *cur, const void *next,
|
||||||
int w, int prefs, int mrefs, int prefs2,
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
@ -49,6 +53,10 @@ void ff_bwdif_filter_line_12bit_avx2(void *dst, const void *prev, const void *cu
|
|||||||
int w, int prefs, int mrefs, int prefs2,
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||||
int mrefs4, int parity, int clip_max);
|
int mrefs4, int parity, int clip_max);
|
||||||
|
void ff_bwdif_filter_line_12bit_avx512icl(void *dst, const void *prev, const void *cur, const void *next,
|
||||||
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
|
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||||
|
int mrefs4, int parity, int clip_max);
|
||||||
|
|
||||||
av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth)
|
av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth)
|
||||||
{
|
{
|
||||||
@ -61,6 +69,8 @@ av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth)
|
|||||||
bwdif->filter_line = ff_bwdif_filter_line_ssse3;
|
bwdif->filter_line = ff_bwdif_filter_line_ssse3;
|
||||||
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
|
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
|
||||||
bwdif->filter_line = ff_bwdif_filter_line_avx2;
|
bwdif->filter_line = ff_bwdif_filter_line_avx2;
|
||||||
|
if (ARCH_X86_64 && EXTERNAL_AVX512ICL(cpu_flags))
|
||||||
|
bwdif->filter_line = ff_bwdif_filter_line_avx512icl;
|
||||||
} else if (bit_depth <= 12) {
|
} else if (bit_depth <= 12) {
|
||||||
if (EXTERNAL_SSE2(cpu_flags))
|
if (EXTERNAL_SSE2(cpu_flags))
|
||||||
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
|
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
|
||||||
@ -68,5 +78,7 @@ av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth)
|
|||||||
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
|
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
|
||||||
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
|
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
|
||||||
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
|
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
|
||||||
|
if (ARCH_X86_64 && EXTERNAL_AVX512ICL(cpu_flags))
|
||||||
|
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx512icl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user