diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm index c93b41ec48..f52be946ca 100644 --- a/libavfilter/x86/vf_bwdif.asm +++ b/libavfilter/x86/vf_bwdif.asm @@ -26,17 +26,17 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 -pw_coefhf: times 8 dw 1016, 5570 -pw_coefhf1: times 16 dw -3801 -pw_coefsp: times 8 dw 5077, -981 -pw_splfdif: times 8 dw -768, 768 +pw_coefhf: times 16 dw 1016, 5570 +pw_coefhf1: times 32 dw -3801 +pw_coefsp: times 16 dw 5077, -981 +pw_splfdif: times 16 dw -768, 768 SECTION .text %macro LOAD8 2 - %if mmsize == 32 + %if mmsize >= 32 pmovzxbw %1, %2 %else movh %1, %2 @@ -49,7 +49,10 @@ SECTION .text %endmacro %macro DISP8 0 - %if mmsize == 32 + %if mmsize == 64 + pmaxsw m2, m7 + vpmovuswb [dstq], m2 + %elif mmsize == 32 vextracti128 xm1, m2, 1 packuswb xm2, xm1 movu [dstq], xm2 @@ -123,8 +126,13 @@ SECTION .text mova m6, m7 psubw m6, m3 pmaxsw m6, m5 +%if cpuflag(avx512) + pcmpgtw k1, m2, m7 + vpmovm2w m3, k1 +%else mova m3, m2 pcmpgtw m3, m7 +%endif pand m6, m3 pmaxsw m2, m6 mova m11, m2 @@ -169,7 +177,12 @@ SECTION .text paddw m6, m5 psubw m1, m4 ABS1 m1, m4 +%if cpuflag(avx512) + pcmpgtw k1, m1, m9 + vpmovm2w m1, k1 +%else pcmpgtw m1, m9 +%endif mova m4, m1 punpcklwd m1, m4 punpckhwd m4, m4 @@ -254,7 +267,7 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \ prefs, mrefs, prefs2, mrefs2, \ prefs3, mrefs3, prefs4, \ mrefs4, parity, clip_max - %if mmsize == 32 + %if mmsize >= 32 vpbroadcastw m12, WORD clip_maxm %else movd m12, DWORD clip_maxm @@ -283,3 +296,8 @@ BWDIF INIT_YMM avx2 BWDIF %endif + +%if HAVE_AVX512_EXTERNAL && ARCH_X86_64 +INIT_ZMM avx512icl +BWDIF +%endif diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c index 69a70e3293..76b574b2a9 100644 --- a/libavfilter/x86/vf_bwdif_init.c +++ b/libavfilter/x86/vf_bwdif_init.c @@ -36,6 +36,10 @@ void ff_bwdif_filter_line_avx2(void *dst, const void *prev, const void *cur, con int w, int prefs, int mrefs, int prefs2, int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_avx512icl(void *dst, const void *prev, const void *cur, const void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); void ff_bwdif_filter_line_12bit_sse2(void *dst, const void *prev, const void *cur, const void *next, int w, int prefs, int mrefs, int prefs2, @@ -49,6 +53,10 @@ void ff_bwdif_filter_line_12bit_avx2(void *dst, const void *prev, const void *cu int w, int prefs, int mrefs, int prefs2, int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_12bit_avx512icl(void *dst, const void *prev, const void *cur, const void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth) { @@ -61,6 +69,8 @@ av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth) bwdif->filter_line = ff_bwdif_filter_line_ssse3; if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_avx2; + if (ARCH_X86_64 && EXTERNAL_AVX512ICL(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_avx512icl; } else if (bit_depth <= 12) { if (EXTERNAL_SSE2(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; @@ -68,5 +78,7 @@ av_cold void ff_bwdif_init_x86(BWDIFDSPContext *bwdif, int bit_depth) bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2; + if (ARCH_X86_64 && EXTERNAL_AVX512ICL(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_12bit_avx512icl; } }