From 53a03b5c8c7d355bd353727115efc9977aa76f28 Mon Sep 17 00:00:00 2001 From: Martin Vignali Date: Sat, 17 Feb 2018 21:01:34 +0100 Subject: [PATCH] avfilter/x86/vf_blend : add 16 bit version for BLEND_SIMPLE, phoenix, difference for SSE and AVX2 (x86_64) --- libavfilter/x86/vf_blend.asm | 75 +++++++++++++++++++++++++++------ libavfilter/x86/vf_blend_init.c | 54 ++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 13 deletions(-) diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index 680e266348..5d9a909192 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -36,10 +36,13 @@ pb_255: times 16 db 255 SECTION .text -%macro BLEND_INIT 2 +%macro BLEND_INIT 2-3 %if ARCH_X86_64 cglobal blend_%1, 6, 9, %2, top, top_linesize, bottom, bottom_linesize, dst, dst_linesize, width, end, x mov widthd, dword widthm + %if %0 == 3; is 16 bit + add widthq, widthq ; doesn't compile on x86_32 + %endif %else cglobal blend_%1, 5, 7, %2, top, top_linesize, bottom, bottom_linesize, dst, end, x %define dst_linesizeq r5mp @@ -61,8 +64,8 @@ cglobal blend_%1, 5, 7, %2, top, top_linesize, bottom, bottom_linesize, dst, end REP_RET %endmacro -%macro BLEND_SIMPLE 2 -BLEND_INIT %1, 2 +%macro BLEND_SIMPLE 2-3 +BLEND_INIT %1, 2, %3 .nextrow: mov xq, widthq @@ -270,8 +273,9 @@ BLEND_INIT divide, 4 BLEND_END %endmacro -%macro PHOENIX 0 -BLEND_INIT phoenix, 4 +%macro PHOENIX 2-3 +; %1 name, %2 b or w, %3 (opt) 1 if 16 bit +BLEND_INIT %1, 4, %3 VBROADCASTI128 m3, [pb_255] .nextrow: mov xq, widthq @@ -280,19 +284,19 @@ BLEND_INIT phoenix, 4 movu m0, [topq + xq] movu m1, [bottomq + xq] mova m2, m0 - pminub m0, m1 - pmaxub m1, m2 + pminu%2 m0, m1 + pmaxu%2 m1, m2 mova m2, m3 - psubusb m2, m1 - paddusb m2, m0 + psubus%2 m2, m1 + paddus%2 m2, m0 mova [dstq + xq], m2 add xq, mmsize jl .loop BLEND_END %endmacro -%macro BLEND_ABS 0 -BLEND_INIT difference, 5 +%macro DIFFERENCE 1-2 +BLEND_INIT %1, 5, %2 pxor m2, m2 .nextrow: mov xq, widthq @@ -300,6 +304,17 @@ BLEND_INIT difference, 5 .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] +%if %0 == 2 ; 16 bit + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m4, m1, m2 + punpcklwd m1, m2 + psubd m0, m1 + psubd m3, m4 + pabsd m0, m0 + pabsd m3, m3 + packusdw m0, m3 +%else punpckhbw m3, m0, m2 punpcklbw m0, m2 punpckhbw m4, m1, m2 @@ -308,11 +323,14 @@ BLEND_INIT difference, 5 psubw m3, m4 ABS2 m0, m3, m1, m4 packuswb m0, m3 +%endif mova [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END +%endmacro +%macro BLEND_ABS 0 BLEND_INIT extremity, 8 pxor m2, m2 VBROADCASTI128 m4, [pw_255] @@ -378,14 +396,32 @@ BLEND_SCREEN AVERAGE GRAINMERGE HARDMIX -PHOENIX +PHOENIX phoenix, b +DIFFERENCE difference DIVIDE BLEND_ABS +%if ARCH_X86_64 +BLEND_SIMPLE addition_16, addusw, 1 +BLEND_SIMPLE and_16, and, 1 +BLEND_SIMPLE or_16, or, 1 +BLEND_SIMPLE subtract_16, subusw, 1 +BLEND_SIMPLE xor_16, xor, 1 +%endif + INIT_XMM ssse3 +DIFFERENCE difference BLEND_ABS +INIT_XMM sse4 +%if ARCH_X86_64 +BLEND_SIMPLE darken_16, minuw, 1 +BLEND_SIMPLE lighten_16, maxuw, 1 +PHOENIX phoenix_16, w, 1 +DIFFERENCE difference_16, 1 +%endif + %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 BLEND_SIMPLE xor, xor @@ -401,7 +437,20 @@ BLEND_SCREEN AVERAGE GRAINMERGE HARDMIX -PHOENIX +PHOENIX phoenix, b +DIFFERENCE difference BLEND_ABS + +%if ARCH_X86_64 +BLEND_SIMPLE addition_16, addusw, 1 +BLEND_SIMPLE and_16, and, 1 +BLEND_SIMPLE darken_16, minuw, 1 +BLEND_SIMPLE lighten_16, maxuw, 1 +BLEND_SIMPLE or_16, or, 1 +BLEND_SIMPLE subtract_16, subusw, 1 +BLEND_SIMPLE xor_16, xor, 1 +PHOENIX phoenix_16, w, 1 +DIFFERENCE difference_16, 1 +%endif %endif diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index c9c7a52ef9..0962f6d7fd 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -69,6 +69,27 @@ BLEND_FUNC(negation, sse2) BLEND_FUNC(negation, ssse3) BLEND_FUNC(negation, avx2) +#if ARCH_X86_64 +BLEND_FUNC(addition_16, sse2) +BLEND_FUNC(addition_16, avx2) +BLEND_FUNC(and_16, sse2) +BLEND_FUNC(and_16, avx2) +BLEND_FUNC(darken_16, sse4) +BLEND_FUNC(darken_16, avx2) +BLEND_FUNC(difference_16, sse4) +BLEND_FUNC(difference_16, avx2) +BLEND_FUNC(lighten_16, sse4) +BLEND_FUNC(lighten_16, avx2) +BLEND_FUNC(or_16, sse2) +BLEND_FUNC(or_16, avx2) +BLEND_FUNC(phoenix_16, sse4) +BLEND_FUNC(phoenix_16, avx2) +BLEND_FUNC(subtract_16, sse2) +BLEND_FUNC(subtract_16, avx2) +BLEND_FUNC(xor_16, sse2) +BLEND_FUNC(xor_16, avx2) +#endif /* ARCH_X86_64 */ + av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) { int cpu_flags = av_get_cpu_flags(); @@ -125,5 +146,38 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) case BLEND_NEGATION: param->blend = ff_blend_negation_avx2; break; } } + } else { /* is_16_bit */ +#if ARCH_X86_64 + if (EXTERNAL_SSE2(cpu_flags) && param->opacity == 1) { + switch (param->mode) { + case BLEND_ADDITION: param->blend = ff_blend_addition_16_sse2; break; + case BLEND_AND: param->blend = ff_blend_and_16_sse2; break; + case BLEND_OR: param->blend = ff_blend_or_16_sse2; break; + case BLEND_SUBTRACT: param->blend = ff_blend_subtract_16_sse2; break; + case BLEND_XOR: param->blend = ff_blend_xor_16_sse2; break; + } + } + if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1) { + switch (param->mode) { + case BLEND_DARKEN: param->blend = ff_blend_darken_16_sse4; break; + case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_sse4; break; + case BLEND_LIGHTEN: param->blend = ff_blend_lighten_16_sse4; break; + case BLEND_PHOENIX: param->blend = ff_blend_phoenix_16_sse4; break; + } + } + if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1) { + switch (param->mode) { + case BLEND_ADDITION: param->blend = ff_blend_addition_16_avx2; break; + case BLEND_AND: param->blend = ff_blend_and_16_avx2; break; + case BLEND_DARKEN: param->blend = ff_blend_darken_16_avx2; break; + case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_avx2; break; + case BLEND_LIGHTEN: param->blend = ff_blend_lighten_16_avx2; break; + case BLEND_OR: param->blend = ff_blend_or_16_avx2; break; + case BLEND_PHOENIX: param->blend = ff_blend_phoenix_16_avx2; break; + case BLEND_SUBTRACT: param->blend = ff_blend_subtract_16_avx2; break; + case BLEND_XOR: param->blend = ff_blend_xor_16_avx2; break; + } + } +#endif /* ARCH_X86_64 */ } }