1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-04-14 00:58:38 +02:00

avfilter/x86/vf_blend : avfilter/x86/vf_blend : add AVX2 version for each func except divide

and optimize average, grainextract, multiply, screen, grain merge
This commit is contained in:
Martin Vignali 2018-01-17 20:59:58 +01:00
parent 4d95c6d5d7
commit 3a230ce5fa
2 changed files with 186 additions and 86 deletions

View File

@ -2,6 +2,8 @@
;* x86-optimized functions for blend filter ;* x86-optimized functions for blend filter
;* ;*
;* Copyright (C) 2015 Paul B Mahol ;* Copyright (C) 2015 Paul B Mahol
;* Copyright (C) 2018 Henrik Gramner
;* Copyright (C) 2018 Jokyo Images
;* ;*
;* This file is part of FFmpeg. ;* This file is part of FFmpeg.
;* ;*
@ -74,39 +76,36 @@ BLEND_INIT %1, 2
BLEND_END BLEND_END
%endmacro %endmacro
INIT_XMM sse2 %macro GRAINEXTRACT 0
BLEND_SIMPLE xor, xor BLEND_INIT grainextract, 6
BLEND_SIMPLE or, or pxor m4, m4
BLEND_SIMPLE and, and VBROADCASTI128 m5, [pw_128]
BLEND_SIMPLE addition, addusb
BLEND_SIMPLE subtract, subusb
BLEND_SIMPLE darken, minub
BLEND_SIMPLE lighten, maxub
BLEND_INIT grainextract, 4
pxor m2, m2
mova m3, [pw_128]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
movh m0, [topq + xq] movu m1, [topq + xq]
movh m1, [bottomq + xq] movu m3, [bottomq + xq]
punpcklbw m0, m2 punpcklbw m0, m1, m4
punpcklbw m1, m2 punpckhbw m1, m4
paddw m0, m3 punpcklbw m2, m3, m4
psubw m0, m1 punpckhbw m3, m4
packuswb m0, m0
movh [dstq + xq], m0 paddw m0, m5
add xq, mmsize / 2 paddw m1, m5
psubw m0, m2
psubw m1, m3
packuswb m0, m1
mova [dstq + xq], m0
add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro MULTIPLY 3 ; a, b, pw_1 %macro MULTIPLY 3 ; a, b, pw_1
pmullw %1, %2 ; xxxxxxxx a * b pmullw %1, %2 ; xxxxxxxx a * b
paddw %1, %3 paddw %1, %3
mova %2, %1 psrlw %2, %1, 8
psrlw %2, 8
paddw %1, %2 paddw %1, %2
psrlw %1, 8 ; 00xx00xx a * b / 255 psrlw %1, 8 ; 00xx00xx a * b / 255
%endmacro %endmacro
@ -118,92 +117,112 @@ BLEND_END
pxor %1, %4 ; 00xx00xx 255 - x / 255 pxor %1, %4 ; 00xx00xx 255 - x / 255
%endmacro %endmacro
BLEND_INIT multiply, 4 %macro BLEND_MULTIPLY 0
pxor m2, m2 BLEND_INIT multiply, 6
mova m3, [pw_1] pxor m4, m4
VBROADCASTI128 m5, [pw_1]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
; word movu m1, [topq + xq]
; |--| movu m3, [bottomq + xq]
movh m0, [topq + xq] ; 0000xxxx punpcklbw m0, m1, m4
movh m1, [bottomq + xq] punpckhbw m1, m4
punpcklbw m0, m2 ; 00xx00xx punpcklbw m2, m3, m4
punpcklbw m1, m2 punpckhbw m3, m4
MULTIPLY m0, m1, m3 MULTIPLY m0, m2, m5
MULTIPLY m1, m3, m5
packuswb m0, m0 ; 0000xxxx
movh [dstq + xq], m0
add xq, mmsize / 2
packuswb m0, m1
mova [dstq + xq], m0
add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
BLEND_INIT screen, 5 %macro BLEND_SCREEN 0
pxor m2, m2 BLEND_INIT screen, 7
mova m3, [pw_1] pxor m4, m4
mova m4, [pw_255]
VBROADCASTI128 m5, [pw_1]
VBROADCASTI128 m6, [pw_255]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
movh m0, [topq + xq] ; 0000xxxx movu m1, [topq + xq]
movh m1, [bottomq + xq] movu m3, [bottomq + xq]
punpcklbw m0, m2 ; 00xx00xx punpcklbw m0, m1, m4
punpcklbw m1, m2 punpckhbw m1, m4
punpcklbw m2, m3, m4
punpckhbw m3, m4
SCREEN m0, m1, m3, m4 SCREEN m0, m2, m5, m6
SCREEN m1, m3, m5, m6
packuswb m0, m0 ; 0000xxxx
movh [dstq + xq], m0
add xq, mmsize / 2
packuswb m0, m1
mova [dstq + xq], m0
add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro AVERAGE 0
BLEND_INIT average, 3 BLEND_INIT average, 3
pxor m2, m2 pcmpeqb m2, m2
.nextrow:
mov xq, widthq
.loop:
movu m0, [topq + xq]
movu m1, [bottomq + xq]
pxor m0, m2
pxor m1, m2
pavgb m0, m1
pxor m0, m2
mova [dstq + xq], m0
add xq, mmsize
jl .loop
BLEND_END
%endmacro
%macro GRAINMERGE 0
BLEND_INIT grainmerge, 6
pxor m4, m4
VBROADCASTI128 m5, [pw_128]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
movh m0, [topq + xq] movu m1, [topq + xq]
movh m1, [bottomq + xq] movu m3, [bottomq + xq]
punpcklbw m0, m2 punpcklbw m0, m1, m4
punpcklbw m1, m2 punpckhbw m1, m4
paddw m0, m1 punpcklbw m2, m3, m4
psrlw m0, 1 punpckhbw m3, m4
packuswb m0, m0
movh [dstq + xq], m0 paddw m0, m2
add xq, mmsize / 2 paddw m1, m3
jl .loop psubw m0, m5
BLEND_END psubw m1, m5
BLEND_INIT grainmerge, 4 packuswb m0, m1
pxor m2, m2 mova [dstq + xq], m0
mova m3, [pw_128] add xq, mmsize
.nextrow:
mov xq, widthq
.loop:
movh m0, [topq + xq]
movh m1, [bottomq + xq]
punpcklbw m0, m2
punpcklbw m1, m2
paddw m0, m1
psubw m0, m3
packuswb m0, m0
movh [dstq + xq], m0
add xq, mmsize / 2
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro HARDMIX 0
BLEND_INIT hardmix, 5 BLEND_INIT hardmix, 5
mova m2, [pb_255] VBROADCASTI128 m2, [pb_255]
mova m3, [pb_128] VBROADCASTI128 m3, [pb_128]
mova m4, [pb_127] VBROADCASTI128 m4, [pb_127]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
@ -218,7 +237,9 @@ BLEND_INIT hardmix, 5
add xq, mmsize add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro DIVIDE 0
BLEND_INIT divide, 4 BLEND_INIT divide, 4
pxor m2, m2 pxor m2, m2
mova m3, [ps_255] mova m3, [ps_255]
@ -247,9 +268,11 @@ BLEND_INIT divide, 4
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro PHOENIX 0
BLEND_INIT phoenix, 4 BLEND_INIT phoenix, 4
mova m3, [pb_255] VBROADCASTI128 m3, [pb_255]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
@ -266,6 +289,7 @@ BLEND_INIT phoenix, 4
add xq, mmsize add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro BLEND_ABS 0 %macro BLEND_ABS 0
BLEND_INIT difference, 5 BLEND_INIT difference, 5
@ -291,7 +315,7 @@ BLEND_END
BLEND_INIT extremity, 8 BLEND_INIT extremity, 8
pxor m2, m2 pxor m2, m2
mova m4, [pw_255] VBROADCASTI128 m4, [pw_255]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
@ -315,7 +339,7 @@ BLEND_END
BLEND_INIT negation, 8 BLEND_INIT negation, 8
pxor m2, m2 pxor m2, m2
mova m4, [pw_255] VBROADCASTI128 m4, [pw_255]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
@ -341,6 +365,43 @@ BLEND_END
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
BLEND_SIMPLE xor, xor
BLEND_SIMPLE or, or
BLEND_SIMPLE and, and
BLEND_SIMPLE addition, addusb
BLEND_SIMPLE subtract, subusb
BLEND_SIMPLE darken, minub
BLEND_SIMPLE lighten, maxub
GRAINEXTRACT
BLEND_MULTIPLY
BLEND_SCREEN
AVERAGE
GRAINMERGE
HARDMIX
PHOENIX
DIVIDE
BLEND_ABS BLEND_ABS
INIT_XMM ssse3 INIT_XMM ssse3
BLEND_ABS BLEND_ABS
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
BLEND_SIMPLE xor, xor
BLEND_SIMPLE or, or
BLEND_SIMPLE and, and
BLEND_SIMPLE addition, addusb
BLEND_SIMPLE subtract, subusb
BLEND_SIMPLE darken, minub
BLEND_SIMPLE lighten, maxub
GRAINEXTRACT
BLEND_MULTIPLY
BLEND_SCREEN
AVERAGE
GRAINMERGE
HARDMIX
PHOENIX
BLEND_ABS
%endif

View File

@ -31,26 +31,43 @@ void ff_blend_##name##_##opt(const uint8_t *top, ptrdiff_t top_linesize, \
struct FilterParams *param, double *values, int starty); struct FilterParams *param, double *values, int starty);
BLEND_FUNC(addition, sse2) BLEND_FUNC(addition, sse2)
BLEND_FUNC(addition, avx2)
BLEND_FUNC(grainmerge, sse2) BLEND_FUNC(grainmerge, sse2)
BLEND_FUNC(grainmerge, avx2)
BLEND_FUNC(average, sse2) BLEND_FUNC(average, sse2)
BLEND_FUNC(average, avx2)
BLEND_FUNC(and, sse2) BLEND_FUNC(and, sse2)
BLEND_FUNC(and, avx2)
BLEND_FUNC(darken, sse2) BLEND_FUNC(darken, sse2)
BLEND_FUNC(darken, avx2)
BLEND_FUNC(grainextract, sse2) BLEND_FUNC(grainextract, sse2)
BLEND_FUNC(grainextract, avx2)
BLEND_FUNC(multiply, sse2) BLEND_FUNC(multiply, sse2)
BLEND_FUNC(multiply, avx2)
BLEND_FUNC(screen, sse2) BLEND_FUNC(screen, sse2)
BLEND_FUNC(screen, avx2)
BLEND_FUNC(hardmix, sse2) BLEND_FUNC(hardmix, sse2)
BLEND_FUNC(hardmix, avx2)
BLEND_FUNC(divide, sse2) BLEND_FUNC(divide, sse2)
BLEND_FUNC(lighten, sse2) BLEND_FUNC(lighten, sse2)
BLEND_FUNC(lighten, avx2)
BLEND_FUNC(or, sse2) BLEND_FUNC(or, sse2)
BLEND_FUNC(or, avx2)
BLEND_FUNC(phoenix, sse2) BLEND_FUNC(phoenix, sse2)
BLEND_FUNC(phoenix, avx2)
BLEND_FUNC(subtract, sse2) BLEND_FUNC(subtract, sse2)
BLEND_FUNC(subtract, avx2)
BLEND_FUNC(xor, sse2) BLEND_FUNC(xor, sse2)
BLEND_FUNC(xor, avx2)
BLEND_FUNC(difference, sse2) BLEND_FUNC(difference, sse2)
BLEND_FUNC(difference, ssse3) BLEND_FUNC(difference, ssse3)
BLEND_FUNC(difference, avx2)
BLEND_FUNC(extremity, sse2) BLEND_FUNC(extremity, sse2)
BLEND_FUNC(extremity, ssse3) BLEND_FUNC(extremity, ssse3)
BLEND_FUNC(extremity, avx2)
BLEND_FUNC(negation, sse2) BLEND_FUNC(negation, sse2)
BLEND_FUNC(negation, ssse3) BLEND_FUNC(negation, ssse3)
BLEND_FUNC(negation, avx2)
av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
{ {
@ -85,4 +102,26 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break; case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break;
} }
} }
if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1 && !is_16bit) {
switch (param->mode) {
case BLEND_ADDITION: param->blend = ff_blend_addition_avx2; break;
case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_avx2; break;
case BLEND_AND: param->blend = ff_blend_and_avx2; break;
case BLEND_AVERAGE: param->blend = ff_blend_average_avx2; break;
case BLEND_DARKEN: param->blend = ff_blend_darken_avx2; break;
case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_avx2; break;
case BLEND_HARDMIX: param->blend = ff_blend_hardmix_avx2; break;
case BLEND_LIGHTEN: param->blend = ff_blend_lighten_avx2; break;
case BLEND_MULTIPLY: param->blend = ff_blend_multiply_avx2; break;
case BLEND_OR: param->blend = ff_blend_or_avx2; break;
case BLEND_PHOENIX: param->blend = ff_blend_phoenix_avx2; break;
case BLEND_SCREEN: param->blend = ff_blend_screen_avx2; break;
case BLEND_SUBTRACT: param->blend = ff_blend_subtract_avx2; break;
case BLEND_XOR: param->blend = ff_blend_xor_avx2; break;
case BLEND_DIFFERENCE: param->blend = ff_blend_difference_avx2; break;
case BLEND_EXTREMITY: param->blend = ff_blend_extremity_avx2; break;
case BLEND_NEGATION: param->blend = ff_blend_negation_avx2; break;
}
}
} }