mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-04-14 00:58:38 +02:00
avfilter/x86/vf_blend : avfilter/x86/vf_blend : add AVX2 version for each func except divide
and optimize average, grainextract, multiply, screen, grain merge
This commit is contained in:
parent
4d95c6d5d7
commit
3a230ce5fa
@ -2,6 +2,8 @@
|
|||||||
;* x86-optimized functions for blend filter
|
;* x86-optimized functions for blend filter
|
||||||
;*
|
;*
|
||||||
;* Copyright (C) 2015 Paul B Mahol
|
;* Copyright (C) 2015 Paul B Mahol
|
||||||
|
;* Copyright (C) 2018 Henrik Gramner
|
||||||
|
;* Copyright (C) 2018 Jokyo Images
|
||||||
;*
|
;*
|
||||||
;* This file is part of FFmpeg.
|
;* This file is part of FFmpeg.
|
||||||
;*
|
;*
|
||||||
@ -74,39 +76,36 @@ BLEND_INIT %1, 2
|
|||||||
BLEND_END
|
BLEND_END
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_XMM sse2
|
%macro GRAINEXTRACT 0
|
||||||
BLEND_SIMPLE xor, xor
|
BLEND_INIT grainextract, 6
|
||||||
BLEND_SIMPLE or, or
|
pxor m4, m4
|
||||||
BLEND_SIMPLE and, and
|
VBROADCASTI128 m5, [pw_128]
|
||||||
BLEND_SIMPLE addition, addusb
|
|
||||||
BLEND_SIMPLE subtract, subusb
|
|
||||||
BLEND_SIMPLE darken, minub
|
|
||||||
BLEND_SIMPLE lighten, maxub
|
|
||||||
|
|
||||||
BLEND_INIT grainextract, 4
|
|
||||||
pxor m2, m2
|
|
||||||
mova m3, [pw_128]
|
|
||||||
.nextrow:
|
.nextrow:
|
||||||
mov xq, widthq
|
mov xq, widthq
|
||||||
|
|
||||||
.loop:
|
.loop:
|
||||||
movh m0, [topq + xq]
|
movu m1, [topq + xq]
|
||||||
movh m1, [bottomq + xq]
|
movu m3, [bottomq + xq]
|
||||||
punpcklbw m0, m2
|
punpcklbw m0, m1, m4
|
||||||
punpcklbw m1, m2
|
punpckhbw m1, m4
|
||||||
paddw m0, m3
|
punpcklbw m2, m3, m4
|
||||||
psubw m0, m1
|
punpckhbw m3, m4
|
||||||
packuswb m0, m0
|
|
||||||
movh [dstq + xq], m0
|
paddw m0, m5
|
||||||
add xq, mmsize / 2
|
paddw m1, m5
|
||||||
|
psubw m0, m2
|
||||||
|
psubw m1, m3
|
||||||
|
|
||||||
|
packuswb m0, m1
|
||||||
|
mova [dstq + xq], m0
|
||||||
|
add xq, mmsize
|
||||||
jl .loop
|
jl .loop
|
||||||
BLEND_END
|
BLEND_END
|
||||||
|
%endmacro
|
||||||
|
|
||||||
%macro MULTIPLY 3 ; a, b, pw_1
|
%macro MULTIPLY 3 ; a, b, pw_1
|
||||||
pmullw %1, %2 ; xxxxxxxx a * b
|
pmullw %1, %2 ; xxxxxxxx a * b
|
||||||
paddw %1, %3
|
paddw %1, %3
|
||||||
mova %2, %1
|
psrlw %2, %1, 8
|
||||||
psrlw %2, 8
|
|
||||||
paddw %1, %2
|
paddw %1, %2
|
||||||
psrlw %1, 8 ; 00xx00xx a * b / 255
|
psrlw %1, 8 ; 00xx00xx a * b / 255
|
||||||
%endmacro
|
%endmacro
|
||||||
@ -118,92 +117,112 @@ BLEND_END
|
|||||||
pxor %1, %4 ; 00xx00xx 255 - x / 255
|
pxor %1, %4 ; 00xx00xx 255 - x / 255
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
BLEND_INIT multiply, 4
|
%macro BLEND_MULTIPLY 0
|
||||||
pxor m2, m2
|
BLEND_INIT multiply, 6
|
||||||
mova m3, [pw_1]
|
pxor m4, m4
|
||||||
|
VBROADCASTI128 m5, [pw_1]
|
||||||
.nextrow:
|
.nextrow:
|
||||||
mov xq, widthq
|
mov xq, widthq
|
||||||
|
|
||||||
.loop:
|
.loop:
|
||||||
; word
|
movu m1, [topq + xq]
|
||||||
; |--|
|
movu m3, [bottomq + xq]
|
||||||
movh m0, [topq + xq] ; 0000xxxx
|
punpcklbw m0, m1, m4
|
||||||
movh m1, [bottomq + xq]
|
punpckhbw m1, m4
|
||||||
punpcklbw m0, m2 ; 00xx00xx
|
punpcklbw m2, m3, m4
|
||||||
punpcklbw m1, m2
|
punpckhbw m3, m4
|
||||||
|
|
||||||
MULTIPLY m0, m1, m3
|
MULTIPLY m0, m2, m5
|
||||||
|
MULTIPLY m1, m3, m5
|
||||||
packuswb m0, m0 ; 0000xxxx
|
|
||||||
movh [dstq + xq], m0
|
|
||||||
add xq, mmsize / 2
|
|
||||||
|
|
||||||
|
packuswb m0, m1
|
||||||
|
mova [dstq + xq], m0
|
||||||
|
add xq, mmsize
|
||||||
jl .loop
|
jl .loop
|
||||||
BLEND_END
|
BLEND_END
|
||||||
|
%endmacro
|
||||||
|
|
||||||
BLEND_INIT screen, 5
|
%macro BLEND_SCREEN 0
|
||||||
pxor m2, m2
|
BLEND_INIT screen, 7
|
||||||
mova m3, [pw_1]
|
pxor m4, m4
|
||||||
mova m4, [pw_255]
|
|
||||||
|
VBROADCASTI128 m5, [pw_1]
|
||||||
|
VBROADCASTI128 m6, [pw_255]
|
||||||
.nextrow:
|
.nextrow:
|
||||||
mov xq, widthq
|
mov xq, widthq
|
||||||
|
|
||||||
.loop:
|
.loop:
|
||||||
movh m0, [topq + xq] ; 0000xxxx
|
movu m1, [topq + xq]
|
||||||
movh m1, [bottomq + xq]
|
movu m3, [bottomq + xq]
|
||||||
punpcklbw m0, m2 ; 00xx00xx
|
punpcklbw m0, m1, m4
|
||||||
punpcklbw m1, m2
|
punpckhbw m1, m4
|
||||||
|
punpcklbw m2, m3, m4
|
||||||
|
punpckhbw m3, m4
|
||||||
|
|
||||||
SCREEN m0, m1, m3, m4
|
SCREEN m0, m2, m5, m6
|
||||||
|
SCREEN m1, m3, m5, m6
|
||||||
packuswb m0, m0 ; 0000xxxx
|
|
||||||
movh [dstq + xq], m0
|
|
||||||
add xq, mmsize / 2
|
|
||||||
|
|
||||||
|
packuswb m0, m1
|
||||||
|
mova [dstq + xq], m0
|
||||||
|
add xq, mmsize
|
||||||
jl .loop
|
jl .loop
|
||||||
BLEND_END
|
BLEND_END
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro AVERAGE 0
|
||||||
BLEND_INIT average, 3
|
BLEND_INIT average, 3
|
||||||
pxor m2, m2
|
pcmpeqb m2, m2
|
||||||
|
|
||||||
|
.nextrow:
|
||||||
|
mov xq, widthq
|
||||||
|
|
||||||
|
.loop:
|
||||||
|
movu m0, [topq + xq]
|
||||||
|
movu m1, [bottomq + xq]
|
||||||
|
pxor m0, m2
|
||||||
|
pxor m1, m2
|
||||||
|
pavgb m0, m1
|
||||||
|
pxor m0, m2
|
||||||
|
mova [dstq + xq], m0
|
||||||
|
add xq, mmsize
|
||||||
|
jl .loop
|
||||||
|
BLEND_END
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
|
||||||
|
%macro GRAINMERGE 0
|
||||||
|
BLEND_INIT grainmerge, 6
|
||||||
|
pxor m4, m4
|
||||||
|
|
||||||
|
VBROADCASTI128 m5, [pw_128]
|
||||||
.nextrow:
|
.nextrow:
|
||||||
mov xq, widthq
|
mov xq, widthq
|
||||||
|
|
||||||
.loop:
|
.loop:
|
||||||
movh m0, [topq + xq]
|
movu m1, [topq + xq]
|
||||||
movh m1, [bottomq + xq]
|
movu m3, [bottomq + xq]
|
||||||
punpcklbw m0, m2
|
punpcklbw m0, m1, m4
|
||||||
punpcklbw m1, m2
|
punpckhbw m1, m4
|
||||||
paddw m0, m1
|
punpcklbw m2, m3, m4
|
||||||
psrlw m0, 1
|
punpckhbw m3, m4
|
||||||
packuswb m0, m0
|
|
||||||
movh [dstq + xq], m0
|
paddw m0, m2
|
||||||
add xq, mmsize / 2
|
paddw m1, m3
|
||||||
jl .loop
|
psubw m0, m5
|
||||||
BLEND_END
|
psubw m1, m5
|
||||||
|
|
||||||
BLEND_INIT grainmerge, 4
|
packuswb m0, m1
|
||||||
pxor m2, m2
|
mova [dstq + xq], m0
|
||||||
mova m3, [pw_128]
|
add xq, mmsize
|
||||||
.nextrow:
|
|
||||||
mov xq, widthq
|
|
||||||
|
|
||||||
.loop:
|
|
||||||
movh m0, [topq + xq]
|
|
||||||
movh m1, [bottomq + xq]
|
|
||||||
punpcklbw m0, m2
|
|
||||||
punpcklbw m1, m2
|
|
||||||
paddw m0, m1
|
|
||||||
psubw m0, m3
|
|
||||||
packuswb m0, m0
|
|
||||||
movh [dstq + xq], m0
|
|
||||||
add xq, mmsize / 2
|
|
||||||
jl .loop
|
jl .loop
|
||||||
BLEND_END
|
BLEND_END
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro HARDMIX 0
|
||||||
BLEND_INIT hardmix, 5
|
BLEND_INIT hardmix, 5
|
||||||
mova m2, [pb_255]
|
VBROADCASTI128 m2, [pb_255]
|
||||||
mova m3, [pb_128]
|
VBROADCASTI128 m3, [pb_128]
|
||||||
mova m4, [pb_127]
|
VBROADCASTI128 m4, [pb_127]
|
||||||
.nextrow:
|
.nextrow:
|
||||||
mov xq, widthq
|
mov xq, widthq
|
||||||
|
|
||||||
@ -218,7 +237,9 @@ BLEND_INIT hardmix, 5
|
|||||||
add xq, mmsize
|
add xq, mmsize
|
||||||
jl .loop
|
jl .loop
|
||||||
BLEND_END
|
BLEND_END
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro DIVIDE 0
|
||||||
BLEND_INIT divide, 4
|
BLEND_INIT divide, 4
|
||||||
pxor m2, m2
|
pxor m2, m2
|
||||||
mova m3, [ps_255]
|
mova m3, [ps_255]
|
||||||
@ -247,9 +268,11 @@ BLEND_INIT divide, 4
|
|||||||
|
|
||||||
jl .loop
|
jl .loop
|
||||||
BLEND_END
|
BLEND_END
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro PHOENIX 0
|
||||||
BLEND_INIT phoenix, 4
|
BLEND_INIT phoenix, 4
|
||||||
mova m3, [pb_255]
|
VBROADCASTI128 m3, [pb_255]
|
||||||
.nextrow:
|
.nextrow:
|
||||||
mov xq, widthq
|
mov xq, widthq
|
||||||
|
|
||||||
@ -266,6 +289,7 @@ BLEND_INIT phoenix, 4
|
|||||||
add xq, mmsize
|
add xq, mmsize
|
||||||
jl .loop
|
jl .loop
|
||||||
BLEND_END
|
BLEND_END
|
||||||
|
%endmacro
|
||||||
|
|
||||||
%macro BLEND_ABS 0
|
%macro BLEND_ABS 0
|
||||||
BLEND_INIT difference, 5
|
BLEND_INIT difference, 5
|
||||||
@ -291,7 +315,7 @@ BLEND_END
|
|||||||
|
|
||||||
BLEND_INIT extremity, 8
|
BLEND_INIT extremity, 8
|
||||||
pxor m2, m2
|
pxor m2, m2
|
||||||
mova m4, [pw_255]
|
VBROADCASTI128 m4, [pw_255]
|
||||||
.nextrow:
|
.nextrow:
|
||||||
mov xq, widthq
|
mov xq, widthq
|
||||||
|
|
||||||
@ -315,7 +339,7 @@ BLEND_END
|
|||||||
|
|
||||||
BLEND_INIT negation, 8
|
BLEND_INIT negation, 8
|
||||||
pxor m2, m2
|
pxor m2, m2
|
||||||
mova m4, [pw_255]
|
VBROADCASTI128 m4, [pw_255]
|
||||||
.nextrow:
|
.nextrow:
|
||||||
mov xq, widthq
|
mov xq, widthq
|
||||||
|
|
||||||
@ -341,6 +365,43 @@ BLEND_END
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
|
BLEND_SIMPLE xor, xor
|
||||||
|
BLEND_SIMPLE or, or
|
||||||
|
BLEND_SIMPLE and, and
|
||||||
|
BLEND_SIMPLE addition, addusb
|
||||||
|
BLEND_SIMPLE subtract, subusb
|
||||||
|
BLEND_SIMPLE darken, minub
|
||||||
|
BLEND_SIMPLE lighten, maxub
|
||||||
|
GRAINEXTRACT
|
||||||
|
BLEND_MULTIPLY
|
||||||
|
BLEND_SCREEN
|
||||||
|
AVERAGE
|
||||||
|
GRAINMERGE
|
||||||
|
HARDMIX
|
||||||
|
PHOENIX
|
||||||
|
DIVIDE
|
||||||
|
|
||||||
BLEND_ABS
|
BLEND_ABS
|
||||||
|
|
||||||
INIT_XMM ssse3
|
INIT_XMM ssse3
|
||||||
BLEND_ABS
|
BLEND_ABS
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
BLEND_SIMPLE xor, xor
|
||||||
|
BLEND_SIMPLE or, or
|
||||||
|
BLEND_SIMPLE and, and
|
||||||
|
BLEND_SIMPLE addition, addusb
|
||||||
|
BLEND_SIMPLE subtract, subusb
|
||||||
|
BLEND_SIMPLE darken, minub
|
||||||
|
BLEND_SIMPLE lighten, maxub
|
||||||
|
GRAINEXTRACT
|
||||||
|
BLEND_MULTIPLY
|
||||||
|
BLEND_SCREEN
|
||||||
|
AVERAGE
|
||||||
|
GRAINMERGE
|
||||||
|
HARDMIX
|
||||||
|
PHOENIX
|
||||||
|
|
||||||
|
BLEND_ABS
|
||||||
|
%endif
|
||||||
|
@ -31,26 +31,43 @@ void ff_blend_##name##_##opt(const uint8_t *top, ptrdiff_t top_linesize, \
|
|||||||
struct FilterParams *param, double *values, int starty);
|
struct FilterParams *param, double *values, int starty);
|
||||||
|
|
||||||
BLEND_FUNC(addition, sse2)
|
BLEND_FUNC(addition, sse2)
|
||||||
|
BLEND_FUNC(addition, avx2)
|
||||||
BLEND_FUNC(grainmerge, sse2)
|
BLEND_FUNC(grainmerge, sse2)
|
||||||
|
BLEND_FUNC(grainmerge, avx2)
|
||||||
BLEND_FUNC(average, sse2)
|
BLEND_FUNC(average, sse2)
|
||||||
|
BLEND_FUNC(average, avx2)
|
||||||
BLEND_FUNC(and, sse2)
|
BLEND_FUNC(and, sse2)
|
||||||
|
BLEND_FUNC(and, avx2)
|
||||||
BLEND_FUNC(darken, sse2)
|
BLEND_FUNC(darken, sse2)
|
||||||
|
BLEND_FUNC(darken, avx2)
|
||||||
BLEND_FUNC(grainextract, sse2)
|
BLEND_FUNC(grainextract, sse2)
|
||||||
|
BLEND_FUNC(grainextract, avx2)
|
||||||
BLEND_FUNC(multiply, sse2)
|
BLEND_FUNC(multiply, sse2)
|
||||||
|
BLEND_FUNC(multiply, avx2)
|
||||||
BLEND_FUNC(screen, sse2)
|
BLEND_FUNC(screen, sse2)
|
||||||
|
BLEND_FUNC(screen, avx2)
|
||||||
BLEND_FUNC(hardmix, sse2)
|
BLEND_FUNC(hardmix, sse2)
|
||||||
|
BLEND_FUNC(hardmix, avx2)
|
||||||
BLEND_FUNC(divide, sse2)
|
BLEND_FUNC(divide, sse2)
|
||||||
BLEND_FUNC(lighten, sse2)
|
BLEND_FUNC(lighten, sse2)
|
||||||
|
BLEND_FUNC(lighten, avx2)
|
||||||
BLEND_FUNC(or, sse2)
|
BLEND_FUNC(or, sse2)
|
||||||
|
BLEND_FUNC(or, avx2)
|
||||||
BLEND_FUNC(phoenix, sse2)
|
BLEND_FUNC(phoenix, sse2)
|
||||||
|
BLEND_FUNC(phoenix, avx2)
|
||||||
BLEND_FUNC(subtract, sse2)
|
BLEND_FUNC(subtract, sse2)
|
||||||
|
BLEND_FUNC(subtract, avx2)
|
||||||
BLEND_FUNC(xor, sse2)
|
BLEND_FUNC(xor, sse2)
|
||||||
|
BLEND_FUNC(xor, avx2)
|
||||||
BLEND_FUNC(difference, sse2)
|
BLEND_FUNC(difference, sse2)
|
||||||
BLEND_FUNC(difference, ssse3)
|
BLEND_FUNC(difference, ssse3)
|
||||||
|
BLEND_FUNC(difference, avx2)
|
||||||
BLEND_FUNC(extremity, sse2)
|
BLEND_FUNC(extremity, sse2)
|
||||||
BLEND_FUNC(extremity, ssse3)
|
BLEND_FUNC(extremity, ssse3)
|
||||||
|
BLEND_FUNC(extremity, avx2)
|
||||||
BLEND_FUNC(negation, sse2)
|
BLEND_FUNC(negation, sse2)
|
||||||
BLEND_FUNC(negation, ssse3)
|
BLEND_FUNC(negation, ssse3)
|
||||||
|
BLEND_FUNC(negation, avx2)
|
||||||
|
|
||||||
av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
|
av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
|
||||||
{
|
{
|
||||||
@ -85,4 +102,26 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
|
|||||||
case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break;
|
case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1 && !is_16bit) {
|
||||||
|
switch (param->mode) {
|
||||||
|
case BLEND_ADDITION: param->blend = ff_blend_addition_avx2; break;
|
||||||
|
case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_avx2; break;
|
||||||
|
case BLEND_AND: param->blend = ff_blend_and_avx2; break;
|
||||||
|
case BLEND_AVERAGE: param->blend = ff_blend_average_avx2; break;
|
||||||
|
case BLEND_DARKEN: param->blend = ff_blend_darken_avx2; break;
|
||||||
|
case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_avx2; break;
|
||||||
|
case BLEND_HARDMIX: param->blend = ff_blend_hardmix_avx2; break;
|
||||||
|
case BLEND_LIGHTEN: param->blend = ff_blend_lighten_avx2; break;
|
||||||
|
case BLEND_MULTIPLY: param->blend = ff_blend_multiply_avx2; break;
|
||||||
|
case BLEND_OR: param->blend = ff_blend_or_avx2; break;
|
||||||
|
case BLEND_PHOENIX: param->blend = ff_blend_phoenix_avx2; break;
|
||||||
|
case BLEND_SCREEN: param->blend = ff_blend_screen_avx2; break;
|
||||||
|
case BLEND_SUBTRACT: param->blend = ff_blend_subtract_avx2; break;
|
||||||
|
case BLEND_XOR: param->blend = ff_blend_xor_avx2; break;
|
||||||
|
case BLEND_DIFFERENCE: param->blend = ff_blend_difference_avx2; break;
|
||||||
|
case BLEND_EXTREMITY: param->blend = ff_blend_extremity_avx2; break;
|
||||||
|
case BLEND_NEGATION: param->blend = ff_blend_negation_avx2; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user