1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-28 20:53:54 +02:00

libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512()

The new vertical slice with AVX2/512 acceleration can significantly
improve the performance of Gaussian Filter 2D.

Performance data:
ff_verti_slice_c: 32.57
ff_verti_slice_avx2: 476.19
ff_verti_slice_avx512: 833.33

Co-authored-by: Cheng Yanfei <yanfei.cheng@intel.com>
Co-authored-by: Jin Jun <jun.i.jin@intel.com>
Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
This commit is contained in:
Wu Jianhua 2021-08-04 10:06:13 +08:00 committed by Paul B Mahol
parent 4a5e24721c
commit 68a2722aee
4 changed files with 214 additions and 8 deletions

View File

@ -50,6 +50,8 @@ typedef struct GBlurContext {
float nuV; float nuV;
int nb_planes; int nb_planes;
void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale); void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale);
void (*verti_slice)(float *buffer, int width, int height, int slice_start, int slice_end, int steps,
float nu, float bscale);
void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max); void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max);
} GBlurContext; } GBlurContext;

View File

@ -138,6 +138,19 @@ static void do_vertical_columns(float *buffer, int width, int height,
} }
} }
static void verti_slice_c(float *buffer, int width, int height,
int slice_start, int slice_end, int steps,
float nu, float boundaryscale)
{
int aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3);
/* Filter vertically along columns (process 8 columns in each step) */
do_vertical_columns(buffer, width, height, slice_start, aligned_end,
steps, nu, boundaryscale, 8);
/* Filter un-aligned columns one by one */
do_vertical_columns(buffer, width, height, aligned_end, slice_end,
steps, nu, boundaryscale, 1);
}
static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{ {
GBlurContext *s = ctx->priv; GBlurContext *s = ctx->priv;
@ -150,16 +163,10 @@ static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_
const int steps = s->steps; const int steps = s->steps;
const float nu = s->nuV; const float nu = s->nuV;
float *buffer = s->buffer; float *buffer = s->buffer;
int aligned_end;
aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3); s->verti_slice(buffer, width, height, slice_start, slice_end,
/* Filter vertically along columns (process 8 columns in each step) */ steps, nu, boundaryscale);
do_vertical_columns(buffer, width, height, slice_start, aligned_end,
steps, nu, boundaryscale, 8);
/* Filter un-aligned columns one by one */
do_vertical_columns(buffer, width, height, aligned_end, slice_end,
steps, nu, boundaryscale, 1);
return 0; return 0;
} }
@ -236,6 +243,7 @@ static int query_formats(AVFilterContext *ctx)
void ff_gblur_init(GBlurContext *s) void ff_gblur_init(GBlurContext *s)
{ {
s->horiz_slice = horiz_slice_c; s->horiz_slice = horiz_slice_c;
s->verti_slice = verti_slice_c;
s->postscale_slice = postscale_c; s->postscale_slice = postscale_c;
if (ARCH_X86) if (ARCH_X86)
ff_gblur_init_x86(s); ff_gblur_init_x86(s);

View File

@ -22,6 +22,43 @@
SECTION .text SECTION .text
%xdefine AVX2_MMSIZE 32
%xdefine AVX512_MMSIZE 64
%macro MOVSXDIFNIDN 1-*
%rep %0
movsxdifnidn %1q, %1d
%rotate 1
%endrep
%endmacro
%macro PUSH_MASK 5
%if mmsize == AVX2_MMSIZE
%assign %%n mmsize/4
%assign %%i 0
%rep %%n
mov %4, %3
and %4, 1
neg %4
mov dword [%5 + %%i*4], %4
sar %3, 1
%assign %%i %%i+1
%endrep
movu %1, [%5]
%else
kmovd %2, %3
%endif
%endmacro
%macro VMASKMOVPS 4
%if mmsize == AVX2_MMSIZE
vpmaskmovd %1, %3, %2
%else
kmovw k7, %4
vmovups %1{k7}, %2
%endif
%endmacro
; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, ; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
; float nu, float bscale) ; float nu, float bscale)
@ -232,3 +269,155 @@ POSTSCALE_SLICE
INIT_ZMM avx512 INIT_ZMM avx512
POSTSCALE_SLICE POSTSCALE_SLICE
%endif %endif
;*******************************************************************************
; void ff_verti_slice(float *buffer, int width, int height, int column_begin,
; int column_end, int steps, float nu, float bscale);
;*******************************************************************************
%macro VERTI_SLICE 0
%if UNIX64
cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
steps, x, y, cwidth, step, ptr, stride
%else
cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
steps, nu, bscale, x, y, cwidth, step, \
ptr, stride
%endif
%assign cols mmsize/4
%if WIN64
VBROADCASTSS m0, num
VBROADCASTSS m1, bscalem
DEFINE_ARGS buffer, width, height, cbegin, cend, \
steps, x, y, cwidth, step, ptr, stride
MOVSXDIFNIDN width, height, cbegin, cend, steps
%else
VBROADCASTSS m0, xmm0 ; nu
VBROADCASTSS m1, xmm1 ; bscale
%endif
mov cwidthq, cendq
sub cwidthq, cbeginq
lea strideq, [widthq * 4]
xor xq, xq ; x = 0
cmp cwidthq, cols
jl .x_scalar
cmp cwidthq, 0x0
je .end_scalar
sub cwidthq, cols
.loop_x:
xor stepq, stepq
.loop_step:
; ptr = buffer + x + column_begin;
lea ptrq, [xq + cbeginq]
lea ptrq, [bufferq + ptrq*4]
; ptr[15:0] *= bcale;
movu m2, [ptrq]
mulps m2, m1
movu [ptrq], m2
; Filter downwards
mov yq, 1
.loop_y_down:
add ptrq, strideq ; ptrq += width
movu m3, [ptrq]
FMULADD_PS m2, m2, m0, m3, m2
movu [ptrq], m2
inc yq
cmp yq, heightq
jl .loop_y_down
mulps m2, m1
movu [ptrq], m2
; Filter upwards
dec yq
.loop_y_up:
sub ptrq, strideq
movu m3, [ptrq]
FMULADD_PS m2, m2, m0, m3, m2
movu [ptrq], m2
dec yq
cmp yq, 0
jg .loop_y_up
inc stepq
cmp stepq, stepsq
jl .loop_step
add xq, cols
cmp xq, cwidthq
jle .loop_x
add cwidthq, cols
cmp xq, cwidthq
jge .end_scalar
.x_scalar:
xor stepq, stepq
mov qword [rsp + 0x10], xq
sub cwidthq, xq
mov xq, 1
shlx cwidthq, xq, cwidthq
sub cwidthq, 1
PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20
mov xq, qword [rsp + 0x10]
.loop_step_scalar:
lea ptrq, [xq + cbeginq]
lea ptrq, [bufferq + ptrq*4]
VMASKMOVPS m2, [ptrq], m4, k1
mulps m2, m1
VMASKMOVPS [ptrq], m2, m4, k1
; Filter downwards
mov yq, 1
.x_scalar_loop_y_down:
add ptrq, strideq
VMASKMOVPS m3, [ptrq], m4, k1
FMULADD_PS m2, m2, m0, m3, m2
VMASKMOVPS [ptrq], m2, m4, k1
inc yq
cmp yq, heightq
jl .x_scalar_loop_y_down
mulps m2, m1
VMASKMOVPS [ptrq], m2, m4, k1
; Filter upwards
dec yq
.x_scalar_loop_y_up:
sub ptrq, strideq
VMASKMOVPS m3, [ptrq], m4, k1
FMULADD_PS m2, m2, m0, m3, m2
VMASKMOVPS [ptrq], m2, m4, k1
dec yq
cmp yq, 0
jg .x_scalar_loop_y_up
inc stepq
cmp stepq, stepsq
jl .loop_step_scalar
.end_scalar:
RET
%endmacro
%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
VERTI_SLICE
%endif
%if HAVE_AVX512_EXTERNAL
INIT_ZMM avx512
VERTI_SLICE
%endif
%endif

View File

@ -31,6 +31,11 @@ void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min,
void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max); void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max); void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max);
void ff_verti_slice_avx2(float *buffer, int width, int height, int column_begin, int column_end,
int steps, float nu, float bscale);
void ff_verti_slice_avx512(float *buffer, int width, int height, int column_begin, int column_end,
int steps, float nu, float bscale);
av_cold void ff_gblur_init_x86(GBlurContext *s) av_cold void ff_gblur_init_x86(GBlurContext *s)
{ {
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
@ -47,9 +52,11 @@ av_cold void ff_gblur_init_x86(GBlurContext *s)
} }
if (EXTERNAL_AVX2(cpu_flags)) { if (EXTERNAL_AVX2(cpu_flags)) {
s->horiz_slice = ff_horiz_slice_avx2; s->horiz_slice = ff_horiz_slice_avx2;
s->verti_slice = ff_verti_slice_avx2;
} }
if (EXTERNAL_AVX512(cpu_flags)) { if (EXTERNAL_AVX512(cpu_flags)) {
s->postscale_slice = ff_postscale_slice_avx512; s->postscale_slice = ff_postscale_slice_avx512;
s->verti_slice = ff_verti_slice_avx512;
} }
#endif #endif
} }