mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512()
The new vertical slice with AVX2/512 acceleration can significantly improve the performance of Gaussian Filter 2D. Performance data: ff_verti_slice_c: 32.57 ff_verti_slice_avx2: 476.19 ff_verti_slice_avx512: 833.33 Co-authored-by: Cheng Yanfei <yanfei.cheng@intel.com> Co-authored-by: Jin Jun <jun.i.jin@intel.com> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
This commit is contained in:
parent
4a5e24721c
commit
68a2722aee
@ -50,6 +50,8 @@ typedef struct GBlurContext {
|
|||||||
float nuV;
|
float nuV;
|
||||||
int nb_planes;
|
int nb_planes;
|
||||||
void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale);
|
void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale);
|
||||||
|
void (*verti_slice)(float *buffer, int width, int height, int slice_start, int slice_end, int steps,
|
||||||
|
float nu, float bscale);
|
||||||
void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max);
|
void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max);
|
||||||
} GBlurContext;
|
} GBlurContext;
|
||||||
|
|
||||||
|
@ -138,6 +138,19 @@ static void do_vertical_columns(float *buffer, int width, int height,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void verti_slice_c(float *buffer, int width, int height,
|
||||||
|
int slice_start, int slice_end, int steps,
|
||||||
|
float nu, float boundaryscale)
|
||||||
|
{
|
||||||
|
int aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3);
|
||||||
|
/* Filter vertically along columns (process 8 columns in each step) */
|
||||||
|
do_vertical_columns(buffer, width, height, slice_start, aligned_end,
|
||||||
|
steps, nu, boundaryscale, 8);
|
||||||
|
/* Filter un-aligned columns one by one */
|
||||||
|
do_vertical_columns(buffer, width, height, aligned_end, slice_end,
|
||||||
|
steps, nu, boundaryscale, 1);
|
||||||
|
}
|
||||||
|
|
||||||
static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
|
static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
|
||||||
{
|
{
|
||||||
GBlurContext *s = ctx->priv;
|
GBlurContext *s = ctx->priv;
|
||||||
@ -150,16 +163,10 @@ static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_
|
|||||||
const int steps = s->steps;
|
const int steps = s->steps;
|
||||||
const float nu = s->nuV;
|
const float nu = s->nuV;
|
||||||
float *buffer = s->buffer;
|
float *buffer = s->buffer;
|
||||||
int aligned_end;
|
|
||||||
|
|
||||||
aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3);
|
s->verti_slice(buffer, width, height, slice_start, slice_end,
|
||||||
/* Filter vertically along columns (process 8 columns in each step) */
|
steps, nu, boundaryscale);
|
||||||
do_vertical_columns(buffer, width, height, slice_start, aligned_end,
|
|
||||||
steps, nu, boundaryscale, 8);
|
|
||||||
|
|
||||||
/* Filter un-aligned columns one by one */
|
|
||||||
do_vertical_columns(buffer, width, height, aligned_end, slice_end,
|
|
||||||
steps, nu, boundaryscale, 1);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -236,6 +243,7 @@ static int query_formats(AVFilterContext *ctx)
|
|||||||
void ff_gblur_init(GBlurContext *s)
|
void ff_gblur_init(GBlurContext *s)
|
||||||
{
|
{
|
||||||
s->horiz_slice = horiz_slice_c;
|
s->horiz_slice = horiz_slice_c;
|
||||||
|
s->verti_slice = verti_slice_c;
|
||||||
s->postscale_slice = postscale_c;
|
s->postscale_slice = postscale_c;
|
||||||
if (ARCH_X86)
|
if (ARCH_X86)
|
||||||
ff_gblur_init_x86(s);
|
ff_gblur_init_x86(s);
|
||||||
|
@ -22,6 +22,43 @@
|
|||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
|
%xdefine AVX2_MMSIZE 32
|
||||||
|
%xdefine AVX512_MMSIZE 64
|
||||||
|
|
||||||
|
%macro MOVSXDIFNIDN 1-*
|
||||||
|
%rep %0
|
||||||
|
movsxdifnidn %1q, %1d
|
||||||
|
%rotate 1
|
||||||
|
%endrep
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro PUSH_MASK 5
|
||||||
|
%if mmsize == AVX2_MMSIZE
|
||||||
|
%assign %%n mmsize/4
|
||||||
|
%assign %%i 0
|
||||||
|
%rep %%n
|
||||||
|
mov %4, %3
|
||||||
|
and %4, 1
|
||||||
|
neg %4
|
||||||
|
mov dword [%5 + %%i*4], %4
|
||||||
|
sar %3, 1
|
||||||
|
%assign %%i %%i+1
|
||||||
|
%endrep
|
||||||
|
movu %1, [%5]
|
||||||
|
%else
|
||||||
|
kmovd %2, %3
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro VMASKMOVPS 4
|
||||||
|
%if mmsize == AVX2_MMSIZE
|
||||||
|
vpmaskmovd %1, %3, %2
|
||||||
|
%else
|
||||||
|
kmovw k7, %4
|
||||||
|
vmovups %1{k7}, %2
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
|
; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
|
||||||
; float nu, float bscale)
|
; float nu, float bscale)
|
||||||
|
|
||||||
@ -232,3 +269,155 @@ POSTSCALE_SLICE
|
|||||||
INIT_ZMM avx512
|
INIT_ZMM avx512
|
||||||
POSTSCALE_SLICE
|
POSTSCALE_SLICE
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
|
||||||
|
;*******************************************************************************
|
||||||
|
; void ff_verti_slice(float *buffer, int width, int height, int column_begin,
|
||||||
|
; int column_end, int steps, float nu, float bscale);
|
||||||
|
;*******************************************************************************
|
||||||
|
%macro VERTI_SLICE 0
|
||||||
|
%if UNIX64
|
||||||
|
cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
|
||||||
|
steps, x, y, cwidth, step, ptr, stride
|
||||||
|
%else
|
||||||
|
cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
|
||||||
|
steps, nu, bscale, x, y, cwidth, step, \
|
||||||
|
ptr, stride
|
||||||
|
%endif
|
||||||
|
%assign cols mmsize/4
|
||||||
|
%if WIN64
|
||||||
|
VBROADCASTSS m0, num
|
||||||
|
VBROADCASTSS m1, bscalem
|
||||||
|
DEFINE_ARGS buffer, width, height, cbegin, cend, \
|
||||||
|
steps, x, y, cwidth, step, ptr, stride
|
||||||
|
MOVSXDIFNIDN width, height, cbegin, cend, steps
|
||||||
|
%else
|
||||||
|
VBROADCASTSS m0, xmm0 ; nu
|
||||||
|
VBROADCASTSS m1, xmm1 ; bscale
|
||||||
|
%endif
|
||||||
|
mov cwidthq, cendq
|
||||||
|
sub cwidthq, cbeginq
|
||||||
|
lea strideq, [widthq * 4]
|
||||||
|
|
||||||
|
xor xq, xq ; x = 0
|
||||||
|
cmp cwidthq, cols
|
||||||
|
jl .x_scalar
|
||||||
|
cmp cwidthq, 0x0
|
||||||
|
je .end_scalar
|
||||||
|
|
||||||
|
sub cwidthq, cols
|
||||||
|
.loop_x:
|
||||||
|
xor stepq, stepq
|
||||||
|
.loop_step:
|
||||||
|
; ptr = buffer + x + column_begin;
|
||||||
|
lea ptrq, [xq + cbeginq]
|
||||||
|
lea ptrq, [bufferq + ptrq*4]
|
||||||
|
|
||||||
|
; ptr[15:0] *= bcale;
|
||||||
|
movu m2, [ptrq]
|
||||||
|
mulps m2, m1
|
||||||
|
movu [ptrq], m2
|
||||||
|
|
||||||
|
; Filter downwards
|
||||||
|
mov yq, 1
|
||||||
|
.loop_y_down:
|
||||||
|
add ptrq, strideq ; ptrq += width
|
||||||
|
movu m3, [ptrq]
|
||||||
|
FMULADD_PS m2, m2, m0, m3, m2
|
||||||
|
movu [ptrq], m2
|
||||||
|
|
||||||
|
inc yq
|
||||||
|
cmp yq, heightq
|
||||||
|
jl .loop_y_down
|
||||||
|
|
||||||
|
mulps m2, m1
|
||||||
|
movu [ptrq], m2
|
||||||
|
|
||||||
|
; Filter upwards
|
||||||
|
dec yq
|
||||||
|
.loop_y_up:
|
||||||
|
sub ptrq, strideq
|
||||||
|
movu m3, [ptrq]
|
||||||
|
FMULADD_PS m2, m2, m0, m3, m2
|
||||||
|
movu [ptrq], m2
|
||||||
|
|
||||||
|
dec yq
|
||||||
|
cmp yq, 0
|
||||||
|
jg .loop_y_up
|
||||||
|
|
||||||
|
inc stepq
|
||||||
|
cmp stepq, stepsq
|
||||||
|
jl .loop_step
|
||||||
|
|
||||||
|
add xq, cols
|
||||||
|
cmp xq, cwidthq
|
||||||
|
jle .loop_x
|
||||||
|
|
||||||
|
add cwidthq, cols
|
||||||
|
cmp xq, cwidthq
|
||||||
|
jge .end_scalar
|
||||||
|
|
||||||
|
.x_scalar:
|
||||||
|
xor stepq, stepq
|
||||||
|
mov qword [rsp + 0x10], xq
|
||||||
|
sub cwidthq, xq
|
||||||
|
mov xq, 1
|
||||||
|
shlx cwidthq, xq, cwidthq
|
||||||
|
sub cwidthq, 1
|
||||||
|
PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20
|
||||||
|
mov xq, qword [rsp + 0x10]
|
||||||
|
|
||||||
|
.loop_step_scalar:
|
||||||
|
lea ptrq, [xq + cbeginq]
|
||||||
|
lea ptrq, [bufferq + ptrq*4]
|
||||||
|
|
||||||
|
VMASKMOVPS m2, [ptrq], m4, k1
|
||||||
|
mulps m2, m1
|
||||||
|
VMASKMOVPS [ptrq], m2, m4, k1
|
||||||
|
|
||||||
|
; Filter downwards
|
||||||
|
mov yq, 1
|
||||||
|
.x_scalar_loop_y_down:
|
||||||
|
add ptrq, strideq
|
||||||
|
VMASKMOVPS m3, [ptrq], m4, k1
|
||||||
|
FMULADD_PS m2, m2, m0, m3, m2
|
||||||
|
VMASKMOVPS [ptrq], m2, m4, k1
|
||||||
|
|
||||||
|
inc yq
|
||||||
|
cmp yq, heightq
|
||||||
|
jl .x_scalar_loop_y_down
|
||||||
|
|
||||||
|
mulps m2, m1
|
||||||
|
VMASKMOVPS [ptrq], m2, m4, k1
|
||||||
|
|
||||||
|
; Filter upwards
|
||||||
|
dec yq
|
||||||
|
.x_scalar_loop_y_up:
|
||||||
|
sub ptrq, strideq
|
||||||
|
VMASKMOVPS m3, [ptrq], m4, k1
|
||||||
|
FMULADD_PS m2, m2, m0, m3, m2
|
||||||
|
VMASKMOVPS [ptrq], m2, m4, k1
|
||||||
|
|
||||||
|
dec yq
|
||||||
|
cmp yq, 0
|
||||||
|
jg .x_scalar_loop_y_up
|
||||||
|
|
||||||
|
inc stepq
|
||||||
|
cmp stepq, stepsq
|
||||||
|
jl .loop_step_scalar
|
||||||
|
|
||||||
|
.end_scalar:
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%if ARCH_X86_64
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
INIT_YMM avx2
|
||||||
|
VERTI_SLICE
|
||||||
|
%endif
|
||||||
|
|
||||||
|
%if HAVE_AVX512_EXTERNAL
|
||||||
|
INIT_ZMM avx512
|
||||||
|
VERTI_SLICE
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
|
@ -31,6 +31,11 @@ void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min,
|
|||||||
void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
|
void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
|
||||||
void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max);
|
void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max);
|
||||||
|
|
||||||
|
void ff_verti_slice_avx2(float *buffer, int width, int height, int column_begin, int column_end,
|
||||||
|
int steps, float nu, float bscale);
|
||||||
|
void ff_verti_slice_avx512(float *buffer, int width, int height, int column_begin, int column_end,
|
||||||
|
int steps, float nu, float bscale);
|
||||||
|
|
||||||
av_cold void ff_gblur_init_x86(GBlurContext *s)
|
av_cold void ff_gblur_init_x86(GBlurContext *s)
|
||||||
{
|
{
|
||||||
int cpu_flags = av_get_cpu_flags();
|
int cpu_flags = av_get_cpu_flags();
|
||||||
@ -47,9 +52,11 @@ av_cold void ff_gblur_init_x86(GBlurContext *s)
|
|||||||
}
|
}
|
||||||
if (EXTERNAL_AVX2(cpu_flags)) {
|
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||||
s->horiz_slice = ff_horiz_slice_avx2;
|
s->horiz_slice = ff_horiz_slice_avx2;
|
||||||
|
s->verti_slice = ff_verti_slice_avx2;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_AVX512(cpu_flags)) {
|
if (EXTERNAL_AVX512(cpu_flags)) {
|
||||||
s->postscale_slice = ff_postscale_slice_avx512;
|
s->postscale_slice = ff_postscale_slice_avx512;
|
||||||
|
s->verti_slice = ff_verti_slice_avx512;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user