libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512()

The new vertical slice with AVX2/512 acceleration can significantly improve the performance of Gaussian Filter 2D. Performance data: ff_verti_slice_c: 32.57 ff_verti_slice_avx2: 476.19 ff_verti_slice_avx512: 833.33 Co-authored-by: Cheng Yanfei <yanfei.cheng@intel.com> Co-authored-by: Jin Jun <jun.i.jin@intel.com> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
2025-08-10 06:10:52 +02:00 · 2021-08-04 10:06:13 +08:00
parent 4a5e24721c
commit 68a2722aee
4 changed files with 214 additions and 8 deletions
--- a/libavfilter/gblur.h
+++ b/libavfilter/gblur.h
@@ -50,6 +50,8 @@ typedef struct GBlurContext {
    float nuV;
    int nb_planes;
    void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale);
    void (*verti_slice)(float *buffer, int width, int height, int slice_start, int slice_end, int steps,
                            float nu, float bscale);
    void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max);
 } GBlurContext;
--- a/libavfilter/vf_gblur.c
+++ b/libavfilter/vf_gblur.c
@@ -138,6 +138,19 @@ static void do_vertical_columns(float *buffer, int width, int height,
    }
 }
 static void verti_slice_c(float *buffer, int width, int height,
                          int slice_start, int slice_end, int steps,
                          float nu, float boundaryscale)
 {
    int aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3);
    /* Filter vertically along columns (process 8 columns in each step) */
    do_vertical_columns(buffer, width, height, slice_start, aligned_end,
                        steps, nu, boundaryscale, 8);
    /* Filter un-aligned columns one by one */
    do_vertical_columns(buffer, width, height, aligned_end, slice_end,
                        steps, nu, boundaryscale, 1);
 }
 static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 {
    GBlurContext *s = ctx->priv;
@@ -150,16 +163,10 @@ static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_
    const int steps = s->steps;
    const float nu = s->nuV;
    float *buffer = s->buffer;
    int aligned_end;
-    aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3);
+    s->verti_slice(buffer, width, height, slice_start, slice_end,
-    /* Filter vertically along columns (process 8 columns in each step) */
+                   steps, nu, boundaryscale);
    do_vertical_columns(buffer, width, height, slice_start, aligned_end,
                        steps, nu, boundaryscale, 8);
    /* Filter un-aligned columns one by one */
    do_vertical_columns(buffer, width, height, aligned_end, slice_end,
                        steps, nu, boundaryscale, 1);
    return 0;
 }
@@ -236,6 +243,7 @@ static int query_formats(AVFilterContext *ctx)
 void ff_gblur_init(GBlurContext *s)
 {
    s->horiz_slice = horiz_slice_c;
    s->verti_slice = verti_slice_c;
    s->postscale_slice = postscale_c;
    if (ARCH_X86)
        ff_gblur_init_x86(s);
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@@ -22,6 +22,43 @@
 SECTION .text
 %xdefine AVX2_MMSIZE   32
 %xdefine AVX512_MMSIZE 64
 %macro MOVSXDIFNIDN 1-*
    %rep %0
        movsxdifnidn %1q, %1d
        %rotate 1
    %endrep
 %endmacro
 %macro PUSH_MASK 5
 %if mmsize == AVX2_MMSIZE
    %assign %%n mmsize/4
    %assign %%i 0
    %rep %%n
        mov %4, %3
        and %4, 1
        neg %4
        mov dword [%5 + %%i*4], %4
        sar %3, 1
        %assign %%i %%i+1
    %endrep
    movu %1, [%5]
 %else
    kmovd %2, %3
 %endif
 %endmacro
 %macro VMASKMOVPS 4
 %if mmsize == AVX2_MMSIZE
    vpmaskmovd %1, %3, %2
 %else
    kmovw k7, %4
    vmovups %1{k7}, %2
 %endif
 %endmacro
 ; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
 ;                          float nu, float bscale)
@@ -232,3 +269,155 @@ POSTSCALE_SLICE
 INIT_ZMM avx512
 POSTSCALE_SLICE
 %endif
 ;*******************************************************************************
 ; void ff_verti_slice(float *buffer, int width, int height, int column_begin,
 ;                     int column_end, int steps, float nu, float bscale);
 ;*******************************************************************************
 %macro VERTI_SLICE 0
 %if UNIX64
 cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
                                         steps, x, y, cwidth, step, ptr, stride
 %else
 cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
                                         steps, nu, bscale, x, y, cwidth, step, \
                                         ptr, stride
 %endif
 %assign cols mmsize/4
 %if WIN64
    VBROADCASTSS m0, num
    VBROADCASTSS m1, bscalem
    DEFINE_ARGS buffer, width, height, cbegin, cend, \
                steps, x, y, cwidth, step, ptr, stride
    MOVSXDIFNIDN width, height, cbegin, cend, steps
 %else
    VBROADCASTSS m0, xmm0 ; nu
    VBROADCASTSS m1, xmm1 ; bscale
 %endif
    mov cwidthq, cendq
    sub cwidthq, cbeginq
    lea strideq, [widthq * 4]
    xor xq, xq ; x = 0
    cmp cwidthq, cols
    jl .x_scalar
    cmp cwidthq, 0x0
    je .end_scalar
    sub cwidthq, cols
 .loop_x:
    xor stepq, stepq
    .loop_step:
        ; ptr = buffer + x + column_begin;
        lea ptrq, [xq + cbeginq]
        lea ptrq, [bufferq + ptrq*4]
        ;  ptr[15:0] *= bcale;
        movu m2, [ptrq]
        mulps m2, m1
        movu [ptrq], m2
        ; Filter downwards
        mov yq, 1
        .loop_y_down:
            add ptrq, strideq ; ptrq += width
            movu m3, [ptrq]
            FMULADD_PS m2, m2, m0, m3, m2
            movu [ptrq], m2
            inc yq
            cmp yq, heightq
            jl .loop_y_down
        mulps m2, m1
        movu [ptrq], m2
        ; Filter upwards
        dec yq
        .loop_y_up:
            sub ptrq, strideq
            movu m3, [ptrq]
            FMULADD_PS m2, m2, m0, m3, m2
            movu [ptrq], m2
            dec yq
            cmp yq, 0
            jg .loop_y_up
        inc stepq
        cmp stepq, stepsq
        jl .loop_step
    add xq, cols
    cmp xq, cwidthq
    jle .loop_x
    add cwidthq, cols
    cmp xq, cwidthq
    jge .end_scalar
 .x_scalar:
    xor stepq, stepq
    mov qword [rsp + 0x10], xq
    sub cwidthq, xq
    mov xq, 1
    shlx cwidthq, xq, cwidthq
    sub cwidthq, 1
    PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20
    mov xq, qword [rsp + 0x10]
    .loop_step_scalar:
        lea ptrq, [xq + cbeginq]
        lea ptrq, [bufferq + ptrq*4]
        VMASKMOVPS m2, [ptrq], m4, k1
        mulps m2, m1
        VMASKMOVPS [ptrq], m2, m4, k1
        ; Filter downwards
        mov yq, 1
        .x_scalar_loop_y_down:
            add ptrq, strideq
            VMASKMOVPS m3, [ptrq], m4, k1
            FMULADD_PS m2, m2, m0, m3, m2
            VMASKMOVPS [ptrq], m2, m4, k1
            inc yq
            cmp yq, heightq
            jl .x_scalar_loop_y_down
        mulps m2, m1
        VMASKMOVPS [ptrq], m2, m4, k1
        ; Filter upwards
        dec yq
        .x_scalar_loop_y_up:
            sub ptrq, strideq
            VMASKMOVPS m3, [ptrq], m4, k1
            FMULADD_PS m2, m2, m0, m3, m2
            VMASKMOVPS [ptrq], m2, m4, k1
            dec yq
            cmp yq, 0
            jg .x_scalar_loop_y_up
        inc stepq
        cmp stepq, stepsq
        jl .loop_step_scalar
 .end_scalar:
    RET
 %endmacro
 %if ARCH_X86_64
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 VERTI_SLICE
 %endif
 %if HAVE_AVX512_EXTERNAL
 INIT_ZMM avx512
 VERTI_SLICE
 %endif
 %endif
--- a/libavfilter/x86/vf_gblur_init.c
+++ b/libavfilter/x86/vf_gblur_init.c
@@ -31,6 +31,11 @@ void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min,
 void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
 void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max);
 void ff_verti_slice_avx2(float *buffer, int width, int height, int column_begin, int column_end,
                        int steps, float nu, float bscale);
 void ff_verti_slice_avx512(float *buffer, int width, int height, int column_begin, int column_end,
                        int steps, float nu, float bscale);
 av_cold void ff_gblur_init_x86(GBlurContext *s)
 {
    int cpu_flags = av_get_cpu_flags();
@@ -47,9 +52,11 @@ av_cold void ff_gblur_init_x86(GBlurContext *s)
    }
    if (EXTERNAL_AVX2(cpu_flags)) {
        s->horiz_slice = ff_horiz_slice_avx2;
        s->verti_slice = ff_verti_slice_avx2;
    }
    if (EXTERNAL_AVX512(cpu_flags)) {
        s->postscale_slice = ff_postscale_slice_avx512;
        s->verti_slice = ff_verti_slice_avx512;
    }
 #endif
 }