avfilter/vf_gblur: add x86 SIMD optimizations

The horizontal pass get ~2x performance with the patch under single thread. Tested overall performance using the command(avx2 enabled): ./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null ./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null For single thread, the fps improves from 43 to 60, about 40%. For multi-thread, the fps improves from 110 to 130, about 20%. Signed-off-by: Ruiling Song <ruiling.song@intel.com>
2025-07-11 14:30:22 +02:00 · 2019-05-15 17:54:10 +08:00
parent 5fc8d87ba6
commit 83f9da7768
5 changed files with 312 additions and 41 deletions
--- a/libavfilter/gblur.h
+++ b/libavfilter/gblur.h
@ -0,0 +1,55 @@
 /*
 * Copyright (c) 2011 Pascal Getreuer
 * Copyright (c) 2016 Paul B Mahol
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials provided
 *    with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef AVFILTER_GBLUR_H
 #define AVFILTER_GBLUR_H
 #include "avfilter.h"
 typedef struct GBlurContext {
    const AVClass *class;
    float sigma;
    float sigmaV;
    int steps;
    int planes;
    int depth;
    int planewidth[4];
    int planeheight[4];
    float *buffer;
    float boundaryscale;
    float boundaryscaleV;
    float postscale;
    float postscaleV;
    float nu;
    float nuV;
    int nb_planes;
    void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale);
 } GBlurContext;
 void ff_gblur_init(GBlurContext *s);
 void ff_gblur_init_x86(GBlurContext *s);
 #endif
--- a/libavfilter/vf_gblur.c
+++ b/libavfilter/vf_gblur.c
@ -30,30 +30,10 @@
 #include "libavutil/pixdesc.h"
 #include "avfilter.h"
 #include "formats.h"
 #include "gblur.h"
 #include "internal.h"
 #include "video.h"
 typedef struct GBlurContext {
    const AVClass *class;
    float sigma;
    float sigmaV;
    int steps;
    int planes;
    int depth;
    int planewidth[4];
    int planeheight[4];
    float *buffer;
    float boundaryscale;
    float boundaryscaleV;
    float postscale;
    float postscaleV;
    float nu;
    float nuV;
    int nb_planes;
 } GBlurContext;
 #define OFFSET(x) offsetof(GBlurContext, x)
 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
@ -72,6 +52,28 @@ typedef struct ThreadData {
    int width;
 } ThreadData;
 static void horiz_slice_c(float *buffer, int width, int height, int steps,
                          float nu, float bscale)
 {
    int step, x, y;
    float *ptr;
    for (y = 0; y < height; y++) {
        for (step = 0; step < steps; step++) {
            ptr = buffer + width * y;
            ptr[0] *= bscale;
            /* Filter rightwards */
            for (x = 1; x < width; x++)
                ptr[x] += nu * ptr[x - 1];
            ptr[x = width - 1] *= bscale;
            /* Filter leftwards */
            for (; x > 0; x--)
                ptr[x - 1] += nu * ptr[x];
        }
    }
 }
 static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 {
    GBlurContext *s = ctx->priv;
@ -84,27 +86,10 @@ static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int n
    const int steps = s->steps;
    const float nu = s->nu;
    float *buffer = s->buffer;
    int y, x, step;
    float *ptr;
    /* Filter horizontally along each row */
    for (y = slice_start; y < slice_end; y++) {
        for (step = 0; step < steps; step++) {
            ptr = buffer + width * y;
            ptr[0] *= boundaryscale;
            /* Filter rightwards */
            for (x = 1; x < width; x++)
                ptr[x] += nu * ptr[x - 1];
            ptr[x = width - 1] *= boundaryscale;
            /* Filter leftwards */
            for (; x > 0; x--)
                ptr[x - 1] += nu * ptr[x];
        }
    }
    s->horiz_slice(buffer + width * slice_start, width, slice_end - slice_start,
                   steps, nu, boundaryscale);
    emms_c();
    return 0;
 }
@ -231,6 +216,13 @@ static int query_formats(AVFilterContext *ctx)
    return ff_set_common_formats(ctx, ff_make_format_list(pix_fmts));
 }
 void ff_gblur_init(GBlurContext *s)
 {
    s->horiz_slice = horiz_slice_c;
    if (ARCH_X86_64)
        ff_gblur_init_x86(s);
 }
 static int config_input(AVFilterLink *inlink)
 {
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
@ -251,6 +243,7 @@ static int config_input(AVFilterLink *inlink)
    if (s->sigmaV < 0) {
        s->sigmaV = s->sigma;
    }
    ff_gblur_init(s);
    return 0;
 }
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@ -7,6 +7,7 @@ OBJS-$(CONFIG_BWDIF_FILTER)                  += x86/vf_bwdif_init.o
 OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
 OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq.o
 OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GBLUR_FILTER)                  += x86/vf_gblur_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
 OBJS-$(CONFIG_FRAMERATE_FILTER)              += x86/vf_framerate_init.o
 OBJS-$(CONFIG_HFLIP_FILTER)                  += x86/vf_hflip_init.o
@ -41,6 +42,7 @@ X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
 X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER)       += x86/vf_framerate.o
 X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
 X86ASM-OBJS-$(CONFIG_GBLUR_FILTER)           += x86/vf_gblur.o
 X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER)         += x86/vf_gradfun.o
 X86ASM-OBJS-$(CONFIG_HFLIP_FILTER)           += x86/vf_hflip.o
 X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@ -0,0 +1,185 @@
 ;*****************************************************************************
 ;* x86-optimized functions for gblur filter
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 %include "libavutil/x86/x86util.asm"
 SECTION .text
 ; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
 ;                          float nu, float bscale)
 %macro HORIZ_SLICE 0
 %if UNIX64
 cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain
 %else
 cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain
 %endif
 %if WIN64
    movss m0, num
    movss m1, bscalem
    DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain
 %endif
    movsxdifnidn widthq, widthd
    mulss m2, m0, m0 ; nu ^ 2
    mulss m3, m2, m0 ; nu ^ 3
    mulss m4, m3, m0 ; nu ^ 4
    xor   xq, xq
    xor   yd, yd
    mov   strideq, widthq
    ; stride = width * 4
    shl   strideq, 2
    ; w = w - ((w - 1) & 3)
    mov   remainq, widthq
    sub   remainq, 1
    and   remainq, 3
    sub   widthq, remainq
    shufps m0, m0, 0
    shufps m2, m2, 0
    shufps m3, m3, 0
    shufps m4, m4, 0
 .loop_y:
    xor   stepd, stepd
    .loop_step:
        ; p0 *= bscale
        mulss m5, m1, [ptrq + xq * 4]
        movss [ptrq + xq * 4], m5
        inc xq
        ; filter rightwards
        ; Here we are vectorizing the c version by 4
        ;    for (x = 1; x < width; x++)
        ;       ptr[x] += nu * ptr[x - 1];
        ;   let p0 stands for ptr[x-1], the data from last loop
        ;   and [p1,p2,p3,p4] be the vector data for this loop.
        ; Unrolling the loop, we get:
        ;   p1' = p1 + p0*nu
        ;   p2' = p2 + p1*nu + p0*nu^2
        ;   p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3
        ;   p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4
        ; so we can do it in simd:
        ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu +
        ;                     [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 +
        ;                     [0,0,0,p0]*nu^4
        .loop_x:
            movu m6, [ptrq + xq * 4]         ; s  = [p1,p2,p3,p4]
            pslldq m7, m6, 4                 ;      [0, p1,p2,p3]
            movss  m7, m5                    ;      [p0,p1,p2,p3]
            FMULADD_PS  m6, m7, m0, m6, m8   ; s += [p0,p1,p2,p3] * nu
            pslldq m7, 4                     ;      [0,p0,p1,p2]
            FMULADD_PS  m6, m7, m2, m6, m8   ; s += [0,p0,p1,p2]  * nu^2
            pslldq m7, 4
            FMULADD_PS  m6, m7, m3, m6, m8   ; s += [0,0,p0,p1]   * nu^3
            pslldq m7, 4
            FMULADD_PS  m6, m7, m4, m6, m8   ; s += [0,0,0,p0]    * nu^4
            movu [ptrq + xq * 4], m6
            shufps m5, m6, m6, q3333
            add xq, 4
            cmp xq, widthq
            jl .loop_x
        add widthq, remainq
        cmp xq, widthq
        je .end_scalar
        .loop_scalar:
            ; ptr[x] += nu * ptr[x-1]
            movss m5, [ptrq + 4*xq - 4]
            mulss m5, m0
            addss m5, [ptrq + 4*xq]
            movss [ptrq + 4*xq], m5
            inc xq
            cmp xq, widthq
            jl .loop_scalar
        .end_scalar:
            ; ptr[width - 1] *= bscale
            dec xq
            mulss m5, m1, [ptrq + 4*xq]
            movss [ptrq + 4*xq], m5
            shufps m5, m5, 0
        ; filter leftwards
        ;    for (; x > 0; x--)
        ;        ptr[x - 1] += nu * ptr[x];
        ; The idea here is basically the same as filter rightwards.
        ; But we need to take care as the data layout is different.
        ; Let p0 stands for the ptr[x], which is the data from last loop.
        ; The way we do it in simd as below:
        ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1]
        ;                          + [p-3, p-2, p-1, p0] * nu
        ;                          + [p-2, p-1, p0,  0]  * nu^2
        ;                          + [p-1, p0,  0,   0]  * nu^3
        ;                          + [p0,  0,   0,   0]  * nu^4
        .loop_x_back:
            sub xq, 4
            movu m6, [ptrq + xq * 4]      ; s = [p-4, p-3, p-2, p-1]
            psrldq m7, m6, 4              ;     [p-3, p-2, p-1, 0  ]
            blendps m7, m5, 0x8           ;     [p-3, p-2, p-1, p0 ]
            FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu
            psrldq m7, 4                  ;
            FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0,  0] * nu^2
            psrldq m7, 4
            FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0,   0,  0] * nu^3
            psrldq m7, 4
            FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0,  0,    0,  0] * nu^4
            movu [ptrq + xq * 4], m6
            shufps m5, m6, m6, 0          ; m5 = [p-4', p-4', p-4', p-4']
            cmp xq, remainq
            jg .loop_x_back
        cmp xq, 0
        je .end_scalar_back
        .loop_scalar_back:
            ; ptr[x-1] += nu * ptr[x]
            movss m5, [ptrq + 4*xq]
            mulss m5, m0
            addss m5, [ptrq + 4*xq - 4]
            movss [ptrq + 4*xq - 4], m5
            dec xq
            cmp xq, 0
            jg .loop_scalar_back
        .end_scalar_back:
        ; reset aligned width for next line
        sub widthq, remainq
        inc stepd
        cmp stepd, stepsd
        jl .loop_step
    add ptrq, strideq
    inc yd
    cmp yd, heightd
    jl .loop_y
    RET
 %endmacro
 %if ARCH_X86_64
 INIT_XMM sse4
 HORIZ_SLICE
 INIT_XMM avx2
 HORIZ_SLICE
 %endif
--- a/libavfilter/x86/vf_gblur_init.c
+++ b/libavfilter/x86/vf_gblur_init.c
@ -0,0 +1,36 @@
 /*
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/gblur.h"
 void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
 void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
 av_cold void ff_gblur_init_x86(GBlurContext *s)
 {
    int cpu_flags = av_get_cpu_flags();
    if (EXTERNAL_SSE4(cpu_flags))
        s->horiz_slice = ff_horiz_slice_sse4;
    if (EXTERNAL_AVX2(cpu_flags))
        s->horiz_slice = ff_horiz_slice_avx2;
 }