1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

avfilter/x86/f_ebur128: add x86 AVX implementation

Processes two channels in parallel, using 128-bit XMM registers.

In theory, we could go up to YMM registers to process 4 channels, but this is
not a gain except for relatively high channel counts (e.g. 7.1), and also
complicates the sample load/store operations considerably.

I decided to only add an AVX variant, since the C code is not substantially
slower enough to justify a separate function just for ancient CPUs.
This commit is contained in:
Niklas Haas
2025-06-12 19:48:33 +02:00
parent deab15e76a
commit 53e03ec8af
5 changed files with 206 additions and 5 deletions

View File

@ -579,6 +579,11 @@ static av_cold int init(AVFilterContext *ctx)
/* summary */
av_log(ctx, AV_LOG_VERBOSE, "EBU +%d scale\n", ebur128->meter);
ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
#if ARCH_X86
ff_ebur128_init_x86(&ebur128->dsp);
#endif
return 0;
}
@ -692,7 +697,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
MOVE_TO_NEXT_CACHED_ENTRY(400);
MOVE_TO_NEXT_CACHED_ENTRY(3000);
ff_ebur128_filter_channels_c(dsp, &samples[idx_insample * nb_channels],
dsp->filter_channels(dsp, &samples[idx_insample * nb_channels],
&ebur128->i400.cache[bin_id_400 * nb_channels],
&ebur128->i3000.cache[bin_id_3000 * nb_channels],
ebur128->i400.sum, ebur128->i3000.sum,

View File

@ -22,6 +22,9 @@
#ifndef AVFILTER_F_EBUR128_H
#define AVFILTER_F_EBUR128_H
#include <assert.h>
#include <stddef.h>
typedef struct EBUR128Biquad {
double b0, b1, b2;
double a1, a2;
@ -35,8 +38,21 @@ typedef struct EBUR128DSPContext {
/* Cache of 3 samples for each channel */
double *y; /* after pre-filter */
double *z; /* after RLB-filter */
/* DSP functions */
void (*filter_channels)(const struct EBUR128DSPContext *dsp,
const double *samples,
double *cache_400, double *cache_3000,
double *sum_400, double *sum_3000,
int nb_channels);
} EBUR128DSPContext;
static_assert(offsetof(EBUR128DSPContext, pre) == 0, "struct layout mismatch");
static_assert(offsetof(EBUR128DSPContext, rlb) == 5 * sizeof(double), "struct layout mismatch");
static_assert(offsetof(EBUR128DSPContext, y) == 10 * sizeof(double), "struct layout mismatch");
void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
double *, double *, double *, double *, int);

View File

@ -7,6 +7,7 @@ OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution_init.o
OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128_init.o
OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq_init.o
OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
OBJS-$(CONFIG_GBLUR_FILTER) += x86/vf_gblur_init.o
@ -52,6 +53,7 @@ X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution.o
X86ASM-OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128.o
X86ASM-OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER) += x86/vf_framerate.o
X86ASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o

View File

@ -0,0 +1,143 @@
;*****************************************************************************
;* x86-optimized functions for ebur128 filter
;*
;* Copyright (C) 2025 Niklas Haas
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
%include "libavutil/x86/x86util.asm"
struc Biquad
.b0 resq 1
.b1 resq 1
.b2 resq 1
.a1 resq 1
.a2 resq 1
endstruc
struc DSP
.pre resq 5
.rlb resq 5
.y resq 1
.z resq 1
endstruc
SECTION .text
%macro MOVNQ 3 ; num, dst, src
%if %1 == 1
movsd %2, %3
%else
movupd %2, %3
%endif
%endmacro
%macro FILTER 11 ; y0, y1, y2, x, b0, b1, b2, a1, a2, samples, num_channels
; Y[0] := b0 * X + Y1
; Y[1] := b1 * X + Y2 - a1 * Y[0]
; Y[2] := b2 * X - a2 * Y[0]
movsd %1, [%10 + 8]
movsd %3, [%10 + 16]
%if %11 > 1
movhpd %1, [%10 + 32]
movhpd %3, [%10 + 40]
%endif
mulpd %2, %5, %4
addpd %1, %2
mulpd %2, %8, %1
subpd %3, %2
mulpd %2, %6, %4
addpd %2, %3
mulpd %3, %7, %4
mulpd %4, %9, %1
subpd %3, %4
movsd [%10 + 0], %1
movsd [%10 + 8], %2
movsd [%10 + 16], %3
%if %11 > 1
movhpd [%10 + 24], %1
movhpd [%10 + 32], %2
movhpd [%10 + 40], %3
%endif
add %10, 24 * %11
%endmacro
%macro filter_channels 1 ; num_channels
MOVNQ %1, m3, [samplesq]
add samplesq, 8 * %1
FILTER m0, m1, m2, m3, m4, m5, m6, m7, m8, r7q, %1
FILTER m3, m1, m2, m0, m9, m10, m11, m12, m13, r8q, %1
; update sum and cache
mulpd m3, m3
subpd m0, m3, [cache400q]
subpd m1, m3, [cache3000q]
MOVNQ %1, [cache400q], m3
MOVNQ %1, [cache3000q], m3
add cache400q, 8 * %1
add cache3000q, 8 * %1
addpd m0, [sum400q]
addpd m1, [sum3000q]
MOVNQ %1, [sum400q], m0
MOVNQ %1, [sum3000q], m1
add sum400q, 8 * %1
add sum3000q, 8 * %1
%endmacro
%if ARCH_X86_64
INIT_XMM avx
cglobal ebur128_filter_channels, 7, 9, 14, dsp, samples, cache400, cache3000, sum400, sum3000, channels
movddup m4, [dspq + DSP.pre + Biquad.b0]
movddup m5, [dspq + DSP.pre + Biquad.b1]
movddup m6, [dspq + DSP.pre + Biquad.b2]
movddup m7, [dspq + DSP.pre + Biquad.a1]
movddup m8, [dspq + DSP.pre + Biquad.a2]
movddup m9, [dspq + DSP.rlb + Biquad.b0]
movddup m10, [dspq + DSP.rlb + Biquad.b1]
movddup m11, [dspq + DSP.rlb + Biquad.b2]
movddup m12, [dspq + DSP.rlb + Biquad.a1]
movddup m13, [dspq + DSP.rlb + Biquad.a2]
mov r7q, [dspq + DSP.y]
mov r8q, [dspq + DSP.z]
; handle odd channel count
test channelsd, 1
jnz .tail
.loop:
filter_channels 2
sub channelsd, 2
jg .loop
RET
.tail:
filter_channels 1
dec channelsd
test channelsd, channelsd
jnz .loop
RET
%endif ; ARCH_X86_64

View File

@ -0,0 +1,35 @@
/*
* Copyright (c) 2018 Paul B Mahol
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavfilter/f_ebur128.h"
void ff_ebur128_filter_channels_avx(const EBUR128DSPContext *, const double *,
double *, double *, double *, double *, int);
av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags))
dsp->filter_channels = ff_ebur128_filter_channels_avx;
}