You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
avfilter/x86/f_ebur128: add x86 AVX implementation
Processes two channels in parallel, using 128-bit XMM registers. In theory, we could go up to YMM registers to process 4 channels, but this is not a gain except for relatively high channel counts (e.g. 7.1), and also complicates the sample load/store operations considerably. I decided to only add an AVX variant, since the C code is not substantially slower enough to justify a separate function just for ancient CPUs.
This commit is contained in:
@ -579,6 +579,11 @@ static av_cold int init(AVFilterContext *ctx)
|
||||
/* summary */
|
||||
av_log(ctx, AV_LOG_VERBOSE, "EBU +%d scale\n", ebur128->meter);
|
||||
|
||||
ebur128->dsp.filter_channels = ff_ebur128_filter_channels_c;
|
||||
#if ARCH_X86
|
||||
ff_ebur128_init_x86(&ebur128->dsp);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -692,11 +697,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
|
||||
MOVE_TO_NEXT_CACHED_ENTRY(400);
|
||||
MOVE_TO_NEXT_CACHED_ENTRY(3000);
|
||||
|
||||
ff_ebur128_filter_channels_c(dsp, &samples[idx_insample * nb_channels],
|
||||
&ebur128->i400.cache[bin_id_400 * nb_channels],
|
||||
&ebur128->i3000.cache[bin_id_3000 * nb_channels],
|
||||
ebur128->i400.sum, ebur128->i3000.sum,
|
||||
nb_channels);
|
||||
dsp->filter_channels(dsp, &samples[idx_insample * nb_channels],
|
||||
&ebur128->i400.cache[bin_id_400 * nb_channels],
|
||||
&ebur128->i3000.cache[bin_id_3000 * nb_channels],
|
||||
ebur128->i400.sum, ebur128->i3000.sum,
|
||||
nb_channels);
|
||||
|
||||
#define FIND_PEAK(global, sp, ptype) do { \
|
||||
int ch; \
|
||||
|
@ -22,6 +22,9 @@
|
||||
#ifndef AVFILTER_F_EBUR128_H
|
||||
#define AVFILTER_F_EBUR128_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
|
||||
typedef struct EBUR128Biquad {
|
||||
double b0, b1, b2;
|
||||
double a1, a2;
|
||||
@ -35,8 +38,21 @@ typedef struct EBUR128DSPContext {
|
||||
/* Cache of 3 samples for each channel */
|
||||
double *y; /* after pre-filter */
|
||||
double *z; /* after RLB-filter */
|
||||
|
||||
/* DSP functions */
|
||||
void (*filter_channels)(const struct EBUR128DSPContext *dsp,
|
||||
const double *samples,
|
||||
double *cache_400, double *cache_3000,
|
||||
double *sum_400, double *sum_3000,
|
||||
int nb_channels);
|
||||
} EBUR128DSPContext;
|
||||
|
||||
static_assert(offsetof(EBUR128DSPContext, pre) == 0, "struct layout mismatch");
|
||||
static_assert(offsetof(EBUR128DSPContext, rlb) == 5 * sizeof(double), "struct layout mismatch");
|
||||
static_assert(offsetof(EBUR128DSPContext, y) == 10 * sizeof(double), "struct layout mismatch");
|
||||
|
||||
void ff_ebur128_init_x86(EBUR128DSPContext *dsp);
|
||||
|
||||
void ff_ebur128_filter_channels_c(const EBUR128DSPContext *, const double *,
|
||||
double *, double *, double *, double *, int);
|
||||
|
||||
|
@ -7,6 +7,7 @@ OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
|
||||
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
|
||||
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
|
||||
OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution_init.o
|
||||
OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128_init.o
|
||||
OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq_init.o
|
||||
OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
|
||||
OBJS-$(CONFIG_GBLUR_FILTER) += x86/vf_gblur_init.o
|
||||
@ -52,6 +53,7 @@ X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
|
||||
X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
|
||||
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
|
||||
X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution.o
|
||||
X86ASM-OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128.o
|
||||
X86ASM-OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
|
||||
X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER) += x86/vf_framerate.o
|
||||
X86ASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o
|
||||
|
143
libavfilter/x86/f_ebur128.asm
Normal file
143
libavfilter/x86/f_ebur128.asm
Normal file
@ -0,0 +1,143 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized functions for ebur128 filter
|
||||
;*
|
||||
;* Copyright (C) 2025 Niklas Haas
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;*****************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
struc Biquad
|
||||
.b0 resq 1
|
||||
.b1 resq 1
|
||||
.b2 resq 1
|
||||
.a1 resq 1
|
||||
.a2 resq 1
|
||||
endstruc
|
||||
|
||||
struc DSP
|
||||
.pre resq 5
|
||||
.rlb resq 5
|
||||
.y resq 1
|
||||
.z resq 1
|
||||
endstruc
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro MOVNQ 3 ; num, dst, src
|
||||
%if %1 == 1
|
||||
movsd %2, %3
|
||||
%else
|
||||
movupd %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro FILTER 11 ; y0, y1, y2, x, b0, b1, b2, a1, a2, samples, num_channels
|
||||
; Y[0] := b0 * X + Y1
|
||||
; Y[1] := b1 * X + Y2 - a1 * Y[0]
|
||||
; Y[2] := b2 * X - a2 * Y[0]
|
||||
movsd %1, [%10 + 8]
|
||||
movsd %3, [%10 + 16]
|
||||
%if %11 > 1
|
||||
movhpd %1, [%10 + 32]
|
||||
movhpd %3, [%10 + 40]
|
||||
%endif
|
||||
|
||||
mulpd %2, %5, %4
|
||||
addpd %1, %2
|
||||
|
||||
mulpd %2, %8, %1
|
||||
subpd %3, %2
|
||||
mulpd %2, %6, %4
|
||||
addpd %2, %3
|
||||
|
||||
mulpd %3, %7, %4
|
||||
mulpd %4, %9, %1
|
||||
subpd %3, %4
|
||||
|
||||
movsd [%10 + 0], %1
|
||||
movsd [%10 + 8], %2
|
||||
movsd [%10 + 16], %3
|
||||
%if %11 > 1
|
||||
movhpd [%10 + 24], %1
|
||||
movhpd [%10 + 32], %2
|
||||
movhpd [%10 + 40], %3
|
||||
%endif
|
||||
add %10, 24 * %11
|
||||
%endmacro
|
||||
|
||||
%macro filter_channels 1 ; num_channels
|
||||
MOVNQ %1, m3, [samplesq]
|
||||
add samplesq, 8 * %1
|
||||
|
||||
FILTER m0, m1, m2, m3, m4, m5, m6, m7, m8, r7q, %1
|
||||
FILTER m3, m1, m2, m0, m9, m10, m11, m12, m13, r8q, %1
|
||||
|
||||
; update sum and cache
|
||||
mulpd m3, m3
|
||||
subpd m0, m3, [cache400q]
|
||||
subpd m1, m3, [cache3000q]
|
||||
MOVNQ %1, [cache400q], m3
|
||||
MOVNQ %1, [cache3000q], m3
|
||||
add cache400q, 8 * %1
|
||||
add cache3000q, 8 * %1
|
||||
addpd m0, [sum400q]
|
||||
addpd m1, [sum3000q]
|
||||
MOVNQ %1, [sum400q], m0
|
||||
MOVNQ %1, [sum3000q], m1
|
||||
add sum400q, 8 * %1
|
||||
add sum3000q, 8 * %1
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
INIT_XMM avx
|
||||
cglobal ebur128_filter_channels, 7, 9, 14, dsp, samples, cache400, cache3000, sum400, sum3000, channels
|
||||
movddup m4, [dspq + DSP.pre + Biquad.b0]
|
||||
movddup m5, [dspq + DSP.pre + Biquad.b1]
|
||||
movddup m6, [dspq + DSP.pre + Biquad.b2]
|
||||
movddup m7, [dspq + DSP.pre + Biquad.a1]
|
||||
movddup m8, [dspq + DSP.pre + Biquad.a2]
|
||||
|
||||
movddup m9, [dspq + DSP.rlb + Biquad.b0]
|
||||
movddup m10, [dspq + DSP.rlb + Biquad.b1]
|
||||
movddup m11, [dspq + DSP.rlb + Biquad.b2]
|
||||
movddup m12, [dspq + DSP.rlb + Biquad.a1]
|
||||
movddup m13, [dspq + DSP.rlb + Biquad.a2]
|
||||
|
||||
mov r7q, [dspq + DSP.y]
|
||||
mov r8q, [dspq + DSP.z]
|
||||
|
||||
; handle odd channel count
|
||||
test channelsd, 1
|
||||
jnz .tail
|
||||
|
||||
.loop:
|
||||
filter_channels 2
|
||||
sub channelsd, 2
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
.tail:
|
||||
filter_channels 1
|
||||
dec channelsd
|
||||
test channelsd, channelsd
|
||||
jnz .loop
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
35
libavfilter/x86/f_ebur128_init.c
Normal file
35
libavfilter/x86/f_ebur128_init.c
Normal file
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Paul B Mahol
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavfilter/f_ebur128.h"
|
||||
|
||||
void ff_ebur128_filter_channels_avx(const EBUR128DSPContext *, const double *,
|
||||
double *, double *, double *, double *, int);
|
||||
|
||||
av_cold void ff_ebur128_init_x86(EBUR128DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags))
|
||||
dsp->filter_channels = ff_ebur128_filter_channels_avx;
|
||||
}
|
Reference in New Issue
Block a user