2019-07-27 19:54:20 +02:00
|
|
|
/*
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
*
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef AVUTIL_TX_PRIV_H
|
|
|
|
#define AVUTIL_TX_PRIV_H
|
|
|
|
|
|
|
|
#include "tx.h"
|
|
|
|
#include "thread.h"
|
2020-05-27 14:54:38 +02:00
|
|
|
#include "mem_internal.h"
|
2019-07-27 19:54:20 +02:00
|
|
|
#include "avassert.h"
|
|
|
|
#include "attributes.h"
|
|
|
|
|
|
|
|
#ifdef TX_FLOAT
|
|
|
|
#define TX_NAME(x) x ## _float
|
2020-02-09 01:13:28 +02:00
|
|
|
#define SCALE_TYPE float
|
2019-07-27 19:54:20 +02:00
|
|
|
typedef float FFTSample;
|
|
|
|
typedef AVComplexFloat FFTComplex;
|
|
|
|
#elif defined(TX_DOUBLE)
|
|
|
|
#define TX_NAME(x) x ## _double
|
2020-02-09 01:13:28 +02:00
|
|
|
#define SCALE_TYPE double
|
2019-07-27 19:54:20 +02:00
|
|
|
typedef double FFTSample;
|
|
|
|
typedef AVComplexDouble FFTComplex;
|
2020-02-09 01:13:28 +02:00
|
|
|
#elif defined(TX_INT32)
|
|
|
|
#define TX_NAME(x) x ## _int32
|
|
|
|
#define SCALE_TYPE float
|
|
|
|
typedef int32_t FFTSample;
|
|
|
|
typedef AVComplexInt32 FFTComplex;
|
2019-07-27 19:54:20 +02:00
|
|
|
#else
|
|
|
|
typedef void FFTComplex;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(TX_FLOAT) || defined(TX_DOUBLE)
|
2020-02-09 01:13:28 +02:00
|
|
|
|
2021-04-10 03:45:03 +02:00
|
|
|
#define CMUL(dre, dim, are, aim, bre, bim) \
|
|
|
|
do { \
|
2019-07-27 19:54:20 +02:00
|
|
|
(dre) = (are) * (bre) - (aim) * (bim); \
|
|
|
|
(dim) = (are) * (bim) + (aim) * (bre); \
|
|
|
|
} while (0)
|
2020-02-09 01:13:28 +02:00
|
|
|
|
2021-04-10 03:45:03 +02:00
|
|
|
#define SMUL(dre, dim, are, aim, bre, bim) \
|
|
|
|
do { \
|
2020-02-09 01:13:28 +02:00
|
|
|
(dre) = (are) * (bre) - (aim) * (bim); \
|
|
|
|
(dim) = (are) * (bim) - (aim) * (bre); \
|
|
|
|
} while (0)
|
|
|
|
|
2021-01-12 09:11:47 +02:00
|
|
|
#define UNSCALE(x) (x)
|
2020-02-09 01:13:28 +02:00
|
|
|
#define RESCALE(x) (x)
|
|
|
|
|
|
|
|
#define FOLD(a, b) ((a) + (b))
|
|
|
|
|
|
|
|
#elif defined(TX_INT32)
|
|
|
|
|
|
|
|
/* Properly rounds the result */
|
2021-04-10 03:45:03 +02:00
|
|
|
#define CMUL(dre, dim, are, aim, bre, bim) \
|
|
|
|
do { \
|
2020-02-09 01:13:28 +02:00
|
|
|
int64_t accu; \
|
|
|
|
(accu) = (int64_t)(bre) * (are); \
|
|
|
|
(accu) -= (int64_t)(bim) * (aim); \
|
|
|
|
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
|
|
|
(accu) = (int64_t)(bim) * (are); \
|
|
|
|
(accu) += (int64_t)(bre) * (aim); \
|
|
|
|
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
|
|
|
} while (0)
|
|
|
|
|
2021-04-10 03:45:03 +02:00
|
|
|
#define SMUL(dre, dim, are, aim, bre, bim) \
|
|
|
|
do { \
|
2020-02-09 01:13:28 +02:00
|
|
|
int64_t accu; \
|
|
|
|
(accu) = (int64_t)(bre) * (are); \
|
|
|
|
(accu) -= (int64_t)(bim) * (aim); \
|
|
|
|
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
|
|
|
(accu) = (int64_t)(bim) * (are); \
|
|
|
|
(accu) -= (int64_t)(bre) * (aim); \
|
|
|
|
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
|
|
|
} while (0)
|
|
|
|
|
2021-01-12 09:11:47 +02:00
|
|
|
#define UNSCALE(x) ((double)x/2147483648.0)
|
2021-01-09 21:41:25 +02:00
|
|
|
#define RESCALE(x) (av_clip64(lrintf((x) * 2147483648.0), INT32_MIN, INT32_MAX))
|
2020-02-09 01:13:28 +02:00
|
|
|
|
|
|
|
#define FOLD(x, y) ((int)((x) + (unsigned)(y) + 32) >> 6)
|
|
|
|
|
2019-07-27 19:54:20 +02:00
|
|
|
#endif
|
|
|
|
|
2021-04-10 03:45:03 +02:00
|
|
|
#define BF(x, y, a, b) \
|
|
|
|
do { \
|
2020-02-09 01:13:28 +02:00
|
|
|
x = (a) - (b); \
|
|
|
|
y = (a) + (b); \
|
|
|
|
} while (0)
|
|
|
|
|
2019-07-27 19:54:20 +02:00
|
|
|
#define CMUL3(c, a, b) \
|
|
|
|
CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
|
|
|
|
|
2021-04-10 03:45:03 +02:00
|
|
|
#define COSTABLE(size) \
|
2021-04-10 03:47:18 +02:00
|
|
|
DECLARE_ALIGNED(32, FFTSample, TX_NAME(ff_cos_##size))[size/4 + 1]
|
2019-07-27 19:54:20 +02:00
|
|
|
|
|
|
|
/* Used by asm, reorder with care */
|
|
|
|
struct AVTXContext {
|
lavu/tx: support in-place FFT transforms
This commit adds support for in-place FFT transforms. Since our
internal transforms were all in-place anyway, this only changes
the permutation on the input.
Unfortunately, research papers were of no help here. All focused
on dry hardware implementations, where permutes are free, or on
software implementations where binary bloat is of no concern so
storing dozen times the transforms for each permutation and version
is not considered bad practice.
Still, for a pure C implementation, it's only around 28% slower
than the multi-megabyte FFTW3 in unaligned mode.
Unlike a closed permutation like with PFA, split-radix FFT bit-reversals
contain multiple NOPs, multiple simple swaps, and a few chained swaps,
so regular single-loop single-state permute loops were not possible.
Instead, we filter out parts of the input indices which are redundant.
This allows for a single branch, and with some clever AVX512 asm,
could possibly be SIMD'd without refactoring.
The inplace_idx array is guaranteed to never be larger than the
revtab array, and in practice only requires around log2(len) entries.
The power-of-two MDCTs can be done in-place as well. And it's
possible to eliminate a copy in the compound MDCTs too, however
it'll be slower than doing them out of place, and we'd need to dirty
the input array.
2021-02-10 18:58:22 +02:00
|
|
|
int n; /* Non-power-of-two part */
|
|
|
|
int m; /* Power-of-two part */
|
|
|
|
int inv; /* Is inverse */
|
2019-07-27 19:54:20 +02:00
|
|
|
int type; /* Type */
|
lavu/tx: support in-place FFT transforms
This commit adds support for in-place FFT transforms. Since our
internal transforms were all in-place anyway, this only changes
the permutation on the input.
Unfortunately, research papers were of no help here. All focused
on dry hardware implementations, where permutes are free, or on
software implementations where binary bloat is of no concern so
storing dozen times the transforms for each permutation and version
is not considered bad practice.
Still, for a pure C implementation, it's only around 28% slower
than the multi-megabyte FFTW3 in unaligned mode.
Unlike a closed permutation like with PFA, split-radix FFT bit-reversals
contain multiple NOPs, multiple simple swaps, and a few chained swaps,
so regular single-loop single-state permute loops were not possible.
Instead, we filter out parts of the input indices which are redundant.
This allows for a single branch, and with some clever AVX512 asm,
could possibly be SIMD'd without refactoring.
The inplace_idx array is guaranteed to never be larger than the
revtab array, and in practice only requires around log2(len) entries.
The power-of-two MDCTs can be done in-place as well. And it's
possible to eliminate a copy in the compound MDCTs too, however
it'll be slower than doing them out of place, and we'd need to dirty
the input array.
2021-02-10 18:58:22 +02:00
|
|
|
uint64_t flags; /* Flags */
|
2021-01-12 09:11:47 +02:00
|
|
|
double scale; /* Scale */
|
2019-07-27 19:54:20 +02:00
|
|
|
|
|
|
|
FFTComplex *exptab; /* MDCT exptab */
|
2021-04-10 03:45:03 +02:00
|
|
|
FFTComplex *tmp; /* Temporary buffer needed for all compound transforms */
|
2019-07-27 19:54:20 +02:00
|
|
|
int *pfatab; /* Input/Output mapping for compound transforms */
|
|
|
|
int *revtab; /* Input mapping for power of two transforms */
|
lavu/tx: support in-place FFT transforms
This commit adds support for in-place FFT transforms. Since our
internal transforms were all in-place anyway, this only changes
the permutation on the input.
Unfortunately, research papers were of no help here. All focused
on dry hardware implementations, where permutes are free, or on
software implementations where binary bloat is of no concern so
storing dozen times the transforms for each permutation and version
is not considered bad practice.
Still, for a pure C implementation, it's only around 28% slower
than the multi-megabyte FFTW3 in unaligned mode.
Unlike a closed permutation like with PFA, split-radix FFT bit-reversals
contain multiple NOPs, multiple simple swaps, and a few chained swaps,
so regular single-loop single-state permute loops were not possible.
Instead, we filter out parts of the input indices which are redundant.
This allows for a single branch, and with some clever AVX512 asm,
could possibly be SIMD'd without refactoring.
The inplace_idx array is guaranteed to never be larger than the
revtab array, and in practice only requires around log2(len) entries.
The power-of-two MDCTs can be done in-place as well. And it's
possible to eliminate a copy in the compound MDCTs too, however
it'll be slower than doing them out of place, and we'd need to dirty
the input array.
2021-02-10 18:58:22 +02:00
|
|
|
int *inplace_idx; /* Required indices to revtab for in-place transforms */
|
2021-04-10 03:55:37 +02:00
|
|
|
|
2021-04-10 03:53:38 +02:00
|
|
|
int *revtab_c; /* Revtab for only the C transforms, needed because
|
|
|
|
* checkasm makes us reuse the same context. */
|
|
|
|
|
2021-04-10 03:55:37 +02:00
|
|
|
av_tx_fn top_tx; /* Used for computing transforms derived from other
|
|
|
|
* transforms, like full-length iMDCTs and RDFTs.
|
|
|
|
* NOTE: Do NOT use this to mix assembly with C code. */
|
2019-07-27 19:54:20 +02:00
|
|
|
};
|
|
|
|
|
2021-04-10 03:45:03 +02:00
|
|
|
/* Checks if type is an MDCT */
|
2020-02-09 01:13:28 +02:00
|
|
|
int ff_tx_type_is_mdct(enum AVTXType type);
|
2021-04-10 03:45:03 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Generates the PFA permutation table into AVTXContext->pfatab. The end table
|
|
|
|
* is appended to the start table.
|
|
|
|
*/
|
2019-07-27 19:54:20 +02:00
|
|
|
int ff_tx_gen_compound_mapping(AVTXContext *s);
|
2021-04-10 03:45:03 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Generates a standard-ish (slightly modified) Split-Radix revtab into
|
|
|
|
* AVTXContext->revtab
|
|
|
|
*/
|
2021-02-27 05:11:04 +02:00
|
|
|
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup);
|
2021-04-10 03:45:03 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Generates an index into AVTXContext->inplace_idx that if followed in the
|
|
|
|
* specific order, allows the revtab to be done in-place. AVTXContext->revtab
|
|
|
|
* must already exist.
|
|
|
|
*/
|
2021-04-10 03:53:38 +02:00
|
|
|
int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab);
|
2019-07-27 19:54:20 +02:00
|
|
|
|
2021-04-10 03:52:31 +02:00
|
|
|
/*
|
|
|
|
* This generates a parity-based revtab of length len and direction inv.
|
|
|
|
*
|
|
|
|
* Parity means even and odd complex numbers will be split, e.g. the even
|
|
|
|
* coefficients will come first, after which the odd coefficients will be
|
|
|
|
* placed. For example, a 4-point transform's coefficients after reordering:
|
|
|
|
* z[0].re, z[0].im, z[2].re, z[2].im, z[1].re, z[1].im, z[3].re, z[3].im
|
|
|
|
*
|
|
|
|
* The basis argument is the length of the largest non-composite transform
|
|
|
|
* supported, and also implies that the basis/2 transform is supported as well,
|
|
|
|
* as the split-radix algorithm requires it to be.
|
|
|
|
*
|
|
|
|
* The dual_stride argument indicates that both the basis, as well as the
|
|
|
|
* basis/2 transforms support doing two transforms at once, and the coefficients
|
|
|
|
* will be interleaved between each pair in a split-radix like so (stride == 2):
|
|
|
|
* tx1[0], tx1[2], tx2[0], tx2[2], tx1[1], tx1[3], tx2[1], tx2[3]
|
|
|
|
* A non-zero number switches this on, with the value indicating the stride
|
|
|
|
* (how many values of 1 transform to put first before switching to the other).
|
|
|
|
* Must be a power of two or 0. Must be less than the basis.
|
|
|
|
* Value will be clipped to the transform size, so for a basis of 16 and a
|
|
|
|
* dual_stride of 8, dual 8-point transforms will be laid out as if dual_stride
|
|
|
|
* was set to 4.
|
|
|
|
* Usually you'll set this to half the complex numbers that fit in a single
|
|
|
|
* register or 0. This allows to reuse SSE functions as dual-transform
|
|
|
|
* functions in AVX mode.
|
|
|
|
*
|
|
|
|
* If length is smaller than basis/2 this function will not do anything.
|
|
|
|
*/
|
|
|
|
void ff_tx_gen_split_radix_parity_revtab(int *revtab, int len, int inv,
|
|
|
|
int basis, int dual_stride);
|
|
|
|
|
2021-04-10 03:45:03 +02:00
|
|
|
/* Templated init functions */
|
2019-07-27 19:54:20 +02:00
|
|
|
int ff_tx_init_mdct_fft_float(AVTXContext *s, av_tx_fn *tx,
|
|
|
|
enum AVTXType type, int inv, int len,
|
|
|
|
const void *scale, uint64_t flags);
|
|
|
|
int ff_tx_init_mdct_fft_double(AVTXContext *s, av_tx_fn *tx,
|
|
|
|
enum AVTXType type, int inv, int len,
|
|
|
|
const void *scale, uint64_t flags);
|
2020-02-09 01:13:28 +02:00
|
|
|
int ff_tx_init_mdct_fft_int32(AVTXContext *s, av_tx_fn *tx,
|
|
|
|
enum AVTXType type, int inv, int len,
|
|
|
|
const void *scale, uint64_t flags);
|
2019-07-27 19:54:20 +02:00
|
|
|
|
|
|
|
typedef struct CosTabsInitOnce {
|
|
|
|
void (*func)(void);
|
|
|
|
AVOnce control;
|
|
|
|
} CosTabsInitOnce;
|
|
|
|
|
lavu/x86: add FFT assembly
This commit adds a pure x86 assembly SIMD version of the FFT in libavutil/tx.
The design of this pure assembly FFT is pretty unconventional.
On the lowest level, instead of splitting the complex numbers into
real and imaginary parts, we keep complex numbers together but split
them in terms of parity. This saves a number of shuffles in each transform,
but more importantly, it splits each transform into two independent
paths, which we process using separate registers in parallel.
This allows us to keep all units saturated and lets us use all available
registers to avoid dependencies.
Moreover, it allows us to double the granularity of our per-load permutation,
skipping many expensive lookups and allowing us to use just 4 loads per register,
rather than 8, or in case FMA3 (and by extension, AVX2), use the vgatherdpd
instruction, which is at least as fast as 4 separate loads on old hardware,
and quite a bit faster on modern CPUs).
Higher up, we go for a bottom-up construction of large transforms, foregoing
the traditional per-transform call-return recursion chains. Instead, we always
start at the bottom-most basis transform (in this case, a 32-point transform),
and continue constructing larger and larger transforms until we return to the
top-most transform.
This way, we only touch the stack 3 times per a complete target transform:
once for the 1/2 length transform and two times for the 1/4 length transform.
The combination algorithm we use is a standard Split-Radix algorithm,
as used in our C code. Although a version with less operations exists
(Steven G. Johnson and Matteo Frigo's "A modified split-radix FFT with fewer
arithmetic operations", IEEE Trans. Signal Process. 55 (1), 111–119 (2007),
which is the one FFTW uses), it only has 2% less operations and requires at least 4x
the binary code (due to it needing 4 different paths to do a single transform).
That version also has other issues which prevent it from being implemented
with SIMD code as efficiently, which makes it lose the marginal gains it offered,
and cannot be performed bottom-up, requiring many recursive call-return chains,
whose overhead adds up.
We go through a lot of effort to minimize load/stores by keeping as much in
registers in between construcring transforms. This saves us around 32 cycles,
on paper, but in reality a lot more due to load/store aliasing (a load from a
memory location cannot be issued while there's a store pending, and there are
only so many (2 for Zen 3) load/store units in a CPU).
Also, we interleave coefficients during the last stage to save on a store+load
per register.
Each of the smallest, basis transforms (4, 8 and 16-point in our case)
has been extremely optimized. Our 8-point transform is barely 20 instructions
in total, beating our old implementation 8-point transform by 1 instruction.
Our 2x8-point transform is 23 instructions, beating our old implementation by
6 instruction and needing 50% less cycles. Our 16-point transform's combination
code takes slightly more instructions than our old implementation, but makes up
for it by requiring a lot less arithmetic operations.
Overall, the transform was optimized for the timings of Zen 3, which at the
time of writing has the most IPC from all documented CPUs. Shuffles were
preferred over arithmetic operations due to their 1/0.5 latency/throughput.
On average, this code is 30% faster than our old libavcodec implementation.
It's able to trade blows with the previously-untouchable FFTW on small transforms,
and due to its tiny size and better prediction, outdoes FFTW on larger transforms
by 11% on the largest currently supported size.
2021-04-10 03:54:22 +02:00
|
|
|
void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx);
|
|
|
|
|
2019-07-27 19:54:20 +02:00
|
|
|
#endif /* AVUTIL_TX_PRIV_H */
|