2019-07-27 18:54:20 +01:00
|
|
|
/*
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
*
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef AVUTIL_TX_PRIV_H
|
|
|
|
#define AVUTIL_TX_PRIV_H
|
|
|
|
|
|
|
|
#include "tx.h"
|
|
|
|
#include <stddef.h>
|
|
|
|
#include "thread.h"
|
|
|
|
#include "mem.h"
|
2020-05-27 14:54:38 +02:00
|
|
|
#include "mem_internal.h"
|
2019-07-27 18:54:20 +01:00
|
|
|
#include "avassert.h"
|
|
|
|
#include "attributes.h"
|
|
|
|
|
|
|
|
#ifdef TX_FLOAT
|
|
|
|
#define TX_NAME(x) x ## _float
|
2020-02-08 23:13:28 +00:00
|
|
|
#define SCALE_TYPE float
|
2019-07-27 18:54:20 +01:00
|
|
|
typedef float FFTSample;
|
|
|
|
typedef AVComplexFloat FFTComplex;
|
|
|
|
#elif defined(TX_DOUBLE)
|
|
|
|
#define TX_NAME(x) x ## _double
|
2020-02-08 23:13:28 +00:00
|
|
|
#define SCALE_TYPE double
|
2019-07-27 18:54:20 +01:00
|
|
|
typedef double FFTSample;
|
|
|
|
typedef AVComplexDouble FFTComplex;
|
2020-02-08 23:13:28 +00:00
|
|
|
#elif defined(TX_INT32)
|
|
|
|
#define TX_NAME(x) x ## _int32
|
|
|
|
#define SCALE_TYPE float
|
|
|
|
typedef int32_t FFTSample;
|
|
|
|
typedef AVComplexInt32 FFTComplex;
|
2019-07-27 18:54:20 +01:00
|
|
|
#else
|
|
|
|
typedef void FFTComplex;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(TX_FLOAT) || defined(TX_DOUBLE)
|
2020-02-08 23:13:28 +00:00
|
|
|
|
2019-07-27 18:54:20 +01:00
|
|
|
#define CMUL(dre, dim, are, aim, bre, bim) do { \
|
|
|
|
(dre) = (are) * (bre) - (aim) * (bim); \
|
|
|
|
(dim) = (are) * (bim) + (aim) * (bre); \
|
|
|
|
} while (0)
|
2020-02-08 23:13:28 +00:00
|
|
|
|
|
|
|
#define SMUL(dre, dim, are, aim, bre, bim) do { \
|
|
|
|
(dre) = (are) * (bre) - (aim) * (bim); \
|
|
|
|
(dim) = (are) * (bim) - (aim) * (bre); \
|
|
|
|
} while (0)
|
|
|
|
|
2021-01-12 08:11:47 +01:00
|
|
|
#define UNSCALE(x) (x)
|
2020-02-08 23:13:28 +00:00
|
|
|
#define RESCALE(x) (x)
|
|
|
|
|
|
|
|
#define FOLD(a, b) ((a) + (b))
|
|
|
|
|
|
|
|
#elif defined(TX_INT32)
|
|
|
|
|
|
|
|
/* Properly rounds the result */
|
|
|
|
#define CMUL(dre, dim, are, aim, bre, bim) do { \
|
|
|
|
int64_t accu; \
|
|
|
|
(accu) = (int64_t)(bre) * (are); \
|
|
|
|
(accu) -= (int64_t)(bim) * (aim); \
|
|
|
|
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
|
|
|
(accu) = (int64_t)(bim) * (are); \
|
|
|
|
(accu) += (int64_t)(bre) * (aim); \
|
|
|
|
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define SMUL(dre, dim, are, aim, bre, bim) do { \
|
|
|
|
int64_t accu; \
|
|
|
|
(accu) = (int64_t)(bre) * (are); \
|
|
|
|
(accu) -= (int64_t)(bim) * (aim); \
|
|
|
|
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
|
|
|
(accu) = (int64_t)(bim) * (are); \
|
|
|
|
(accu) -= (int64_t)(bre) * (aim); \
|
|
|
|
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
|
|
|
} while (0)
|
|
|
|
|
2021-01-12 08:11:47 +01:00
|
|
|
#define UNSCALE(x) ((double)x/2147483648.0)
|
2021-01-09 20:41:25 +01:00
|
|
|
#define RESCALE(x) (av_clip64(lrintf((x) * 2147483648.0), INT32_MIN, INT32_MAX))
|
2020-02-08 23:13:28 +00:00
|
|
|
|
|
|
|
#define FOLD(x, y) ((int)((x) + (unsigned)(y) + 32) >> 6)
|
|
|
|
|
2019-07-27 18:54:20 +01:00
|
|
|
#endif
|
|
|
|
|
2020-02-08 23:13:28 +00:00
|
|
|
#define BF(x, y, a, b) do { \
|
|
|
|
x = (a) - (b); \
|
|
|
|
y = (a) + (b); \
|
|
|
|
} while (0)
|
|
|
|
|
2019-07-27 18:54:20 +01:00
|
|
|
#define CMUL3(c, a, b) \
|
|
|
|
CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
|
|
|
|
|
|
|
|
#define COSTABLE(size) \
|
|
|
|
DECLARE_ALIGNED(32, FFTSample, TX_NAME(ff_cos_##size))[size/2]
|
|
|
|
|
|
|
|
/* Used by asm, reorder with care */
|
|
|
|
struct AVTXContext {
|
lavu/tx: support in-place FFT transforms
This commit adds support for in-place FFT transforms. Since our
internal transforms were all in-place anyway, this only changes
the permutation on the input.
Unfortunately, research papers were of no help here. All focused
on dry hardware implementations, where permutes are free, or on
software implementations where binary bloat is of no concern so
storing dozen times the transforms for each permutation and version
is not considered bad practice.
Still, for a pure C implementation, it's only around 28% slower
than the multi-megabyte FFTW3 in unaligned mode.
Unlike a closed permutation like with PFA, split-radix FFT bit-reversals
contain multiple NOPs, multiple simple swaps, and a few chained swaps,
so regular single-loop single-state permute loops were not possible.
Instead, we filter out parts of the input indices which are redundant.
This allows for a single branch, and with some clever AVX512 asm,
could possibly be SIMD'd without refactoring.
The inplace_idx array is guaranteed to never be larger than the
revtab array, and in practice only requires around log2(len) entries.
The power-of-two MDCTs can be done in-place as well. And it's
possible to eliminate a copy in the compound MDCTs too, however
it'll be slower than doing them out of place, and we'd need to dirty
the input array.
2021-02-10 17:58:22 +01:00
|
|
|
int n; /* Non-power-of-two part */
|
|
|
|
int m; /* Power-of-two part */
|
|
|
|
int inv; /* Is inverse */
|
2019-07-27 18:54:20 +01:00
|
|
|
int type; /* Type */
|
lavu/tx: support in-place FFT transforms
This commit adds support for in-place FFT transforms. Since our
internal transforms were all in-place anyway, this only changes
the permutation on the input.
Unfortunately, research papers were of no help here. All focused
on dry hardware implementations, where permutes are free, or on
software implementations where binary bloat is of no concern so
storing dozen times the transforms for each permutation and version
is not considered bad practice.
Still, for a pure C implementation, it's only around 28% slower
than the multi-megabyte FFTW3 in unaligned mode.
Unlike a closed permutation like with PFA, split-radix FFT bit-reversals
contain multiple NOPs, multiple simple swaps, and a few chained swaps,
so regular single-loop single-state permute loops were not possible.
Instead, we filter out parts of the input indices which are redundant.
This allows for a single branch, and with some clever AVX512 asm,
could possibly be SIMD'd without refactoring.
The inplace_idx array is guaranteed to never be larger than the
revtab array, and in practice only requires around log2(len) entries.
The power-of-two MDCTs can be done in-place as well. And it's
possible to eliminate a copy in the compound MDCTs too, however
it'll be slower than doing them out of place, and we'd need to dirty
the input array.
2021-02-10 17:58:22 +01:00
|
|
|
uint64_t flags; /* Flags */
|
2021-01-12 08:11:47 +01:00
|
|
|
double scale; /* Scale */
|
2019-07-27 18:54:20 +01:00
|
|
|
|
|
|
|
FFTComplex *exptab; /* MDCT exptab */
|
|
|
|
FFTComplex *tmp; /* Temporary buffer needed for all compound transforms */
|
|
|
|
int *pfatab; /* Input/Output mapping for compound transforms */
|
|
|
|
int *revtab; /* Input mapping for power of two transforms */
|
lavu/tx: support in-place FFT transforms
This commit adds support for in-place FFT transforms. Since our
internal transforms were all in-place anyway, this only changes
the permutation on the input.
Unfortunately, research papers were of no help here. All focused
on dry hardware implementations, where permutes are free, or on
software implementations where binary bloat is of no concern so
storing dozen times the transforms for each permutation and version
is not considered bad practice.
Still, for a pure C implementation, it's only around 28% slower
than the multi-megabyte FFTW3 in unaligned mode.
Unlike a closed permutation like with PFA, split-radix FFT bit-reversals
contain multiple NOPs, multiple simple swaps, and a few chained swaps,
so regular single-loop single-state permute loops were not possible.
Instead, we filter out parts of the input indices which are redundant.
This allows for a single branch, and with some clever AVX512 asm,
could possibly be SIMD'd without refactoring.
The inplace_idx array is guaranteed to never be larger than the
revtab array, and in practice only requires around log2(len) entries.
The power-of-two MDCTs can be done in-place as well. And it's
possible to eliminate a copy in the compound MDCTs too, however
it'll be slower than doing them out of place, and we'd need to dirty
the input array.
2021-02-10 17:58:22 +01:00
|
|
|
int *inplace_idx; /* Required indices to revtab for in-place transforms */
|
2019-07-27 18:54:20 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/* Shared functions */
|
2020-02-08 23:13:28 +00:00
|
|
|
int ff_tx_type_is_mdct(enum AVTXType type);
|
2019-07-27 18:54:20 +01:00
|
|
|
int ff_tx_gen_compound_mapping(AVTXContext *s);
|
2021-02-27 04:11:04 +01:00
|
|
|
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup);
|
lavu/tx: support in-place FFT transforms
This commit adds support for in-place FFT transforms. Since our
internal transforms were all in-place anyway, this only changes
the permutation on the input.
Unfortunately, research papers were of no help here. All focused
on dry hardware implementations, where permutes are free, or on
software implementations where binary bloat is of no concern so
storing dozen times the transforms for each permutation and version
is not considered bad practice.
Still, for a pure C implementation, it's only around 28% slower
than the multi-megabyte FFTW3 in unaligned mode.
Unlike a closed permutation like with PFA, split-radix FFT bit-reversals
contain multiple NOPs, multiple simple swaps, and a few chained swaps,
so regular single-loop single-state permute loops were not possible.
Instead, we filter out parts of the input indices which are redundant.
This allows for a single branch, and with some clever AVX512 asm,
could possibly be SIMD'd without refactoring.
The inplace_idx array is guaranteed to never be larger than the
revtab array, and in practice only requires around log2(len) entries.
The power-of-two MDCTs can be done in-place as well. And it's
possible to eliminate a copy in the compound MDCTs too, however
it'll be slower than doing them out of place, and we'd need to dirty
the input array.
2021-02-10 17:58:22 +01:00
|
|
|
int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s);
|
2019-07-27 18:54:20 +01:00
|
|
|
|
|
|
|
/* Also used by SIMD init */
|
|
|
|
static inline int split_radix_permutation(int i, int n, int inverse)
|
|
|
|
{
|
|
|
|
int m;
|
|
|
|
if (n <= 2)
|
|
|
|
return i & 1;
|
|
|
|
m = n >> 1;
|
|
|
|
if (!(i & m))
|
|
|
|
return split_radix_permutation(i, m, inverse)*2;
|
|
|
|
m >>= 1;
|
|
|
|
if (inverse == !(i & m))
|
|
|
|
return split_radix_permutation(i, m, inverse)*4 + 1;
|
|
|
|
else
|
|
|
|
return split_radix_permutation(i, m, inverse)*4 - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Templated functions */
|
|
|
|
int ff_tx_init_mdct_fft_float(AVTXContext *s, av_tx_fn *tx,
|
|
|
|
enum AVTXType type, int inv, int len,
|
|
|
|
const void *scale, uint64_t flags);
|
|
|
|
int ff_tx_init_mdct_fft_double(AVTXContext *s, av_tx_fn *tx,
|
|
|
|
enum AVTXType type, int inv, int len,
|
|
|
|
const void *scale, uint64_t flags);
|
2020-02-08 23:13:28 +00:00
|
|
|
int ff_tx_init_mdct_fft_int32(AVTXContext *s, av_tx_fn *tx,
|
|
|
|
enum AVTXType type, int inv, int len,
|
|
|
|
const void *scale, uint64_t flags);
|
2019-07-27 18:54:20 +01:00
|
|
|
|
|
|
|
typedef struct CosTabsInitOnce {
|
|
|
|
void (*func)(void);
|
|
|
|
AVOnce control;
|
|
|
|
} CosTabsInitOnce;
|
|
|
|
|
|
|
|
#endif /* AVUTIL_TX_PRIV_H */
|