mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-05-29 21:47:48 +02:00
lavu/tx: rewrite internal code as a tree-based codelet constructor
This commit rewrites the internal transform code into a constructor that stitches transforms (codelets). This allows for transforms to reuse arbitrary parts of other transforms, and allows transforms to be stacked onto one another (such as a full iMDCT using a half-iMDCT which in turn uses an FFT). It also permits for each step to be individually replaced by assembly or a custom implementation (such as an ASIC).
This commit is contained in:
parent
c14976be04
commit
ef4bd81615
594
libavutil/tx.c
594
libavutil/tx.c
@ -16,19 +16,16 @@
|
|||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "cpu.h"
|
||||||
|
#include "qsort.h"
|
||||||
|
#include "bprint.h"
|
||||||
|
|
||||||
#include "tx_priv.h"
|
#include "tx_priv.h"
|
||||||
|
|
||||||
int ff_tx_type_is_mdct(enum AVTXType type)
|
#define TYPE_IS(type, x) \
|
||||||
{
|
(((x) == AV_TX_FLOAT_ ## type) || \
|
||||||
switch (type) {
|
((x) == AV_TX_DOUBLE_ ## type) || \
|
||||||
case AV_TX_FLOAT_MDCT:
|
((x) == AV_TX_INT32_ ## type))
|
||||||
case AV_TX_DOUBLE_MDCT:
|
|
||||||
case AV_TX_INT32_MDCT:
|
|
||||||
return 1;
|
|
||||||
default:
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Calculates the modular multiplicative inverse */
|
/* Calculates the modular multiplicative inverse */
|
||||||
static av_always_inline int mulinv(int n, int m)
|
static av_always_inline int mulinv(int n, int m)
|
||||||
@ -42,22 +39,26 @@ static av_always_inline int mulinv(int n, int m)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Guaranteed to work for any n, m where gcd(n, m) == 1 */
|
/* Guaranteed to work for any n, m where gcd(n, m) == 1 */
|
||||||
int ff_tx_gen_compound_mapping(AVTXContext *s)
|
int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
|
||||||
{
|
{
|
||||||
int *in_map, *out_map;
|
int *in_map, *out_map;
|
||||||
const int n = s->n;
|
const int inv = s->inv;
|
||||||
const int m = s->m;
|
const int len = n*m; /* Will not be equal to s->len for MDCTs */
|
||||||
const int inv = s->inv;
|
const int mdct = TYPE_IS(MDCT, s->type);
|
||||||
const int len = n*m;
|
int m_inv, n_inv;
|
||||||
const int m_inv = mulinv(m, n);
|
|
||||||
const int n_inv = mulinv(n, m);
|
|
||||||
const int mdct = ff_tx_type_is_mdct(s->type);
|
|
||||||
|
|
||||||
if (!(s->pfatab = av_malloc(2*len*sizeof(*s->pfatab))))
|
/* Make sure the numbers are coprime */
|
||||||
|
if (av_gcd(n, m) != 1)
|
||||||
|
return AVERROR(EINVAL);
|
||||||
|
|
||||||
|
m_inv = mulinv(m, n);
|
||||||
|
n_inv = mulinv(n, m);
|
||||||
|
|
||||||
|
if (!(s->map = av_malloc(2*len*sizeof(*s->map))))
|
||||||
return AVERROR(ENOMEM);
|
return AVERROR(ENOMEM);
|
||||||
|
|
||||||
in_map = s->pfatab;
|
in_map = s->map;
|
||||||
out_map = s->pfatab + n*m;
|
out_map = s->map + len;
|
||||||
|
|
||||||
/* Ruritanian map for input, CRT map for output, can be swapped */
|
/* Ruritanian map for input, CRT map for output, can be swapped */
|
||||||
for (int j = 0; j < m; j++) {
|
for (int j = 0; j < m; j++) {
|
||||||
@ -92,48 +93,50 @@ int ff_tx_gen_compound_mapping(AVTXContext *s)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int split_radix_permutation(int i, int m, int inverse)
|
static inline int split_radix_permutation(int i, int len, int inv)
|
||||||
{
|
{
|
||||||
m >>= 1;
|
len >>= 1;
|
||||||
if (m <= 1)
|
if (len <= 1)
|
||||||
return i & 1;
|
return i & 1;
|
||||||
if (!(i & m))
|
if (!(i & len))
|
||||||
return split_radix_permutation(i, m, inverse) * 2;
|
return split_radix_permutation(i, len, inv) * 2;
|
||||||
m >>= 1;
|
len >>= 1;
|
||||||
return split_radix_permutation(i, m, inverse) * 4 + 1 - 2*(!(i & m) ^ inverse);
|
return split_radix_permutation(i, len, inv) * 4 + 1 - 2*(!(i & len) ^ inv);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup)
|
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup)
|
||||||
{
|
{
|
||||||
const int m = s->m, inv = s->inv;
|
int len = s->len;
|
||||||
|
|
||||||
if (!(s->revtab = av_malloc(s->m*sizeof(*s->revtab))))
|
if (!(s->map = av_malloc(len*sizeof(*s->map))))
|
||||||
return AVERROR(ENOMEM);
|
|
||||||
if (!(s->revtab_c = av_malloc(m*sizeof(*s->revtab_c))))
|
|
||||||
return AVERROR(ENOMEM);
|
return AVERROR(ENOMEM);
|
||||||
|
|
||||||
/* Default */
|
if (invert_lookup) {
|
||||||
for (int i = 0; i < m; i++) {
|
for (int i = 0; i < s->len; i++)
|
||||||
int k = -split_radix_permutation(i, m, inv) & (m - 1);
|
s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1);
|
||||||
if (invert_lookup)
|
} else {
|
||||||
s->revtab[i] = s->revtab_c[i] = k;
|
for (int i = 0; i < s->len; i++)
|
||||||
else
|
s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i;
|
||||||
s->revtab[i] = s->revtab_c[k] = i;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab)
|
int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s)
|
||||||
{
|
{
|
||||||
int nb_inplace_idx = 0;
|
int *src_map, out_map_idx = 0, len = s->len;
|
||||||
|
|
||||||
if (!(s->inplace_idx = av_malloc(s->m*sizeof(*s->inplace_idx))))
|
if (!s->sub || !s->sub->map)
|
||||||
|
return AVERROR(EINVAL);
|
||||||
|
|
||||||
|
if (!(s->map = av_mallocz(len*sizeof(*s->map))))
|
||||||
return AVERROR(ENOMEM);
|
return AVERROR(ENOMEM);
|
||||||
|
|
||||||
|
src_map = s->sub->map;
|
||||||
|
|
||||||
/* The first coefficient is always already in-place */
|
/* The first coefficient is always already in-place */
|
||||||
for (int src = 1; src < s->m; src++) {
|
for (int src = 1; src < s->len; src++) {
|
||||||
int dst = revtab[src];
|
int dst = src_map[src];
|
||||||
int found = 0;
|
int found = 0;
|
||||||
|
|
||||||
if (dst <= src)
|
if (dst <= src)
|
||||||
@ -143,48 +146,53 @@ int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab)
|
|||||||
* and if so, skips it, since to fully permute a loop we must only
|
* and if so, skips it, since to fully permute a loop we must only
|
||||||
* enter it once. */
|
* enter it once. */
|
||||||
do {
|
do {
|
||||||
for (int j = 0; j < nb_inplace_idx; j++) {
|
for (int j = 0; j < out_map_idx; j++) {
|
||||||
if (dst == s->inplace_idx[j]) {
|
if (dst == s->map[j]) {
|
||||||
found = 1;
|
found = 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dst = revtab[dst];
|
dst = src_map[dst];
|
||||||
} while (dst != src && !found);
|
} while (dst != src && !found);
|
||||||
|
|
||||||
if (!found)
|
if (!found)
|
||||||
s->inplace_idx[nb_inplace_idx++] = src;
|
s->map[out_map_idx++] = src;
|
||||||
}
|
}
|
||||||
|
|
||||||
s->inplace_idx[nb_inplace_idx++] = 0;
|
s->map[out_map_idx++] = 0;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
|
static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
|
||||||
int is_dual, int dual_high, int len,
|
int is_dual, int dual_high, int len,
|
||||||
int basis, int dual_stride)
|
int basis, int dual_stride, int inv_lookup)
|
||||||
{
|
{
|
||||||
len >>= 1;
|
len >>= 1;
|
||||||
|
|
||||||
if (len <= basis) {
|
if (len <= basis) {
|
||||||
int k1, k2, *even, *odd, stride;
|
int k1, k2, stride, even_idx, odd_idx;
|
||||||
|
|
||||||
is_dual = is_dual && dual_stride;
|
is_dual = is_dual && dual_stride;
|
||||||
dual_high = is_dual & dual_high;
|
dual_high = is_dual & dual_high;
|
||||||
stride = is_dual ? FFMIN(dual_stride, len) : 0;
|
stride = is_dual ? FFMIN(dual_stride, len) : 0;
|
||||||
|
|
||||||
even = &revtab[offset + dual_high*(stride - 2*len)];
|
even_idx = offset + dual_high*(stride - 2*len);
|
||||||
odd = &even[len + (is_dual && !dual_high)*len + dual_high*len];
|
odd_idx = even_idx + len + (is_dual && !dual_high)*len + dual_high*len;
|
||||||
|
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
k1 = -split_radix_permutation(offset + i*2 + 0, n, inv) & (n - 1);
|
k1 = -split_radix_permutation(offset + i*2 + 0, n, inv) & (n - 1);
|
||||||
k2 = -split_radix_permutation(offset + i*2 + 1, n, inv) & (n - 1);
|
k2 = -split_radix_permutation(offset + i*2 + 1, n, inv) & (n - 1);
|
||||||
*even++ = k1;
|
if (inv_lookup) {
|
||||||
*odd++ = k2;
|
revtab[even_idx++] = k1;
|
||||||
|
revtab[odd_idx++] = k2;
|
||||||
|
} else {
|
||||||
|
revtab[k1] = even_idx++;
|
||||||
|
revtab[k2] = odd_idx++;
|
||||||
|
}
|
||||||
if (stride && !((i + 1) % stride)) {
|
if (stride && !((i + 1) % stride)) {
|
||||||
even += stride;
|
even_idx += stride;
|
||||||
odd += stride;
|
odd_idx += stride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -192,22 +200,52 @@ static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
|
|||||||
}
|
}
|
||||||
|
|
||||||
parity_revtab_generator(revtab, n, inv, offset,
|
parity_revtab_generator(revtab, n, inv, offset,
|
||||||
0, 0, len >> 0, basis, dual_stride);
|
0, 0, len >> 0, basis, dual_stride, inv_lookup);
|
||||||
parity_revtab_generator(revtab, n, inv, offset + (len >> 0),
|
parity_revtab_generator(revtab, n, inv, offset + (len >> 0),
|
||||||
1, 0, len >> 1, basis, dual_stride);
|
1, 0, len >> 1, basis, dual_stride, inv_lookup);
|
||||||
parity_revtab_generator(revtab, n, inv, offset + (len >> 0) + (len >> 1),
|
parity_revtab_generator(revtab, n, inv, offset + (len >> 0) + (len >> 1),
|
||||||
1, 1, len >> 1, basis, dual_stride);
|
1, 1, len >> 1, basis, dual_stride, inv_lookup);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ff_tx_gen_split_radix_parity_revtab(int *revtab, int len, int inv,
|
int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int invert_lookup,
|
||||||
int basis, int dual_stride)
|
int basis, int dual_stride)
|
||||||
{
|
{
|
||||||
|
int len = s->len;
|
||||||
|
int inv = s->inv;
|
||||||
|
|
||||||
|
if (!(s->map = av_mallocz(len*sizeof(*s->map))))
|
||||||
|
return AVERROR(ENOMEM);
|
||||||
|
|
||||||
basis >>= 1;
|
basis >>= 1;
|
||||||
if (len < basis)
|
if (len < basis)
|
||||||
return;
|
return AVERROR(EINVAL);
|
||||||
|
|
||||||
av_assert0(!dual_stride || !(dual_stride & (dual_stride - 1)));
|
av_assert0(!dual_stride || !(dual_stride & (dual_stride - 1)));
|
||||||
av_assert0(dual_stride <= basis);
|
av_assert0(dual_stride <= basis);
|
||||||
parity_revtab_generator(revtab, len, inv, 0, 0, 0, len, basis, dual_stride);
|
parity_revtab_generator(s->map, len, inv, 0, 0, 0, len,
|
||||||
|
basis, dual_stride, invert_lookup);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void reset_ctx(AVTXContext *s)
|
||||||
|
{
|
||||||
|
if (!s)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (s->sub)
|
||||||
|
for (int i = 0; i < s->nb_sub; i++)
|
||||||
|
reset_ctx(&s->sub[i]);
|
||||||
|
|
||||||
|
if (s->cd_self->uninit)
|
||||||
|
s->cd_self->uninit(s);
|
||||||
|
|
||||||
|
av_freep(&s->sub);
|
||||||
|
av_freep(&s->map);
|
||||||
|
av_freep(&s->exp);
|
||||||
|
av_freep(&s->tmp);
|
||||||
|
|
||||||
|
memset(s, 0, sizeof(*s));
|
||||||
}
|
}
|
||||||
|
|
||||||
av_cold void av_tx_uninit(AVTXContext **ctx)
|
av_cold void av_tx_uninit(AVTXContext **ctx)
|
||||||
@ -215,53 +253,401 @@ av_cold void av_tx_uninit(AVTXContext **ctx)
|
|||||||
if (!(*ctx))
|
if (!(*ctx))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
av_free((*ctx)->pfatab);
|
reset_ctx(*ctx);
|
||||||
av_free((*ctx)->exptab);
|
|
||||||
av_free((*ctx)->revtab);
|
|
||||||
av_free((*ctx)->revtab_c);
|
|
||||||
av_free((*ctx)->inplace_idx);
|
|
||||||
av_free((*ctx)->tmp);
|
|
||||||
|
|
||||||
av_freep(ctx);
|
av_freep(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static av_cold int ff_tx_null_init(AVTXContext *s, const FFTXCodelet *cd,
|
||||||
|
uint64_t flags, FFTXCodeletOptions *opts,
|
||||||
|
int len, int inv, const void *scale)
|
||||||
|
{
|
||||||
|
/* Can only handle one sample+type to one sample+type transforms */
|
||||||
|
if (TYPE_IS(MDCT, s->type))
|
||||||
|
return AVERROR(EINVAL);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Null transform when the length is 1 */
|
||||||
|
static void ff_tx_null(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
|
||||||
|
{
|
||||||
|
memcpy(_out, _in, stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const FFTXCodelet ff_tx_null_def = {
|
||||||
|
.name = "null",
|
||||||
|
.function = ff_tx_null,
|
||||||
|
.type = TX_TYPE_ANY,
|
||||||
|
.flags = AV_TX_UNALIGNED | FF_TX_ALIGNED |
|
||||||
|
FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
|
||||||
|
.factors[0] = TX_FACTOR_ANY,
|
||||||
|
.min_len = 1,
|
||||||
|
.max_len = 1,
|
||||||
|
.init = ff_tx_null_init,
|
||||||
|
.cpu_flags = FF_TX_CPU_FLAGS_ALL,
|
||||||
|
.prio = FF_TX_PRIO_MAX,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const FFTXCodelet * const ff_tx_null_list[] = {
|
||||||
|
&ff_tx_null_def,
|
||||||
|
NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void print_flags(AVBPrint *bp, uint64_t f)
|
||||||
|
{
|
||||||
|
int prev = 0;
|
||||||
|
const char *sep = ", ";
|
||||||
|
av_bprintf(bp, "flags: [");
|
||||||
|
if ((f & FF_TX_ALIGNED) && ++prev)
|
||||||
|
av_bprintf(bp, "aligned");
|
||||||
|
if ((f & AV_TX_UNALIGNED) && ++prev)
|
||||||
|
av_bprintf(bp, "%sunaligned", prev > 1 ? sep : "");
|
||||||
|
if ((f & AV_TX_INPLACE) && ++prev)
|
||||||
|
av_bprintf(bp, "%sinplace", prev > 1 ? sep : "");
|
||||||
|
if ((f & FF_TX_OUT_OF_PLACE) && ++prev)
|
||||||
|
av_bprintf(bp, "%sout_of_place", prev > 1 ? sep : "");
|
||||||
|
if ((f & FF_TX_FORWARD_ONLY) && ++prev)
|
||||||
|
av_bprintf(bp, "%sfwd_only", prev > 1 ? sep : "");
|
||||||
|
if ((f & FF_TX_INVERSE_ONLY) && ++prev)
|
||||||
|
av_bprintf(bp, "%sinv_only", prev > 1 ? sep : "");
|
||||||
|
if ((f & FF_TX_PRESHUFFLE) && ++prev)
|
||||||
|
av_bprintf(bp, "%spreshuf", prev > 1 ? sep : "");
|
||||||
|
if ((f & AV_TX_FULL_IMDCT) && ++prev)
|
||||||
|
av_bprintf(bp, "%simdct_full", prev > 1 ? sep : "");
|
||||||
|
av_bprintf(bp, "]");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_type(AVBPrint *bp, enum AVTXType type)
|
||||||
|
{
|
||||||
|
av_bprintf(bp, "%s",
|
||||||
|
type == TX_TYPE_ANY ? "any" :
|
||||||
|
type == AV_TX_FLOAT_FFT ? "fft_float" :
|
||||||
|
type == AV_TX_FLOAT_MDCT ? "mdct_float" :
|
||||||
|
type == AV_TX_DOUBLE_FFT ? "fft_double" :
|
||||||
|
type == AV_TX_DOUBLE_MDCT ? "mdct_double" :
|
||||||
|
type == AV_TX_INT32_FFT ? "fft_int32" :
|
||||||
|
type == AV_TX_INT32_MDCT ? "mdct_int32" :
|
||||||
|
"unknown");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_cd_info(const FFTXCodelet *cd, int prio, int print_prio)
|
||||||
|
{
|
||||||
|
AVBPrint bp = { 0 };
|
||||||
|
av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
|
||||||
|
|
||||||
|
av_bprintf(&bp, "%s - type: ", cd->name);
|
||||||
|
|
||||||
|
print_type(&bp, cd->type);
|
||||||
|
|
||||||
|
av_bprintf(&bp, ", len: ");
|
||||||
|
if (cd->min_len != cd->max_len)
|
||||||
|
av_bprintf(&bp, "[%i, ", cd->min_len);
|
||||||
|
|
||||||
|
if (cd->max_len == TX_LEN_UNLIMITED)
|
||||||
|
av_bprintf(&bp, "∞");
|
||||||
|
else
|
||||||
|
av_bprintf(&bp, "%i", cd->max_len);
|
||||||
|
|
||||||
|
av_bprintf(&bp, "%s, factors: [", cd->min_len != cd->max_len ? "]" : "");
|
||||||
|
for (int i = 0; i < TX_MAX_SUB; i++) {
|
||||||
|
if (i && cd->factors[i])
|
||||||
|
av_bprintf(&bp, ", ");
|
||||||
|
if (cd->factors[i] == TX_FACTOR_ANY)
|
||||||
|
av_bprintf(&bp, "any");
|
||||||
|
else if (cd->factors[i])
|
||||||
|
av_bprintf(&bp, "%i", cd->factors[i]);
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
av_bprintf(&bp, "], ");
|
||||||
|
print_flags(&bp, cd->flags);
|
||||||
|
|
||||||
|
if (print_prio)
|
||||||
|
av_bprintf(&bp, ", prio: %i", prio);
|
||||||
|
|
||||||
|
av_log(NULL, AV_LOG_VERBOSE, "%s\n", bp.str);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct TXCodeletMatch {
|
||||||
|
const FFTXCodelet *cd;
|
||||||
|
int prio;
|
||||||
|
} TXCodeletMatch;
|
||||||
|
|
||||||
|
static int cmp_matches(TXCodeletMatch *a, TXCodeletMatch *b)
|
||||||
|
{
|
||||||
|
return FFDIFFSIGN(b->prio, a->prio);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* We want all factors to completely cover the length */
|
||||||
|
static inline int check_cd_factors(const FFTXCodelet *cd, int len)
|
||||||
|
{
|
||||||
|
int all_flag = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < TX_MAX_SUB; i++) {
|
||||||
|
int factor = cd->factors[i];
|
||||||
|
|
||||||
|
/* Conditions satisfied */
|
||||||
|
if (len == 1)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
/* No more factors */
|
||||||
|
if (!factor) {
|
||||||
|
break;
|
||||||
|
} else if (factor == TX_FACTOR_ANY) {
|
||||||
|
all_flag = 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factor == 2) { /* Fast path */
|
||||||
|
int bits_2 = ff_ctz(len);
|
||||||
|
if (!bits_2)
|
||||||
|
return 0; /* Factor not supported */
|
||||||
|
|
||||||
|
len >>= bits_2;
|
||||||
|
} else {
|
||||||
|
int res = len % factor;
|
||||||
|
if (res)
|
||||||
|
return 0; /* Factor not supported */
|
||||||
|
|
||||||
|
while (!res) {
|
||||||
|
len /= factor;
|
||||||
|
res = len % factor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return all_flag || (len == 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
|
||||||
|
uint64_t flags, FFTXCodeletOptions *opts,
|
||||||
|
int len, int inv, const void *scale)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
AVTXContext *sub = NULL;
|
||||||
|
TXCodeletMatch *cd_tmp, *cd_matches = NULL;
|
||||||
|
unsigned int cd_matches_size = 0;
|
||||||
|
int nb_cd_matches = 0;
|
||||||
|
AVBPrint bp = { 0 };
|
||||||
|
|
||||||
|
/* Array of all compiled codelet lists. Order is irrelevant. */
|
||||||
|
const FFTXCodelet * const * const codelet_list[] = {
|
||||||
|
ff_tx_codelet_list_float_c,
|
||||||
|
ff_tx_codelet_list_double_c,
|
||||||
|
ff_tx_codelet_list_int32_c,
|
||||||
|
ff_tx_null_list,
|
||||||
|
#if ARCH_X86
|
||||||
|
ff_tx_codelet_list_float_x86,
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
|
||||||
|
|
||||||
|
/* We still accept functions marked with SLOW, even if the CPU is
|
||||||
|
* marked with the same flag, but we give them lower priority. */
|
||||||
|
const int cpu_flags = av_get_cpu_flags();
|
||||||
|
const int slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW |
|
||||||
|
AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW |
|
||||||
|
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER;
|
||||||
|
|
||||||
|
/* Flags the transform wants */
|
||||||
|
uint64_t req_flags = flags;
|
||||||
|
|
||||||
|
/* Unaligned codelets are compatible with the aligned flag */
|
||||||
|
if (req_flags & FF_TX_ALIGNED)
|
||||||
|
req_flags |= AV_TX_UNALIGNED;
|
||||||
|
|
||||||
|
/* If either flag is set, both are okay, so don't check for an exact match */
|
||||||
|
if ((req_flags & AV_TX_INPLACE) && (req_flags & FF_TX_OUT_OF_PLACE))
|
||||||
|
req_flags &= ~(AV_TX_INPLACE | FF_TX_OUT_OF_PLACE);
|
||||||
|
if ((req_flags & FF_TX_ALIGNED) && (req_flags & AV_TX_UNALIGNED))
|
||||||
|
req_flags &= ~(FF_TX_ALIGNED | AV_TX_UNALIGNED);
|
||||||
|
|
||||||
|
/* Flags the codelet may require to be present */
|
||||||
|
uint64_t inv_req_mask = AV_TX_FULL_IMDCT | FF_TX_PRESHUFFLE;
|
||||||
|
|
||||||
|
/* Loop through all codelets in all codelet lists to find matches
|
||||||
|
* to the requirements */
|
||||||
|
while (codelet_list_num--) {
|
||||||
|
const FFTXCodelet * const * list = codelet_list[codelet_list_num];
|
||||||
|
const FFTXCodelet *cd = NULL;
|
||||||
|
|
||||||
|
while ((cd = *list++)) {
|
||||||
|
int max_factor = 0;
|
||||||
|
|
||||||
|
/* Check if the type matches */
|
||||||
|
if (cd->type != TX_TYPE_ANY && type != cd->type)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Check direction for non-orthogonal codelets */
|
||||||
|
if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
|
||||||
|
((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Check if the requested flags match from both sides */
|
||||||
|
if (((req_flags & cd->flags) != (req_flags)) ||
|
||||||
|
((inv_req_mask & cd->flags) != (req_flags & inv_req_mask)))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Check if length is supported */
|
||||||
|
if ((len < cd->min_len) || (cd->max_len != -1 && (len > cd->max_len)))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Check if the CPU supports the required ISA */
|
||||||
|
if (!(!cd->cpu_flags || (cpu_flags & (cd->cpu_flags & ~slow_mask))))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Check for factors */
|
||||||
|
if (!check_cd_factors(cd, len))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Realloc array and append */
|
||||||
|
cd_tmp = av_fast_realloc(cd_matches, &cd_matches_size,
|
||||||
|
sizeof(*cd_tmp) * (nb_cd_matches + 1));
|
||||||
|
if (!cd_tmp) {
|
||||||
|
av_free(cd_matches);
|
||||||
|
return AVERROR(ENOMEM);
|
||||||
|
}
|
||||||
|
|
||||||
|
cd_matches = cd_tmp;
|
||||||
|
cd_matches[nb_cd_matches].cd = cd;
|
||||||
|
cd_matches[nb_cd_matches].prio = cd->prio;
|
||||||
|
|
||||||
|
/* If the CPU has a SLOW flag, and the instruction is also flagged
|
||||||
|
* as being slow for such, reduce its priority */
|
||||||
|
if ((cpu_flags & cd->cpu_flags) & slow_mask)
|
||||||
|
cd_matches[nb_cd_matches].prio -= 64;
|
||||||
|
|
||||||
|
/* Prioritize aligned-only codelets */
|
||||||
|
if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
|
||||||
|
cd_matches[nb_cd_matches].prio += 64;
|
||||||
|
|
||||||
|
/* Codelets for specific lengths are generally faster */
|
||||||
|
if ((len == cd->min_len) && (len == cd->max_len))
|
||||||
|
cd_matches[nb_cd_matches].prio += 64;
|
||||||
|
|
||||||
|
/* Forward-only or inverse-only transforms are generally better */
|
||||||
|
if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
|
||||||
|
cd_matches[nb_cd_matches].prio += 64;
|
||||||
|
|
||||||
|
/* Larger factors are generally better */
|
||||||
|
for (int i = 0; i < TX_MAX_SUB; i++)
|
||||||
|
max_factor = FFMAX(cd->factors[i], max_factor);
|
||||||
|
if (max_factor)
|
||||||
|
cd_matches[nb_cd_matches].prio += 16*max_factor;
|
||||||
|
|
||||||
|
nb_cd_matches++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No matches found */
|
||||||
|
if (!nb_cd_matches)
|
||||||
|
return AVERROR(ENOSYS);
|
||||||
|
|
||||||
|
/* Sort the list */
|
||||||
|
AV_QSORT(cd_matches, nb_cd_matches, TXCodeletMatch, cmp_matches);
|
||||||
|
|
||||||
|
/* Print debugging info */
|
||||||
|
av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
|
||||||
|
av_bprintf(&bp, "For transform of length %i, %s, ", len,
|
||||||
|
inv ? "inverse" : "forward");
|
||||||
|
print_type(&bp, type);
|
||||||
|
av_bprintf(&bp, ", ");
|
||||||
|
print_flags(&bp, flags);
|
||||||
|
av_bprintf(&bp, ", found %i matches:", nb_cd_matches);
|
||||||
|
av_log(NULL, AV_LOG_VERBOSE, "%s\n", bp.str);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb_cd_matches; i++) {
|
||||||
|
av_log(NULL, AV_LOG_VERBOSE, " %i: ", i + 1);
|
||||||
|
print_cd_info(cd_matches[i].cd, cd_matches[i].prio, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!s->sub)
|
||||||
|
s->sub = sub = av_mallocz(TX_MAX_SUB*sizeof(*sub));
|
||||||
|
|
||||||
|
/* Attempt to initialize each */
|
||||||
|
for (int i = 0; i < nb_cd_matches; i++) {
|
||||||
|
const FFTXCodelet *cd = cd_matches[i].cd;
|
||||||
|
AVTXContext *sctx = &s->sub[s->nb_sub];
|
||||||
|
|
||||||
|
sctx->len = len;
|
||||||
|
sctx->inv = inv;
|
||||||
|
sctx->type = type;
|
||||||
|
sctx->flags = flags;
|
||||||
|
sctx->cd_self = cd;
|
||||||
|
|
||||||
|
s->fn[s->nb_sub] = cd->function;
|
||||||
|
s->cd[s->nb_sub] = cd;
|
||||||
|
|
||||||
|
ret = 0;
|
||||||
|
if (cd->init)
|
||||||
|
ret = cd->init(sctx, cd, flags, opts, len, inv, scale);
|
||||||
|
|
||||||
|
if (ret >= 0) {
|
||||||
|
s->nb_sub++;
|
||||||
|
goto end;
|
||||||
|
}
|
||||||
|
|
||||||
|
s->fn[s->nb_sub] = NULL;
|
||||||
|
s->cd[s->nb_sub] = NULL;
|
||||||
|
|
||||||
|
reset_ctx(sctx);
|
||||||
|
if (ret == AVERROR(ENOMEM))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
av_free(sub);
|
||||||
|
|
||||||
|
if (ret >= 0)
|
||||||
|
ret = AVERROR(ENOSYS);
|
||||||
|
|
||||||
|
end:
|
||||||
|
av_free(cd_matches);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_tx_structure(AVTXContext *s, int depth)
|
||||||
|
{
|
||||||
|
const FFTXCodelet *cd = s->cd_self;
|
||||||
|
|
||||||
|
for (int i = 0; i <= depth; i++)
|
||||||
|
av_log(NULL, AV_LOG_VERBOSE, " ");
|
||||||
|
|
||||||
|
print_cd_info(cd, cd->prio, 0);
|
||||||
|
|
||||||
|
for (int i = 0; i < s->nb_sub; i++)
|
||||||
|
print_tx_structure(&s->sub[i], depth + 1);
|
||||||
|
}
|
||||||
|
|
||||||
av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type,
|
av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type,
|
||||||
int inv, int len, const void *scale, uint64_t flags)
|
int inv, int len, const void *scale, uint64_t flags)
|
||||||
{
|
{
|
||||||
int err;
|
int ret;
|
||||||
AVTXContext *s = av_mallocz(sizeof(*s));
|
AVTXContext tmp = { 0 };
|
||||||
if (!s)
|
const double default_scale_d = 1.0;
|
||||||
return AVERROR(ENOMEM);
|
const float default_scale_f = 1.0f;
|
||||||
|
|
||||||
switch (type) {
|
if (!len || type >= AV_TX_NB || !ctx || !tx)
|
||||||
case AV_TX_FLOAT_FFT:
|
return AVERROR(EINVAL);
|
||||||
case AV_TX_FLOAT_MDCT:
|
|
||||||
if ((err = ff_tx_init_mdct_fft_float(s, tx, type, inv, len, scale, flags)))
|
|
||||||
goto fail;
|
|
||||||
if (ARCH_X86)
|
|
||||||
ff_tx_init_float_x86(s, tx);
|
|
||||||
break;
|
|
||||||
case AV_TX_DOUBLE_FFT:
|
|
||||||
case AV_TX_DOUBLE_MDCT:
|
|
||||||
if ((err = ff_tx_init_mdct_fft_double(s, tx, type, inv, len, scale, flags)))
|
|
||||||
goto fail;
|
|
||||||
break;
|
|
||||||
case AV_TX_INT32_FFT:
|
|
||||||
case AV_TX_INT32_MDCT:
|
|
||||||
if ((err = ff_tx_init_mdct_fft_int32(s, tx, type, inv, len, scale, flags)))
|
|
||||||
goto fail;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
err = AVERROR(EINVAL);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
*ctx = s;
|
if (!(flags & AV_TX_UNALIGNED))
|
||||||
|
flags |= FF_TX_ALIGNED;
|
||||||
|
if (!(flags & AV_TX_INPLACE))
|
||||||
|
flags |= FF_TX_OUT_OF_PLACE;
|
||||||
|
|
||||||
return 0;
|
if (!scale && ((type == AV_TX_FLOAT_MDCT) || (type == AV_TX_INT32_MDCT)))
|
||||||
|
scale = &default_scale_f;
|
||||||
|
else if (!scale && (type == AV_TX_DOUBLE_MDCT))
|
||||||
|
scale = &default_scale_d;
|
||||||
|
|
||||||
fail:
|
ret = ff_tx_init_subtx(&tmp, type, flags, NULL, len, inv, scale);
|
||||||
av_tx_uninit(&s);
|
if (ret < 0)
|
||||||
*tx = NULL;
|
return ret;
|
||||||
return err;
|
|
||||||
|
*ctx = &tmp.sub[0];
|
||||||
|
*tx = tmp.fn[0];
|
||||||
|
|
||||||
|
av_log(NULL, AV_LOG_VERBOSE, "Transform tree:\n");
|
||||||
|
print_tx_structure(*ctx, 0);
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -52,6 +52,7 @@ enum AVTXType {
|
|||||||
* Standard MDCT with a sample data type of float, double or int32_t,
|
* Standard MDCT with a sample data type of float, double or int32_t,
|
||||||
* respecively. For the float and int32 variants, the scale type is
|
* respecively. For the float and int32 variants, the scale type is
|
||||||
* 'float', while for the double variant, it's 'double'.
|
* 'float', while for the double variant, it's 'double'.
|
||||||
|
* If scale is NULL, 1.0 will be used as a default.
|
||||||
*
|
*
|
||||||
* Length is the frame size, not the window size (which is 2x frame).
|
* Length is the frame size, not the window size (which is 2x frame).
|
||||||
* For forward transforms, the stride specifies the spacing between each
|
* For forward transforms, the stride specifies the spacing between each
|
||||||
@ -67,6 +68,9 @@ enum AVTXType {
|
|||||||
AV_TX_FLOAT_MDCT = 1,
|
AV_TX_FLOAT_MDCT = 1,
|
||||||
AV_TX_DOUBLE_MDCT = 3,
|
AV_TX_DOUBLE_MDCT = 3,
|
||||||
AV_TX_INT32_MDCT = 5,
|
AV_TX_INT32_MDCT = 5,
|
||||||
|
|
||||||
|
/* Not part of the API, do not use */
|
||||||
|
AV_TX_NB,
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -25,36 +25,48 @@
|
|||||||
#include "attributes.h"
|
#include "attributes.h"
|
||||||
|
|
||||||
#ifdef TX_FLOAT
|
#ifdef TX_FLOAT
|
||||||
#define TX_NAME(x) x ## _float
|
#define TX_TAB(x) x ## _float
|
||||||
|
#define TX_NAME(x) x ## _float_c
|
||||||
|
#define TX_NAME_STR(x) x "_float_c"
|
||||||
|
#define TX_TYPE(x) AV_TX_FLOAT_ ## x
|
||||||
|
#define MULT(x, m) ((x) * (m))
|
||||||
#define SCALE_TYPE float
|
#define SCALE_TYPE float
|
||||||
typedef float FFTSample;
|
typedef float TXSample;
|
||||||
typedef AVComplexFloat FFTComplex;
|
typedef AVComplexFloat TXComplex;
|
||||||
#elif defined(TX_DOUBLE)
|
#elif defined(TX_DOUBLE)
|
||||||
#define TX_NAME(x) x ## _double
|
#define TX_TAB(x) x ## _double
|
||||||
|
#define TX_NAME(x) x ## _double_c
|
||||||
|
#define TX_NAME_STR(x) x "_double_c"
|
||||||
|
#define TX_TYPE(x) AV_TX_DOUBLE_ ## x
|
||||||
|
#define MULT(x, m) ((x) * (m))
|
||||||
#define SCALE_TYPE double
|
#define SCALE_TYPE double
|
||||||
typedef double FFTSample;
|
typedef double TXSample;
|
||||||
typedef AVComplexDouble FFTComplex;
|
typedef AVComplexDouble TXComplex;
|
||||||
#elif defined(TX_INT32)
|
#elif defined(TX_INT32)
|
||||||
#define TX_NAME(x) x ## _int32
|
#define TX_TAB(x) x ## _int32
|
||||||
|
#define TX_NAME(x) x ## _int32_c
|
||||||
|
#define TX_NAME_STR(x) x "_int32_c"
|
||||||
|
#define TX_TYPE(x) AV_TX_INT32_ ## x
|
||||||
|
#define MULT(x, m) (((((int64_t)(x)) * (int64_t)(m)) + 0x40000000) >> 31)
|
||||||
#define SCALE_TYPE float
|
#define SCALE_TYPE float
|
||||||
typedef int32_t FFTSample;
|
typedef int32_t TXSample;
|
||||||
typedef AVComplexInt32 FFTComplex;
|
typedef AVComplexInt32 TXComplex;
|
||||||
#else
|
#else
|
||||||
typedef void FFTComplex;
|
typedef void TXComplex;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(TX_FLOAT) || defined(TX_DOUBLE)
|
#if defined(TX_FLOAT) || defined(TX_DOUBLE)
|
||||||
|
|
||||||
#define CMUL(dre, dim, are, aim, bre, bim) \
|
#define CMUL(dre, dim, are, aim, bre, bim) \
|
||||||
do { \
|
do { \
|
||||||
(dre) = (are) * (bre) - (aim) * (bim); \
|
(dre) = (are) * (bre) - (aim) * (bim); \
|
||||||
(dim) = (are) * (bim) + (aim) * (bre); \
|
(dim) = (are) * (bim) + (aim) * (bre); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define SMUL(dre, dim, are, aim, bre, bim) \
|
#define SMUL(dre, dim, are, aim, bre, bim) \
|
||||||
do { \
|
do { \
|
||||||
(dre) = (are) * (bre) - (aim) * (bim); \
|
(dre) = (are) * (bre) - (aim) * (bim); \
|
||||||
(dim) = (are) * (bim) - (aim) * (bre); \
|
(dim) = (are) * (bim) - (aim) * (bre); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define UNSCALE(x) (x)
|
#define UNSCALE(x) (x)
|
||||||
@ -65,91 +77,167 @@ typedef void FFTComplex;
|
|||||||
#elif defined(TX_INT32)
|
#elif defined(TX_INT32)
|
||||||
|
|
||||||
/* Properly rounds the result */
|
/* Properly rounds the result */
|
||||||
#define CMUL(dre, dim, are, aim, bre, bim) \
|
#define CMUL(dre, dim, are, aim, bre, bim) \
|
||||||
do { \
|
do { \
|
||||||
int64_t accu; \
|
int64_t accu; \
|
||||||
(accu) = (int64_t)(bre) * (are); \
|
(accu) = (int64_t)(bre) * (are); \
|
||||||
(accu) -= (int64_t)(bim) * (aim); \
|
(accu) -= (int64_t)(bim) * (aim); \
|
||||||
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
||||||
(accu) = (int64_t)(bim) * (are); \
|
(accu) = (int64_t)(bim) * (are); \
|
||||||
(accu) += (int64_t)(bre) * (aim); \
|
(accu) += (int64_t)(bre) * (aim); \
|
||||||
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define SMUL(dre, dim, are, aim, bre, bim) \
|
#define SMUL(dre, dim, are, aim, bre, bim) \
|
||||||
do { \
|
do { \
|
||||||
int64_t accu; \
|
int64_t accu; \
|
||||||
(accu) = (int64_t)(bre) * (are); \
|
(accu) = (int64_t)(bre) * (are); \
|
||||||
(accu) -= (int64_t)(bim) * (aim); \
|
(accu) -= (int64_t)(bim) * (aim); \
|
||||||
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
||||||
(accu) = (int64_t)(bim) * (are); \
|
(accu) = (int64_t)(bim) * (are); \
|
||||||
(accu) -= (int64_t)(bre) * (aim); \
|
(accu) -= (int64_t)(bre) * (aim); \
|
||||||
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define UNSCALE(x) ((double)x/2147483648.0)
|
#define UNSCALE(x) ((double)(x)/2147483648.0)
|
||||||
#define RESCALE(x) (av_clip64(lrintf((x) * 2147483648.0), INT32_MIN, INT32_MAX))
|
#define RESCALE(x) (av_clip64(lrintf((x) * 2147483648.0), INT32_MIN, INT32_MAX))
|
||||||
|
|
||||||
#define FOLD(x, y) ((int)((x) + (unsigned)(y) + 32) >> 6)
|
#define FOLD(x, y) ((int32_t)((x) + (unsigned)(y) + 32) >> 6)
|
||||||
|
|
||||||
#endif
|
#endif /* TX_INT32 */
|
||||||
|
|
||||||
#define BF(x, y, a, b) \
|
#define BF(x, y, a, b) \
|
||||||
do { \
|
do { \
|
||||||
x = (a) - (b); \
|
x = (a) - (b); \
|
||||||
y = (a) + (b); \
|
y = (a) + (b); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define CMUL3(c, a, b) \
|
#define CMUL3(c, a, b) CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
|
||||||
CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
|
|
||||||
|
|
||||||
#define COSTABLE(size) \
|
/* Codelet flags, used to pick codelets. Must be a superset of enum AVTXFlags,
|
||||||
DECLARE_ALIGNED(32, FFTSample, TX_NAME(ff_cos_##size))[size/4 + 1]
|
* but if it runs out of bits, it can be made separate. */
|
||||||
|
typedef enum FFTXCodeletFlags {
|
||||||
|
FF_TX_OUT_OF_PLACE = (1ULL << 63), /* Can be OR'd with AV_TX_INPLACE */
|
||||||
|
FF_TX_ALIGNED = (1ULL << 62), /* Cannot be OR'd with AV_TX_UNALIGNED */
|
||||||
|
FF_TX_PRESHUFFLE = (1ULL << 61), /* Codelet expects permuted coeffs */
|
||||||
|
FF_TX_INVERSE_ONLY = (1ULL << 60), /* For non-orthogonal inverse-only transforms */
|
||||||
|
FF_TX_FORWARD_ONLY = (1ULL << 59), /* For non-orthogonal forward-only transforms */
|
||||||
|
} FFTXCodeletFlags;
|
||||||
|
|
||||||
|
typedef enum FFTXCodeletPriority {
|
||||||
|
FF_TX_PRIO_BASE = 0, /* Baseline priority */
|
||||||
|
|
||||||
|
/* For SIMD, set base prio to the register size in bits and increment in
|
||||||
|
* steps of 64 depending on faster/slower features, like FMA. */
|
||||||
|
|
||||||
|
FF_TX_PRIO_MIN = -131072, /* For naive implementations */
|
||||||
|
FF_TX_PRIO_MAX = 32768, /* For custom implementations/ASICs */
|
||||||
|
} FFTXCodeletPriority;
|
||||||
|
|
||||||
|
/* Codelet options */
|
||||||
|
typedef struct FFTXCodeletOptions {
|
||||||
|
int invert_lookup; /* If codelet is flagged as FF_TX_CODELET_PRESHUFFLE,
|
||||||
|
invert the lookup direction for the map generated */
|
||||||
|
} FFTXCodeletOptions;
|
||||||
|
|
||||||
|
/* Maximum amount of subtransform functions, subtransforms and factors. Arbitrary. */
|
||||||
|
#define TX_MAX_SUB 4
|
||||||
|
|
||||||
|
typedef struct FFTXCodelet {
|
||||||
|
const char *name; /* Codelet name, for debugging */
|
||||||
|
av_tx_fn function; /* Codelet function, != NULL */
|
||||||
|
enum AVTXType type; /* Type of codelet transform */
|
||||||
|
#define TX_TYPE_ANY INT32_MAX /* Special type to allow all types */
|
||||||
|
|
||||||
|
uint64_t flags; /* A combination of AVTXFlags and FFTXCodeletFlags
|
||||||
|
* that describe the codelet's properties. */
|
||||||
|
|
||||||
|
int factors[TX_MAX_SUB]; /* Length factors */
|
||||||
|
#define TX_FACTOR_ANY -1 /* When used alone, signals that the codelet
|
||||||
|
* supports all factors. Otherwise, if other
|
||||||
|
* factors are present, it signals that whatever
|
||||||
|
* remains will be supported, as long as the
|
||||||
|
* other factors are a component of the length */
|
||||||
|
|
||||||
|
int min_len; /* Minimum length of transform, must be >= 1 */
|
||||||
|
int max_len; /* Maximum length of transform */
|
||||||
|
#define TX_LEN_UNLIMITED -1 /* Special length value to permit all lengths */
|
||||||
|
|
||||||
|
int (*init)(AVTXContext *s, /* Optional callback for current context initialization. */
|
||||||
|
const struct FFTXCodelet *cd,
|
||||||
|
uint64_t flags,
|
||||||
|
FFTXCodeletOptions *opts,
|
||||||
|
int len, int inv,
|
||||||
|
const void *scale);
|
||||||
|
|
||||||
|
int (*uninit)(AVTXContext *s); /* Optional callback for uninitialization. */
|
||||||
|
|
||||||
|
int cpu_flags; /* CPU flags. If any negative flags like
|
||||||
|
* SLOW are present, will avoid picking.
|
||||||
|
* 0x0 to signal it's a C codelet */
|
||||||
|
#define FF_TX_CPU_FLAGS_ALL 0x0 /* Special CPU flag for C */
|
||||||
|
|
||||||
|
int prio; /* < 0 = least, 0 = no pref, > 0 = prefer */
|
||||||
|
} FFTXCodelet;
|
||||||
|
|
||||||
/* Used by asm, reorder with care */
|
|
||||||
struct AVTXContext {
|
struct AVTXContext {
|
||||||
int n; /* Non-power-of-two part */
|
/* Fields the root transform and subtransforms use or may use.
|
||||||
int m; /* Power-of-two part */
|
* NOTE: This section is used by assembly, do not reorder or change */
|
||||||
int inv; /* Is inverse */
|
int len; /* Length of the transform */
|
||||||
int type; /* Type */
|
int inv; /* If transform is inverse */
|
||||||
uint64_t flags; /* Flags */
|
int *map; /* Lookup table(s) */
|
||||||
double scale; /* Scale */
|
TXComplex *exp; /* Any non-pre-baked multiplication factors needed */
|
||||||
|
TXComplex *tmp; /* Temporary buffer, if needed */
|
||||||
|
|
||||||
FFTComplex *exptab; /* MDCT exptab */
|
AVTXContext *sub; /* Subtransform context(s), if needed */
|
||||||
FFTComplex *tmp; /* Temporary buffer needed for all compound transforms */
|
av_tx_fn fn[TX_MAX_SUB]; /* Function(s) for the subtransforms */
|
||||||
int *pfatab; /* Input/Output mapping for compound transforms */
|
int nb_sub; /* Number of subtransforms.
|
||||||
int *revtab; /* Input mapping for power of two transforms */
|
* The reason all of these are set here
|
||||||
int *inplace_idx; /* Required indices to revtab for in-place transforms */
|
* rather than in each separate context
|
||||||
|
* is to eliminate extra pointer
|
||||||
|
* dereferences. */
|
||||||
|
|
||||||
int *revtab_c; /* Revtab for only the C transforms, needed because
|
/* Fields mainly useul/applicable for the root transform or initialization.
|
||||||
* checkasm makes us reuse the same context. */
|
* Fields below are not used by assembly code. */
|
||||||
|
const FFTXCodelet *cd[TX_MAX_SUB]; /* Subtransform codelets */
|
||||||
av_tx_fn top_tx; /* Used for computing transforms derived from other
|
const FFTXCodelet *cd_self; /* Codelet for the current context */
|
||||||
* transforms, like full-length iMDCTs and RDFTs.
|
enum AVTXType type; /* Type of transform */
|
||||||
* NOTE: Do NOT use this to mix assembly with C code. */
|
uint64_t flags; /* A combination of AVTXFlags
|
||||||
|
and FFTXCodeletFlags flags
|
||||||
|
used when creating */
|
||||||
|
float scale_f;
|
||||||
|
double scale_d;
|
||||||
|
void *opaque; /* Free to use by implementations */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Checks if type is an MDCT */
|
/* Create a subtransform in the current context with the given parameters.
|
||||||
int ff_tx_type_is_mdct(enum AVTXType type);
|
* The flags parameter from FFTXCodelet.init() should be preserved as much
|
||||||
|
* as that's possible.
|
||||||
|
* MUST be called during the sub() callback of each codelet. */
|
||||||
|
int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
|
||||||
|
uint64_t flags, FFTXCodeletOptions *opts,
|
||||||
|
int len, int inv, const void *scale);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Generates the PFA permutation table into AVTXContext->pfatab. The end table
|
* Generates the PFA permutation table into AVTXContext->pfatab. The end table
|
||||||
* is appended to the start table.
|
* is appended to the start table.
|
||||||
*/
|
*/
|
||||||
int ff_tx_gen_compound_mapping(AVTXContext *s);
|
int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Generates a standard-ish (slightly modified) Split-Radix revtab into
|
* Generates a standard-ish (slightly modified) Split-Radix revtab into
|
||||||
* AVTXContext->revtab
|
* AVTXContext->map. Invert lookup changes how the mapping needs to be applied.
|
||||||
|
* If it's set to 0, it has to be applied like out[map[i]] = in[i], otherwise
|
||||||
|
* if it's set to 1, has to be applied as out[i] = in[map[i]]
|
||||||
*/
|
*/
|
||||||
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup);
|
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Generates an index into AVTXContext->inplace_idx that if followed in the
|
* Generates an index into AVTXContext->inplace_idx that if followed in the
|
||||||
* specific order, allows the revtab to be done in-place. AVTXContext->revtab
|
* specific order, allows the revtab to be done in-place. The sub-transform
|
||||||
* must already exist.
|
* and its map should already be initialized.
|
||||||
*/
|
*/
|
||||||
int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab);
|
int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This generates a parity-based revtab of length len and direction inv.
|
* This generates a parity-based revtab of length len and direction inv.
|
||||||
@ -179,25 +267,26 @@ int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab);
|
|||||||
*
|
*
|
||||||
* If length is smaller than basis/2 this function will not do anything.
|
* If length is smaller than basis/2 this function will not do anything.
|
||||||
*/
|
*/
|
||||||
void ff_tx_gen_split_radix_parity_revtab(int *revtab, int len, int inv,
|
int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int invert_lookup,
|
||||||
int basis, int dual_stride);
|
int basis, int dual_stride);
|
||||||
|
|
||||||
/* Templated init functions */
|
/* Typed init function to initialize shared tables. Will initialize all tables
|
||||||
int ff_tx_init_mdct_fft_float(AVTXContext *s, av_tx_fn *tx,
|
* for all factors of a length. */
|
||||||
enum AVTXType type, int inv, int len,
|
void ff_tx_init_tabs_float (int len);
|
||||||
const void *scale, uint64_t flags);
|
void ff_tx_init_tabs_double(int len);
|
||||||
int ff_tx_init_mdct_fft_double(AVTXContext *s, av_tx_fn *tx,
|
void ff_tx_init_tabs_int32 (int len);
|
||||||
enum AVTXType type, int inv, int len,
|
|
||||||
const void *scale, uint64_t flags);
|
|
||||||
int ff_tx_init_mdct_fft_int32(AVTXContext *s, av_tx_fn *tx,
|
|
||||||
enum AVTXType type, int inv, int len,
|
|
||||||
const void *scale, uint64_t flags);
|
|
||||||
|
|
||||||
typedef struct CosTabsInitOnce {
|
/* Typed init function to initialize an MDCT exptab in a context. */
|
||||||
void (*func)(void);
|
int ff_tx_mdct_gen_exp_float (AVTXContext *s);
|
||||||
AVOnce control;
|
int ff_tx_mdct_gen_exp_double(AVTXContext *s);
|
||||||
} CosTabsInitOnce;
|
int ff_tx_mdct_gen_exp_int32 (AVTXContext *s);
|
||||||
|
|
||||||
void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx);
|
/* Lists of codelets */
|
||||||
|
extern const FFTXCodelet * const ff_tx_codelet_list_float_c [];
|
||||||
|
extern const FFTXCodelet * const ff_tx_codelet_list_float_x86 [];
|
||||||
|
|
||||||
|
extern const FFTXCodelet * const ff_tx_codelet_list_double_c [];
|
||||||
|
|
||||||
|
extern const FFTXCodelet * const ff_tx_codelet_list_int32_c [];
|
||||||
|
|
||||||
#endif /* AVUTIL_TX_PRIV_H */
|
#endif /* AVUTIL_TX_PRIV_H */
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -31,6 +31,8 @@
|
|||||||
|
|
||||||
%include "x86util.asm"
|
%include "x86util.asm"
|
||||||
|
|
||||||
|
%define private_prefix ff_tx
|
||||||
|
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
%define ptr resq
|
%define ptr resq
|
||||||
%else
|
%else
|
||||||
@ -39,25 +41,22 @@
|
|||||||
|
|
||||||
%assign i 16
|
%assign i 16
|
||||||
%rep 14
|
%rep 14
|
||||||
cextern cos_ %+ i %+ _float ; ff_cos_i_float...
|
cextern tab_ %+ i %+ _float ; ff_tab_i_float...
|
||||||
%assign i (i << 1)
|
%assign i (i << 1)
|
||||||
%endrep
|
%endrep
|
||||||
|
|
||||||
struc AVTXContext
|
struc AVTXContext
|
||||||
.n: resd 1 ; Non-power-of-two part
|
.len: resd 1 ; Length
|
||||||
.m: resd 1 ; Power-of-two part
|
.inv resd 1 ; Inverse flag
|
||||||
.inv: resd 1 ; Is inverse
|
.map: ptr 1 ; Lookup table(s)
|
||||||
.type: resd 1 ; Type
|
.exp: ptr 1 ; Exponentiation factors
|
||||||
.flags: resq 1 ; Flags
|
.tmp: ptr 1 ; Temporary data
|
||||||
.scale: resq 1 ; Scale
|
|
||||||
|
|
||||||
.exptab: ptr 1 ; MDCT exptab
|
.sub: ptr 1 ; Subcontexts
|
||||||
.tmp: ptr 1 ; Temporary buffer needed for all compound transforms
|
.fn: ptr 4 ; Subcontext functions
|
||||||
.pfatab: ptr 1 ; Input/Output mapping for compound transforms
|
.nb_sub: resd 1 ; Subcontext count
|
||||||
.revtab: ptr 1 ; Input mapping for power of two transforms
|
|
||||||
.inplace_idx: ptr 1 ; Required indices to revtab for in-place transforms
|
|
||||||
|
|
||||||
.top_tx ptr 1 ; Used for transforms derived from other transforms
|
; Everything else is inaccessible
|
||||||
endstruc
|
endstruc
|
||||||
|
|
||||||
SECTION_RODATA 32
|
SECTION_RODATA 32
|
||||||
@ -485,8 +484,8 @@ SECTION .text
|
|||||||
movaps [outq + 10*mmsize], tx1_o0
|
movaps [outq + 10*mmsize], tx1_o0
|
||||||
movaps [outq + 14*mmsize], tx2_o0
|
movaps [outq + 14*mmsize], tx2_o0
|
||||||
|
|
||||||
movaps tw_e, [cos_64_float + mmsize]
|
movaps tw_e, [tab_64_float + mmsize]
|
||||||
vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
|
||||||
|
|
||||||
movaps m0, [outq + 1*mmsize]
|
movaps m0, [outq + 1*mmsize]
|
||||||
movaps m1, [outq + 3*mmsize]
|
movaps m1, [outq + 3*mmsize]
|
||||||
@ -710,8 +709,7 @@ FFT4 inv, 1
|
|||||||
|
|
||||||
INIT_XMM sse3
|
INIT_XMM sse3
|
||||||
cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
||||||
mov ctxq, [ctxq + AVTXContext.revtab]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
|
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
|
||||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
|
||||||
@ -733,8 +731,7 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
|||||||
|
|
||||||
INIT_YMM avx
|
INIT_YMM avx
|
||||||
cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
||||||
mov ctxq, [ctxq + AVTXContext.revtab]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
|
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
|
||||||
|
|
||||||
@ -754,7 +751,7 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
|||||||
%macro FFT16_FN 1
|
%macro FFT16_FN 1
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
|
cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
|
||||||
mov ctxq, [ctxq + AVTXContext.revtab]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
|
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
|
||||||
@ -786,7 +783,7 @@ FFT16_FN fma3
|
|||||||
%macro FFT32_FN 1
|
%macro FFT32_FN 1
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
||||||
mov ctxq, [ctxq + AVTXContext.revtab]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
|
|
||||||
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
|
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
|
||||||
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
|
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
|
||||||
@ -800,8 +797,8 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
|||||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
|
||||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
|
||||||
|
|
||||||
movaps m8, [cos_32_float]
|
movaps m8, [tab_32_float]
|
||||||
vperm2f128 m9, m9, [cos_32_float + 4*8 - 4*7], 0x23
|
vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
|
||||||
|
|
||||||
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
|
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
|
||||||
|
|
||||||
@ -858,8 +855,8 @@ ALIGN 16
|
|||||||
POP lenq
|
POP lenq
|
||||||
sub outq, (%1*4) + (%1*2) + (%1/2)
|
sub outq, (%1*4) + (%1*2) + (%1/2)
|
||||||
|
|
||||||
lea rtabq, [cos_ %+ %1 %+ _float]
|
lea rtabq, [tab_ %+ %1 %+ _float]
|
||||||
lea itabq, [cos_ %+ %1 %+ _float + %1 - 4*7]
|
lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
|
||||||
|
|
||||||
%if %0 > 1
|
%if %0 > 1
|
||||||
cmp tgtq, %1
|
cmp tgtq, %1
|
||||||
@ -883,9 +880,9 @@ ALIGN 16
|
|||||||
|
|
||||||
%macro FFT_SPLIT_RADIX_FN 1
|
%macro FFT_SPLIT_RADIX_FN 1
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal split_radix_fft_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
||||||
movsxd lenq, dword [lutq + AVTXContext.m]
|
movsxd lenq, dword [lutq + AVTXContext.len]
|
||||||
mov lutq, [lutq + AVTXContext.revtab]
|
mov lutq, [lutq + AVTXContext.map]
|
||||||
mov tgtq, lenq
|
mov tgtq, lenq
|
||||||
|
|
||||||
; Bottom-most/32-point transform ===============================================
|
; Bottom-most/32-point transform ===============================================
|
||||||
@ -903,8 +900,8 @@ ALIGN 16
|
|||||||
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
|
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
|
||||||
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
|
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
|
||||||
|
|
||||||
movaps m8, [cos_32_float]
|
movaps m8, [tab_32_float]
|
||||||
vperm2f128 m9, m9, [cos_32_float + 32 - 4*7], 0x23
|
vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
|
||||||
|
|
||||||
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
|
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
|
||||||
|
|
||||||
@ -961,8 +958,8 @@ ALIGN 16
|
|||||||
|
|
||||||
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
|
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
|
||||||
|
|
||||||
movaps tw_e, [cos_64_float]
|
movaps tw_e, [tab_64_float]
|
||||||
vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7], 0x23
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
|
||||||
|
|
||||||
add lutq, (mmsize/2)*8
|
add lutq, (mmsize/2)*8
|
||||||
cmp tgtq, 64
|
cmp tgtq, 64
|
||||||
@ -989,8 +986,8 @@ ALIGN 16
|
|||||||
POP lenq
|
POP lenq
|
||||||
sub outq, 24*mmsize
|
sub outq, 24*mmsize
|
||||||
|
|
||||||
lea rtabq, [cos_128_float]
|
lea rtabq, [tab_128_float]
|
||||||
lea itabq, [cos_128_float + 128 - 4*7]
|
lea itabq, [tab_128_float + 128 - 4*7]
|
||||||
|
|
||||||
cmp tgtq, 128
|
cmp tgtq, 128
|
||||||
je .deinterleave
|
je .deinterleave
|
||||||
@ -1016,8 +1013,8 @@ ALIGN 16
|
|||||||
POP lenq
|
POP lenq
|
||||||
sub outq, 48*mmsize
|
sub outq, 48*mmsize
|
||||||
|
|
||||||
lea rtabq, [cos_256_float]
|
lea rtabq, [tab_256_float]
|
||||||
lea itabq, [cos_256_float + 256 - 4*7]
|
lea itabq, [tab_256_float + 256 - 4*7]
|
||||||
|
|
||||||
cmp tgtq, 256
|
cmp tgtq, 256
|
||||||
je .deinterleave
|
je .deinterleave
|
||||||
@ -1044,8 +1041,8 @@ ALIGN 16
|
|||||||
POP lenq
|
POP lenq
|
||||||
sub outq, 96*mmsize
|
sub outq, 96*mmsize
|
||||||
|
|
||||||
lea rtabq, [cos_512_float]
|
lea rtabq, [tab_512_float]
|
||||||
lea itabq, [cos_512_float + 512 - 4*7]
|
lea itabq, [tab_512_float + 512 - 4*7]
|
||||||
|
|
||||||
cmp tgtq, 512
|
cmp tgtq, 512
|
||||||
je .deinterleave
|
je .deinterleave
|
||||||
@ -1079,8 +1076,8 @@ ALIGN 16
|
|||||||
POP lenq
|
POP lenq
|
||||||
sub outq, 192*mmsize
|
sub outq, 192*mmsize
|
||||||
|
|
||||||
lea rtabq, [cos_1024_float]
|
lea rtabq, [tab_1024_float]
|
||||||
lea itabq, [cos_1024_float + 1024 - 4*7]
|
lea itabq, [tab_1024_float + 1024 - 4*7]
|
||||||
|
|
||||||
cmp tgtq, 1024
|
cmp tgtq, 1024
|
||||||
je .deinterleave
|
je .deinterleave
|
||||||
@ -1160,8 +1157,8 @@ FFT_SPLIT_RADIX_DEF 131072
|
|||||||
vextractf128 [outq + 13*mmsize + 0], tw_e, 1
|
vextractf128 [outq + 13*mmsize + 0], tw_e, 1
|
||||||
vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
|
vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
|
||||||
|
|
||||||
movaps tw_e, [cos_64_float + mmsize]
|
movaps tw_e, [tab_64_float + mmsize]
|
||||||
vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
|
||||||
|
|
||||||
movaps m0, [outq + 1*mmsize]
|
movaps m0, [outq + 1*mmsize]
|
||||||
movaps m1, [outq + 3*mmsize]
|
movaps m1, [outq + 3*mmsize]
|
||||||
|
@ -21,86 +21,106 @@
|
|||||||
#include "libavutil/attributes.h"
|
#include "libavutil/attributes.h"
|
||||||
#include "libavutil/x86/cpu.h"
|
#include "libavutil/x86/cpu.h"
|
||||||
|
|
||||||
void ff_fft2_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
#include "config.h"
|
||||||
void ff_fft4_inv_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
|
||||||
void ff_fft4_fwd_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
|
||||||
void ff_fft8_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
|
||||||
void ff_fft8_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
|
||||||
void ff_fft16_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
|
||||||
void ff_fft16_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
|
||||||
void ff_fft32_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
|
||||||
void ff_fft32_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
|
||||||
|
|
||||||
void ff_split_radix_fft_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
#define DECL_INIT_FN(basis, interleave) \
|
||||||
void ff_split_radix_fft_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
static av_cold int \
|
||||||
|
ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86 \
|
||||||
av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx)
|
(AVTXContext *s, \
|
||||||
{
|
const FFTXCodelet *cd, \
|
||||||
int cpu_flags = av_get_cpu_flags();
|
uint64_t flags, \
|
||||||
int gen_revtab = 0, basis, revtab_interleave;
|
FFTXCodeletOptions *opts, \
|
||||||
|
int len, int inv, \
|
||||||
if (s->flags & AV_TX_UNALIGNED)
|
const void *scale) \
|
||||||
return;
|
{ \
|
||||||
|
const int inv_lookup = opts ? opts->invert_lookup : 1; \
|
||||||
if (ff_tx_type_is_mdct(s->type))
|
ff_tx_init_tabs_float(len); \
|
||||||
return;
|
return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup, \
|
||||||
|
basis, interleave); \
|
||||||
#define TXFN(fn, gentab, sr_basis, interleave) \
|
|
||||||
do { \
|
|
||||||
*tx = fn; \
|
|
||||||
gen_revtab = gentab; \
|
|
||||||
basis = sr_basis; \
|
|
||||||
revtab_interleave = interleave; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
if (s->n == 1) {
|
|
||||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
|
||||||
if (s->m == 4 && s->inv)
|
|
||||||
TXFN(ff_fft4_inv_float_sse2, 0, 0, 0);
|
|
||||||
else if (s->m == 4)
|
|
||||||
TXFN(ff_fft4_fwd_float_sse2, 0, 0, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EXTERNAL_SSE3(cpu_flags)) {
|
|
||||||
if (s->m == 2)
|
|
||||||
TXFN(ff_fft2_float_sse3, 0, 0, 0);
|
|
||||||
else if (s->m == 8)
|
|
||||||
TXFN(ff_fft8_float_sse3, 1, 8, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
|
||||||
if (s->m == 8)
|
|
||||||
TXFN(ff_fft8_float_avx, 1, 8, 0);
|
|
||||||
else if (s->m == 16)
|
|
||||||
TXFN(ff_fft16_float_avx, 1, 8, 2);
|
|
||||||
#if ARCH_X86_64
|
|
||||||
else if (s->m == 32)
|
|
||||||
TXFN(ff_fft32_float_avx, 1, 8, 2);
|
|
||||||
else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE))
|
|
||||||
TXFN(ff_split_radix_fft_float_avx, 1, 8, 2);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
|
|
||||||
if (s->m == 16)
|
|
||||||
TXFN(ff_fft16_float_fma3, 1, 8, 2);
|
|
||||||
#if ARCH_X86_64
|
|
||||||
else if (s->m == 32)
|
|
||||||
TXFN(ff_fft32_float_fma3, 1, 8, 2);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
#if ARCH_X86_64
|
|
||||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
|
||||||
if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE))
|
|
||||||
TXFN(ff_split_radix_fft_float_avx2, 1, 8, 2);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (gen_revtab)
|
|
||||||
ff_tx_gen_split_radix_parity_revtab(s->revtab, s->m, s->inv, basis,
|
|
||||||
revtab_interleave);
|
|
||||||
|
|
||||||
#undef TXFN
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define ff_tx_fft_sr_codelet_init_b0_i0_x86 NULL
|
||||||
|
DECL_INIT_FN(8, 0)
|
||||||
|
DECL_INIT_FN(8, 2)
|
||||||
|
|
||||||
|
#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \
|
||||||
|
void ff_tx_ ##fn_name(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \
|
||||||
|
static const FFTXCodelet ff_tx_ ##fn_name## _def = { \
|
||||||
|
.name = #fn_name, \
|
||||||
|
.function = ff_tx_ ##fn_name, \
|
||||||
|
.type = TX_TYPE(FFT), \
|
||||||
|
.flags = FF_TX_OUT_OF_PLACE | FF_TX_ALIGNED | fn_flags, \
|
||||||
|
.factors[0] = 2, \
|
||||||
|
.min_len = len, \
|
||||||
|
.max_len = len, \
|
||||||
|
.init = ff_tx_fft_sr_codelet_init_ ##init_fn## _x86, \
|
||||||
|
.cpu_flags = AV_CPU_FLAG_ ##cpu, \
|
||||||
|
.prio = fn_prio, \
|
||||||
|
};
|
||||||
|
|
||||||
|
DECL_SR_CD_DEF(fft2_float_sse3, 2, b0_i0, 128, SSE3, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft4_fwd_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY)
|
||||||
|
DECL_SR_CD_DEF(fft4_inv_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY)
|
||||||
|
DECL_SR_CD_DEF(fft8_float_sse3, 8, b8_i0, 128, SSE3, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft8_float_avx, 8, b8_i0, 256, AVX, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft16_float_avx, 16, b8_i2, 256, AVX, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft16_float_fma3, 16, b8_i2, 288, FMA3, AV_TX_INPLACE)
|
||||||
|
|
||||||
|
#if ARCH_X86_64
|
||||||
|
DECL_SR_CD_DEF(fft32_float_avx, 32, b8_i2, 256, AVX, AV_TX_INPLACE)
|
||||||
|
DECL_SR_CD_DEF(fft32_float_fma3, 32, b8_i2, 288, FMA3, AV_TX_INPLACE)
|
||||||
|
|
||||||
|
void ff_tx_fft_sr_float_avx(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
||||||
|
const FFTXCodelet ff_tx_fft_sr_float_avx_def = {
|
||||||
|
.name = "fft_sr_float_avx",
|
||||||
|
.function = ff_tx_fft_sr_float_avx,
|
||||||
|
.type = TX_TYPE(FFT),
|
||||||
|
.flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
|
||||||
|
.factors[0] = 2,
|
||||||
|
.min_len = 64,
|
||||||
|
.max_len = 131072,
|
||||||
|
.init = ff_tx_fft_sr_codelet_init_b8_i2_x86,
|
||||||
|
.cpu_flags = AV_CPU_FLAG_AVX,
|
||||||
|
.prio = 256,
|
||||||
|
};
|
||||||
|
|
||||||
|
#if HAVE_AVX2_EXTERNAL
|
||||||
|
void ff_tx_fft_sr_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
|
||||||
|
const FFTXCodelet ff_tx_fft_sr_float_avx2_def = {
|
||||||
|
.name = "fft_sr_float_avx2",
|
||||||
|
.function = ff_tx_fft_sr_float_avx2,
|
||||||
|
.type = TX_TYPE(FFT),
|
||||||
|
.flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
|
||||||
|
.factors[0] = 2,
|
||||||
|
.min_len = 64,
|
||||||
|
.max_len = 131072,
|
||||||
|
.init = ff_tx_fft_sr_codelet_init_b8_i2_x86,
|
||||||
|
.cpu_flags = AV_CPU_FLAG_AVX2,
|
||||||
|
.prio = 288,
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
||||||
|
/* Split-Radix codelets */
|
||||||
|
&ff_tx_fft2_float_sse3_def,
|
||||||
|
&ff_tx_fft4_fwd_float_sse2_def,
|
||||||
|
&ff_tx_fft4_inv_float_sse2_def,
|
||||||
|
&ff_tx_fft8_float_sse3_def,
|
||||||
|
&ff_tx_fft8_float_avx_def,
|
||||||
|
&ff_tx_fft16_float_avx_def,
|
||||||
|
&ff_tx_fft16_float_fma3_def,
|
||||||
|
|
||||||
|
#if ARCH_X86_64
|
||||||
|
&ff_tx_fft32_float_avx_def,
|
||||||
|
&ff_tx_fft32_float_fma3_def,
|
||||||
|
|
||||||
|
/* Standalone transforms */
|
||||||
|
&ff_tx_fft_sr_float_avx_def,
|
||||||
|
#if HAVE_AVX2_EXTERNAL
|
||||||
|
&ff_tx_fft_sr_float_avx2_def,
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NULL,
|
||||||
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user