mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
lavu/tx: generalize PFA FFTs
This commit permits any stacking of FFTs of any size.
This commit is contained in:
parent
7f019e7758
commit
8547123f3b
@ -232,7 +232,8 @@ struct AVTXContext {
|
||||
int len; /* Length of the transform */
|
||||
int inv; /* If transform is inverse */
|
||||
int *map; /* Lookup table(s) */
|
||||
TXComplex *exp; /* Any non-pre-baked multiplication factors needed */
|
||||
TXComplex *exp; /* Any non-pre-baked multiplication factors,
|
||||
* or extra temporary buffer */
|
||||
TXComplex *tmp; /* Temporary buffer, if needed */
|
||||
|
||||
AVTXContext *sub; /* Subtransform context(s), if needed */
|
||||
|
@ -949,74 +949,182 @@ static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s,
|
||||
int len, int inv,
|
||||
const void *scale)
|
||||
{
|
||||
int ret;
|
||||
int sub_len = len / cd->factors[0];
|
||||
FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
|
||||
int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
|
||||
FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
|
||||
size_t extra_tmp_len = 0;
|
||||
int len_list[TX_MAX_DECOMPOSITIONS];
|
||||
|
||||
flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
|
||||
flags |= AV_TX_INPLACE; /* in-place */
|
||||
if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
|
||||
return ret;
|
||||
|
||||
/* Two iterations to test both orderings. */
|
||||
for (int i = 0; i < ret; i++) {
|
||||
int len1 = len_list[i];
|
||||
int len2 = len / len1;
|
||||
|
||||
/* Our ptwo transforms don't support striding the output. */
|
||||
if (len2 & (len2 - 1))
|
||||
FFSWAP(int, len1, len2);
|
||||
|
||||
ff_tx_clear_ctx(s);
|
||||
|
||||
/* First transform */
|
||||
sub_opts.map_dir = FF_TX_MAP_GATHER;
|
||||
flags &= ~AV_TX_INPLACE;
|
||||
flags |= FF_TX_OUT_OF_PLACE;
|
||||
flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
|
||||
ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
|
||||
len1, inv, scale);
|
||||
|
||||
if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
|
||||
sub_len, inv, scale)))
|
||||
if (ret == AVERROR(ENOMEM)) {
|
||||
return ret;
|
||||
} else if (ret < 0) { /* Try again without a preshuffle flag */
|
||||
flags &= ~FF_TX_PRESHUFFLE;
|
||||
ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
|
||||
len1, inv, scale);
|
||||
if (ret == AVERROR(ENOMEM))
|
||||
return ret;
|
||||
else if (ret < 0)
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Second transform. */
|
||||
sub_opts.map_dir = FF_TX_MAP_SCATTER;
|
||||
flags |= FF_TX_PRESHUFFLE;
|
||||
retry:
|
||||
flags &= ~FF_TX_OUT_OF_PLACE;
|
||||
flags |= AV_TX_INPLACE;
|
||||
ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
|
||||
len2, inv, scale);
|
||||
|
||||
if (ret == AVERROR(ENOMEM)) {
|
||||
return ret;
|
||||
} else if (ret < 0) { /* Try again with an out-of-place transform */
|
||||
flags |= FF_TX_OUT_OF_PLACE;
|
||||
flags &= ~AV_TX_INPLACE;
|
||||
ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
|
||||
len2, inv, scale);
|
||||
if (ret == AVERROR(ENOMEM)) {
|
||||
return ret;
|
||||
} else if (ret < 0) {
|
||||
if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
|
||||
flags &= ~FF_TX_PRESHUFFLE;
|
||||
goto retry;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Success */
|
||||
break;
|
||||
}
|
||||
|
||||
/* If nothing was sucessful, error out */
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
/* Generate PFA map */
|
||||
if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
|
||||
cd->factors[0], sub_len)))
|
||||
s->sub[0].len, s->sub[1].len)))
|
||||
return ret;
|
||||
|
||||
if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
TX_TAB(ff_tx_init_tabs)(len / sub_len);
|
||||
/* Flatten input map */
|
||||
tmp = (int *)s->tmp;
|
||||
for (int k = 0; k < len; k += s->sub[0].len) {
|
||||
memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
|
||||
for (int i = 0; i < s->sub[0].len; i++)
|
||||
s->map[k + i] = tmp[s->sub[0].map[i]];
|
||||
}
|
||||
|
||||
/* Only allocate extra temporary memory if we need it */
|
||||
if (!(s->sub[1].flags & AV_TX_INPLACE))
|
||||
extra_tmp_len = len;
|
||||
else if (!ps)
|
||||
extra_tmp_len = s->sub[0].len;
|
||||
|
||||
if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define DECL_COMP_FFT(N) \
|
||||
static void TX_NAME(ff_tx_fft_pfa_##N##xM)(AVTXContext *s, void *_out, \
|
||||
void *_in, ptrdiff_t stride) \
|
||||
{ \
|
||||
const int m = s->sub->len; \
|
||||
const int *in_map = s->map, *out_map = in_map + s->len; \
|
||||
const int *sub_map = s->sub->map; \
|
||||
TXComplex *in = _in; \
|
||||
TXComplex *out = _out; \
|
||||
TXComplex fft##N##in[N]; \
|
||||
\
|
||||
for (int i = 0; i < m; i++) { \
|
||||
for (int j = 0; j < N; j++) \
|
||||
fft##N##in[j] = in[in_map[i*N + j]]; \
|
||||
fft##N(s->tmp + sub_map[i], fft##N##in, m); \
|
||||
} \
|
||||
\
|
||||
for (int i = 0; i < N; i++) \
|
||||
s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
|
||||
\
|
||||
for (int i = 0; i < N*m; i++) \
|
||||
out[i] = s->tmp[out_map[i]]; \
|
||||
} \
|
||||
\
|
||||
static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_##N##xM_def) = { \
|
||||
.name = TX_NAME_STR("fft_pfa_" #N "xM"), \
|
||||
.function = TX_NAME(ff_tx_fft_pfa_##N##xM), \
|
||||
.type = TX_TYPE(FFT), \
|
||||
.flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, \
|
||||
.factors = { N, TX_FACTOR_ANY }, \
|
||||
.nb_factors = 2, \
|
||||
.min_len = N*2, \
|
||||
.max_len = TX_LEN_UNLIMITED, \
|
||||
.init = TX_NAME(ff_tx_fft_pfa_init), \
|
||||
.cpu_flags = FF_TX_CPU_FLAGS_ALL, \
|
||||
.prio = FF_TX_PRIO_BASE, \
|
||||
static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out,
|
||||
void *_in, ptrdiff_t stride)
|
||||
{
|
||||
const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
|
||||
const int *in_map = s->map, *out_map = in_map + l;
|
||||
const int *sub_map = s->sub[1].map;
|
||||
TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
|
||||
TXComplex *in = _in, *out = _out;
|
||||
|
||||
stride /= sizeof(*out);
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++)
|
||||
s->exp[j] = in[in_map[i*n + j]];
|
||||
s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
|
||||
}
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
|
||||
|
||||
for (int i = 0; i < l; i++)
|
||||
out[i*stride] = tmp1[out_map[i]];
|
||||
}
|
||||
|
||||
static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out,
|
||||
void *_in, ptrdiff_t stride)
|
||||
{
|
||||
const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
|
||||
const int *in_map = s->map, *out_map = in_map + l;
|
||||
const int *sub_map = s->sub[1].map;
|
||||
TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
|
||||
TXComplex *in = _in, *out = _out;
|
||||
|
||||
stride /= sizeof(*out);
|
||||
|
||||
for (int i = 0; i < m; i++)
|
||||
s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex));
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
|
||||
|
||||
for (int i = 0; i < l; i++)
|
||||
out[i*stride] = tmp1[out_map[i]];
|
||||
}
|
||||
|
||||
static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = {
|
||||
.name = TX_NAME_STR("fft_pfa"),
|
||||
.function = TX_NAME(ff_tx_fft_pfa),
|
||||
.type = TX_TYPE(FFT),
|
||||
.flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
|
||||
.factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
|
||||
.nb_factors = 2,
|
||||
.min_len = 2*3,
|
||||
.max_len = TX_LEN_UNLIMITED,
|
||||
.init = TX_NAME(ff_tx_fft_pfa_init),
|
||||
.cpu_flags = FF_TX_CPU_FLAGS_ALL,
|
||||
.prio = FF_TX_PRIO_BASE,
|
||||
};
|
||||
|
||||
DECL_COMP_FFT(3)
|
||||
DECL_COMP_FFT(5)
|
||||
DECL_COMP_FFT(7)
|
||||
DECL_COMP_FFT(9)
|
||||
DECL_COMP_FFT(15)
|
||||
static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = {
|
||||
.name = TX_NAME_STR("fft_pfa_ns"),
|
||||
.function = TX_NAME(ff_tx_fft_pfa_ns),
|
||||
.type = TX_TYPE(FFT),
|
||||
.flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |
|
||||
FF_TX_PRESHUFFLE,
|
||||
.factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
|
||||
.nb_factors = 2,
|
||||
.min_len = 2*3,
|
||||
.max_len = TX_LEN_UNLIMITED,
|
||||
.init = TX_NAME(ff_tx_fft_pfa_init),
|
||||
.cpu_flags = FF_TX_CPU_FLAGS_ALL,
|
||||
.prio = FF_TX_PRIO_BASE,
|
||||
};
|
||||
|
||||
static av_cold int TX_NAME(ff_tx_mdct_naive_init)(AVTXContext *s,
|
||||
const FFTXCodelet *cd,
|
||||
@ -1683,11 +1791,8 @@ const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
|
||||
&TX_NAME(ff_tx_fft_def),
|
||||
&TX_NAME(ff_tx_fft_inplace_def),
|
||||
&TX_NAME(ff_tx_fft_inplace_small_def),
|
||||
&TX_NAME(ff_tx_fft_pfa_3xM_def),
|
||||
&TX_NAME(ff_tx_fft_pfa_5xM_def),
|
||||
&TX_NAME(ff_tx_fft_pfa_7xM_def),
|
||||
&TX_NAME(ff_tx_fft_pfa_9xM_def),
|
||||
&TX_NAME(ff_tx_fft_pfa_15xM_def),
|
||||
&TX_NAME(ff_tx_fft_pfa_def),
|
||||
&TX_NAME(ff_tx_fft_pfa_ns_def),
|
||||
&TX_NAME(ff_tx_fft_naive_def),
|
||||
&TX_NAME(ff_tx_fft_naive_small_def),
|
||||
&TX_NAME(ff_tx_mdct_fwd_def),
|
||||
|
Loading…
Reference in New Issue
Block a user