diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h index f11061d051..8a79cf0dd3 100644 --- a/libavutil/tx_priv.h +++ b/libavutil/tx_priv.h @@ -232,7 +232,8 @@ struct AVTXContext { int len; /* Length of the transform */ int inv; /* If transform is inverse */ int *map; /* Lookup table(s) */ - TXComplex *exp; /* Any non-pre-baked multiplication factors needed */ + TXComplex *exp; /* Any non-pre-baked multiplication factors, + * or extra temporary buffer */ TXComplex *tmp; /* Temporary buffer, if needed */ AVTXContext *sub; /* Subtransform context(s), if needed */ diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 38ab517f66..59295e06ca 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -949,74 +949,182 @@ static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s, int len, int inv, const void *scale) { - int ret; - int sub_len = len / cd->factors[0]; - FFTXCodeletOptions sub_opts = { .invert_lookup = 0 }; + int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; + size_t extra_tmp_len = 0; + int len_list[TX_MAX_DECOMPOSITIONS]; - flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ - flags |= AV_TX_INPLACE; /* in-place */ - flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ + if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0) + return ret; - if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, - sub_len, inv, scale))) + /* Two iterations to test both orderings. */ + for (int i = 0; i < ret; i++) { + int len1 = len_list[i]; + int len2 = len / len1; + + /* Our ptwo transforms don't support striding the output. */ + if (len2 & (len2 - 1)) + FFSWAP(int, len1, len2); + + ff_tx_clear_ctx(s); + + /* First transform */ + sub_opts.map_dir = FF_TX_MAP_GATHER; + flags &= ~AV_TX_INPLACE; + flags |= FF_TX_OUT_OF_PLACE; + flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ + ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, + len1, inv, scale); + + if (ret == AVERROR(ENOMEM)) { + return ret; + } else if (ret < 0) { /* Try again without a preshuffle flag */ + flags &= ~FF_TX_PRESHUFFLE; + ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, + len1, inv, scale); + if (ret == AVERROR(ENOMEM)) + return ret; + else if (ret < 0) + continue; + } + + /* Second transform. */ + sub_opts.map_dir = FF_TX_MAP_SCATTER; + flags |= FF_TX_PRESHUFFLE; +retry: + flags &= ~FF_TX_OUT_OF_PLACE; + flags |= AV_TX_INPLACE; + ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, + len2, inv, scale); + + if (ret == AVERROR(ENOMEM)) { + return ret; + } else if (ret < 0) { /* Try again with an out-of-place transform */ + flags |= FF_TX_OUT_OF_PLACE; + flags &= ~AV_TX_INPLACE; + ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, + len2, inv, scale); + if (ret == AVERROR(ENOMEM)) { + return ret; + } else if (ret < 0) { + if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */ + flags &= ~FF_TX_PRESHUFFLE; + goto retry; + } else { + continue; + } + } + } + + /* Success */ + break; + } + + /* If nothing was sucessful, error out */ + if (ret < 0) return ret; /* Generate PFA map */ if ((ret = ff_tx_gen_compound_mapping(s, opts, 0, - cd->factors[0], sub_len))) + s->sub[0].len, s->sub[1].len))) return ret; if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) return AVERROR(ENOMEM); - TX_TAB(ff_tx_init_tabs)(len / sub_len); + /* Flatten input map */ + tmp = (int *)s->tmp; + for (int k = 0; k < len; k += s->sub[0].len) { + memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp)); + for (int i = 0; i < s->sub[0].len; i++) + s->map[k + i] = tmp[s->sub[0].map[i]]; + } + + /* Only allocate extra temporary memory if we need it */ + if (!(s->sub[1].flags & AV_TX_INPLACE)) + extra_tmp_len = len; + else if (!ps) + extra_tmp_len = s->sub[0].len; + + if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp)))) + return AVERROR(ENOMEM); return 0; } -#define DECL_COMP_FFT(N) \ -static void TX_NAME(ff_tx_fft_pfa_##N##xM)(AVTXContext *s, void *_out, \ - void *_in, ptrdiff_t stride) \ -{ \ - const int m = s->sub->len; \ - const int *in_map = s->map, *out_map = in_map + s->len; \ - const int *sub_map = s->sub->map; \ - TXComplex *in = _in; \ - TXComplex *out = _out; \ - TXComplex fft##N##in[N]; \ - \ - for (int i = 0; i < m; i++) { \ - for (int j = 0; j < N; j++) \ - fft##N##in[j] = in[in_map[i*N + j]]; \ - fft##N(s->tmp + sub_map[i], fft##N##in, m); \ - } \ - \ - for (int i = 0; i < N; i++) \ - s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \ - \ - for (int i = 0; i < N*m; i++) \ - out[i] = s->tmp[out_map[i]]; \ -} \ - \ -static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_##N##xM_def) = { \ - .name = TX_NAME_STR("fft_pfa_" #N "xM"), \ - .function = TX_NAME(ff_tx_fft_pfa_##N##xM), \ - .type = TX_TYPE(FFT), \ - .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, \ - .factors = { N, TX_FACTOR_ANY }, \ - .nb_factors = 2, \ - .min_len = N*2, \ - .max_len = TX_LEN_UNLIMITED, \ - .init = TX_NAME(ff_tx_fft_pfa_init), \ - .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ - .prio = FF_TX_PRIO_BASE, \ +static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out, + void *_in, ptrdiff_t stride) +{ + const int n = s->sub[0].len, m = s->sub[1].len, l = s->len; + const int *in_map = s->map, *out_map = in_map + l; + const int *sub_map = s->sub[1].map; + TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp; + TXComplex *in = _in, *out = _out; + + stride /= sizeof(*out); + + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) + s->exp[j] = in[in_map[i*n + j]]; + s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex)); + } + + for (int i = 0; i < n; i++) + s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex)); + + for (int i = 0; i < l; i++) + out[i*stride] = tmp1[out_map[i]]; +} + +static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out, + void *_in, ptrdiff_t stride) +{ + const int n = s->sub[0].len, m = s->sub[1].len, l = s->len; + const int *in_map = s->map, *out_map = in_map + l; + const int *sub_map = s->sub[1].map; + TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp; + TXComplex *in = _in, *out = _out; + + stride /= sizeof(*out); + + for (int i = 0; i < m; i++) + s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex)); + + for (int i = 0; i < n; i++) + s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex)); + + for (int i = 0; i < l; i++) + out[i*stride] = tmp1[out_map[i]]; +} + +static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = { + .name = TX_NAME_STR("fft_pfa"), + .function = TX_NAME(ff_tx_fft_pfa), + .type = TX_TYPE(FFT), + .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, + .factors = { 7, 5, 3, 2, TX_FACTOR_ANY }, + .nb_factors = 2, + .min_len = 2*3, + .max_len = TX_LEN_UNLIMITED, + .init = TX_NAME(ff_tx_fft_pfa_init), + .cpu_flags = FF_TX_CPU_FLAGS_ALL, + .prio = FF_TX_PRIO_BASE, }; -DECL_COMP_FFT(3) -DECL_COMP_FFT(5) -DECL_COMP_FFT(7) -DECL_COMP_FFT(9) -DECL_COMP_FFT(15) +static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = { + .name = TX_NAME_STR("fft_pfa_ns"), + .function = TX_NAME(ff_tx_fft_pfa_ns), + .type = TX_TYPE(FFT), + .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | + FF_TX_PRESHUFFLE, + .factors = { 7, 5, 3, 2, TX_FACTOR_ANY }, + .nb_factors = 2, + .min_len = 2*3, + .max_len = TX_LEN_UNLIMITED, + .init = TX_NAME(ff_tx_fft_pfa_init), + .cpu_flags = FF_TX_CPU_FLAGS_ALL, + .prio = FF_TX_PRIO_BASE, +}; static av_cold int TX_NAME(ff_tx_mdct_naive_init)(AVTXContext *s, const FFTXCodelet *cd, @@ -1683,11 +1791,8 @@ const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = { &TX_NAME(ff_tx_fft_def), &TX_NAME(ff_tx_fft_inplace_def), &TX_NAME(ff_tx_fft_inplace_small_def), - &TX_NAME(ff_tx_fft_pfa_3xM_def), - &TX_NAME(ff_tx_fft_pfa_5xM_def), - &TX_NAME(ff_tx_fft_pfa_7xM_def), - &TX_NAME(ff_tx_fft_pfa_9xM_def), - &TX_NAME(ff_tx_fft_pfa_15xM_def), + &TX_NAME(ff_tx_fft_pfa_def), + &TX_NAME(ff_tx_fft_pfa_ns_def), &TX_NAME(ff_tx_fft_naive_def), &TX_NAME(ff_tx_fft_naive_small_def), &TX_NAME(ff_tx_mdct_fwd_def),