lavu/tx: rewrite internal code as a tree-based codelet constructor

This commit rewrites the internal transform code into a constructor that stitches transforms (codelets). This allows for transforms to reuse arbitrary parts of other transforms, and allows transforms to be stacked onto one another (such as a full iMDCT using a half-iMDCT which in turn uses an FFT). It also permits for each step to be individually replaced by assembly or a custom implementation (such as an ASIC).
2025-08-15 14:13:16 +02:00 · 2022-01-20 07:14:46 +01:00
parent c14976be04
commit ef4bd81615
6 changed files with 1698 additions and 881 deletions
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -16,19 +16,16 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "cpu.h"
 #include "qsort.h"
 #include "bprint.h"
 #include "tx_priv.h"
-int ff_tx_type_is_mdct(enum AVTXType type)
+#define TYPE_IS(type, x)               \
-{
+    (((x) == AV_TX_FLOAT_ ## type)  || \
-    switch (type) {
+     ((x) == AV_TX_DOUBLE_ ## type) || \
-    case AV_TX_FLOAT_MDCT:
+     ((x) == AV_TX_INT32_ ## type))
    case AV_TX_DOUBLE_MDCT:
    case AV_TX_INT32_MDCT:
        return 1;
    default:
        return 0;
    }
 }
 /* Calculates the modular multiplicative inverse */
 static av_always_inline int mulinv(int n, int m)
@@ -42,22 +39,26 @@ static av_always_inline int mulinv(int n, int m)
 }
 /* Guaranteed to work for any n, m where gcd(n, m) == 1 */
-int ff_tx_gen_compound_mapping(AVTXContext *s)
+int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
 {
    int *in_map, *out_map;
-    const int n     = s->n;
+    const int inv = s->inv;
-    const int m     = s->m;
+    const int len = n*m;    /* Will not be equal to s->len for MDCTs */
-    const int inv   = s->inv;
+    const int mdct = TYPE_IS(MDCT, s->type);
-    const int len   = n*m;
+    int m_inv, n_inv;
    const int m_inv = mulinv(m, n);
    const int n_inv = mulinv(n, m);
    const int mdct  = ff_tx_type_is_mdct(s->type);
-    if (!(s->pfatab = av_malloc(2*len*sizeof(*s->pfatab))))
+    /* Make sure the numbers are coprime */
    if (av_gcd(n, m) != 1)
        return AVERROR(EINVAL);
    m_inv = mulinv(m, n);
    n_inv = mulinv(n, m);
    if (!(s->map = av_malloc(2*len*sizeof(*s->map))))
        return AVERROR(ENOMEM);
-    in_map  = s->pfatab;
+    in_map  = s->map;
-    out_map = s->pfatab + n*m;
+    out_map = s->map + len;
    /* Ruritanian map for input, CRT map for output, can be swapped */
    for (int j = 0; j < m; j++) {
@@ -92,48 +93,50 @@ int ff_tx_gen_compound_mapping(AVTXContext *s)
    return 0;
 }
-static inline int split_radix_permutation(int i, int m, int inverse)
+static inline int split_radix_permutation(int i, int len, int inv)
 {
-    m >>= 1;
+    len >>= 1;
-    if (m <= 1)
+    if (len <= 1)
        return i & 1;
-    if (!(i & m))
+    if (!(i & len))
-        return split_radix_permutation(i, m, inverse) * 2;
+        return split_radix_permutation(i, len, inv) * 2;
-    m >>= 1;
+    len >>= 1;
-    return split_radix_permutation(i, m, inverse) * 4 + 1 - 2*(!(i & m) ^ inverse);
+    return split_radix_permutation(i, len, inv) * 4 + 1 - 2*(!(i & len) ^ inv);
 }
 int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup)
 {
-    const int m = s->m, inv = s->inv;
+    int len = s->len;
-    if (!(s->revtab = av_malloc(s->m*sizeof(*s->revtab))))
+    if (!(s->map = av_malloc(len*sizeof(*s->map))))
        return AVERROR(ENOMEM);
    if (!(s->revtab_c = av_malloc(m*sizeof(*s->revtab_c))))
        return AVERROR(ENOMEM);
-    /* Default */
+    if (invert_lookup) {
-    for (int i = 0; i < m; i++) {
+        for (int i = 0; i < s->len; i++)
-        int k = -split_radix_permutation(i, m, inv) & (m - 1);
+            s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1);
-        if (invert_lookup)
+    } else {
-            s->revtab[i] = s->revtab_c[i] = k;
+        for (int i = 0; i < s->len; i++)
-        else
+            s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i;
            s->revtab[i] = s->revtab_c[k] = i;
    }
    return 0;
 }
-int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab)
+int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s)
 {
-    int nb_inplace_idx = 0;
+    int *src_map, out_map_idx = 0, len = s->len;
-    if (!(s->inplace_idx = av_malloc(s->m*sizeof(*s->inplace_idx))))
+    if (!s->sub || !s->sub->map)
        return AVERROR(EINVAL);
    if (!(s->map = av_mallocz(len*sizeof(*s->map))))
        return AVERROR(ENOMEM);
    src_map = s->sub->map;
    /* The first coefficient is always already in-place */
-    for (int src = 1; src < s->m; src++) {
+    for (int src = 1; src < s->len; src++) {
-        int dst = revtab[src];
+        int dst = src_map[src];
        int found = 0;
        if (dst <= src)
@@ -143,48 +146,53 @@ int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab)
         * and if so, skips it, since to fully permute a loop we must only
         * enter it once. */
        do {
-            for (int j = 0; j < nb_inplace_idx; j++) {
+            for (int j = 0; j < out_map_idx; j++) {
-                if (dst == s->inplace_idx[j]) {
+                if (dst == s->map[j]) {
                    found = 1;
                    break;
                }
            }
-            dst = revtab[dst];
+            dst = src_map[dst];
        } while (dst != src && !found);
        if (!found)
-            s->inplace_idx[nb_inplace_idx++] = src;
+            s->map[out_map_idx++] = src;
    }
-    s->inplace_idx[nb_inplace_idx++] = 0;
+    s->map[out_map_idx++] = 0;
    return 0;
 }
 static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
                                    int is_dual, int dual_high, int len,
-                                    int basis, int dual_stride)
+                                    int basis, int dual_stride, int inv_lookup)
 {
    len >>= 1;
    if (len <= basis) {
-        int k1, k2, *even, *odd, stride;
+        int k1, k2, stride, even_idx, odd_idx;
        is_dual = is_dual && dual_stride;
        dual_high = is_dual & dual_high;
        stride = is_dual ? FFMIN(dual_stride, len) : 0;
-        even = &revtab[offset + dual_high*(stride - 2*len)];
+        even_idx = offset + dual_high*(stride - 2*len);
-        odd  = &even[len + (is_dual && !dual_high)*len + dual_high*len];
+        odd_idx  = even_idx + len + (is_dual && !dual_high)*len + dual_high*len;
        for (int i = 0; i < len; i++) {
            k1 = -split_radix_permutation(offset + i*2 + 0, n, inv) & (n - 1);
            k2 = -split_radix_permutation(offset + i*2 + 1, n, inv) & (n - 1);
-            *even++ = k1;
+            if (inv_lookup) {
-            *odd++  = k2;
+                revtab[even_idx++] = k1;
                revtab[odd_idx++]  = k2;
            } else {
                revtab[k1] = even_idx++;
                revtab[k2] = odd_idx++;
            }
            if (stride && !((i + 1) % stride)) {
-                even += stride;
+                even_idx += stride;
-                odd  += stride;
+                odd_idx  += stride;
            }
        }
@@ -192,22 +200,52 @@ static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
    }
    parity_revtab_generator(revtab, n, inv, offset,
-                            0, 0, len >> 0, basis, dual_stride);
+                            0, 0, len >> 0, basis, dual_stride, inv_lookup);
    parity_revtab_generator(revtab, n, inv, offset + (len >> 0),
-                            1, 0, len >> 1, basis, dual_stride);
+                            1, 0, len >> 1, basis, dual_stride, inv_lookup);
    parity_revtab_generator(revtab, n, inv, offset + (len >> 0) + (len >> 1),
-                            1, 1, len >> 1, basis, dual_stride);
+                            1, 1, len >> 1, basis, dual_stride, inv_lookup);
 }
-void ff_tx_gen_split_radix_parity_revtab(int *revtab, int len, int inv,
+int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int invert_lookup,
-                                         int basis, int dual_stride)
+                                        int basis, int dual_stride)
 {
    int len = s->len;
    int inv = s->inv;
    if (!(s->map = av_mallocz(len*sizeof(*s->map))))
        return AVERROR(ENOMEM);
    basis >>= 1;
    if (len < basis)
-        return;
+        return AVERROR(EINVAL);
    av_assert0(!dual_stride || !(dual_stride & (dual_stride - 1)));
    av_assert0(dual_stride <= basis);
-    parity_revtab_generator(revtab, len, inv, 0, 0, 0, len, basis, dual_stride);
+    parity_revtab_generator(s->map, len, inv, 0, 0, 0, len,
                            basis, dual_stride, invert_lookup);
    return 0;
 }
 static void reset_ctx(AVTXContext *s)
 {
    if (!s)
        return;
    if (s->sub)
        for (int i = 0; i < s->nb_sub; i++)
            reset_ctx(&s->sub[i]);
    if (s->cd_self->uninit)
        s->cd_self->uninit(s);
    av_freep(&s->sub);
    av_freep(&s->map);
    av_freep(&s->exp);
    av_freep(&s->tmp);
    memset(s, 0, sizeof(*s));
 }
 av_cold void av_tx_uninit(AVTXContext **ctx)
@@ -215,53 +253,401 @@ av_cold void av_tx_uninit(AVTXContext **ctx)
    if (!(*ctx))
        return;
-    av_free((*ctx)->pfatab);
+    reset_ctx(*ctx);
    av_free((*ctx)->exptab);
    av_free((*ctx)->revtab);
    av_free((*ctx)->revtab_c);
    av_free((*ctx)->inplace_idx);
    av_free((*ctx)->tmp);
    av_freep(ctx);
 }
 static av_cold int ff_tx_null_init(AVTXContext *s, const FFTXCodelet *cd,
                                   uint64_t flags, FFTXCodeletOptions *opts,
                                   int len, int inv, const void *scale)
 {
    /* Can only handle one sample+type to one sample+type transforms */
    if (TYPE_IS(MDCT, s->type))
        return AVERROR(EINVAL);
    return 0;
 }
 /* Null transform when the length is 1 */
 static void ff_tx_null(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
 {
    memcpy(_out, _in, stride);
 }
 static const FFTXCodelet ff_tx_null_def = {
    .name       = "null",
    .function   = ff_tx_null,
    .type       = TX_TYPE_ANY,
    .flags      = AV_TX_UNALIGNED | FF_TX_ALIGNED |
                  FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
    .factors[0] = TX_FACTOR_ANY,
    .min_len    = 1,
    .max_len    = 1,
    .init       = ff_tx_null_init,
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
    .prio       = FF_TX_PRIO_MAX,
 };
 static const FFTXCodelet * const ff_tx_null_list[] = {
    &ff_tx_null_def,
    NULL,
 };
 static void print_flags(AVBPrint *bp, uint64_t f)
 {
    int prev = 0;
    const char *sep = ", ";
    av_bprintf(bp, "flags: [");
    if ((f & FF_TX_ALIGNED) && ++prev)
        av_bprintf(bp, "aligned");
    if ((f & AV_TX_UNALIGNED) && ++prev)
        av_bprintf(bp, "%sunaligned", prev > 1 ? sep : "");
    if ((f & AV_TX_INPLACE) && ++prev)
        av_bprintf(bp, "%sinplace", prev > 1 ? sep : "");
    if ((f & FF_TX_OUT_OF_PLACE) && ++prev)
        av_bprintf(bp, "%sout_of_place", prev > 1 ? sep : "");
    if ((f & FF_TX_FORWARD_ONLY) && ++prev)
        av_bprintf(bp, "%sfwd_only", prev > 1 ? sep : "");
    if ((f & FF_TX_INVERSE_ONLY) && ++prev)
        av_bprintf(bp, "%sinv_only", prev > 1 ? sep : "");
    if ((f & FF_TX_PRESHUFFLE) && ++prev)
        av_bprintf(bp, "%spreshuf", prev > 1 ? sep : "");
    if ((f & AV_TX_FULL_IMDCT) && ++prev)
        av_bprintf(bp, "%simdct_full", prev > 1 ? sep : "");
    av_bprintf(bp, "]");
 }
 static void print_type(AVBPrint *bp, enum AVTXType type)
 {
    av_bprintf(bp, "%s",
               type == TX_TYPE_ANY       ? "any"         :
               type == AV_TX_FLOAT_FFT   ? "fft_float"   :
               type == AV_TX_FLOAT_MDCT  ? "mdct_float"  :
               type == AV_TX_DOUBLE_FFT  ? "fft_double"  :
               type == AV_TX_DOUBLE_MDCT ? "mdct_double" :
               type == AV_TX_INT32_FFT   ? "fft_int32"   :
               type == AV_TX_INT32_MDCT  ? "mdct_int32"  :
               "unknown");
 }
 static void print_cd_info(const FFTXCodelet *cd, int prio, int print_prio)
 {
    AVBPrint bp = { 0 };
    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
    av_bprintf(&bp, "%s - type: ", cd->name);
    print_type(&bp, cd->type);
    av_bprintf(&bp, ", len: ");
    if (cd->min_len != cd->max_len)
        av_bprintf(&bp, "[%i, ", cd->min_len);
    if (cd->max_len == TX_LEN_UNLIMITED)
        av_bprintf(&bp, "∞");
    else
        av_bprintf(&bp, "%i", cd->max_len);
    av_bprintf(&bp, "%s, factors: [", cd->min_len != cd->max_len ? "]" : "");
    for (int i = 0; i < TX_MAX_SUB; i++) {
        if (i && cd->factors[i])
            av_bprintf(&bp, ", ");
        if (cd->factors[i] == TX_FACTOR_ANY)
            av_bprintf(&bp, "any");
        else if (cd->factors[i])
            av_bprintf(&bp, "%i", cd->factors[i]);
        else
            break;
    }
    av_bprintf(&bp, "], ");
    print_flags(&bp, cd->flags);
    if (print_prio)
        av_bprintf(&bp, ", prio: %i", prio);
    av_log(NULL, AV_LOG_VERBOSE, "%s\n", bp.str);
 }
 typedef struct TXCodeletMatch {
    const FFTXCodelet *cd;
    int prio;
 } TXCodeletMatch;
 static int cmp_matches(TXCodeletMatch *a, TXCodeletMatch *b)
 {
    return FFDIFFSIGN(b->prio, a->prio);
 }
 /* We want all factors to completely cover the length */
 static inline int check_cd_factors(const FFTXCodelet *cd, int len)
 {
    int all_flag = 0;
    for (int i = 0; i < TX_MAX_SUB; i++) {
        int factor = cd->factors[i];
        /* Conditions satisfied */
        if (len == 1)
            return 1;
        /* No more factors */
        if (!factor) {
            break;
        } else if (factor == TX_FACTOR_ANY) {
            all_flag = 1;
            continue;
        }
        if (factor == 2) { /* Fast path */
            int bits_2 = ff_ctz(len);
            if (!bits_2)
                return 0; /* Factor not supported */
            len >>= bits_2;
        } else {
            int res = len % factor;
            if (res)
                return 0; /* Factor not supported */
            while (!res) {
                len /= factor;
                res = len % factor;
            }
        }
    }
    return all_flag || (len == 1);
 }
 av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
                             uint64_t flags, FFTXCodeletOptions *opts,
                             int len, int inv, const void *scale)
 {
    int ret = 0;
    AVTXContext *sub = NULL;
    TXCodeletMatch *cd_tmp, *cd_matches = NULL;
    unsigned int cd_matches_size = 0;
    int nb_cd_matches = 0;
    AVBPrint bp = { 0 };
    /* Array of all compiled codelet lists. Order is irrelevant. */
    const FFTXCodelet * const * const codelet_list[] = {
        ff_tx_codelet_list_float_c,
        ff_tx_codelet_list_double_c,
        ff_tx_codelet_list_int32_c,
        ff_tx_null_list,
 #if ARCH_X86
        ff_tx_codelet_list_float_x86,
 #endif
    };
    int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
    /* We still accept functions marked with SLOW, even if the CPU is
     * marked with the same flag, but we give them lower priority. */
    const int cpu_flags = av_get_cpu_flags();
    const int slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW  |
                          AV_CPU_FLAG_ATOM     | AV_CPU_FLAG_SSSE3SLOW |
                          AV_CPU_FLAG_AVXSLOW  | AV_CPU_FLAG_SLOW_GATHER;
    /* Flags the transform wants */
    uint64_t req_flags = flags;
    /* Unaligned codelets are compatible with the aligned flag */
    if (req_flags & FF_TX_ALIGNED)
        req_flags |= AV_TX_UNALIGNED;
    /* If either flag is set, both are okay, so don't check for an exact match */
    if ((req_flags & AV_TX_INPLACE) && (req_flags & FF_TX_OUT_OF_PLACE))
        req_flags &= ~(AV_TX_INPLACE | FF_TX_OUT_OF_PLACE);
    if ((req_flags & FF_TX_ALIGNED) && (req_flags & AV_TX_UNALIGNED))
        req_flags &= ~(FF_TX_ALIGNED | AV_TX_UNALIGNED);
    /* Flags the codelet may require to be present */
    uint64_t inv_req_mask = AV_TX_FULL_IMDCT | FF_TX_PRESHUFFLE;
    /* Loop through all codelets in all codelet lists to find matches
     * to the requirements */
    while (codelet_list_num--) {
        const FFTXCodelet * const * list = codelet_list[codelet_list_num];
        const FFTXCodelet *cd = NULL;
        while ((cd = *list++)) {
            int max_factor = 0;
            /* Check if the type matches */
            if (cd->type != TX_TYPE_ANY && type != cd->type)
                continue;
            /* Check direction for non-orthogonal codelets */
            if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv))
                continue;
            /* Check if the requested flags match from both sides */
            if (((req_flags    & cd->flags) != (req_flags)) ||
                ((inv_req_mask & cd->flags) != (req_flags & inv_req_mask)))
                continue;
            /* Check if length is supported */
            if ((len < cd->min_len) || (cd->max_len != -1 && (len > cd->max_len)))
                continue;
            /* Check if the CPU supports the required ISA */
            if (!(!cd->cpu_flags || (cpu_flags & (cd->cpu_flags & ~slow_mask))))
                continue;
            /* Check for factors */
            if (!check_cd_factors(cd, len))
                continue;
            /* Realloc array and append */
            cd_tmp = av_fast_realloc(cd_matches, &cd_matches_size,
                                     sizeof(*cd_tmp) * (nb_cd_matches + 1));
            if (!cd_tmp) {
                av_free(cd_matches);
                return AVERROR(ENOMEM);
            }
            cd_matches                     = cd_tmp;
            cd_matches[nb_cd_matches].cd   = cd;
            cd_matches[nb_cd_matches].prio = cd->prio;
            /* If the CPU has a SLOW flag, and the instruction is also flagged
             * as being slow for such, reduce its priority */
            if ((cpu_flags & cd->cpu_flags) & slow_mask)
                cd_matches[nb_cd_matches].prio -= 64;
            /* Prioritize aligned-only codelets */
            if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
                cd_matches[nb_cd_matches].prio += 64;
            /* Codelets for specific lengths are generally faster */
            if ((len == cd->min_len) && (len == cd->max_len))
                cd_matches[nb_cd_matches].prio += 64;
            /* Forward-only or inverse-only transforms are generally better */
            if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
                cd_matches[nb_cd_matches].prio += 64;
            /* Larger factors are generally better */
            for (int i = 0; i < TX_MAX_SUB; i++)
                max_factor = FFMAX(cd->factors[i], max_factor);
            if (max_factor)
                cd_matches[nb_cd_matches].prio += 16*max_factor;
            nb_cd_matches++;
        }
    }
    /* No matches found */
    if (!nb_cd_matches)
        return AVERROR(ENOSYS);
    /* Sort the list */
    AV_QSORT(cd_matches, nb_cd_matches, TXCodeletMatch, cmp_matches);
    /* Print debugging info */
    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
    av_bprintf(&bp, "For transform of length %i, %s, ", len,
               inv ? "inverse" : "forward");
    print_type(&bp, type);
    av_bprintf(&bp, ", ");
    print_flags(&bp, flags);
    av_bprintf(&bp, ", found %i matches:", nb_cd_matches);
    av_log(NULL, AV_LOG_VERBOSE, "%s\n", bp.str);
    for (int i = 0; i < nb_cd_matches; i++) {
        av_log(NULL, AV_LOG_VERBOSE, "    %i: ", i + 1);
        print_cd_info(cd_matches[i].cd, cd_matches[i].prio, 1);
    }
    if (!s->sub)
        s->sub = sub = av_mallocz(TX_MAX_SUB*sizeof(*sub));
    /* Attempt to initialize each */
    for (int i = 0; i < nb_cd_matches; i++) {
        const FFTXCodelet *cd = cd_matches[i].cd;
        AVTXContext *sctx = &s->sub[s->nb_sub];
        sctx->len        = len;
        sctx->inv        = inv;
        sctx->type       = type;
        sctx->flags      = flags;
        sctx->cd_self    = cd;
        s->fn[s->nb_sub] = cd->function;
        s->cd[s->nb_sub] = cd;
        ret = 0;
        if (cd->init)
            ret = cd->init(sctx, cd, flags, opts, len, inv, scale);
        if (ret >= 0) {
            s->nb_sub++;
            goto end;
        }
        s->fn[s->nb_sub] = NULL;
        s->cd[s->nb_sub] = NULL;
        reset_ctx(sctx);
        if (ret == AVERROR(ENOMEM))
            break;
    }
    av_free(sub);
    if (ret >= 0)
        ret = AVERROR(ENOSYS);
 end:
    av_free(cd_matches);
    return ret;
 }
 static void print_tx_structure(AVTXContext *s, int depth)
 {
    const FFTXCodelet *cd = s->cd_self;
    for (int i = 0; i <= depth; i++)
        av_log(NULL, AV_LOG_VERBOSE, "    ");
    print_cd_info(cd, cd->prio, 0);
    for (int i = 0; i < s->nb_sub; i++)
        print_tx_structure(&s->sub[i], depth + 1);
 }
 av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type,
                       int inv, int len, const void *scale, uint64_t flags)
 {
-    int err;
+    int ret;
-    AVTXContext *s = av_mallocz(sizeof(*s));
+    AVTXContext tmp = { 0 };
-    if (!s)
+    const double default_scale_d = 1.0;
-        return AVERROR(ENOMEM);
+    const float  default_scale_f = 1.0f;
-    switch (type) {
+    if (!len || type >= AV_TX_NB || !ctx || !tx)
-    case AV_TX_FLOAT_FFT:
+        return AVERROR(EINVAL);
    case AV_TX_FLOAT_MDCT:
        if ((err = ff_tx_init_mdct_fft_float(s, tx, type, inv, len, scale, flags)))
            goto fail;
        if (ARCH_X86)
            ff_tx_init_float_x86(s, tx);
        break;
    case AV_TX_DOUBLE_FFT:
    case AV_TX_DOUBLE_MDCT:
        if ((err = ff_tx_init_mdct_fft_double(s, tx, type, inv, len, scale, flags)))
            goto fail;
        break;
    case AV_TX_INT32_FFT:
    case AV_TX_INT32_MDCT:
        if ((err = ff_tx_init_mdct_fft_int32(s, tx, type, inv, len, scale, flags)))
            goto fail;
        break;
    default:
        err = AVERROR(EINVAL);
        goto fail;
    }
-    *ctx = s;
+    if (!(flags & AV_TX_UNALIGNED))
        flags |= FF_TX_ALIGNED;
    if (!(flags & AV_TX_INPLACE))
        flags |= FF_TX_OUT_OF_PLACE;
-    return 0;
+    if (!scale && ((type == AV_TX_FLOAT_MDCT) || (type == AV_TX_INT32_MDCT)))
        scale = &default_scale_f;
    else if (!scale && (type == AV_TX_DOUBLE_MDCT))
        scale = &default_scale_d;
-fail:
+    ret = ff_tx_init_subtx(&tmp, type, flags, NULL, len, inv, scale);
-    av_tx_uninit(&s);
+    if (ret < 0)
-    *tx = NULL;
+        return ret;
-    return err;
+
    *ctx = &tmp.sub[0];
    *tx  = tmp.fn[0];
    av_log(NULL, AV_LOG_VERBOSE, "Transform tree:\n");
    print_tx_structure(*ctx, 0);
    return ret;
 }
--- a/libavutil/tx.h
+++ b/libavutil/tx.h
@@ -52,6 +52,7 @@ enum AVTXType {
     * Standard MDCT with a sample data type of float, double or int32_t,
     * respecively. For the float and int32 variants, the scale type is
     * 'float', while for the double variant, it's 'double'.
     * If scale is NULL, 1.0 will be used as a default.
     *
     * Length is the frame size, not the window size (which is 2x frame).
     * For forward transforms, the stride specifies the spacing between each
@@ -67,6 +68,9 @@ enum AVTXType {
    AV_TX_FLOAT_MDCT  = 1,
    AV_TX_DOUBLE_MDCT = 3,
    AV_TX_INT32_MDCT  = 5,
    /* Not part of the API, do not use */
    AV_TX_NB,
 };
 /**
--- a/libavutil/tx_priv.h
+++ b/libavutil/tx_priv.h
@@ -25,36 +25,48 @@
 #include "attributes.h"
 #ifdef TX_FLOAT
-#define TX_NAME(x) x ## _float
+#define TX_TAB(x) x ## _float
 #define TX_NAME(x) x ## _float_c
 #define TX_NAME_STR(x) x "_float_c"
 #define TX_TYPE(x) AV_TX_FLOAT_ ## x
 #define MULT(x, m) ((x) * (m))
 #define SCALE_TYPE float
-typedef float FFTSample;
+typedef float TXSample;
-typedef AVComplexFloat FFTComplex;
+typedef AVComplexFloat TXComplex;
 #elif defined(TX_DOUBLE)
-#define TX_NAME(x) x ## _double
+#define TX_TAB(x) x ## _double
 #define TX_NAME(x) x ## _double_c
 #define TX_NAME_STR(x) x "_double_c"
 #define TX_TYPE(x) AV_TX_DOUBLE_ ## x
 #define MULT(x, m) ((x) * (m))
 #define SCALE_TYPE double
-typedef double FFTSample;
+typedef double TXSample;
-typedef AVComplexDouble FFTComplex;
+typedef AVComplexDouble TXComplex;
 #elif defined(TX_INT32)
-#define TX_NAME(x) x ## _int32
+#define TX_TAB(x) x ## _int32
 #define TX_NAME(x) x ## _int32_c
 #define TX_NAME_STR(x) x "_int32_c"
 #define TX_TYPE(x) AV_TX_INT32_ ## x
 #define MULT(x, m) (((((int64_t)(x)) * (int64_t)(m)) + 0x40000000) >> 31)
 #define SCALE_TYPE float
-typedef int32_t FFTSample;
+typedef int32_t TXSample;
-typedef AVComplexInt32 FFTComplex;
+typedef AVComplexInt32 TXComplex;
 #else
-typedef void FFTComplex;
+typedef void TXComplex;
 #endif
 #if defined(TX_FLOAT) || defined(TX_DOUBLE)
-#define CMUL(dre, dim, are, aim, bre, bim)                                     \
+#define CMUL(dre, dim, are, aim, bre, bim)      \
-    do {                                                                       \
+    do {                                        \
-        (dre) = (are) * (bre) - (aim) * (bim);                                 \
+        (dre) = (are) * (bre) - (aim) * (bim);  \
-        (dim) = (are) * (bim) + (aim) * (bre);                                 \
+        (dim) = (are) * (bim) + (aim) * (bre);  \
    } while (0)
-#define SMUL(dre, dim, are, aim, bre, bim)                                     \
+#define SMUL(dre, dim, are, aim, bre, bim)      \
-    do {                                                                       \
+    do {                                        \
-        (dre) = (are) * (bre) - (aim) * (bim);                                 \
+        (dre) = (are) * (bre) - (aim) * (bim);  \
-        (dim) = (are) * (bim) - (aim) * (bre);                                 \
+        (dim) = (are) * (bim) - (aim) * (bre);  \
    } while (0)
 #define UNSCALE(x) (x)
@@ -65,91 +77,167 @@ typedef void FFTComplex;
 #elif defined(TX_INT32)
 /* Properly rounds the result */
-#define CMUL(dre, dim, are, aim, bre, bim)                                     \
+#define CMUL(dre, dim, are, aim, bre, bim)             \
-    do {                                                                       \
+    do {                                               \
-        int64_t accu;                                                          \
+        int64_t accu;                                  \
-        (accu)  = (int64_t)(bre) * (are);                                      \
+        (accu)  = (int64_t)(bre) * (are);              \
-        (accu) -= (int64_t)(bim) * (aim);                                      \
+        (accu) -= (int64_t)(bim) * (aim);              \
-        (dre)   = (int)(((accu) + 0x40000000) >> 31);                          \
+        (dre)   = (int)(((accu) + 0x40000000) >> 31);  \
-        (accu)  = (int64_t)(bim) * (are);                                      \
+        (accu)  = (int64_t)(bim) * (are);              \
-        (accu) += (int64_t)(bre) * (aim);                                      \
+        (accu) += (int64_t)(bre) * (aim);              \
-        (dim)   = (int)(((accu) + 0x40000000) >> 31);                          \
+        (dim)   = (int)(((accu) + 0x40000000) >> 31);  \
    } while (0)
-#define SMUL(dre, dim, are, aim, bre, bim)                                     \
+#define SMUL(dre, dim, are, aim, bre, bim)             \
-    do {                                                                       \
+    do {                                               \
-        int64_t accu;                                                          \
+        int64_t accu;                                  \
-        (accu)  = (int64_t)(bre) * (are);                                      \
+        (accu)  = (int64_t)(bre) * (are);              \
-        (accu) -= (int64_t)(bim) * (aim);                                      \
+        (accu) -= (int64_t)(bim) * (aim);              \
-        (dre)   = (int)(((accu) + 0x40000000) >> 31);                          \
+        (dre)   = (int)(((accu) + 0x40000000) >> 31);  \
-        (accu)  = (int64_t)(bim) * (are);                                      \
+        (accu)  = (int64_t)(bim) * (are);              \
-        (accu) -= (int64_t)(bre) * (aim);                                      \
+        (accu) -= (int64_t)(bre) * (aim);              \
-        (dim)   = (int)(((accu) + 0x40000000) >> 31);                          \
+        (dim)   = (int)(((accu) + 0x40000000) >> 31);  \
    } while (0)
-#define UNSCALE(x) ((double)x/2147483648.0)
+#define UNSCALE(x) ((double)(x)/2147483648.0)
 #define RESCALE(x) (av_clip64(lrintf((x) * 2147483648.0), INT32_MIN, INT32_MAX))
-#define FOLD(x, y) ((int)((x) + (unsigned)(y) + 32) >> 6)
+#define FOLD(x, y) ((int32_t)((x) + (unsigned)(y) + 32) >> 6)
-#endif
+#endif /* TX_INT32 */
-#define BF(x, y, a, b)                                                         \
+#define BF(x, y, a, b)  \
-    do {                                                                       \
+    do {                \
-        x = (a) - (b);                                                         \
+        x = (a) - (b);  \
-        y = (a) + (b);                                                         \
+        y = (a) + (b);  \
    } while (0)
-#define CMUL3(c, a, b)                                                         \
+#define CMUL3(c, a, b) CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
    CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
-#define COSTABLE(size)                                                         \
+/* Codelet flags, used to pick codelets. Must be a superset of enum AVTXFlags,
-    DECLARE_ALIGNED(32, FFTSample, TX_NAME(ff_cos_##size))[size/4 + 1]
+ * but if it runs out of bits, it can be made separate. */
 typedef enum FFTXCodeletFlags {
    FF_TX_OUT_OF_PLACE  = (1ULL << 63), /* Can be OR'd with AV_TX_INPLACE             */
    FF_TX_ALIGNED       = (1ULL << 62), /* Cannot be OR'd with AV_TX_UNALIGNED        */
    FF_TX_PRESHUFFLE    = (1ULL << 61), /* Codelet expects permuted coeffs            */
    FF_TX_INVERSE_ONLY  = (1ULL << 60), /* For non-orthogonal inverse-only transforms */
    FF_TX_FORWARD_ONLY  = (1ULL << 59), /* For non-orthogonal forward-only transforms */
 } FFTXCodeletFlags;
 typedef enum FFTXCodeletPriority {
    FF_TX_PRIO_BASE = 0,               /* Baseline priority */
    /* For SIMD, set base prio to the register size in bits and increment in
     * steps of 64 depending on faster/slower features, like FMA. */
    FF_TX_PRIO_MIN          = -131072, /* For naive implementations */
    FF_TX_PRIO_MAX          =  32768,  /* For custom implementations/ASICs */
 } FFTXCodeletPriority;
 /* Codelet options */
 typedef struct FFTXCodeletOptions {
    int invert_lookup;     /* If codelet is flagged as FF_TX_CODELET_PRESHUFFLE,
                              invert the lookup direction for the map generated */
 } FFTXCodeletOptions;
 /* Maximum amount of subtransform functions, subtransforms and factors. Arbitrary. */
 #define TX_MAX_SUB 4
 typedef struct FFTXCodelet {
    const char    *name;          /* Codelet name, for debugging */
    av_tx_fn       function;      /* Codelet function, != NULL */
    enum AVTXType  type;          /* Type of codelet transform */
 #define TX_TYPE_ANY INT32_MAX     /* Special type to allow all types */
    uint64_t flags;               /* A combination of AVTXFlags and FFTXCodeletFlags
                                   * that describe the codelet's properties. */
    int factors[TX_MAX_SUB];      /* Length factors */
 #define TX_FACTOR_ANY -1          /* When used alone, signals that the codelet
                                   * supports all factors. Otherwise, if other
                                   * factors are present, it signals that whatever
                                   * remains will be supported, as long as the
                                   * other factors are a component of the length */
    int min_len;                  /* Minimum length of transform, must be >= 1 */
    int max_len;                  /* Maximum length of transform */
 #define TX_LEN_UNLIMITED -1       /* Special length value to permit all lengths */
    int (*init)(AVTXContext *s,   /* Optional callback for current context initialization. */
                const struct FFTXCodelet *cd,
                uint64_t flags,
                FFTXCodeletOptions *opts,
                int len, int inv,
                const void *scale);
    int (*uninit)(AVTXContext *s); /* Optional callback for uninitialization. */
    int cpu_flags;                 /* CPU flags. If any negative flags like
                                    * SLOW are present, will avoid picking.
                                    * 0x0 to signal it's a C codelet */
 #define FF_TX_CPU_FLAGS_ALL 0x0    /* Special CPU flag for C */
    int prio;                      /* < 0 = least, 0 = no pref, > 0 = prefer */
 } FFTXCodelet;
 /* Used by asm, reorder with care */
 struct AVTXContext {
-    int n;              /* Non-power-of-two part */
+    /* Fields the root transform and subtransforms use or may use.
-    int m;              /* Power-of-two part */
+     * NOTE: This section is used by assembly, do not reorder or change */
-    int inv;            /* Is inverse */
+    int                len;             /* Length of the transform */
-    int type;           /* Type */
+    int                inv;             /* If transform is inverse */
-    uint64_t flags;     /* Flags */
+    int               *map;             /* Lookup table(s) */
-    double scale;       /* Scale */
+    TXComplex         *exp;             /* Any non-pre-baked multiplication factors needed */
    TXComplex         *tmp;             /* Temporary buffer, if needed */
-    FFTComplex *exptab; /* MDCT exptab */
+    AVTXContext       *sub;             /* Subtransform context(s), if needed */
-    FFTComplex    *tmp; /* Temporary buffer needed for all compound transforms */
+    av_tx_fn           fn[TX_MAX_SUB];  /* Function(s) for the subtransforms */
-    int        *pfatab; /* Input/Output mapping for compound transforms */
+    int                nb_sub;          /* Number of subtransforms.
-    int        *revtab; /* Input mapping for power of two transforms */
+                                         * The reason all of these are set here
-    int   *inplace_idx; /* Required indices to revtab for in-place transforms */
+                                         * rather than in each separate context
                                         * is to eliminate extra pointer
                                         * dereferences. */
-    int      *revtab_c; /* Revtab for only the C transforms, needed because
+    /* Fields mainly useul/applicable for the root transform or initialization.
-                         * checkasm makes us reuse the same context. */
+     * Fields below are not used by assembly code. */
-
+    const FFTXCodelet *cd[TX_MAX_SUB];  /* Subtransform codelets */
-    av_tx_fn    top_tx; /* Used for computing transforms derived from other
+    const FFTXCodelet *cd_self;         /* Codelet for the current context */
-                         * transforms, like full-length iMDCTs and RDFTs.
+    enum AVTXType      type;            /* Type of transform */
-                         * NOTE: Do NOT use this to mix assembly with C code. */
+    uint64_t           flags;           /* A combination of AVTXFlags
                                           and FFTXCodeletFlags flags
                                           used when creating */
    float              scale_f;
    double             scale_d;
    void              *opaque;          /* Free to use by implementations */
 };
-/* Checks if type is an MDCT */
+/* Create a subtransform in the current context with the given parameters.
-int ff_tx_type_is_mdct(enum AVTXType type);
+ * The flags parameter from FFTXCodelet.init() should be preserved as much
 * as that's possible.
 * MUST be called during the sub() callback of each codelet. */
 int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
                     uint64_t flags, FFTXCodeletOptions *opts,
                     int len, int inv, const void *scale);
 /*
 * Generates the PFA permutation table into AVTXContext->pfatab. The end table
 * is appended to the start table.
 */
-int ff_tx_gen_compound_mapping(AVTXContext *s);
+int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m);
 /*
 * Generates a standard-ish (slightly modified) Split-Radix revtab into
- * AVTXContext->revtab
+ * AVTXContext->map. Invert lookup changes how the mapping needs to be applied.
 * If it's set to 0, it has to be applied like out[map[i]] = in[i], otherwise
 * if it's set to 1, has to be applied as out[i] = in[map[i]]
 */
 int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup);
 /*
 * Generates an index into AVTXContext->inplace_idx that if followed in the
- * specific order,  allows the revtab to be done in-place. AVTXContext->revtab
+ * specific order, allows the revtab to be done in-place. The sub-transform
- * must already exist.
+ * and its map should already be initialized.
 */
-int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab);
+int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s);
 /*
 * This generates a parity-based revtab of length len and direction inv.
@@ -179,25 +267,26 @@ int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s, int *revtab);
 *
 * If length is smaller than basis/2 this function will not do anything.
 */
-void ff_tx_gen_split_radix_parity_revtab(int *revtab, int len, int inv,
+int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int invert_lookup,
-                                         int basis, int dual_stride);
+                                        int basis, int dual_stride);
-/* Templated init functions */
+/* Typed init function to initialize shared tables. Will initialize all tables
-int ff_tx_init_mdct_fft_float(AVTXContext *s, av_tx_fn *tx,
+ * for all factors of a length. */
-                              enum AVTXType type, int inv, int len,
+void ff_tx_init_tabs_float (int len);
-                              const void *scale, uint64_t flags);
+void ff_tx_init_tabs_double(int len);
-int ff_tx_init_mdct_fft_double(AVTXContext *s, av_tx_fn *tx,
+void ff_tx_init_tabs_int32 (int len);
                               enum AVTXType type, int inv, int len,
                               const void *scale, uint64_t flags);
 int ff_tx_init_mdct_fft_int32(AVTXContext *s, av_tx_fn *tx,
                              enum AVTXType type, int inv, int len,
                              const void *scale, uint64_t flags);
-typedef struct CosTabsInitOnce {
+/* Typed init function to initialize an MDCT exptab in a context. */
-    void (*func)(void);
+int  ff_tx_mdct_gen_exp_float (AVTXContext *s);
-    AVOnce control;
+int  ff_tx_mdct_gen_exp_double(AVTXContext *s);
-} CosTabsInitOnce;
+int  ff_tx_mdct_gen_exp_int32 (AVTXContext *s);
-void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx);
+/* Lists of codelets */
 extern const FFTXCodelet * const ff_tx_codelet_list_float_c       [];
 extern const FFTXCodelet * const ff_tx_codelet_list_float_x86     [];
 extern const FFTXCodelet * const ff_tx_codelet_list_double_c      [];
 extern const FFTXCodelet * const ff_tx_codelet_list_int32_c       [];
 #endif /* AVUTIL_TX_PRIV_H */
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -31,6 +31,8 @@
 %include "x86util.asm"
 %define private_prefix ff_tx
 %if ARCH_X86_64
 %define ptr resq
 %else
@@ -39,25 +41,22 @@
 %assign i 16
 %rep 14
-cextern cos_ %+ i %+ _float ; ff_cos_i_float...
+cextern tab_ %+ i %+ _float ; ff_tab_i_float...
 %assign i (i << 1)
 %endrep
 struc AVTXContext
-    .n:           resd 1 ; Non-power-of-two part
+    .len:          resd 1 ; Length
-    .m:           resd 1 ; Power-of-two part
+    .inv           resd 1 ; Inverse flag
-    .inv:         resd 1 ; Is inverse
+    .map:           ptr 1 ; Lookup table(s)
-    .type:        resd 1 ; Type
+    .exp:           ptr 1 ; Exponentiation factors
-    .flags:       resq 1 ; Flags
+    .tmp:           ptr 1 ; Temporary data
    .scale:       resq 1 ; Scale
-    .exptab:       ptr 1 ; MDCT exptab
+    .sub:           ptr 1 ; Subcontexts
-    .tmp:          ptr 1 ; Temporary buffer needed for all compound transforms
+    .fn:            ptr 4 ; Subcontext functions
-    .pfatab:       ptr 1 ; Input/Output mapping for compound transforms
+    .nb_sub:       resd 1 ; Subcontext count
    .revtab:       ptr 1 ; Input mapping for power of two transforms
    .inplace_idx:  ptr 1 ; Required indices to revtab for in-place transforms
-    .top_tx        ptr 1 ;  Used for transforms derived from other transforms
+    ; Everything else is inaccessible
 endstruc
 SECTION_RODATA 32
@@ -485,8 +484,8 @@ SECTION .text
    movaps [outq + 10*mmsize], tx1_o0
    movaps [outq + 14*mmsize], tx2_o0
-    movaps tw_e,           [cos_64_float + mmsize]
+    movaps tw_e,           [tab_64_float + mmsize]
-    vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
+    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
    movaps m0, [outq +  1*mmsize]
    movaps m1, [outq +  3*mmsize]
@@ -710,8 +709,7 @@ FFT4 inv, 1
 INIT_XMM sse3
 cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
-    mov ctxq, [ctxq + AVTXContext.revtab]
+    mov ctxq, [ctxq + AVTXContext.map]
    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
@@ -733,8 +731,7 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
 INIT_YMM avx
 cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
-    mov ctxq, [ctxq + AVTXContext.revtab]
+    mov ctxq, [ctxq + AVTXContext.map]
    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
@@ -754,7 +751,7 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
 %macro FFT16_FN 1
 INIT_YMM %1
 cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
-    mov ctxq, [ctxq + AVTXContext.revtab]
+    mov ctxq, [ctxq + AVTXContext.map]
    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
@@ -786,7 +783,7 @@ FFT16_FN fma3
 %macro FFT32_FN 1
 INIT_YMM %1
 cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
-    mov ctxq, [ctxq + AVTXContext.revtab]
+    mov ctxq, [ctxq + AVTXContext.map]
    LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq,  m8,  m9
    LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
@@ -800,8 +797,8 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
-    movaps m8,         [cos_32_float]
+    movaps m8,         [tab_32_float]
-    vperm2f128 m9, m9, [cos_32_float + 4*8 - 4*7], 0x23
+    vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
    FFT16 m0, m1, m2, m3, m10, m11, m12, m13
@@ -858,8 +855,8 @@ ALIGN 16
    POP lenq
    sub outq, (%1*4) + (%1*2) + (%1/2)
-    lea rtabq, [cos_ %+ %1 %+ _float]
+    lea rtabq, [tab_ %+ %1 %+ _float]
-    lea itabq, [cos_ %+ %1 %+ _float + %1 - 4*7]
+    lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
 %if %0 > 1
    cmp tgtq, %1
@@ -883,9 +880,9 @@ ALIGN 16
 %macro FFT_SPLIT_RADIX_FN 1
 INIT_YMM %1
-cglobal split_radix_fft_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
+cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
-    movsxd lenq, dword [lutq + AVTXContext.m]
+    movsxd lenq, dword [lutq + AVTXContext.len]
-    mov lutq, [lutq + AVTXContext.revtab]
+    mov lutq, [lutq + AVTXContext.map]
    mov tgtq, lenq
 ; Bottom-most/32-point transform ===============================================
@@ -903,8 +900,8 @@ ALIGN 16
    LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
    LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
-    movaps m8,         [cos_32_float]
+    movaps m8,         [tab_32_float]
-    vperm2f128 m9, m9, [cos_32_float + 32 - 4*7], 0x23
+    vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
    FFT16 m0, m1, m2, m3, m10, m11, m12, m13
@@ -961,8 +958,8 @@ ALIGN 16
    FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
-    movaps tw_e,           [cos_64_float]
+    movaps tw_e,           [tab_64_float]
-    vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7], 0x23
+    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
    add lutq, (mmsize/2)*8
    cmp tgtq, 64
@@ -989,8 +986,8 @@ ALIGN 16
    POP lenq
    sub outq, 24*mmsize
-    lea rtabq, [cos_128_float]
+    lea rtabq, [tab_128_float]
-    lea itabq, [cos_128_float + 128 - 4*7]
+    lea itabq, [tab_128_float + 128 - 4*7]
    cmp tgtq, 128
    je .deinterleave
@@ -1016,8 +1013,8 @@ ALIGN 16
    POP lenq
    sub outq, 48*mmsize
-    lea rtabq, [cos_256_float]
+    lea rtabq, [tab_256_float]
-    lea itabq, [cos_256_float + 256 - 4*7]
+    lea itabq, [tab_256_float + 256 - 4*7]
    cmp tgtq, 256
    je .deinterleave
@@ -1044,8 +1041,8 @@ ALIGN 16
    POP lenq
    sub outq, 96*mmsize
-    lea rtabq, [cos_512_float]
+    lea rtabq, [tab_512_float]
-    lea itabq, [cos_512_float + 512 - 4*7]
+    lea itabq, [tab_512_float + 512 - 4*7]
    cmp tgtq, 512
    je .deinterleave
@@ -1079,8 +1076,8 @@ ALIGN 16
    POP lenq
    sub outq, 192*mmsize
-    lea rtabq, [cos_1024_float]
+    lea rtabq, [tab_1024_float]
-    lea itabq, [cos_1024_float + 1024 - 4*7]
+    lea itabq, [tab_1024_float + 1024 - 4*7]
    cmp tgtq, 1024
    je .deinterleave
@@ -1160,8 +1157,8 @@ FFT_SPLIT_RADIX_DEF 131072
    vextractf128 [outq + 13*mmsize +  0], tw_e,   1
    vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
-    movaps tw_e,           [cos_64_float + mmsize]
+    movaps tw_e,           [tab_64_float + mmsize]
-    vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
+    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
    movaps m0, [outq +  1*mmsize]
    movaps m1, [outq +  3*mmsize]
--- a/libavutil/x86/tx_float_init.c
+++ b/libavutil/x86/tx_float_init.c
@@ -21,86 +21,106 @@
 #include "libavutil/attributes.h"
 #include "libavutil/x86/cpu.h"
-void ff_fft2_float_sse3     (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
+#include "config.h"
 void ff_fft4_inv_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 void ff_fft4_fwd_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 void ff_fft8_float_sse3     (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 void ff_fft8_float_avx      (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 void ff_fft16_float_avx     (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 void ff_fft16_float_fma3    (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 void ff_fft32_float_avx     (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 void ff_fft32_float_fma3    (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
-void ff_split_radix_fft_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride);
+#define DECL_INIT_FN(basis, interleave)                                        \
-void ff_split_radix_fft_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
+static av_cold int                                                             \
-
+    ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86               \
-av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx)
+    (AVTXContext *s,                                                           \
-{
+     const FFTXCodelet *cd,                                                    \
-    int cpu_flags = av_get_cpu_flags();
+     uint64_t flags,                                                           \
-    int gen_revtab = 0, basis, revtab_interleave;
+     FFTXCodeletOptions *opts,                                                 \
-
+     int len, int inv,                                                         \
-    if (s->flags & AV_TX_UNALIGNED)
+     const void *scale)                                                        \
-        return;
+{                                                                              \
-
+    const int inv_lookup = opts ? opts->invert_lookup : 1;                     \
-    if (ff_tx_type_is_mdct(s->type))
+    ff_tx_init_tabs_float(len);                                                \
-        return;
+    return ff_tx_gen_split_radix_parity_revtab(s, inv_lookup,                  \
-
+                                               basis, interleave);             \
 #define TXFN(fn, gentab, sr_basis, interleave) \
    do {                                       \
        *tx = fn;                              \
        gen_revtab = gentab;                   \
        basis = sr_basis;                      \
        revtab_interleave = interleave;        \
    } while (0)
    if (s->n == 1) {
        if (EXTERNAL_SSE2(cpu_flags)) {
            if (s->m == 4 && s->inv)
                TXFN(ff_fft4_inv_float_sse2, 0, 0, 0);
            else if (s->m == 4)
                TXFN(ff_fft4_fwd_float_sse2, 0, 0, 0);
        }
        if (EXTERNAL_SSE3(cpu_flags)) {
            if (s->m == 2)
                TXFN(ff_fft2_float_sse3, 0, 0, 0);
            else if (s->m == 8)
                TXFN(ff_fft8_float_sse3, 1, 8, 0);
        }
        if (EXTERNAL_AVX_FAST(cpu_flags)) {
            if (s->m == 8)
                TXFN(ff_fft8_float_avx, 1, 8, 0);
            else if (s->m == 16)
                TXFN(ff_fft16_float_avx, 1, 8, 2);
 #if ARCH_X86_64
            else if (s->m == 32)
                TXFN(ff_fft32_float_avx, 1, 8, 2);
            else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE))
                TXFN(ff_split_radix_fft_float_avx, 1, 8, 2);
 #endif
        }
        if (EXTERNAL_FMA3_FAST(cpu_flags)) {
            if (s->m == 16)
                TXFN(ff_fft16_float_fma3, 1, 8, 2);
 #if ARCH_X86_64
            else if (s->m == 32)
                TXFN(ff_fft32_float_fma3, 1, 8, 2);
 #endif
        }
 #if ARCH_X86_64
        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
            if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE))
                TXFN(ff_split_radix_fft_float_avx2, 1, 8, 2);
        }
 #endif
    }
    if (gen_revtab)
        ff_tx_gen_split_radix_parity_revtab(s->revtab, s->m, s->inv, basis,
                                            revtab_interleave);
 #undef TXFN
 }
 #define ff_tx_fft_sr_codelet_init_b0_i0_x86 NULL
 DECL_INIT_FN(8, 0)
 DECL_INIT_FN(8, 2)
 #define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags)          \
 void ff_tx_ ##fn_name(AVTXContext *s, void *out, void *in, ptrdiff_t stride);  \
 static const FFTXCodelet ff_tx_ ##fn_name## _def = {                           \
    .name       = #fn_name,                                                    \
    .function   = ff_tx_ ##fn_name,                                            \
    .type       = TX_TYPE(FFT),                                                \
    .flags      = FF_TX_OUT_OF_PLACE | FF_TX_ALIGNED | fn_flags,               \
    .factors[0] = 2,                                                           \
    .min_len    = len,                                                         \
    .max_len    = len,                                                         \
    .init       = ff_tx_fft_sr_codelet_init_ ##init_fn## _x86,                 \
    .cpu_flags  = AV_CPU_FLAG_ ##cpu,                                          \
    .prio       = fn_prio,                                                     \
 };
 DECL_SR_CD_DEF(fft2_float_sse3,      2, b0_i0, 128, SSE3, AV_TX_INPLACE)
 DECL_SR_CD_DEF(fft4_fwd_float_sse2,  4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY)
 DECL_SR_CD_DEF(fft4_inv_float_sse2,  4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY)
 DECL_SR_CD_DEF(fft8_float_sse3,      8, b8_i0, 128, SSE3, AV_TX_INPLACE)
 DECL_SR_CD_DEF(fft8_float_avx,       8, b8_i0, 256, AVX,  AV_TX_INPLACE)
 DECL_SR_CD_DEF(fft16_float_avx,     16, b8_i2, 256, AVX,  AV_TX_INPLACE)
 DECL_SR_CD_DEF(fft16_float_fma3,    16, b8_i2, 288, FMA3, AV_TX_INPLACE)
 #if ARCH_X86_64
 DECL_SR_CD_DEF(fft32_float_avx,     32, b8_i2, 256, AVX,  AV_TX_INPLACE)
 DECL_SR_CD_DEF(fft32_float_fma3,    32, b8_i2, 288, FMA3, AV_TX_INPLACE)
 void ff_tx_fft_sr_float_avx(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 const FFTXCodelet ff_tx_fft_sr_float_avx_def = {
    .name       = "fft_sr_float_avx",
    .function   = ff_tx_fft_sr_float_avx,
    .type       = TX_TYPE(FFT),
    .flags      = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
    .factors[0] = 2,
    .min_len    = 64,
    .max_len    = 131072,
    .init       = ff_tx_fft_sr_codelet_init_b8_i2_x86,
    .cpu_flags  = AV_CPU_FLAG_AVX,
    .prio       = 256,
 };
 #if HAVE_AVX2_EXTERNAL
 void ff_tx_fft_sr_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
 const FFTXCodelet ff_tx_fft_sr_float_avx2_def = {
    .name       = "fft_sr_float_avx2",
    .function   = ff_tx_fft_sr_float_avx2,
    .type       = TX_TYPE(FFT),
    .flags      = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE,
    .factors[0] = 2,
    .min_len    = 64,
    .max_len    = 131072,
    .init       = ff_tx_fft_sr_codelet_init_b8_i2_x86,
    .cpu_flags  = AV_CPU_FLAG_AVX2,
    .prio       = 288,
 };
 #endif
 #endif
 const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
    /* Split-Radix codelets */
    &ff_tx_fft2_float_sse3_def,
    &ff_tx_fft4_fwd_float_sse2_def,
    &ff_tx_fft4_inv_float_sse2_def,
    &ff_tx_fft8_float_sse3_def,
    &ff_tx_fft8_float_avx_def,
    &ff_tx_fft16_float_avx_def,
    &ff_tx_fft16_float_fma3_def,
 #if ARCH_X86_64
    &ff_tx_fft32_float_avx_def,
    &ff_tx_fft32_float_fma3_def,
    /* Standalone transforms */
    &ff_tx_fft_sr_float_avx_def,
 #if HAVE_AVX2_EXTERNAL
    &ff_tx_fft_sr_float_avx2_def,
 #endif
 #endif
    NULL,
 };