diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h index 11892df4be..28c8435c83 100644 --- a/libavutil/tx_priv.h +++ b/libavutil/tx_priv.h @@ -29,6 +29,8 @@ #define TX_NAME(x) x ## _float_c #define TX_NAME_STR(x) x "_float_c" #define TX_TYPE(x) AV_TX_FLOAT_ ## x +#define TX_FN_NAME(fn, suffix) ff_tx_ ## fn ## _float_ ## suffix +#define TX_FN_NAME_STR(fn, suffix) #fn "_float_" #suffix #define MULT(x, m) ((x) * (m)) #define SCALE_TYPE float typedef float TXSample; @@ -38,6 +40,8 @@ typedef AVComplexFloat TXComplex; #define TX_NAME(x) x ## _double_c #define TX_NAME_STR(x) x "_double_c" #define TX_TYPE(x) AV_TX_DOUBLE_ ## x +#define TX_FN_NAME(fn, suffix) ff_tx_ ## fn ## _double_ ## suffix +#define TX_FN_NAME_STR(fn, suffix) #fn "_double_" #suffix #define MULT(x, m) ((x) * (m)) #define SCALE_TYPE double typedef double TXSample; @@ -47,6 +51,8 @@ typedef AVComplexDouble TXComplex; #define TX_NAME(x) x ## _int32_c #define TX_NAME_STR(x) x "_int32_c" #define TX_TYPE(x) AV_TX_INT32_ ## x +#define TX_FN_NAME(fn, suffix) ff_tx_ ## fn ## _int32_ ## suffix +#define TX_FN_NAME_STR(fn, suffix) #fn "_int32_" #suffix #define MULT(x, m) (((((int64_t)(x)) * (int64_t)(m)) + 0x40000000) >> 31) #define SCALE_TYPE float typedef int32_t TXSample; @@ -55,6 +61,24 @@ typedef AVComplexInt32 TXComplex; typedef void TXComplex; #endif +#define TX_DECL_FN(fn, suffix) \ + void TX_FN_NAME(fn, suffix)(AVTXContext *s, void *o, void *i, ptrdiff_t st); + +#define TX_DEF(fn, tx_type, len_min, len_max, f1, f2, \ + p, init_fn, suffix, cf, cd_flags, cf2) \ + &(const FFTXCodelet){ \ + .name = TX_FN_NAME_STR(fn, suffix), \ + .function = TX_FN_NAME(fn, suffix), \ + .type = TX_TYPE(tx_type), \ + .flags = FF_TX_ALIGNED | FF_TX_OUT_OF_PLACE | cd_flags, \ + .factors = { f1, f2 }, \ + .min_len = len_min, \ + .max_len = len_max, \ + .init = init_fn, \ + .cpu_flags = cf2 | AV_CPU_FLAG_ ## cf, \ + .prio = p, \ + } + #if defined(TX_FLOAT) || defined(TX_DOUBLE) #define CMUL(dre, dim, are, aim, bre, bim) \ diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 3a87711def..e60faf7fa6 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -23,19 +23,33 @@ #include "config.h" -/* These versions already do what we need them to do. */ -#define ff_tx_fft2_ns_float_sse3 ff_tx_fft2_float_sse3 -#define ff_tx_fft4_ns_float_sse2 ff_tx_fft4_fwd_float_sse2 +TX_DECL_FN(fft2, sse3) +TX_DECL_FN(fft4_fwd, sse2) +TX_DECL_FN(fft4_inv, sse2) +TX_DECL_FN(fft8, sse3) +TX_DECL_FN(fft8_ns, sse3) +TX_DECL_FN(fft8, avx) +TX_DECL_FN(fft8_ns, avx) +TX_DECL_FN(fft16, avx) +TX_DECL_FN(fft16_ns, avx) +TX_DECL_FN(fft16, fma3) +TX_DECL_FN(fft16_ns, fma3) +TX_DECL_FN(fft32, avx) +TX_DECL_FN(fft32_ns, avx) +TX_DECL_FN(fft32, fma3) +TX_DECL_FN(fft32_ns, fma3) +TX_DECL_FN(fft_sr, avx) +TX_DECL_FN(fft_sr_ns, avx) +TX_DECL_FN(fft_sr, avx2) +TX_DECL_FN(fft_sr_ns, avx2) #define DECL_INIT_FN(basis, interleave) \ -static av_cold int \ - ff_tx_fft_sr_codelet_init_b ##basis## _i ##interleave## _x86 \ - (AVTXContext *s, \ - const FFTXCodelet *cd, \ - uint64_t flags, \ - FFTXCodeletOptions *opts, \ - int len, int inv, \ - const void *scale) \ +static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \ + const FFTXCodelet *cd, \ + uint64_t flags, \ + FFTXCodeletOptions *opts, \ + int len, int inv, \ + const void *scale) \ { \ const int inv_lookup = opts ? opts->invert_lookup : 1; \ ff_tx_init_tabs_float(len); \ @@ -46,95 +60,35 @@ static av_cold int \ basis, interleave); \ } -#define ff_tx_fft_sr_codelet_init_b0_i0_x86 NULL DECL_INIT_FN(8, 0) DECL_INIT_FN(8, 2) -#define DECL_CD_DEF(fn, t, min, max, f1, f2, i, p, c, f) \ -void ff_tx_ ##fn(AVTXContext *s, void *out, void *in, ptrdiff_t stride); \ -static const FFTXCodelet ff_tx_ ##fn## _def = { \ - .name = #fn, \ - .function = ff_tx_ ##fn, \ - .type = TX_TYPE(t), \ - .flags = FF_TX_ALIGNED | f, \ - .factors = { f1, f2 }, \ - .min_len = min, \ - .max_len = max, \ - .init = ff_tx_ ##i## _x86, \ - .cpu_flags = c, \ - .prio = p, \ -}; - -#define DECL_SR_CD_DEF(fn_name, len, init_fn, fn_prio, cpu, fn_flags) \ - DECL_CD_DEF(fn_name, FFT, len, len, 2, 0, \ - fft_sr_codelet_init_ ##init_fn, fn_prio, \ - AV_CPU_FLAG_ ##cpu, FF_TX_OUT_OF_PLACE | fn_flags) - -DECL_SR_CD_DEF(fft2_float_sse3, 2, b0_i0, 128, SSE3, AV_TX_INPLACE) -DECL_SR_CD_DEF(fft2_ns_float_sse3, 2, b8_i0, 192, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE) -DECL_SR_CD_DEF(fft4_fwd_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY) -DECL_SR_CD_DEF(fft4_inv_float_sse2, 4, b0_i0, 128, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY) -DECL_SR_CD_DEF(fft4_ns_float_sse2, 4, b8_i0, 192, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE) -DECL_SR_CD_DEF(fft8_float_sse3, 8, b8_i0, 128, SSE3, AV_TX_INPLACE) -DECL_SR_CD_DEF(fft8_ns_float_sse3, 8, b8_i0, 192, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE) -DECL_SR_CD_DEF(fft8_float_avx, 8, b8_i0, 256, AVX, AV_TX_INPLACE) -DECL_SR_CD_DEF(fft8_ns_float_avx, 8, b8_i0, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE) -DECL_SR_CD_DEF(fft16_float_avx, 16, b8_i2, 256, AVX, AV_TX_INPLACE) -DECL_SR_CD_DEF(fft16_ns_float_avx, 16, b8_i2, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE) -DECL_SR_CD_DEF(fft16_float_fma3, 16, b8_i2, 288, FMA3, AV_TX_INPLACE) -DECL_SR_CD_DEF(fft16_ns_float_fma3, 16, b8_i2, 352, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE) - -#if ARCH_X86_64 -DECL_SR_CD_DEF(fft32_float_avx, 32, b8_i2, 256, AVX, AV_TX_INPLACE) -DECL_SR_CD_DEF(fft32_ns_float_avx, 32, b8_i2, 320, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE) -DECL_SR_CD_DEF(fft32_float_fma3, 32, b8_i2, 288, FMA3, AV_TX_INPLACE) -DECL_SR_CD_DEF(fft32_ns_float_fma3, 32, b8_i2, 352, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE) - -DECL_CD_DEF(fft_sr_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2, - 256, AV_CPU_FLAG_AVX, - FF_TX_OUT_OF_PLACE) - -DECL_CD_DEF(fft_sr_ns_float_avx, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2, - 320, AV_CPU_FLAG_AVX, - FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE) - -#if HAVE_AVX2_EXTERNAL -DECL_CD_DEF(fft_sr_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2, - 288, AV_CPU_FLAG_AVX2 | AV_CPU_FLAG_AVXSLOW, - FF_TX_OUT_OF_PLACE) - -DECL_CD_DEF(fft_sr_ns_float_avx2, FFT, 64, 131072, 2, 0, fft_sr_codelet_init_b8_i2, - 352, AV_CPU_FLAG_AVX2 | AV_CPU_FLAG_AVXSLOW, - FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | FF_TX_PRESHUFFLE) -#endif -#endif - const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { - &ff_tx_fft2_float_sse3_def, - &ff_tx_fft2_ns_float_sse3_def, - &ff_tx_fft4_fwd_float_sse2_def, - &ff_tx_fft4_inv_float_sse2_def, - &ff_tx_fft4_ns_float_sse2_def, - &ff_tx_fft8_float_sse3_def, - &ff_tx_fft8_ns_float_sse3_def, - &ff_tx_fft8_float_avx_def, - &ff_tx_fft8_ns_float_avx_def, - &ff_tx_fft16_float_avx_def, - &ff_tx_fft16_ns_float_avx_def, - &ff_tx_fft16_float_fma3_def, - &ff_tx_fft16_ns_float_fma3_def, + TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0), + TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0), + TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0), + TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0), + TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, 0), + TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, 0), + TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, 0), + TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), #if ARCH_X86_64 - &ff_tx_fft32_float_avx_def, - &ff_tx_fft32_ns_float_avx_def, - &ff_tx_fft32_float_fma3_def, - &ff_tx_fft32_ns_float_fma3_def, - - &ff_tx_fft_sr_float_avx_def, - &ff_tx_fft_sr_ns_float_avx_def, + TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, 0), + TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, 0), + TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), #if HAVE_AVX2_EXTERNAL - &ff_tx_fft_sr_float_avx2_def, - &ff_tx_fft_sr_ns_float_avx2_def, + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 256, b8_i2, avx, AVX, 0, 0), + TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, avx2, AVX2, 0, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), #endif #endif