diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index 58ec657116..def61d86c1 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -100,7 +100,7 @@ SECTION .text ; %6 - temporary register (for avx only) ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set) %macro LOAD64_LUT 5-7 -%if %0 > 6 && cpuflag(fma3) +%if %0 > 6 && cpuflag(avx2) pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args @@ -1208,5 +1208,5 @@ FFT_SPLIT_RADIX_DEF 131072 %if ARCH_X86_64 FFT_SPLIT_RADIX_FN avx -FFT_SPLIT_RADIX_FN fma3 +FFT_SPLIT_RADIX_FN avx2 %endif diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 993933317c..8b77a5f29f 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -32,7 +32,7 @@ void ff_fft32_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stri void ff_fft32_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); void ff_split_radix_fft_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); -void ff_split_radix_fft_float_fma3(AVTXContext *s, void *out, void *in, ptrdiff_t stride); +void ff_split_radix_fft_float_avx2(AVTXContext *s, void *out, void *in, ptrdiff_t stride); av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx) { @@ -87,10 +87,15 @@ av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx) #if ARCH_X86_64 else if (s->m == 32) TXFN(ff_fft32_float_fma3, 1, 8, 2); - else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE)) - TXFN(ff_split_radix_fft_float_fma3, 1, 8, 2); #endif } + +#if ARCH_X86_64 + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE)) + TXFN(ff_split_radix_fft_float_avx2, 1, 8, 2); + } +#endif } if (gen_revtab)