mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-03 05:10:03 +02:00
27cffd16aa
When the SLOW_GATHER flag was added to the AVX2 version, this made FMA3-features not enabled on Zen CPUs. As FMA3 adds 6-7% across all platforms that support it, in the interest of saving space, this commit removes the AVX version and replaces it with an FMA3 version. The only CPUs affected are Sandy Bridge and Bulldozer, which have AVX support, but no FMA3 support. In the future, if there's a demand for it, a version of the function duplicated for AVX can be added.
1295 lines
40 KiB
NASM
1295 lines
40 KiB
NASM
;******************************************************************************
|
|
;* Copyright (c) Lynne
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
; Open `doc/transforms.md` to see the code upon which the transforms here were
|
|
; based upon and compare.
|
|
|
|
; TODO:
|
|
; carry over registers from smaller transforms to save on ~8 loads/stores
|
|
; check if vinsertf could be faster than verpm2f128 for duplication
|
|
; even faster FFT8 (current one is very #instructions optimized)
|
|
; replace some xors with blends + addsubs?
|
|
; replace some shuffles with vblends?
|
|
; avx512 split-radix
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
%define private_prefix ff_tx
|
|
|
|
%if ARCH_X86_64
|
|
%define ptr resq
|
|
%else
|
|
%define ptr resd
|
|
%endif
|
|
|
|
%assign i 16
|
|
%rep 14
|
|
cextern tab_ %+ i %+ _float ; ff_tab_i_float...
|
|
%assign i (i << 1)
|
|
%endrep
|
|
|
|
struc AVTXContext
|
|
.len: resd 1 ; Length
|
|
.inv resd 1 ; Inverse flag
|
|
.map: ptr 1 ; Lookup table(s)
|
|
.exp: ptr 1 ; Exponentiation factors
|
|
.tmp: ptr 1 ; Temporary data
|
|
|
|
.sub: ptr 1 ; Subcontexts
|
|
.fn: ptr 4 ; Subcontext functions
|
|
.nb_sub: resd 1 ; Subcontext count
|
|
|
|
; Everything else is inaccessible
|
|
endstruc
|
|
|
|
SECTION_RODATA 32
|
|
|
|
%define POS 0x00000000
|
|
%define NEG 0x80000000
|
|
|
|
%define M_SQRT1_2 0.707106781186547524401
|
|
%define COS16_1 0.92387950420379638671875
|
|
%define COS16_3 0.3826834261417388916015625
|
|
|
|
d8_mult_odd: dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \
|
|
M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
|
|
|
|
s8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
|
s8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0
|
|
s8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3
|
|
s8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1
|
|
|
|
s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
|
|
s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3
|
|
s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
|
|
s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2
|
|
|
|
mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
|
|
mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
|
|
mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
|
|
mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG
|
|
mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS
|
|
mask_pmpmpmpm: times 4 dd POS, NEG
|
|
|
|
SECTION .text
|
|
|
|
; Load complex values (64 bits) via a lookup table
|
|
; %1 - output register
|
|
; %2 - GRP of base input memory address
|
|
; %3 - GPR of LUT (int32_t indices) address
|
|
; %4 - LUT offset
|
|
; %5 - temporary GPR (only used if vgather is not used)
|
|
; %6 - temporary register (for avx only)
|
|
; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
|
|
%macro LOAD64_LUT 5-7
|
|
%if %0 > 6 && cpuflag(avx2)
|
|
pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
|
|
movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
|
|
vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
|
|
%else
|
|
mov %5d, [%3 + %4 + 0]
|
|
movsd xmm%1, [%2 + %5q*8]
|
|
%if mmsize == 32
|
|
mov %5d, [%3 + %4 + 8]
|
|
movsd xmm%6, [%2 + %5q*8]
|
|
%endif
|
|
mov %5d, [%3 + %4 + 4]
|
|
movhps xmm%1, [%2 + %5q*8]
|
|
%if mmsize == 32
|
|
mov %5d, [%3 + %4 + 12]
|
|
movhps xmm%6, [%2 + %5q*8]
|
|
vinsertf128 %1, %1, xmm%6, 1
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode)
|
|
; %1 - coefficients (r0.reim, r1.reim)
|
|
; %2 - temporary
|
|
%macro FFT2 2
|
|
shufps %2, %1, %1, q3322
|
|
shufps %1, %1, %1, q1100
|
|
|
|
addsubps %1, %1, %2
|
|
|
|
shufps %1, %1, %1, q2031
|
|
%endmacro
|
|
|
|
; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode)
|
|
; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
|
|
; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
|
|
; %3 - temporary
|
|
%macro FFT4 3
|
|
subps %3, %1, %2 ; r1234, [r5678]
|
|
addps %1, %1, %2 ; t1234, [t5678]
|
|
|
|
shufps %2, %1, %3, q1010 ; t12, r12
|
|
shufps %1, %1, %3, q2332 ; t34, r43
|
|
|
|
subps %3, %2, %1 ; a34, b32
|
|
addps %2, %2, %1 ; a12, b14
|
|
|
|
shufps %1, %2, %3, q1010 ; a1234 even
|
|
|
|
shufps %2, %2, %3, q2332 ; b1423
|
|
shufps %2, %2, %2, q1320 ; b1234 odd
|
|
%endmacro
|
|
|
|
; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode)
|
|
; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim])
|
|
; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim])
|
|
; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim])
|
|
; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim])
|
|
; %5 - temporary
|
|
; %6 - temporary
|
|
%macro FFT8 6
|
|
addps %5, %1, %3 ; q1-8
|
|
addps %6, %2, %4 ; k1-8
|
|
|
|
subps %1, %1, %3 ; r1-8
|
|
subps %2, %2, %4 ; j1-8
|
|
|
|
shufps %4, %1, %1, q2323 ; r4343
|
|
shufps %3, %5, %6, q3032 ; q34, k14
|
|
|
|
shufps %1, %1, %1, q1010 ; r1212
|
|
shufps %5, %5, %6, q1210 ; q12, k32
|
|
|
|
xorps %4, %4, [mask_pmmppmmp] ; r4343 * pmmp
|
|
addps %6, %5, %3 ; s12, g12
|
|
|
|
mulps %2, %2, [d8_mult_odd] ; r8 * d8_mult_odd
|
|
subps %5, %5, %3 ; s34, g43
|
|
|
|
addps %3, %1, %4 ; z1234
|
|
unpcklpd %1, %6, %5 ; s1234
|
|
|
|
shufps %4, %2, %2, q2301 ; j2143
|
|
shufps %6, %6, %5, q2332 ; g1234
|
|
|
|
addsubps %2, %2, %4 ; l2143
|
|
shufps %5, %2, %2, q0123 ; l3412
|
|
addsubps %5, %5, %2 ; t1234
|
|
|
|
subps %2, %1, %6 ; h1234 even
|
|
subps %4, %3, %5 ; u1234 odd
|
|
|
|
addps %1, %1, %6 ; w1234 even
|
|
addps %3, %3, %5 ; o1234 odd
|
|
%endmacro
|
|
|
|
; Single 8-point in-place complex FFT in 20 instructions
|
|
; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
|
|
; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
|
|
; %3 - temporary
|
|
; %4 - temporary
|
|
%macro FFT8_AVX 4
|
|
subps %3, %1, %2 ; r1234, r5678
|
|
addps %1, %1, %2 ; q1234, q5678
|
|
|
|
vpermilps %2, %3, [s8_perm_odd1] ; r4422, r6688
|
|
shufps %4, %1, %1, q3322 ; q1122, q5566
|
|
|
|
movsldup %3, %3 ; r1133, r5577
|
|
shufps %1, %1, %1, q1100 ; q3344, q7788
|
|
|
|
addsubps %3, %3, %2 ; z1234, z5678
|
|
addsubps %1, %1, %4 ; s3142, s7586
|
|
|
|
mulps %3, %3, [s8_mult_odd] ; z * s8_mult_odd
|
|
vpermilps %1, %1, [s8_perm_even] ; s1234, s5687 !
|
|
|
|
shufps %2, %3, %3, q2332 ; junk, z7887
|
|
xorps %4, %1, [mask_mmmmpppm] ; e1234, e5687 !
|
|
|
|
vpermilps %3, %3, [s8_perm_odd2] ; z2314, z6556
|
|
vperm2f128 %1, %1, %4, 0x03 ; e5687, s1234
|
|
|
|
addsubps %2, %2, %3 ; junk, t5678
|
|
subps %1, %1, %4 ; w1234, w5678 even
|
|
|
|
vperm2f128 %2, %2, %2, 0x11 ; t5678, t5678
|
|
vperm2f128 %3, %3, %3, 0x00 ; z2314, z2314
|
|
|
|
xorps %2, %2, [mask_ppmpmmpm] ; t * ppmpmmpm
|
|
addps %2, %3, %2 ; u1234, u5678 odd
|
|
%endmacro
|
|
|
|
; Single 16-point in-place complex FFT
|
|
; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
|
|
; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim)
|
|
; %3 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
|
|
; %4 - odd coefficients (r9.reim, r11.reim, r13.reim, r15.reim)
|
|
; %5, %6 - temporary
|
|
; %7, %8 - temporary (optional)
|
|
%macro FFT16 6-8
|
|
FFT4 %3, %4, %5
|
|
%if %0 > 7
|
|
FFT8_AVX %1, %2, %6, %7
|
|
movaps %8, [mask_mpmppmpm]
|
|
movaps %7, [s16_perm]
|
|
%define mask %8
|
|
%define perm %7
|
|
%elif %0 > 6
|
|
FFT8_AVX %1, %2, %6, %7
|
|
movaps %7, [s16_perm]
|
|
%define mask [mask_mpmppmpm]
|
|
%define perm %7
|
|
%else
|
|
FFT8_AVX %1, %2, %6, %5
|
|
%define mask [mask_mpmppmpm]
|
|
%define perm [s16_perm]
|
|
%endif
|
|
xorps %5, %5, %5 ; 0
|
|
|
|
shufps %6, %4, %4, q2301 ; z12.imre, z13.imre...
|
|
shufps %5, %5, %3, q2301 ; 0, 0, z8.imre...
|
|
|
|
mulps %4, %4, [s16_mult_odd1] ; z.reim * costab
|
|
xorps %5, %5, [mask_mppmmpmp]
|
|
%if cpuflag(fma3)
|
|
fmaddps %6, %6, [s16_mult_odd2], %4 ; s[8..15]
|
|
addps %5, %3, %5 ; s[0...7]
|
|
%else
|
|
mulps %6, %6, [s16_mult_odd2] ; z.imre * costab
|
|
|
|
addps %5, %3, %5 ; s[0...7]
|
|
addps %6, %4, %6 ; s[8..15]
|
|
%endif
|
|
mulps %5, %5, [s16_mult_even] ; s[0...7]*costab
|
|
|
|
xorps %4, %6, mask ; s[8..15]*mpmppmpm
|
|
xorps %3, %5, mask ; s[0...7]*mpmppmpm
|
|
|
|
vperm2f128 %4, %4, %4, 0x01 ; s[12..15, 8..11]
|
|
vperm2f128 %3, %3, %3, 0x01 ; s[4..7, 0..3]
|
|
|
|
addps %6, %6, %4 ; y56, u56, y34, u34
|
|
addps %5, %5, %3 ; w56, x56, w34, x34
|
|
|
|
vpermilps %6, %6, perm ; y56, u56, y43, u43
|
|
vpermilps %5, %5, perm ; w56, x56, w43, x43
|
|
|
|
subps %4, %2, %6 ; odd part 2
|
|
addps %3, %2, %6 ; odd part 1
|
|
|
|
subps %2, %1, %5 ; even part 2
|
|
addps %1, %1, %5 ; even part 1
|
|
%undef mask
|
|
%undef perm
|
|
%endmacro
|
|
|
|
; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
|
|
; Uses all 16 of registers.
|
|
; Output is slightly permuted such that tx2,3's coefficients are interleaved
|
|
; on a 2-point basis (look at `doc/transforms.md`)
|
|
%macro SPLIT_RADIX_COMBINE 17
|
|
%if %1 && mmsize == 32
|
|
vperm2f128 %14, %6, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] even
|
|
vperm2f128 %16, %9, %8, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd
|
|
vperm2f128 %15, %6, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] even
|
|
vperm2f128 %17, %9, %8, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd
|
|
%endif
|
|
|
|
shufps %12, %10, %10, q2200 ; cos00224466
|
|
shufps %13, %11, %11, q1133 ; wim77553311
|
|
movshdup %10, %10 ; cos11335577
|
|
shufps %11, %11, %11, q0022 ; wim66442200
|
|
|
|
%if %1 && mmsize == 32
|
|
shufps %6, %14, %14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
|
|
shufps %8, %16, %16, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
|
|
shufps %7, %15, %15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
|
|
shufps %9, %17, %17, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
|
|
|
|
mulps %14, %14, %13 ; m2[0123]reim * wim7531 even
|
|
mulps %16, %16, %11 ; m2[0123]reim * wim7531 odd
|
|
mulps %15, %15, %13 ; m3[0123]reim * wim7531 even
|
|
mulps %17, %17, %11 ; m3[0123]reim * wim7531 odd
|
|
%else
|
|
mulps %14, %6, %13 ; m2,3[01]reim * wim7531 even
|
|
mulps %16, %8, %11 ; m2,3[01]reim * wim7531 odd
|
|
mulps %15, %7, %13 ; m2,3[23]reim * wim7531 even
|
|
mulps %17, %9, %11 ; m2,3[23]reim * wim7531 odd
|
|
; reorder the multiplies to save movs reg, reg in the %if above
|
|
shufps %6, %6, %6, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
|
|
shufps %8, %8, %8, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
|
|
shufps %7, %7, %7, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
|
|
shufps %9, %9, %9, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
|
|
%endif
|
|
|
|
%if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA!
|
|
fmaddsubps %6, %6, %12, %14 ; w[0..8] even
|
|
fmaddsubps %8, %8, %10, %16 ; w[0..8] odd
|
|
fmsubaddps %7, %7, %12, %15 ; j[0..8] even
|
|
fmsubaddps %9, %9, %10, %17 ; j[0..8] odd
|
|
movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
|
|
%else
|
|
mulps %6, %6, %12 ; m2,3[01]imre * cos0246
|
|
mulps %8, %8, %10 ; m2,3[01]imre * cos0246
|
|
movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
|
|
mulps %7, %7, %12 ; m2,3[23]reim * cos0246
|
|
mulps %9, %9, %10 ; m2,3[23]reim * cos0246
|
|
addsubps %6, %6, %14 ; w[0..8]
|
|
addsubps %8, %8, %16 ; w[0..8]
|
|
xorps %15, %15, %13 ; +-m2,3[23]imre * wim7531
|
|
xorps %17, %17, %13 ; +-m2,3[23]imre * wim7531
|
|
addps %7, %7, %15 ; j[0..8]
|
|
addps %9, %9, %17 ; j[0..8]
|
|
%endif
|
|
|
|
addps %14, %6, %7 ; t10235476 even
|
|
addps %16, %8, %9 ; t10235476 odd
|
|
subps %15, %6, %7 ; +-r[0..7] even
|
|
subps %17, %8, %9 ; +-r[0..7] odd
|
|
|
|
shufps %14, %14, %14, q2301 ; t[0..7] even
|
|
shufps %16, %16, %16, q2301 ; t[0..7] odd
|
|
xorps %15, %15, %13 ; r[0..7] even
|
|
xorps %17, %17, %13 ; r[0..7] odd
|
|
|
|
subps %6, %2, %14 ; m2,3[01] even
|
|
subps %8, %4, %16 ; m2,3[01] odd
|
|
subps %7, %3, %15 ; m2,3[23] even
|
|
subps %9, %5, %17 ; m2,3[23] odd
|
|
|
|
addps %2, %2, %14 ; m0 even
|
|
addps %4, %4, %16 ; m0 odd
|
|
addps %3, %3, %15 ; m1 even
|
|
addps %5, %5, %17 ; m1 odd
|
|
%endmacro
|
|
|
|
; Same as above, only does one parity at a time, takes 3 temporary registers,
|
|
; however, if the twiddles aren't needed after this, the registers they use
|
|
; can be used as any of the temporary registers.
|
|
%macro SPLIT_RADIX_COMBINE_HALF 10
|
|
%if %1
|
|
shufps %8, %6, %6, q2200 ; cos00224466
|
|
shufps %9, %7, %7, q1133 ; wim77553311
|
|
%else
|
|
shufps %8, %6, %6, q3311 ; cos11335577
|
|
shufps %9, %7, %7, q0022 ; wim66442200
|
|
%endif
|
|
|
|
mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even
|
|
mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even
|
|
|
|
shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
|
|
shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
|
|
|
|
%if cpuflag(fma3)
|
|
fmaddsubps %4, %4, %8, %10 ; w[0..8] even
|
|
fmsubaddps %5, %5, %8, %9 ; j[0..8] even
|
|
movaps %10, [mask_pmpmpmpm]
|
|
%else
|
|
mulps %4, %4, %8 ; m2,3[01]imre * cos0246
|
|
mulps %5, %5, %8 ; m2,3[23]reim * cos0246
|
|
addsubps %4, %4, %10 ; w[0..8]
|
|
movaps %10, [mask_pmpmpmpm]
|
|
xorps %9, %9, %10 ; +-m2,3[23]imre * wim7531
|
|
addps %5, %5, %9 ; j[0..8]
|
|
%endif
|
|
|
|
addps %8, %4, %5 ; t10235476
|
|
subps %9, %4, %5 ; +-r[0..7]
|
|
|
|
shufps %8, %8, %8, q2301 ; t[0..7]
|
|
xorps %9, %9, %10 ; r[0..7]
|
|
|
|
subps %4, %2, %8 ; %3,3[01]
|
|
subps %5, %3, %9 ; %3,3[23]
|
|
|
|
addps %2, %2, %8 ; m0
|
|
addps %3, %3, %9 ; m1
|
|
%endmacro
|
|
|
|
; Same as above, tries REALLY hard to use 2 temporary registers.
|
|
%macro SPLIT_RADIX_COMBINE_LITE 9
|
|
%if %1
|
|
shufps %8, %6, %6, q2200 ; cos00224466
|
|
shufps %9, %7, %7, q1133 ; wim77553311
|
|
%else
|
|
shufps %8, %6, %6, q3311 ; cos11335577
|
|
shufps %9, %7, %7, q0022 ; wim66442200
|
|
%endif
|
|
|
|
mulps %9, %9, %4 ; m2,3[01]reim * wim7531 even
|
|
shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
|
|
|
|
%if cpuflag(fma3)
|
|
fmaddsubps %4, %4, %8, %9 ; w[0..8] even
|
|
%else
|
|
mulps %4, %4, %8 ; m2,3[01]imre * cos0246
|
|
addsubps %4, %4, %9 ; w[0..8]
|
|
%endif
|
|
|
|
%if %1
|
|
shufps %9, %7, %7, q1133 ; wim77553311
|
|
%else
|
|
shufps %9, %7, %7, q0022 ; wim66442200
|
|
%endif
|
|
|
|
mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even
|
|
shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
|
|
%if cpuflag (fma3)
|
|
fmsubaddps %5, %5, %8, %9 ; j[0..8] even
|
|
%else
|
|
mulps %5, %5, %8 ; m2,3[23]reim * cos0246
|
|
xorps %9, %9, [mask_pmpmpmpm] ; +-m2,3[23]imre * wim7531
|
|
addps %5, %5, %9 ; j[0..8]
|
|
%endif
|
|
|
|
addps %8, %4, %5 ; t10235476
|
|
subps %9, %4, %5 ; +-r[0..7]
|
|
|
|
shufps %8, %8, %8, q2301 ; t[0..7]
|
|
xorps %9, %9, [mask_pmpmpmpm] ; r[0..7]
|
|
|
|
subps %4, %2, %8 ; %3,3[01]
|
|
subps %5, %3, %9 ; %3,3[23]
|
|
|
|
addps %2, %2, %8 ; m0
|
|
addps %3, %3, %9 ; m1
|
|
%endmacro
|
|
|
|
%macro SPLIT_RADIX_COMBINE_64 0
|
|
SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
|
|
|
|
movaps [outq + 0*mmsize], m0
|
|
movaps [outq + 4*mmsize], m1
|
|
movaps [outq + 8*mmsize], tx1_e0
|
|
movaps [outq + 12*mmsize], tx2_e0
|
|
|
|
SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0
|
|
|
|
movaps [outq + 2*mmsize], m2
|
|
movaps [outq + 6*mmsize], m3
|
|
movaps [outq + 10*mmsize], tx1_o0
|
|
movaps [outq + 14*mmsize], tx2_o0
|
|
|
|
movaps tw_e, [tab_64_float + mmsize]
|
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
|
|
|
|
movaps m0, [outq + 1*mmsize]
|
|
movaps m1, [outq + 3*mmsize]
|
|
movaps m2, [outq + 5*mmsize]
|
|
movaps m3, [outq + 7*mmsize]
|
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
|
|
tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
|
|
|
|
movaps [outq + 1*mmsize], m0
|
|
movaps [outq + 3*mmsize], m1
|
|
movaps [outq + 5*mmsize], m2
|
|
movaps [outq + 7*mmsize], m3
|
|
|
|
movaps [outq + 9*mmsize], tx1_e1
|
|
movaps [outq + 11*mmsize], tx1_o1
|
|
movaps [outq + 13*mmsize], tx2_e1
|
|
movaps [outq + 15*mmsize], tx2_o1
|
|
%endmacro
|
|
|
|
; Perform a single even/odd split radix combination with loads and stores
|
|
; The _4 indicates this is a quarter of the iterations required to complete a full
|
|
; combine loop
|
|
; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6
|
|
%macro SPLIT_RADIX_LOAD_COMBINE_4 8
|
|
movaps m8, [rtabq + (%5)*mmsize + %7]
|
|
vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23
|
|
|
|
movaps m0, [outq + (0 + %4)*mmsize + %6]
|
|
movaps m2, [outq + (2 + %4)*mmsize + %6]
|
|
movaps m1, [outq + %1 + (0 + %4)*mmsize + %6]
|
|
movaps m3, [outq + %1 + (2 + %4)*mmsize + %6]
|
|
|
|
movaps m4, [outq + %2 + (0 + %4)*mmsize + %6]
|
|
movaps m6, [outq + %2 + (2 + %4)*mmsize + %6]
|
|
movaps m5, [outq + %3 + (0 + %4)*mmsize + %6]
|
|
movaps m7, [outq + %3 + (2 + %4)*mmsize + %6]
|
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
|
|
m4, m5, m6, m7, \
|
|
m8, m9, \
|
|
m10, m11, m12, m13, m14, m15
|
|
|
|
movaps [outq + (0 + %4)*mmsize + %6], m0
|
|
movaps [outq + (2 + %4)*mmsize + %6], m2
|
|
movaps [outq + %1 + (0 + %4)*mmsize + %6], m1
|
|
movaps [outq + %1 + (2 + %4)*mmsize + %6], m3
|
|
|
|
movaps [outq + %2 + (0 + %4)*mmsize + %6], m4
|
|
movaps [outq + %2 + (2 + %4)*mmsize + %6], m6
|
|
movaps [outq + %3 + (0 + %4)*mmsize + %6], m5
|
|
movaps [outq + %3 + (2 + %4)*mmsize + %6], m7
|
|
%endmacro
|
|
|
|
%macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5
|
|
%if %0 > 2
|
|
%define offset_c %3
|
|
%else
|
|
%define offset_c 0
|
|
%endif
|
|
%if %0 > 3
|
|
%define offset_r %4
|
|
%else
|
|
%define offset_r 0
|
|
%endif
|
|
%if %0 > 4
|
|
%define offset_i %5
|
|
%else
|
|
%define offset_i 0
|
|
%endif
|
|
|
|
SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i
|
|
SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i
|
|
SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i
|
|
SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i
|
|
%endmacro
|
|
|
|
; Perform a single even/odd split radix combination with loads, deinterleaves and
|
|
; stores. The _2 indicates this is a half of the iterations required to complete
|
|
; a full combine+deinterleave loop
|
|
; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6
|
|
%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6
|
|
movaps m8, [rtabq + (0 + %2)*mmsize]
|
|
vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23
|
|
|
|
movaps m0, [outq + (0 + 0 + %1)*mmsize + %6]
|
|
movaps m2, [outq + (2 + 0 + %1)*mmsize + %6]
|
|
movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6]
|
|
movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6]
|
|
|
|
movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6]
|
|
movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6]
|
|
movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6]
|
|
movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6]
|
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
|
|
m4, m5, m6, m7, \
|
|
m8, m9, \
|
|
m10, m11, m12, m13, m14, m15
|
|
|
|
unpckhpd m10, m0, m2
|
|
unpckhpd m11, m1, m3
|
|
unpckhpd m12, m4, m6
|
|
unpckhpd m13, m5, m7
|
|
unpcklpd m0, m0, m2
|
|
unpcklpd m1, m1, m3
|
|
unpcklpd m4, m4, m6
|
|
unpcklpd m5, m5, m7
|
|
|
|
vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 0], m0, 0
|
|
vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 16], m10, 0
|
|
vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 0], m1, 0
|
|
vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0
|
|
|
|
vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 0], m4, 0
|
|
vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0
|
|
vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 0], m5, 0
|
|
vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0
|
|
|
|
vperm2f128 m10, m10, m0, 0x13
|
|
vperm2f128 m11, m11, m1, 0x13
|
|
vperm2f128 m12, m12, m4, 0x13
|
|
vperm2f128 m13, m13, m5, 0x13
|
|
|
|
movaps m8, [rtabq + (1 + %2)*mmsize]
|
|
vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23
|
|
|
|
movaps m0, [outq + (0 + 1 + %1)*mmsize + %6]
|
|
movaps m2, [outq + (2 + 1 + %1)*mmsize + %6]
|
|
movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6]
|
|
movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6]
|
|
|
|
movaps [outq + (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict
|
|
movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict
|
|
|
|
movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6]
|
|
movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6]
|
|
movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6]
|
|
movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6]
|
|
|
|
movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict
|
|
movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict
|
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
|
|
m4, m5, m6, m7, \
|
|
m8, m9, \
|
|
m10, m11, m12, m13, m14, m15 ; temporary registers
|
|
|
|
unpcklpd m8, m0, m2
|
|
unpcklpd m9, m1, m3
|
|
unpcklpd m10, m4, m6
|
|
unpcklpd m11, m5, m7
|
|
unpckhpd m0, m0, m2
|
|
unpckhpd m1, m1, m3
|
|
unpckhpd m4, m4, m6
|
|
unpckhpd m5, m5, m7
|
|
|
|
vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 0], m8, 0
|
|
vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 16], m0, 0
|
|
vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 0], m8, 1
|
|
vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 16], m0, 1
|
|
|
|
vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 0], m9, 0
|
|
vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1, 0
|
|
vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 0], m9, 1
|
|
vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1, 1
|
|
|
|
vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 0], m10, 0
|
|
vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4, 0
|
|
vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 0], m10, 1
|
|
vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4, 1
|
|
|
|
vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 0], m11, 0
|
|
vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5, 0
|
|
vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 0], m11, 1
|
|
vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5, 1
|
|
%endmacro
|
|
|
|
%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3
|
|
%if %0 > 2
|
|
%define offset %3
|
|
%else
|
|
%define offset 0
|
|
%endif
|
|
SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset
|
|
SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset
|
|
%endmacro
|
|
|
|
INIT_XMM sse3
|
|
cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
|
|
movaps m0, [inq]
|
|
FFT2 m0, m1
|
|
movaps [outq], m0
|
|
RET
|
|
|
|
%macro FFT4 2
|
|
INIT_XMM sse2
|
|
cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
|
|
movaps m0, [inq + 0*mmsize]
|
|
movaps m1, [inq + 1*mmsize]
|
|
|
|
%if %2
|
|
shufps m2, m1, m0, q3210
|
|
shufps m0, m0, m1, q3210
|
|
movaps m1, m2
|
|
%endif
|
|
|
|
FFT4 m0, m1, m2
|
|
|
|
unpcklpd m2, m0, m1
|
|
unpckhpd m0, m0, m1
|
|
|
|
movaps [outq + 0*mmsize], m2
|
|
movaps [outq + 1*mmsize], m0
|
|
|
|
RET
|
|
%endmacro
|
|
|
|
FFT4 fwd, 0
|
|
FFT4 inv, 1
|
|
|
|
%macro FFT8_SSE_FN 2
|
|
INIT_XMM sse3
|
|
cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
|
|
%if %2
|
|
mov ctxq, [ctxq + AVTXContext.map]
|
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
|
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
|
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
|
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
|
|
%else
|
|
movaps m0, [inq + 0*mmsize]
|
|
movaps m1, [inq + 1*mmsize]
|
|
movaps m2, [inq + 2*mmsize]
|
|
movaps m3, [inq + 3*mmsize]
|
|
%endif
|
|
|
|
FFT8 m0, m1, m2, m3, m4, m5
|
|
|
|
unpcklpd m4, m0, m3
|
|
unpcklpd m5, m1, m2
|
|
unpckhpd m0, m0, m3
|
|
unpckhpd m1, m1, m2
|
|
|
|
movups [outq + 0*mmsize], m4
|
|
movups [outq + 1*mmsize], m0
|
|
movups [outq + 2*mmsize], m5
|
|
movups [outq + 3*mmsize], m1
|
|
|
|
RET
|
|
%endmacro
|
|
|
|
FFT8_SSE_FN float, 1
|
|
FFT8_SSE_FN ns_float, 0
|
|
|
|
%macro FFT8_AVX_FN 2
|
|
INIT_YMM avx
|
|
cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
|
|
%if %2
|
|
mov ctxq, [ctxq + AVTXContext.map]
|
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
|
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
|
|
%else
|
|
movaps m0, [inq + 0*mmsize]
|
|
movaps m1, [inq + 1*mmsize]
|
|
%endif
|
|
|
|
FFT8_AVX m0, m1, m2, m3
|
|
|
|
unpcklpd m2, m0, m1
|
|
unpckhpd m0, m0, m1
|
|
|
|
; Around 2% faster than 2x vperm2f128 + 2x movapd
|
|
vextractf128 [outq + 16*0], m2, 0
|
|
vextractf128 [outq + 16*1], m0, 0
|
|
vextractf128 [outq + 16*2], m2, 1
|
|
vextractf128 [outq + 16*3], m0, 1
|
|
|
|
RET
|
|
%endmacro
|
|
|
|
FFT8_AVX_FN float, 1
|
|
FFT8_AVX_FN ns_float, 0
|
|
|
|
%macro FFT16_FN 3
|
|
INIT_YMM %1
|
|
cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
|
|
%if %3
|
|
movaps m0, [inq + 0*mmsize]
|
|
movaps m1, [inq + 1*mmsize]
|
|
movaps m2, [inq + 2*mmsize]
|
|
movaps m3, [inq + 3*mmsize]
|
|
%else
|
|
mov ctxq, [ctxq + AVTXContext.map]
|
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
|
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
|
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
|
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
|
|
%endif
|
|
|
|
FFT16 m0, m1, m2, m3, m4, m5, m6, m7
|
|
|
|
unpcklpd m5, m1, m3
|
|
unpcklpd m4, m0, m2
|
|
unpckhpd m1, m1, m3
|
|
unpckhpd m0, m0, m2
|
|
|
|
vextractf128 [outq + 16*0], m4, 0
|
|
vextractf128 [outq + 16*1], m0, 0
|
|
vextractf128 [outq + 16*2], m4, 1
|
|
vextractf128 [outq + 16*3], m0, 1
|
|
vextractf128 [outq + 16*4], m5, 0
|
|
vextractf128 [outq + 16*5], m1, 0
|
|
vextractf128 [outq + 16*6], m5, 1
|
|
vextractf128 [outq + 16*7], m1, 1
|
|
|
|
RET
|
|
%endmacro
|
|
|
|
FFT16_FN avx, float, 0
|
|
FFT16_FN avx, ns_float, 1
|
|
FFT16_FN fma3, float, 0
|
|
FFT16_FN fma3, ns_float, 1
|
|
|
|
%macro FFT32_FN 3
|
|
INIT_YMM %1
|
|
cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
|
|
%if %3
|
|
movaps m4, [inq + 4*mmsize]
|
|
movaps m5, [inq + 5*mmsize]
|
|
movaps m6, [inq + 6*mmsize]
|
|
movaps m7, [inq + 7*mmsize]
|
|
%else
|
|
mov ctxq, [ctxq + AVTXContext.map]
|
|
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12
|
|
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13
|
|
LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14
|
|
LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15
|
|
%endif
|
|
|
|
FFT8 m4, m5, m6, m7, m8, m9
|
|
|
|
%if %3
|
|
movaps m0, [inq + 0*mmsize]
|
|
movaps m1, [inq + 1*mmsize]
|
|
movaps m2, [inq + 2*mmsize]
|
|
movaps m3, [inq + 3*mmsize]
|
|
%else
|
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m12
|
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9, m13
|
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14
|
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15
|
|
%endif
|
|
|
|
movaps m8, [tab_32_float]
|
|
vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
|
|
|
|
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
|
|
|
|
SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
|
|
m10, m11, m12, m13, m14, m15 ; temporary registers
|
|
|
|
unpcklpd m9, m1, m3
|
|
unpcklpd m10, m5, m7
|
|
unpcklpd m8, m0, m2
|
|
unpcklpd m11, m4, m6
|
|
unpckhpd m1, m1, m3
|
|
unpckhpd m5, m5, m7
|
|
unpckhpd m0, m0, m2
|
|
unpckhpd m4, m4, m6
|
|
|
|
vextractf128 [outq + 16* 0], m8, 0
|
|
vextractf128 [outq + 16* 1], m0, 0
|
|
vextractf128 [outq + 16* 2], m8, 1
|
|
vextractf128 [outq + 16* 3], m0, 1
|
|
vextractf128 [outq + 16* 4], m9, 0
|
|
vextractf128 [outq + 16* 5], m1, 0
|
|
vextractf128 [outq + 16* 6], m9, 1
|
|
vextractf128 [outq + 16* 7], m1, 1
|
|
|
|
vextractf128 [outq + 16* 8], m11, 0
|
|
vextractf128 [outq + 16* 9], m4, 0
|
|
vextractf128 [outq + 16*10], m11, 1
|
|
vextractf128 [outq + 16*11], m4, 1
|
|
vextractf128 [outq + 16*12], m10, 0
|
|
vextractf128 [outq + 16*13], m5, 0
|
|
vextractf128 [outq + 16*14], m10, 1
|
|
vextractf128 [outq + 16*15], m5, 1
|
|
|
|
RET
|
|
%endmacro
|
|
|
|
%if ARCH_X86_64
|
|
FFT32_FN avx, float, 0
|
|
FFT32_FN avx, ns_float, 1
|
|
FFT32_FN fma3, float, 0
|
|
FFT32_FN fma3, ns_float, 1
|
|
%endif
|
|
|
|
%macro FFT_SPLIT_RADIX_DEF 1-2
|
|
ALIGN 16
|
|
.%1 %+ pt:
|
|
PUSH lenq
|
|
mov lenq, (%1/4)
|
|
|
|
add outq, (%1*4) - (%1/1)
|
|
call .32pt
|
|
|
|
add outq, (%1*2) - (%1/2) ; the synth loops also increment outq
|
|
call .32pt
|
|
|
|
POP lenq
|
|
sub outq, (%1*4) + (%1*2) + (%1/2)
|
|
|
|
lea rtabq, [tab_ %+ %1 %+ _float]
|
|
lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
|
|
|
|
%if %0 > 1
|
|
cmp tgtq, %1
|
|
je .deinterleave
|
|
|
|
mov tmpq, %1
|
|
|
|
.synth_ %+ %1:
|
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0
|
|
add outq, 8*mmsize
|
|
add rtabq, 4*mmsize
|
|
sub itabq, 4*mmsize
|
|
sub tmpq, 4*mmsize
|
|
jg .synth_ %+ %1
|
|
|
|
cmp lenq, %1
|
|
jg %2 ; can't do math here, nasm doesn't get it
|
|
ret
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro FFT_SPLIT_RADIX_FN 3
|
|
INIT_YMM %1
|
|
cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
|
movsxd lenq, dword [lutq + AVTXContext.len]
|
|
mov lutq, [lutq + AVTXContext.map]
|
|
mov tgtq, lenq
|
|
|
|
; Bottom-most/32-point transform ===============================================
|
|
ALIGN 16
|
|
.32pt:
|
|
%if %3
|
|
movaps m4, [inq + 4*mmsize]
|
|
movaps m5, [inq + 5*mmsize]
|
|
movaps m6, [inq + 6*mmsize]
|
|
movaps m7, [inq + 7*mmsize]
|
|
%else
|
|
LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m12
|
|
LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9, m13
|
|
LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14
|
|
LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15
|
|
%endif
|
|
|
|
FFT8 m4, m5, m6, m7, m8, m9
|
|
|
|
%if %3
|
|
movaps m0, [inq + 0*mmsize]
|
|
movaps m1, [inq + 1*mmsize]
|
|
movaps m2, [inq + 2*mmsize]
|
|
movaps m3, [inq + 3*mmsize]
|
|
%else
|
|
LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m12
|
|
LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9, m13
|
|
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14
|
|
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15
|
|
%endif
|
|
|
|
movaps m8, [tab_32_float]
|
|
vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
|
|
|
|
FFT16 m0, m1, m2, m3, m10, m11, m12, m13
|
|
|
|
SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
|
|
m10, m11, m12, m13, m14, m15 ; temporary registers
|
|
|
|
movaps [outq + 1*mmsize], m1
|
|
movaps [outq + 3*mmsize], m3
|
|
movaps [outq + 5*mmsize], m5
|
|
movaps [outq + 7*mmsize], m7
|
|
|
|
%if %3
|
|
add inq, 8*mmsize
|
|
%else
|
|
add lutq, (mmsize/2)*8
|
|
%endif
|
|
cmp lenq, 32
|
|
jg .64pt
|
|
|
|
movaps [outq + 0*mmsize], m0
|
|
movaps [outq + 2*mmsize], m2
|
|
movaps [outq + 4*mmsize], m4
|
|
movaps [outq + 6*mmsize], m6
|
|
|
|
ret
|
|
|
|
; 64-point transform ===========================================================
|
|
ALIGN 16
|
|
.64pt:
|
|
; Helper defines, these make it easier to track what's happening
|
|
%define tx1_e0 m4
|
|
%define tx1_e1 m5
|
|
%define tx1_o0 m6
|
|
%define tx1_o1 m7
|
|
%define tx2_e0 m8
|
|
%define tx2_e1 m9
|
|
%define tx2_o0 m10
|
|
%define tx2_o1 m11
|
|
%define tw_e m12
|
|
%define tw_o m13
|
|
%define tmp1 m14
|
|
%define tmp2 m15
|
|
|
|
SWAP m4, m1
|
|
SWAP m6, m3
|
|
|
|
%if %3
|
|
movaps tx1_e0, [inq + 0*mmsize]
|
|
movaps tx1_e1, [inq + 1*mmsize]
|
|
movaps tx1_o0, [inq + 2*mmsize]
|
|
movaps tx1_o1, [inq + 3*mmsize]
|
|
%else
|
|
LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1
|
|
LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2
|
|
LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1
|
|
LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2
|
|
%endif
|
|
|
|
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
|
|
|
|
%if %3
|
|
movaps tx2_e0, [inq + 4*mmsize]
|
|
movaps tx2_e1, [inq + 5*mmsize]
|
|
movaps tx2_o0, [inq + 6*mmsize]
|
|
movaps tx2_o1, [inq + 7*mmsize]
|
|
%else
|
|
LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1
|
|
LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2
|
|
LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1
|
|
LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2
|
|
%endif
|
|
|
|
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
|
|
|
|
movaps tw_e, [tab_64_float]
|
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
|
|
|
|
%if %3
|
|
add inq, 8*mmsize
|
|
%else
|
|
add lutq, (mmsize/2)*8
|
|
%endif
|
|
cmp tgtq, 64
|
|
je .deinterleave
|
|
|
|
SPLIT_RADIX_COMBINE_64
|
|
|
|
cmp lenq, 64
|
|
jg .128pt
|
|
ret
|
|
|
|
; 128-point transform ==========================================================
|
|
ALIGN 16
|
|
.128pt:
|
|
PUSH lenq
|
|
mov lenq, 32
|
|
|
|
add outq, 16*mmsize
|
|
call .32pt
|
|
|
|
add outq, 8*mmsize
|
|
call .32pt
|
|
|
|
POP lenq
|
|
sub outq, 24*mmsize
|
|
|
|
lea rtabq, [tab_128_float]
|
|
lea itabq, [tab_128_float + 128 - 4*7]
|
|
|
|
cmp tgtq, 128
|
|
je .deinterleave
|
|
|
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128
|
|
|
|
cmp lenq, 128
|
|
jg .256pt
|
|
ret
|
|
|
|
; 256-point transform ==========================================================
|
|
ALIGN 16
|
|
.256pt:
|
|
PUSH lenq
|
|
mov lenq, 64
|
|
|
|
add outq, 32*mmsize
|
|
call .32pt
|
|
|
|
add outq, 16*mmsize
|
|
call .32pt
|
|
|
|
POP lenq
|
|
sub outq, 48*mmsize
|
|
|
|
lea rtabq, [tab_256_float]
|
|
lea itabq, [tab_256_float + 256 - 4*7]
|
|
|
|
cmp tgtq, 256
|
|
je .deinterleave
|
|
|
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256
|
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize
|
|
|
|
cmp lenq, 256
|
|
jg .512pt
|
|
ret
|
|
|
|
; 512-point transform ==========================================================
|
|
ALIGN 16
|
|
.512pt:
|
|
PUSH lenq
|
|
mov lenq, 128
|
|
|
|
add outq, 64*mmsize
|
|
call .32pt
|
|
|
|
add outq, 32*mmsize
|
|
call .32pt
|
|
|
|
POP lenq
|
|
sub outq, 96*mmsize
|
|
|
|
lea rtabq, [tab_512_float]
|
|
lea itabq, [tab_512_float + 512 - 4*7]
|
|
|
|
cmp tgtq, 512
|
|
je .deinterleave
|
|
|
|
mov tmpq, 4
|
|
|
|
.synth_512:
|
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512
|
|
add outq, 8*mmsize
|
|
add rtabq, 4*mmsize
|
|
sub itabq, 4*mmsize
|
|
sub tmpq, 1
|
|
jg .synth_512
|
|
|
|
cmp lenq, 512
|
|
jg .1024pt
|
|
ret
|
|
|
|
; 1024-point transform ==========================================================
|
|
ALIGN 16
|
|
.1024pt:
|
|
PUSH lenq
|
|
mov lenq, 256
|
|
|
|
add outq, 96*mmsize
|
|
call .32pt
|
|
|
|
add outq, 64*mmsize
|
|
call .32pt
|
|
|
|
POP lenq
|
|
sub outq, 192*mmsize
|
|
|
|
lea rtabq, [tab_1024_float]
|
|
lea itabq, [tab_1024_float + 1024 - 4*7]
|
|
|
|
cmp tgtq, 1024
|
|
je .deinterleave
|
|
|
|
mov tmpq, 8
|
|
|
|
.synth_1024:
|
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024
|
|
add outq, 8*mmsize
|
|
add rtabq, 4*mmsize
|
|
sub itabq, 4*mmsize
|
|
sub tmpq, 1
|
|
jg .synth_1024
|
|
|
|
cmp lenq, 1024
|
|
jg .2048pt
|
|
ret
|
|
|
|
; 2048 to 131072-point transforms ==============================================
|
|
FFT_SPLIT_RADIX_DEF 2048, .4096pt
|
|
FFT_SPLIT_RADIX_DEF 4096, .8192pt
|
|
FFT_SPLIT_RADIX_DEF 8192, .16384pt
|
|
FFT_SPLIT_RADIX_DEF 16384, .32768pt
|
|
FFT_SPLIT_RADIX_DEF 32768, .65536pt
|
|
FFT_SPLIT_RADIX_DEF 65536, .131072pt
|
|
FFT_SPLIT_RADIX_DEF 131072
|
|
|
|
;===============================================================================
|
|
; Final synthesis + deinterleaving code
|
|
;===============================================================================
|
|
.deinterleave:
|
|
cmp lenq, 64
|
|
je .64pt_deint
|
|
|
|
imul tmpq, lenq, 2
|
|
lea lutq, [4*lenq + tmpq]
|
|
|
|
.synth_deinterleave:
|
|
SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, lutq
|
|
add outq, 8*mmsize
|
|
add rtabq, 4*mmsize
|
|
sub itabq, 4*mmsize
|
|
sub lenq, 4*mmsize
|
|
jg .synth_deinterleave
|
|
|
|
RET
|
|
|
|
; 64-point deinterleave which only has to load 4 registers =====================
|
|
.64pt_deint:
|
|
SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
|
|
SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e
|
|
|
|
unpcklpd tmp1, m0, m2
|
|
unpcklpd tmp2, m1, m3
|
|
unpcklpd tw_o, tx1_e0, tx1_o0
|
|
unpcklpd tw_e, tx2_e0, tx2_o0
|
|
unpckhpd m0, m0, m2
|
|
unpckhpd m1, m1, m3
|
|
unpckhpd tx1_e0, tx1_e0, tx1_o0
|
|
unpckhpd tx2_e0, tx2_e0, tx2_o0
|
|
|
|
vextractf128 [outq + 0*mmsize + 0], tmp1, 0
|
|
vextractf128 [outq + 0*mmsize + 16], m0, 0
|
|
vextractf128 [outq + 4*mmsize + 0], tmp2, 0
|
|
vextractf128 [outq + 4*mmsize + 16], m1, 0
|
|
|
|
vextractf128 [outq + 8*mmsize + 0], tw_o, 0
|
|
vextractf128 [outq + 8*mmsize + 16], tx1_e0, 0
|
|
vextractf128 [outq + 9*mmsize + 0], tw_o, 1
|
|
vextractf128 [outq + 9*mmsize + 16], tx1_e0, 1
|
|
|
|
vperm2f128 tmp1, tmp1, m0, 0x31
|
|
vperm2f128 tmp2, tmp2, m1, 0x31
|
|
|
|
vextractf128 [outq + 12*mmsize + 0], tw_e, 0
|
|
vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0
|
|
vextractf128 [outq + 13*mmsize + 0], tw_e, 1
|
|
vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
|
|
|
|
movaps tw_e, [tab_64_float + mmsize]
|
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
|
|
|
|
movaps m0, [outq + 1*mmsize]
|
|
movaps m1, [outq + 3*mmsize]
|
|
movaps m2, [outq + 5*mmsize]
|
|
movaps m3, [outq + 7*mmsize]
|
|
|
|
movaps [outq + 1*mmsize], tmp1
|
|
movaps [outq + 5*mmsize], tmp2
|
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
|
|
tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
|
|
|
|
unpcklpd tmp1, m0, m1
|
|
unpcklpd tmp2, m2, m3
|
|
unpcklpd tw_e, tx1_e1, tx1_o1
|
|
unpcklpd tw_o, tx2_e1, tx2_o1
|
|
unpckhpd m0, m0, m1
|
|
unpckhpd m2, m2, m3
|
|
unpckhpd tx1_e1, tx1_e1, tx1_o1
|
|
unpckhpd tx2_e1, tx2_e1, tx2_o1
|
|
|
|
vextractf128 [outq + 2*mmsize + 0], tmp1, 0
|
|
vextractf128 [outq + 2*mmsize + 16], m0, 0
|
|
vextractf128 [outq + 3*mmsize + 0], tmp1, 1
|
|
vextractf128 [outq + 3*mmsize + 16], m0, 1
|
|
|
|
vextractf128 [outq + 6*mmsize + 0], tmp2, 0
|
|
vextractf128 [outq + 6*mmsize + 16], m2, 0
|
|
vextractf128 [outq + 7*mmsize + 0], tmp2, 1
|
|
vextractf128 [outq + 7*mmsize + 16], m2, 1
|
|
|
|
vextractf128 [outq + 10*mmsize + 0], tw_e, 0
|
|
vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0
|
|
vextractf128 [outq + 11*mmsize + 0], tw_e, 1
|
|
vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1
|
|
|
|
vextractf128 [outq + 14*mmsize + 0], tw_o, 0
|
|
vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0
|
|
vextractf128 [outq + 15*mmsize + 0], tw_o, 1
|
|
vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
|
|
|
|
RET
|
|
%endmacro
|
|
|
|
%if ARCH_X86_64
|
|
FFT_SPLIT_RADIX_FN fma3, float, 0
|
|
FFT_SPLIT_RADIX_FN fma3, ns_float, 1
|
|
%if HAVE_AVX2_EXTERNAL
|
|
FFT_SPLIT_RADIX_FN avx2, float, 0
|
|
FFT_SPLIT_RADIX_FN avx2, ns_float, 1
|
|
%endif
|
|
%endif
|