mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
bbe95f7353
From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether.
839 lines
20 KiB
NASM
839 lines
20 KiB
NASM
;******************************************************************************
|
|
;* FFT transform with SSE/AVX optimizations
|
|
;* Copyright (c) 2008 Loren Merritt
|
|
;* Copyright (c) 2011 Vitor Sessak
|
|
;*
|
|
;* This algorithm (though not any of the implementation details) is
|
|
;* based on libdjbfft by D. J. Bernstein.
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
; These functions are not individually interchangeable with the C versions.
|
|
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
|
|
; in blocks as conventient to the vector size.
|
|
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
%if ARCH_X86_64
|
|
%define pointer resq
|
|
%else
|
|
%define pointer resd
|
|
%endif
|
|
|
|
struc FFTContext
|
|
.nbits: resd 1
|
|
.reverse: resd 1
|
|
.revtab: pointer 1
|
|
.tmpbuf: pointer 1
|
|
.mdctsize: resd 1
|
|
.mdctbits: resd 1
|
|
.tcos: pointer 1
|
|
.tsin: pointer 1
|
|
.fftperm: pointer 1
|
|
.fftcalc: pointer 1
|
|
.imdctcalc:pointer 1
|
|
.imdcthalf:pointer 1
|
|
endstruc
|
|
|
|
SECTION_RODATA 32
|
|
|
|
%define M_SQRT1_2 0.70710678118654752440
|
|
%define M_COS_PI_1_8 0.923879532511287
|
|
%define M_COS_PI_3_8 0.38268343236509
|
|
|
|
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
|
|
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
|
|
|
|
ps_root2: times 8 dd M_SQRT1_2
|
|
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
|
|
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
|
|
|
|
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
|
|
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
|
|
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
|
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
|
|
ps_m1p1: dd 1<<31, 0
|
|
|
|
cextern ps_neg
|
|
|
|
%assign i 16
|
|
%rep 14
|
|
cextern cos_ %+ i
|
|
%assign i i<<1
|
|
%endrep
|
|
|
|
%if ARCH_X86_64
|
|
%define pointer dq
|
|
%else
|
|
%define pointer dd
|
|
%endif
|
|
|
|
%macro IF0 1+
|
|
%endmacro
|
|
%macro IF1 1+
|
|
%1
|
|
%endmacro
|
|
|
|
SECTION .text
|
|
|
|
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
|
|
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
|
|
; %3, %4, %5 tmp
|
|
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
|
|
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
|
|
%macro T8_AVX 5
|
|
vsubps %5, %1, %2 ; v = %1 - %2
|
|
vaddps %3, %1, %2 ; w = %1 + %2
|
|
vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
|
|
vpermilps %2, %2, [perm1]
|
|
vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
|
|
vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
|
|
vsubps %4, %5, %1 ; s = r - q
|
|
vaddps %1, %5, %1 ; u = r + q
|
|
vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
|
|
vshufps %5, %4, %1, 0xbb
|
|
vshufps %3, %4, %1, 0xee
|
|
vperm2f128 %3, %3, %5, 0x13
|
|
vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
|
|
vshufps %2, %1, %4, 0xdd
|
|
vshufps %1, %1, %4, 0x88
|
|
vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
|
|
vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
|
|
vsubps %5, %1, %3
|
|
vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
|
|
vsubps %2, %4, %1 ; %2 = v - w
|
|
vaddps %1, %4, %1 ; %1 = v + w
|
|
%endmacro
|
|
|
|
; In SSE mode do one fft4 transforms
|
|
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
|
|
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
|
|
;
|
|
; In AVX mode do two fft4 transforms
|
|
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
|
|
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
|
|
%macro T4_SSE 3
|
|
subps %3, %1, %2 ; {t3,t4,-t8,t7}
|
|
addps %1, %1, %2 ; {t1,t2,t6,t5}
|
|
xorps %3, %3, [ps_p1p1m1p1]
|
|
shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
|
|
shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
|
|
subps %3, %1, %2 ; {r2,i2,r3,i3}
|
|
addps %1, %1, %2 ; {r0,i0,r1,i1}
|
|
shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
|
|
shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
|
|
%endmacro
|
|
|
|
; In SSE mode do one FFT8
|
|
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
|
|
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
|
|
;
|
|
; In AVX mode do two FFT8
|
|
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
|
|
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
|
|
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
|
|
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
|
|
%macro T8_SSE 6
|
|
addps %6, %3, %4 ; {t1,t2,t3,t4}
|
|
subps %3, %3, %4 ; {r5,i5,r7,i7}
|
|
shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
|
|
mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
|
|
mulps %4, %4, [ps_root2]
|
|
addps %3, %3, %4 ; {t8,t7,ta,t9}
|
|
shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
|
|
shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
|
|
subps %3, %6, %4 ; {t6,t5,tc,tb}
|
|
addps %6, %6, %4 ; {t1,t2,t9,ta}
|
|
shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
|
|
shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
|
|
subps %3, %1, %6 ; {r4,r5,r6,r7}
|
|
addps %1, %1, %6 ; {r0,r1,r2,r3}
|
|
subps %4, %2, %5 ; {i4,i5,i6,i7}
|
|
addps %2, %2, %5 ; {i0,i1,i2,i3}
|
|
%endmacro
|
|
|
|
%macro INTERL 5
|
|
%if cpuflag(avx)
|
|
vunpckhps %3, %2, %1
|
|
vunpcklps %2, %2, %1
|
|
vextractf128 %4(%5), %2, 0
|
|
vextractf128 %4 %+ H(%5), %3, 0
|
|
vextractf128 %4(%5 + 1), %2, 1
|
|
vextractf128 %4 %+ H(%5 + 1), %3, 1
|
|
%elif cpuflag(sse)
|
|
mova %3, %2
|
|
unpcklps %2, %1
|
|
unpckhps %3, %1
|
|
mova %4(%5), %2
|
|
mova %4(%5+1), %3
|
|
%endif
|
|
%endmacro
|
|
|
|
; scheduled for cpu-bound sizes
|
|
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
|
|
IF%1 mova m4, Z(4)
|
|
IF%1 mova m5, Z(5)
|
|
mova m0, %2 ; wre
|
|
mova m1, %3 ; wim
|
|
mulps m2, m4, m0 ; r2*wre
|
|
IF%1 mova m6, Z2(6)
|
|
mulps m3, m5, m1 ; i2*wim
|
|
IF%1 mova m7, Z2(7)
|
|
mulps m4, m4, m1 ; r2*wim
|
|
mulps m5, m5, m0 ; i2*wre
|
|
addps m2, m2, m3 ; r2*wre + i2*wim
|
|
mulps m3, m1, m7 ; i3*wim
|
|
subps m5, m5, m4 ; i2*wre - r2*wim
|
|
mulps m1, m1, m6 ; r3*wim
|
|
mulps m4, m0, m6 ; r3*wre
|
|
mulps m0, m0, m7 ; i3*wre
|
|
subps m4, m4, m3 ; r3*wre - i3*wim
|
|
mova m3, Z(0)
|
|
addps m0, m0, m1 ; i3*wre + r3*wim
|
|
subps m1, m4, m2 ; t3
|
|
addps m4, m4, m2 ; t5
|
|
subps m3, m3, m4 ; r2
|
|
addps m4, m4, Z(0) ; r0
|
|
mova m6, Z(2)
|
|
mova Z(4), m3
|
|
mova Z(0), m4
|
|
subps m3, m5, m0 ; t4
|
|
subps m4, m6, m3 ; r3
|
|
addps m3, m3, m6 ; r1
|
|
mova Z2(6), m4
|
|
mova Z(2), m3
|
|
mova m2, Z(3)
|
|
addps m3, m5, m0 ; t6
|
|
subps m2, m2, m1 ; i3
|
|
mova m7, Z(1)
|
|
addps m1, m1, Z(3) ; i1
|
|
mova Z2(7), m2
|
|
mova Z(3), m1
|
|
subps m4, m7, m3 ; i2
|
|
addps m3, m3, m7 ; i0
|
|
mova Z(5), m4
|
|
mova Z(1), m3
|
|
%endmacro
|
|
|
|
; scheduled to avoid store->load aliasing
|
|
%macro PASS_BIG 1 ; (!interleave)
|
|
mova m4, Z(4) ; r2
|
|
mova m5, Z(5) ; i2
|
|
mova m0, [wq] ; wre
|
|
mova m1, [wq+o1q] ; wim
|
|
mulps m2, m4, m0 ; r2*wre
|
|
mova m6, Z2(6) ; r3
|
|
mulps m3, m5, m1 ; i2*wim
|
|
mova m7, Z2(7) ; i3
|
|
mulps m4, m4, m1 ; r2*wim
|
|
mulps m5, m5, m0 ; i2*wre
|
|
addps m2, m2, m3 ; r2*wre + i2*wim
|
|
mulps m3, m1, m7 ; i3*wim
|
|
mulps m1, m1, m6 ; r3*wim
|
|
subps m5, m5, m4 ; i2*wre - r2*wim
|
|
mulps m4, m0, m6 ; r3*wre
|
|
mulps m0, m0, m7 ; i3*wre
|
|
subps m4, m4, m3 ; r3*wre - i3*wim
|
|
mova m3, Z(0)
|
|
addps m0, m0, m1 ; i3*wre + r3*wim
|
|
subps m1, m4, m2 ; t3
|
|
addps m4, m4, m2 ; t5
|
|
subps m3, m3, m4 ; r2
|
|
addps m4, m4, Z(0) ; r0
|
|
mova m6, Z(2)
|
|
mova Z(4), m3
|
|
mova Z(0), m4
|
|
subps m3, m5, m0 ; t4
|
|
subps m4, m6, m3 ; r3
|
|
addps m3, m3, m6 ; r1
|
|
IF%1 mova Z2(6), m4
|
|
IF%1 mova Z(2), m3
|
|
mova m2, Z(3)
|
|
addps m5, m5, m0 ; t6
|
|
subps m2, m2, m1 ; i3
|
|
mova m7, Z(1)
|
|
addps m1, m1, Z(3) ; i1
|
|
IF%1 mova Z2(7), m2
|
|
IF%1 mova Z(3), m1
|
|
subps m6, m7, m5 ; i2
|
|
addps m5, m5, m7 ; i0
|
|
IF%1 mova Z(5), m6
|
|
IF%1 mova Z(1), m5
|
|
%if %1==0
|
|
INTERL m1, m3, m7, Z, 2
|
|
INTERL m2, m4, m0, Z2, 6
|
|
|
|
mova m1, Z(0)
|
|
mova m2, Z(4)
|
|
|
|
INTERL m5, m1, m3, Z, 0
|
|
INTERL m6, m2, m7, Z, 4
|
|
%endif
|
|
%endmacro
|
|
|
|
%define Z(x) [r0+mmsize*x]
|
|
%define Z2(x) [r0+mmsize*x]
|
|
%define ZH(x) [r0+mmsize*x+mmsize/2]
|
|
|
|
INIT_YMM avx
|
|
|
|
%if HAVE_AVX_EXTERNAL
|
|
align 16
|
|
fft8_avx:
|
|
mova m0, Z(0)
|
|
mova m1, Z(1)
|
|
T8_AVX m0, m1, m2, m3, m4
|
|
mova Z(0), m0
|
|
mova Z(1), m1
|
|
ret
|
|
|
|
|
|
align 16
|
|
fft16_avx:
|
|
mova m2, Z(2)
|
|
mova m3, Z(3)
|
|
T4_SSE m2, m3, m7
|
|
|
|
mova m0, Z(0)
|
|
mova m1, Z(1)
|
|
T8_AVX m0, m1, m4, m5, m7
|
|
|
|
mova m4, [ps_cos16_1]
|
|
mova m5, [ps_cos16_2]
|
|
vmulps m6, m2, m4
|
|
vmulps m7, m3, m5
|
|
vaddps m7, m7, m6
|
|
vmulps m2, m2, m5
|
|
vmulps m3, m3, m4
|
|
vsubps m3, m3, m2
|
|
vblendps m2, m7, m3, 0xf0
|
|
vperm2f128 m3, m7, m3, 0x21
|
|
vaddps m4, m2, m3
|
|
vsubps m2, m3, m2
|
|
vperm2f128 m2, m2, m2, 0x01
|
|
vsubps m3, m1, m2
|
|
vaddps m1, m1, m2
|
|
vsubps m5, m0, m4
|
|
vaddps m0, m0, m4
|
|
vextractf128 Z(0), m0, 0
|
|
vextractf128 ZH(0), m1, 0
|
|
vextractf128 Z(1), m0, 1
|
|
vextractf128 ZH(1), m1, 1
|
|
vextractf128 Z(2), m5, 0
|
|
vextractf128 ZH(2), m3, 0
|
|
vextractf128 Z(3), m5, 1
|
|
vextractf128 ZH(3), m3, 1
|
|
ret
|
|
|
|
align 16
|
|
fft32_avx:
|
|
call fft16_avx
|
|
|
|
mova m0, Z(4)
|
|
mova m1, Z(5)
|
|
|
|
T4_SSE m0, m1, m4
|
|
|
|
mova m2, Z(6)
|
|
mova m3, Z(7)
|
|
|
|
T8_SSE m0, m1, m2, m3, m4, m6
|
|
; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
|
|
; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
|
|
|
|
vperm2f128 m4, m0, m2, 0x20
|
|
vperm2f128 m5, m1, m3, 0x20
|
|
vperm2f128 m6, m0, m2, 0x31
|
|
vperm2f128 m7, m1, m3, 0x31
|
|
|
|
PASS_SMALL 0, [cos_32], [cos_32+32]
|
|
|
|
ret
|
|
|
|
fft32_interleave_avx:
|
|
call fft32_avx
|
|
mov r2d, 32
|
|
.deint_loop:
|
|
mova m2, Z(0)
|
|
mova m3, Z(1)
|
|
vunpcklps m0, m2, m3
|
|
vunpckhps m1, m2, m3
|
|
vextractf128 Z(0), m0, 0
|
|
vextractf128 ZH(0), m1, 0
|
|
vextractf128 Z(1), m0, 1
|
|
vextractf128 ZH(1), m1, 1
|
|
add r0, mmsize*2
|
|
sub r2d, mmsize/4
|
|
jg .deint_loop
|
|
ret
|
|
|
|
%endif
|
|
|
|
INIT_XMM sse
|
|
|
|
align 16
|
|
fft4_avx:
|
|
fft4_sse:
|
|
mova m0, Z(0)
|
|
mova m1, Z(1)
|
|
T4_SSE m0, m1, m2
|
|
mova Z(0), m0
|
|
mova Z(1), m1
|
|
ret
|
|
|
|
align 16
|
|
fft8_sse:
|
|
mova m0, Z(0)
|
|
mova m1, Z(1)
|
|
T4_SSE m0, m1, m2
|
|
mova m2, Z(2)
|
|
mova m3, Z(3)
|
|
T8_SSE m0, m1, m2, m3, m4, m5
|
|
mova Z(0), m0
|
|
mova Z(1), m1
|
|
mova Z(2), m2
|
|
mova Z(3), m3
|
|
ret
|
|
|
|
align 16
|
|
fft16_sse:
|
|
mova m0, Z(0)
|
|
mova m1, Z(1)
|
|
T4_SSE m0, m1, m2
|
|
mova m2, Z(2)
|
|
mova m3, Z(3)
|
|
T8_SSE m0, m1, m2, m3, m4, m5
|
|
mova m4, Z(4)
|
|
mova m5, Z(5)
|
|
mova Z(0), m0
|
|
mova Z(1), m1
|
|
mova Z(2), m2
|
|
mova Z(3), m3
|
|
T4_SSE m4, m5, m6
|
|
mova m6, Z2(6)
|
|
mova m7, Z2(7)
|
|
T4_SSE m6, m7, m0
|
|
PASS_SMALL 0, [cos_16], [cos_16+16]
|
|
ret
|
|
|
|
|
|
%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
|
|
%define Z2(x) [zcq + o3q + mmsize*(x&1)]
|
|
%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
|
|
%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
|
|
|
|
%macro DECL_PASS 2+ ; name, payload
|
|
align 16
|
|
%1:
|
|
DEFINE_ARGS zc, w, n, o1, o3
|
|
lea o3q, [nq*3]
|
|
lea o1q, [nq*8]
|
|
shl o3q, 4
|
|
.loop:
|
|
%2
|
|
add zcq, mmsize*2
|
|
add wq, mmsize
|
|
sub nd, mmsize/8
|
|
jg .loop
|
|
rep ret
|
|
%endmacro
|
|
|
|
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
|
|
lea r2, [dispatch_tab%1]
|
|
mov r2, [r2 + (%2q-2)*gprsize]
|
|
%ifdef PIC
|
|
lea r3, [$$]
|
|
add r2, r3
|
|
%endif
|
|
call r2
|
|
%endmacro ; FFT_DISPATCH
|
|
|
|
INIT_YMM avx
|
|
|
|
%if HAVE_AVX_EXTERNAL
|
|
DECL_PASS pass_avx, PASS_BIG 1
|
|
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
|
|
|
cglobal fft_calc, 2,5,8
|
|
mov r3d, [r0 + FFTContext.nbits]
|
|
mov r0, r1
|
|
mov r1, r3
|
|
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
|
RET
|
|
|
|
%endif
|
|
|
|
INIT_XMM sse
|
|
|
|
DECL_PASS pass_sse, PASS_BIG 1
|
|
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
|
|
|
INIT_XMM sse
|
|
cglobal fft_calc, 2,5,8
|
|
mov r3d, [r0 + FFTContext.nbits]
|
|
PUSH r1
|
|
PUSH r3
|
|
mov r0, r1
|
|
mov r1, r3
|
|
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
|
POP rcx
|
|
POP r4
|
|
cmp rcx, 3+(mmsize/16)
|
|
jg .end
|
|
mov r2, -1
|
|
add rcx, 3
|
|
shl r2, cl
|
|
sub r4, r2
|
|
.loop:
|
|
movaps xmm0, [r4 + r2]
|
|
movaps xmm1, xmm0
|
|
unpcklps xmm0, [r4 + r2 + 16]
|
|
unpckhps xmm1, [r4 + r2 + 16]
|
|
movaps [r4 + r2], xmm0
|
|
movaps [r4 + r2 + 16], xmm1
|
|
add r2, mmsize*2
|
|
jl .loop
|
|
.end:
|
|
RET
|
|
|
|
cglobal fft_permute, 2,7,1
|
|
mov r4, [r0 + FFTContext.revtab]
|
|
mov r5, [r0 + FFTContext.tmpbuf]
|
|
mov ecx, [r0 + FFTContext.nbits]
|
|
mov r2, 1
|
|
shl r2, cl
|
|
xor r0, r0
|
|
%if ARCH_X86_32
|
|
mov r1, r1m
|
|
%endif
|
|
.loop:
|
|
movaps xmm0, [r1 + 8*r0]
|
|
movzx r6, word [r4 + 2*r0]
|
|
movzx r3, word [r4 + 2*r0 + 2]
|
|
movlps [r5 + 8*r6], xmm0
|
|
movhps [r5 + 8*r3], xmm0
|
|
add r0, 2
|
|
cmp r0, r2
|
|
jl .loop
|
|
shl r2, 3
|
|
add r1, r2
|
|
add r5, r2
|
|
neg r2
|
|
; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
|
|
.loopcopy:
|
|
movaps xmm0, [r5 + r2]
|
|
movaps xmm1, [r5 + r2 + 16]
|
|
movaps [r1 + r2], xmm0
|
|
movaps [r1 + r2 + 16], xmm1
|
|
add r2, 32
|
|
jl .loopcopy
|
|
RET
|
|
|
|
INIT_XMM sse
|
|
cglobal imdct_calc, 3,5,3
|
|
mov r3d, [r0 + FFTContext.mdctsize]
|
|
mov r4, [r0 + FFTContext.imdcthalf]
|
|
add r1, r3
|
|
PUSH r3
|
|
PUSH r1
|
|
%if ARCH_X86_32
|
|
push r2
|
|
push r1
|
|
push r0
|
|
%else
|
|
sub rsp, 8+32*WIN64 ; allocate win64 shadow space
|
|
%endif
|
|
call r4
|
|
%if ARCH_X86_32
|
|
add esp, 12
|
|
%else
|
|
add rsp, 8+32*WIN64
|
|
%endif
|
|
POP r1
|
|
POP r3
|
|
lea r0, [r1 + 2*r3]
|
|
mov r2, r3
|
|
sub r3, mmsize
|
|
neg r2
|
|
mova m2, [ps_neg]
|
|
.loop:
|
|
mova m0, [r1 + r3]
|
|
mova m1, [r0 + r2]
|
|
shufps m0, m0, 0x1b
|
|
shufps m1, m1, 0x1b
|
|
xorps m0, m2
|
|
mova [r0 + r3], m1
|
|
mova [r1 + r2], m0
|
|
sub r3, mmsize
|
|
add r2, mmsize
|
|
jl .loop
|
|
RET
|
|
|
|
%ifdef PIC
|
|
%define SECTION_REL - $$
|
|
%else
|
|
%define SECTION_REL
|
|
%endif
|
|
|
|
%macro DECL_FFT 1-2 ; nbits, suffix
|
|
%ifidn %0, 1
|
|
%xdefine fullsuffix SUFFIX
|
|
%else
|
|
%xdefine fullsuffix %2 %+ SUFFIX
|
|
%endif
|
|
%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
|
|
%if %1>=5
|
|
%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
|
|
%endif
|
|
%if %1>=6
|
|
%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
|
|
%endif
|
|
|
|
%assign n 1<<%1
|
|
%rep 18-%1
|
|
%assign n2 n/2
|
|
%assign n4 n/4
|
|
%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
|
|
|
|
align 16
|
|
fft %+ n %+ fullsuffix:
|
|
call fft %+ n2 %+ SUFFIX
|
|
add r0, n*4 - (n&(-2<<%1))
|
|
call fft %+ n4 %+ SUFFIX
|
|
add r0, n*2 - (n2&(-2<<%1))
|
|
call fft %+ n4 %+ SUFFIX
|
|
sub r0, n*6 + (n2&(-2<<%1))
|
|
lea r1, [cos_ %+ n]
|
|
mov r2d, n4/2
|
|
jmp pass %+ fullsuffix
|
|
|
|
%assign n n*2
|
|
%endrep
|
|
%undef n
|
|
|
|
align 8
|
|
dispatch_tab %+ fullsuffix: pointer list_of_fft
|
|
%endmacro ; DECL_FFT
|
|
|
|
%if HAVE_AVX_EXTERNAL
|
|
INIT_YMM avx
|
|
DECL_FFT 6
|
|
DECL_FFT 6, _interleave
|
|
%endif
|
|
INIT_XMM sse
|
|
DECL_FFT 5
|
|
DECL_FFT 5, _interleave
|
|
|
|
INIT_XMM sse
|
|
%undef mulps
|
|
%undef addps
|
|
%undef subps
|
|
%undef unpcklps
|
|
%undef unpckhps
|
|
|
|
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
|
|
movaps xmm0, [%3+%2*4]
|
|
movaps xmm1, [%3+%1*4-0x10]
|
|
movaps xmm2, xmm0
|
|
shufps xmm0, xmm1, 0x88
|
|
shufps xmm1, xmm2, 0x77
|
|
movlps xmm4, [%4+%2*2]
|
|
movlps xmm5, [%5+%2*2+0x0]
|
|
movhps xmm4, [%4+%1*2-0x8]
|
|
movhps xmm5, [%5+%1*2-0x8]
|
|
movaps xmm2, xmm0
|
|
movaps xmm3, xmm1
|
|
mulps xmm0, xmm5
|
|
mulps xmm1, xmm4
|
|
mulps xmm2, xmm4
|
|
mulps xmm3, xmm5
|
|
subps xmm1, xmm0
|
|
addps xmm2, xmm3
|
|
movaps xmm0, xmm1
|
|
unpcklps xmm1, xmm2
|
|
unpckhps xmm0, xmm2
|
|
%endmacro
|
|
|
|
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
|
|
mulps m6, %3, [%5+%1]
|
|
mulps m7, %2, [%5+%1]
|
|
mulps %2, %2, [%6+%1]
|
|
mulps %3, %3, [%6+%1]
|
|
subps %2, %2, m6
|
|
addps %3, %3, m7
|
|
%endmacro
|
|
|
|
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
|
|
.post:
|
|
%if cpuflag(avx)
|
|
vmovaps ymm1, [%3+%1*2]
|
|
vmovaps ymm0, [%3+%1*2+0x20]
|
|
vmovaps ymm3, [%3+%2*2]
|
|
vmovaps ymm2, [%3+%2*2+0x20]
|
|
|
|
CMUL %1, ymm0, ymm1, %3, %4, %5
|
|
CMUL %2, ymm2, ymm3, %3, %4, %5
|
|
vshufps ymm1, ymm1, ymm1, 0x1b
|
|
vshufps ymm3, ymm3, ymm3, 0x1b
|
|
vperm2f128 ymm1, ymm1, ymm1, 0x01
|
|
vperm2f128 ymm3, ymm3, ymm3, 0x01
|
|
vunpcklps ymm6, ymm2, ymm1
|
|
vunpckhps ymm4, ymm2, ymm1
|
|
vunpcklps ymm7, ymm0, ymm3
|
|
vunpckhps ymm5, ymm0, ymm3
|
|
|
|
vextractf128 [%3+%1*2], ymm7, 0
|
|
vextractf128 [%3+%1*2+0x10], ymm5, 0
|
|
vextractf128 [%3+%1*2+0x20], ymm7, 1
|
|
vextractf128 [%3+%1*2+0x30], ymm5, 1
|
|
|
|
vextractf128 [%3+%2*2], ymm6, 0
|
|
vextractf128 [%3+%2*2+0x10], ymm4, 0
|
|
vextractf128 [%3+%2*2+0x20], ymm6, 1
|
|
vextractf128 [%3+%2*2+0x30], ymm4, 1
|
|
sub %2, 0x20
|
|
add %1, 0x20
|
|
jl .post
|
|
%else
|
|
movaps xmm1, [%3+%1*2]
|
|
movaps xmm0, [%3+%1*2+0x10]
|
|
CMUL %1, xmm0, xmm1, %3, %4, %5
|
|
movaps xmm5, [%3+%2*2]
|
|
movaps xmm4, [%3+%2*2+0x10]
|
|
CMUL %2, xmm4, xmm5, %3, %4, %5
|
|
shufps xmm1, xmm1, 0x1b
|
|
shufps xmm5, xmm5, 0x1b
|
|
movaps xmm6, xmm4
|
|
unpckhps xmm4, xmm1
|
|
unpcklps xmm6, xmm1
|
|
movaps xmm2, xmm0
|
|
unpcklps xmm0, xmm5
|
|
unpckhps xmm2, xmm5
|
|
movaps [%3+%2*2], xmm6
|
|
movaps [%3+%2*2+0x10], xmm4
|
|
movaps [%3+%1*2], xmm0
|
|
movaps [%3+%1*2+0x10], xmm2
|
|
sub %2, 0x10
|
|
add %1, 0x10
|
|
jl .post
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro DECL_IMDCT 0
|
|
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
|
|
%if ARCH_X86_64
|
|
%define rrevtab r7
|
|
%define rtcos r8
|
|
%define rtsin r9
|
|
%else
|
|
%define rrevtab r6
|
|
%define rtsin r6
|
|
%define rtcos r5
|
|
%endif
|
|
mov r3d, [r0+FFTContext.mdctsize]
|
|
add r2, r3
|
|
shr r3, 1
|
|
mov rtcos, [r0+FFTContext.tcos]
|
|
mov rtsin, [r0+FFTContext.tsin]
|
|
add rtcos, r3
|
|
add rtsin, r3
|
|
%if ARCH_X86_64 == 0
|
|
push rtcos
|
|
push rtsin
|
|
%endif
|
|
shr r3, 1
|
|
mov rrevtab, [r0+FFTContext.revtab]
|
|
add rrevtab, r3
|
|
%if ARCH_X86_64 == 0
|
|
push rrevtab
|
|
%endif
|
|
|
|
sub r3, 4
|
|
%if ARCH_X86_64
|
|
xor r4, r4
|
|
sub r4, r3
|
|
%endif
|
|
.pre:
|
|
%if ARCH_X86_64 == 0
|
|
;unspill
|
|
xor r4, r4
|
|
sub r4, r3
|
|
mov rtcos, [esp+8]
|
|
mov rtsin, [esp+4]
|
|
%endif
|
|
|
|
PREROTATER r4, r3, r2, rtcos, rtsin
|
|
%if ARCH_X86_64
|
|
movzx r5, word [rrevtab+r4-4]
|
|
movzx r6, word [rrevtab+r4-2]
|
|
movzx r10, word [rrevtab+r3]
|
|
movzx r11, word [rrevtab+r3+2]
|
|
movlps [r1+r5 *8], xmm0
|
|
movhps [r1+r6 *8], xmm0
|
|
movlps [r1+r10*8], xmm1
|
|
movhps [r1+r11*8], xmm1
|
|
add r4, 4
|
|
%else
|
|
mov r6, [esp]
|
|
movzx r5, word [r6+r4-4]
|
|
movzx r4, word [r6+r4-2]
|
|
movlps [r1+r5*8], xmm0
|
|
movhps [r1+r4*8], xmm0
|
|
movzx r5, word [r6+r3]
|
|
movzx r4, word [r6+r3+2]
|
|
movlps [r1+r5*8], xmm1
|
|
movhps [r1+r4*8], xmm1
|
|
%endif
|
|
sub r3, 4
|
|
jns .pre
|
|
|
|
mov r5, r0
|
|
mov r6, r1
|
|
mov r0, r1
|
|
mov r1d, [r5+FFTContext.nbits]
|
|
|
|
FFT_DISPATCH SUFFIX, r1
|
|
|
|
mov r0d, [r5+FFTContext.mdctsize]
|
|
add r6, r0
|
|
shr r0, 1
|
|
%if ARCH_X86_64 == 0
|
|
%define rtcos r2
|
|
%define rtsin r3
|
|
mov rtcos, [esp+8]
|
|
mov rtsin, [esp+4]
|
|
%endif
|
|
neg r0
|
|
mov r1, -mmsize
|
|
sub r1, r0
|
|
POSROTATESHUF r0, r1, r6, rtcos, rtsin
|
|
%if ARCH_X86_64 == 0
|
|
add esp, 12
|
|
%endif
|
|
RET
|
|
%endmacro
|
|
|
|
DECL_IMDCT
|
|
|
|
INIT_YMM avx
|
|
|
|
%if HAVE_AVX_EXTERNAL
|
|
DECL_IMDCT
|
|
%endif
|