mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
x86: fft: convert sse inline asm to yasm
This commit is contained in:
parent
8123e0901f
commit
8299260470
@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
|
||||
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
|
||||
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
|
||||
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
|
||||
YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
|
||||
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
|
||||
$(YASM-OBJS-FFT-yes)
|
||||
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||
|
@ -45,6 +45,10 @@ struc FFTContext
|
||||
.mdctbits: resd 1
|
||||
.tcos: pointer 1
|
||||
.tsin: pointer 1
|
||||
.fftperm: pointer 1
|
||||
.fftcalc: pointer 1
|
||||
.imdctcalc:pointer 1
|
||||
.imdcthalf:pointer 1
|
||||
endstruc
|
||||
|
||||
SECTION_RODATA
|
||||
@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
|
||||
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
|
||||
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
||||
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
|
||||
ps_m1m1m1m1: times 4 dd 1<<31
|
||||
ps_m1p1: dd 1<<31, 0
|
||||
|
||||
%assign i 16
|
||||
@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
|
||||
rep ret
|
||||
%endmacro
|
||||
|
||||
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
|
||||
lea r2, [dispatch_tab%1]
|
||||
mov r2, [r2 + (%2q-2)*gprsize]
|
||||
%ifdef PIC
|
||||
lea r3, [$$]
|
||||
add r2, r3
|
||||
%endif
|
||||
call r2
|
||||
%endmacro ; FFT_DISPATCH
|
||||
|
||||
INIT_YMM avx
|
||||
|
||||
%if HAVE_AVX
|
||||
@ -548,6 +563,14 @@ INIT_YMM avx
|
||||
|
||||
DECL_PASS pass_avx, PASS_BIG 1
|
||||
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
||||
|
||||
cglobal fft_calc, 2,5,8
|
||||
mov r3d, [r0 + FFTContext.nbits]
|
||||
mov r0, r1
|
||||
mov r1, r3
|
||||
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
||||
REP_RET
|
||||
|
||||
%endif
|
||||
|
||||
INIT_XMM sse
|
||||
@ -565,6 +588,112 @@ INIT_XMM sse
|
||||
DECL_PASS pass_sse, PASS_BIG 1
|
||||
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
||||
|
||||
cglobal fft_calc, 2,5,8
|
||||
mov r3d, [r0 + FFTContext.nbits]
|
||||
PUSH r1
|
||||
PUSH r3
|
||||
mov r0, r1
|
||||
mov r1, r3
|
||||
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
||||
POP rcx
|
||||
POP r4
|
||||
cmp rcx, 4
|
||||
jg .end
|
||||
mov r2, -1
|
||||
add rcx, 3
|
||||
shl r2, cl
|
||||
sub r4, r2
|
||||
.loop
|
||||
movaps xmm0, [r4 + r2]
|
||||
movaps xmm1, xmm0
|
||||
unpcklps xmm0, [r4 + r2 + 16]
|
||||
unpckhps xmm1, [r4 + r2 + 16]
|
||||
movaps [r4 + r2], xmm0
|
||||
movaps [r4 + r2 + 16], xmm1
|
||||
add r2, 32
|
||||
jl .loop
|
||||
.end:
|
||||
REP_RET
|
||||
|
||||
cextern_naked memcpy
|
||||
|
||||
cglobal fft_permute, 2,7,1
|
||||
mov r4, [r0 + FFTContext.revtab]
|
||||
mov r5, [r0 + FFTContext.tmpbuf]
|
||||
mov ecx, [r0 + FFTContext.nbits]
|
||||
mov r2, 1
|
||||
shl r2, cl
|
||||
xor r0, r0
|
||||
%if ARCH_X86_32
|
||||
mov r1, r1m
|
||||
%endif
|
||||
.loop:
|
||||
movaps xmm0, [r1 + 8*r0]
|
||||
movzx r6, word [r4 + 2*r0]
|
||||
movzx r3, word [r4 + 2*r0 + 2]
|
||||
movlps [r5 + 8*r6], xmm0
|
||||
movhps [r5 + 8*r3], xmm0
|
||||
add r0, 2
|
||||
cmp r0, r2
|
||||
jl .loop
|
||||
shl r2, 3
|
||||
%if ARCH_X86_64
|
||||
mov r0, r1
|
||||
mov r1, r5
|
||||
%else
|
||||
push r2
|
||||
push r5
|
||||
push r1
|
||||
%endif
|
||||
%if ARCH_X86_64 && WIN64 == 0
|
||||
jmp memcpy
|
||||
%else
|
||||
call memcpy
|
||||
%if ARCH_X86_32
|
||||
add esp, 12
|
||||
%endif
|
||||
REP_RET
|
||||
%endif
|
||||
|
||||
cglobal imdct_calc, 3,5,3
|
||||
mov r3d, [r0 + FFTContext.mdctsize]
|
||||
mov r4, [r0 + FFTContext.imdcthalf]
|
||||
add r1, r3
|
||||
PUSH r3
|
||||
PUSH r1
|
||||
%if ARCH_X86_32
|
||||
push r2
|
||||
push r1
|
||||
push r0
|
||||
%else
|
||||
sub rsp, 8
|
||||
%endif
|
||||
call r4
|
||||
%if ARCH_X86_32
|
||||
add esp, 12
|
||||
%else
|
||||
add rsp, 8
|
||||
%endif
|
||||
POP r1
|
||||
POP r3
|
||||
lea r0, [r1 + 2*r3]
|
||||
mov r2, r3
|
||||
sub r3, 16
|
||||
neg r2
|
||||
movaps xmm2, [ps_m1m1m1m1]
|
||||
.loop:
|
||||
movaps xmm0, [r1 + r3]
|
||||
movaps xmm1, [r0 + r2]
|
||||
shufps xmm0, xmm0, 0x1b
|
||||
shufps xmm1, xmm1, 0x1b
|
||||
xorps xmm0, xmm2
|
||||
movaps [r0 + r3], xmm1
|
||||
movaps [r1 + r2], xmm0
|
||||
sub r3, 16
|
||||
add r2, 16
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
INIT_MMX 3dnow
|
||||
%define mulps pfmul
|
||||
%define addps pfadd
|
||||
@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
|
||||
%define SECTION_REL
|
||||
%endif
|
||||
|
||||
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
|
||||
lea r2, [dispatch_tab%1]
|
||||
mov r2, [r2 + (%2q-2)*gprsize]
|
||||
%ifdef PIC
|
||||
lea r3, [$$]
|
||||
add r2, r3
|
||||
%endif
|
||||
call r2
|
||||
%endmacro ; FFT_DISPATCH
|
||||
|
||||
%macro DECL_FFT 1-2 ; nbits, suffix
|
||||
%ifidn %0, 1
|
||||
%xdefine fullsuffix SUFFIX
|
||||
|
@ -1,110 +0,0 @@
|
||||
/*
|
||||
* FFT/MDCT transform with SSE optimizations
|
||||
* Copyright (c) 2008 Loren Merritt
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86_cpu.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "fft.h"
|
||||
#include "config.h"
|
||||
|
||||
DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
|
||||
{ 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
|
||||
|
||||
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
|
||||
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
|
||||
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
|
||||
|
||||
#if HAVE_AVX
|
||||
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
ff_fft_dispatch_interleave_avx(z, s->nbits);
|
||||
}
|
||||
#endif
|
||||
|
||||
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
int n = 1 << s->nbits;
|
||||
|
||||
ff_fft_dispatch_interleave_sse(z, s->nbits);
|
||||
|
||||
if(n <= 16) {
|
||||
x86_reg i = -8*n;
|
||||
__asm__ volatile(
|
||||
"1: \n"
|
||||
"movaps (%0,%1), %%xmm0 \n"
|
||||
"movaps %%xmm0, %%xmm1 \n"
|
||||
"unpcklps 16(%0,%1), %%xmm0 \n"
|
||||
"unpckhps 16(%0,%1), %%xmm1 \n"
|
||||
"movaps %%xmm0, (%0,%1) \n"
|
||||
"movaps %%xmm1, 16(%0,%1) \n"
|
||||
"add $32, %0 \n"
|
||||
"jl 1b \n"
|
||||
:"+r"(i)
|
||||
:"r"(z+n)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
int n = 1 << s->nbits;
|
||||
int i;
|
||||
for(i=0; i<n; i+=2) {
|
||||
__asm__ volatile(
|
||||
"movaps %2, %%xmm0 \n"
|
||||
"movlps %%xmm0, %0 \n"
|
||||
"movhps %%xmm0, %1 \n"
|
||||
:"=m"(s->tmp_buf[s->revtab[i]]),
|
||||
"=m"(s->tmp_buf[s->revtab[i+1]])
|
||||
:"m"(z[i])
|
||||
);
|
||||
}
|
||||
memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
|
||||
}
|
||||
|
||||
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
x86_reg j, k;
|
||||
long n = s->mdct_size;
|
||||
long n4 = n >> 2;
|
||||
|
||||
s->imdct_half(s, output + n4, input);
|
||||
|
||||
j = -n;
|
||||
k = n-16;
|
||||
__asm__ volatile(
|
||||
"movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
|
||||
"1: \n"
|
||||
"movaps (%2,%1), %%xmm0 \n"
|
||||
"movaps (%3,%0), %%xmm1 \n"
|
||||
"shufps $0x1b, %%xmm0, %%xmm0 \n"
|
||||
"shufps $0x1b, %%xmm1, %%xmm1 \n"
|
||||
"xorps %%xmm7, %%xmm0 \n"
|
||||
"movaps %%xmm1, (%3,%1) \n"
|
||||
"movaps %%xmm0, (%2,%0) \n"
|
||||
"sub $16, %1 \n"
|
||||
"add $16, %0 \n"
|
||||
"jl 1b \n"
|
||||
:"+r"(j), "+r"(k)
|
||||
:"r"(output+n4), "r"(output+n4*3)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
|
||||
);
|
||||
}
|
Loading…
Reference in New Issue
Block a user