mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
x86: fft: convert sse inline asm to yasm
This commit is contained in:
parent
8123e0901f
commit
8299260470
@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
|
|||||||
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
|
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
|
||||||
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
|
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
|
||||||
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
|
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
|
||||||
YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
|
|
||||||
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
|
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
|
||||||
$(YASM-OBJS-FFT-yes)
|
$(YASM-OBJS-FFT-yes)
|
||||||
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||||
|
@ -45,6 +45,10 @@ struc FFTContext
|
|||||||
.mdctbits: resd 1
|
.mdctbits: resd 1
|
||||||
.tcos: pointer 1
|
.tcos: pointer 1
|
||||||
.tsin: pointer 1
|
.tsin: pointer 1
|
||||||
|
.fftperm: pointer 1
|
||||||
|
.fftcalc: pointer 1
|
||||||
|
.imdctcalc:pointer 1
|
||||||
|
.imdcthalf:pointer 1
|
||||||
endstruc
|
endstruc
|
||||||
|
|
||||||
SECTION_RODATA
|
SECTION_RODATA
|
||||||
@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
|
|||||||
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
|
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
|
||||||
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
||||||
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
|
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
|
||||||
|
ps_m1m1m1m1: times 4 dd 1<<31
|
||||||
ps_m1p1: dd 1<<31, 0
|
ps_m1p1: dd 1<<31, 0
|
||||||
|
|
||||||
%assign i 16
|
%assign i 16
|
||||||
@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
|
|||||||
rep ret
|
rep ret
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
|
||||||
|
lea r2, [dispatch_tab%1]
|
||||||
|
mov r2, [r2 + (%2q-2)*gprsize]
|
||||||
|
%ifdef PIC
|
||||||
|
lea r3, [$$]
|
||||||
|
add r2, r3
|
||||||
|
%endif
|
||||||
|
call r2
|
||||||
|
%endmacro ; FFT_DISPATCH
|
||||||
|
|
||||||
INIT_YMM avx
|
INIT_YMM avx
|
||||||
|
|
||||||
%if HAVE_AVX
|
%if HAVE_AVX
|
||||||
@ -548,6 +563,14 @@ INIT_YMM avx
|
|||||||
|
|
||||||
DECL_PASS pass_avx, PASS_BIG 1
|
DECL_PASS pass_avx, PASS_BIG 1
|
||||||
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
||||||
|
|
||||||
|
cglobal fft_calc, 2,5,8
|
||||||
|
mov r3d, [r0 + FFTContext.nbits]
|
||||||
|
mov r0, r1
|
||||||
|
mov r1, r3
|
||||||
|
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
||||||
|
REP_RET
|
||||||
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
INIT_XMM sse
|
INIT_XMM sse
|
||||||
@ -565,6 +588,112 @@ INIT_XMM sse
|
|||||||
DECL_PASS pass_sse, PASS_BIG 1
|
DECL_PASS pass_sse, PASS_BIG 1
|
||||||
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
||||||
|
|
||||||
|
cglobal fft_calc, 2,5,8
|
||||||
|
mov r3d, [r0 + FFTContext.nbits]
|
||||||
|
PUSH r1
|
||||||
|
PUSH r3
|
||||||
|
mov r0, r1
|
||||||
|
mov r1, r3
|
||||||
|
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
||||||
|
POP rcx
|
||||||
|
POP r4
|
||||||
|
cmp rcx, 4
|
||||||
|
jg .end
|
||||||
|
mov r2, -1
|
||||||
|
add rcx, 3
|
||||||
|
shl r2, cl
|
||||||
|
sub r4, r2
|
||||||
|
.loop
|
||||||
|
movaps xmm0, [r4 + r2]
|
||||||
|
movaps xmm1, xmm0
|
||||||
|
unpcklps xmm0, [r4 + r2 + 16]
|
||||||
|
unpckhps xmm1, [r4 + r2 + 16]
|
||||||
|
movaps [r4 + r2], xmm0
|
||||||
|
movaps [r4 + r2 + 16], xmm1
|
||||||
|
add r2, 32
|
||||||
|
jl .loop
|
||||||
|
.end:
|
||||||
|
REP_RET
|
||||||
|
|
||||||
|
cextern_naked memcpy
|
||||||
|
|
||||||
|
cglobal fft_permute, 2,7,1
|
||||||
|
mov r4, [r0 + FFTContext.revtab]
|
||||||
|
mov r5, [r0 + FFTContext.tmpbuf]
|
||||||
|
mov ecx, [r0 + FFTContext.nbits]
|
||||||
|
mov r2, 1
|
||||||
|
shl r2, cl
|
||||||
|
xor r0, r0
|
||||||
|
%if ARCH_X86_32
|
||||||
|
mov r1, r1m
|
||||||
|
%endif
|
||||||
|
.loop:
|
||||||
|
movaps xmm0, [r1 + 8*r0]
|
||||||
|
movzx r6, word [r4 + 2*r0]
|
||||||
|
movzx r3, word [r4 + 2*r0 + 2]
|
||||||
|
movlps [r5 + 8*r6], xmm0
|
||||||
|
movhps [r5 + 8*r3], xmm0
|
||||||
|
add r0, 2
|
||||||
|
cmp r0, r2
|
||||||
|
jl .loop
|
||||||
|
shl r2, 3
|
||||||
|
%if ARCH_X86_64
|
||||||
|
mov r0, r1
|
||||||
|
mov r1, r5
|
||||||
|
%else
|
||||||
|
push r2
|
||||||
|
push r5
|
||||||
|
push r1
|
||||||
|
%endif
|
||||||
|
%if ARCH_X86_64 && WIN64 == 0
|
||||||
|
jmp memcpy
|
||||||
|
%else
|
||||||
|
call memcpy
|
||||||
|
%if ARCH_X86_32
|
||||||
|
add esp, 12
|
||||||
|
%endif
|
||||||
|
REP_RET
|
||||||
|
%endif
|
||||||
|
|
||||||
|
cglobal imdct_calc, 3,5,3
|
||||||
|
mov r3d, [r0 + FFTContext.mdctsize]
|
||||||
|
mov r4, [r0 + FFTContext.imdcthalf]
|
||||||
|
add r1, r3
|
||||||
|
PUSH r3
|
||||||
|
PUSH r1
|
||||||
|
%if ARCH_X86_32
|
||||||
|
push r2
|
||||||
|
push r1
|
||||||
|
push r0
|
||||||
|
%else
|
||||||
|
sub rsp, 8
|
||||||
|
%endif
|
||||||
|
call r4
|
||||||
|
%if ARCH_X86_32
|
||||||
|
add esp, 12
|
||||||
|
%else
|
||||||
|
add rsp, 8
|
||||||
|
%endif
|
||||||
|
POP r1
|
||||||
|
POP r3
|
||||||
|
lea r0, [r1 + 2*r3]
|
||||||
|
mov r2, r3
|
||||||
|
sub r3, 16
|
||||||
|
neg r2
|
||||||
|
movaps xmm2, [ps_m1m1m1m1]
|
||||||
|
.loop:
|
||||||
|
movaps xmm0, [r1 + r3]
|
||||||
|
movaps xmm1, [r0 + r2]
|
||||||
|
shufps xmm0, xmm0, 0x1b
|
||||||
|
shufps xmm1, xmm1, 0x1b
|
||||||
|
xorps xmm0, xmm2
|
||||||
|
movaps [r0 + r3], xmm1
|
||||||
|
movaps [r1 + r2], xmm0
|
||||||
|
sub r3, 16
|
||||||
|
add r2, 16
|
||||||
|
jl .loop
|
||||||
|
REP_RET
|
||||||
|
|
||||||
INIT_MMX 3dnow
|
INIT_MMX 3dnow
|
||||||
%define mulps pfmul
|
%define mulps pfmul
|
||||||
%define addps pfadd
|
%define addps pfadd
|
||||||
@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
|
|||||||
%define SECTION_REL
|
%define SECTION_REL
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
|
|
||||||
lea r2, [dispatch_tab%1]
|
|
||||||
mov r2, [r2 + (%2q-2)*gprsize]
|
|
||||||
%ifdef PIC
|
|
||||||
lea r3, [$$]
|
|
||||||
add r2, r3
|
|
||||||
%endif
|
|
||||||
call r2
|
|
||||||
%endmacro ; FFT_DISPATCH
|
|
||||||
|
|
||||||
%macro DECL_FFT 1-2 ; nbits, suffix
|
%macro DECL_FFT 1-2 ; nbits, suffix
|
||||||
%ifidn %0, 1
|
%ifidn %0, 1
|
||||||
%xdefine fullsuffix SUFFIX
|
%xdefine fullsuffix SUFFIX
|
||||||
|
@ -1,110 +0,0 @@
|
|||||||
/*
|
|
||||||
* FFT/MDCT transform with SSE optimizations
|
|
||||||
* Copyright (c) 2008 Loren Merritt
|
|
||||||
*
|
|
||||||
* This file is part of Libav.
|
|
||||||
*
|
|
||||||
* Libav is free software; you can redistribute it and/or
|
|
||||||
* modify it under the terms of the GNU Lesser General Public
|
|
||||||
* License as published by the Free Software Foundation; either
|
|
||||||
* version 2.1 of the License, or (at your option) any later version.
|
|
||||||
*
|
|
||||||
* Libav is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
* Lesser General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU Lesser General Public
|
|
||||||
* License along with Libav; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "libavutil/x86_cpu.h"
|
|
||||||
#include "libavcodec/dsputil.h"
|
|
||||||
#include "fft.h"
|
|
||||||
#include "config.h"
|
|
||||||
|
|
||||||
DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
|
|
||||||
{ 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
|
|
||||||
|
|
||||||
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
|
|
||||||
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
|
|
||||||
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
|
|
||||||
|
|
||||||
#if HAVE_AVX
|
|
||||||
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
|
|
||||||
{
|
|
||||||
ff_fft_dispatch_interleave_avx(z, s->nbits);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
|
|
||||||
{
|
|
||||||
int n = 1 << s->nbits;
|
|
||||||
|
|
||||||
ff_fft_dispatch_interleave_sse(z, s->nbits);
|
|
||||||
|
|
||||||
if(n <= 16) {
|
|
||||||
x86_reg i = -8*n;
|
|
||||||
__asm__ volatile(
|
|
||||||
"1: \n"
|
|
||||||
"movaps (%0,%1), %%xmm0 \n"
|
|
||||||
"movaps %%xmm0, %%xmm1 \n"
|
|
||||||
"unpcklps 16(%0,%1), %%xmm0 \n"
|
|
||||||
"unpckhps 16(%0,%1), %%xmm1 \n"
|
|
||||||
"movaps %%xmm0, (%0,%1) \n"
|
|
||||||
"movaps %%xmm1, 16(%0,%1) \n"
|
|
||||||
"add $32, %0 \n"
|
|
||||||
"jl 1b \n"
|
|
||||||
:"+r"(i)
|
|
||||||
:"r"(z+n)
|
|
||||||
:"memory"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
|
|
||||||
{
|
|
||||||
int n = 1 << s->nbits;
|
|
||||||
int i;
|
|
||||||
for(i=0; i<n; i+=2) {
|
|
||||||
__asm__ volatile(
|
|
||||||
"movaps %2, %%xmm0 \n"
|
|
||||||
"movlps %%xmm0, %0 \n"
|
|
||||||
"movhps %%xmm0, %1 \n"
|
|
||||||
:"=m"(s->tmp_buf[s->revtab[i]]),
|
|
||||||
"=m"(s->tmp_buf[s->revtab[i+1]])
|
|
||||||
:"m"(z[i])
|
|
||||||
);
|
|
||||||
}
|
|
||||||
memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
|
|
||||||
}
|
|
||||||
|
|
||||||
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
|
|
||||||
{
|
|
||||||
x86_reg j, k;
|
|
||||||
long n = s->mdct_size;
|
|
||||||
long n4 = n >> 2;
|
|
||||||
|
|
||||||
s->imdct_half(s, output + n4, input);
|
|
||||||
|
|
||||||
j = -n;
|
|
||||||
k = n-16;
|
|
||||||
__asm__ volatile(
|
|
||||||
"movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
|
|
||||||
"1: \n"
|
|
||||||
"movaps (%2,%1), %%xmm0 \n"
|
|
||||||
"movaps (%3,%0), %%xmm1 \n"
|
|
||||||
"shufps $0x1b, %%xmm0, %%xmm0 \n"
|
|
||||||
"shufps $0x1b, %%xmm1, %%xmm1 \n"
|
|
||||||
"xorps %%xmm7, %%xmm0 \n"
|
|
||||||
"movaps %%xmm1, (%3,%1) \n"
|
|
||||||
"movaps %%xmm0, (%2,%0) \n"
|
|
||||||
"sub $16, %1 \n"
|
|
||||||
"add $16, %0 \n"
|
|
||||||
"jl 1b \n"
|
|
||||||
:"+r"(j), "+r"(k)
|
|
||||||
:"r"(output+n4), "r"(output+n4*3)
|
|
||||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
|
|
||||||
);
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user