1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

x86: fft: convert sse inline asm to yasm

This commit is contained in:
Mans Rullgard 2012-06-23 19:08:11 +01:00
parent 8123e0901f
commit 8299260470
3 changed files with 129 additions and 121 deletions

View File

@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
$(YASM-OBJS-FFT-yes) $(YASM-OBJS-FFT-yes)
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \

View File

@ -45,6 +45,10 @@ struc FFTContext
.mdctbits: resd 1 .mdctbits: resd 1
.tcos: pointer 1 .tcos: pointer 1
.tsin: pointer 1 .tsin: pointer 1
.fftperm: pointer 1
.fftcalc: pointer 1
.imdctcalc:pointer 1
.imdcthalf:pointer 1
endstruc endstruc
SECTION_RODATA SECTION_RODATA
@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
ps_m1m1m1m1: times 4 dd 1<<31
ps_m1p1: dd 1<<31, 0 ps_m1p1: dd 1<<31, 0
%assign i 16 %assign i 16
@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
rep ret rep ret
%endmacro %endmacro
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
lea r2, [dispatch_tab%1]
mov r2, [r2 + (%2q-2)*gprsize]
%ifdef PIC
lea r3, [$$]
add r2, r3
%endif
call r2
%endmacro ; FFT_DISPATCH
INIT_YMM avx INIT_YMM avx
%if HAVE_AVX %if HAVE_AVX
@ -548,6 +563,14 @@ INIT_YMM avx
DECL_PASS pass_avx, PASS_BIG 1 DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0 DECL_PASS pass_interleave_avx, PASS_BIG 0
cglobal fft_calc, 2,5,8
mov r3d, [r0 + FFTContext.nbits]
mov r0, r1
mov r1, r3
FFT_DISPATCH _interleave %+ SUFFIX, r1
REP_RET
%endif %endif
INIT_XMM sse INIT_XMM sse
@ -565,6 +588,112 @@ INIT_XMM sse
DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0 DECL_PASS pass_interleave_sse, PASS_BIG 0
cglobal fft_calc, 2,5,8
mov r3d, [r0 + FFTContext.nbits]
PUSH r1
PUSH r3
mov r0, r1
mov r1, r3
FFT_DISPATCH _interleave %+ SUFFIX, r1
POP rcx
POP r4
cmp rcx, 4
jg .end
mov r2, -1
add rcx, 3
shl r2, cl
sub r4, r2
.loop
movaps xmm0, [r4 + r2]
movaps xmm1, xmm0
unpcklps xmm0, [r4 + r2 + 16]
unpckhps xmm1, [r4 + r2 + 16]
movaps [r4 + r2], xmm0
movaps [r4 + r2 + 16], xmm1
add r2, 32
jl .loop
.end:
REP_RET
cextern_naked memcpy
cglobal fft_permute, 2,7,1
mov r4, [r0 + FFTContext.revtab]
mov r5, [r0 + FFTContext.tmpbuf]
mov ecx, [r0 + FFTContext.nbits]
mov r2, 1
shl r2, cl
xor r0, r0
%if ARCH_X86_32
mov r1, r1m
%endif
.loop:
movaps xmm0, [r1 + 8*r0]
movzx r6, word [r4 + 2*r0]
movzx r3, word [r4 + 2*r0 + 2]
movlps [r5 + 8*r6], xmm0
movhps [r5 + 8*r3], xmm0
add r0, 2
cmp r0, r2
jl .loop
shl r2, 3
%if ARCH_X86_64
mov r0, r1
mov r1, r5
%else
push r2
push r5
push r1
%endif
%if ARCH_X86_64 && WIN64 == 0
jmp memcpy
%else
call memcpy
%if ARCH_X86_32
add esp, 12
%endif
REP_RET
%endif
cglobal imdct_calc, 3,5,3
mov r3d, [r0 + FFTContext.mdctsize]
mov r4, [r0 + FFTContext.imdcthalf]
add r1, r3
PUSH r3
PUSH r1
%if ARCH_X86_32
push r2
push r1
push r0
%else
sub rsp, 8
%endif
call r4
%if ARCH_X86_32
add esp, 12
%else
add rsp, 8
%endif
POP r1
POP r3
lea r0, [r1 + 2*r3]
mov r2, r3
sub r3, 16
neg r2
movaps xmm2, [ps_m1m1m1m1]
.loop:
movaps xmm0, [r1 + r3]
movaps xmm1, [r0 + r2]
shufps xmm0, xmm0, 0x1b
shufps xmm1, xmm1, 0x1b
xorps xmm0, xmm2
movaps [r0 + r3], xmm1
movaps [r1 + r2], xmm0
sub r3, 16
add r2, 16
jl .loop
REP_RET
INIT_MMX 3dnow INIT_MMX 3dnow
%define mulps pfmul %define mulps pfmul
%define addps pfadd %define addps pfadd
@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define SECTION_REL %define SECTION_REL
%endif %endif
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
lea r2, [dispatch_tab%1]
mov r2, [r2 + (%2q-2)*gprsize]
%ifdef PIC
lea r3, [$$]
add r2, r3
%endif
call r2
%endmacro ; FFT_DISPATCH
%macro DECL_FFT 1-2 ; nbits, suffix %macro DECL_FFT 1-2 ; nbits, suffix
%ifidn %0, 1 %ifidn %0, 1
%xdefine fullsuffix SUFFIX %xdefine fullsuffix SUFFIX

View File

@ -1,110 +0,0 @@
/*
* FFT/MDCT transform with SSE optimizations
* Copyright (c) 2008 Loren Merritt
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "fft.h"
#include "config.h"
DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
{ 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
#if HAVE_AVX
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
{
ff_fft_dispatch_interleave_avx(z, s->nbits);
}
#endif
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
int n = 1 << s->nbits;
ff_fft_dispatch_interleave_sse(z, s->nbits);
if(n <= 16) {
x86_reg i = -8*n;
__asm__ volatile(
"1: \n"
"movaps (%0,%1), %%xmm0 \n"
"movaps %%xmm0, %%xmm1 \n"
"unpcklps 16(%0,%1), %%xmm0 \n"
"unpckhps 16(%0,%1), %%xmm1 \n"
"movaps %%xmm0, (%0,%1) \n"
"movaps %%xmm1, 16(%0,%1) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(z+n)
:"memory"
);
}
}
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
{
int n = 1 << s->nbits;
int i;
for(i=0; i<n; i+=2) {
__asm__ volatile(
"movaps %2, %%xmm0 \n"
"movlps %%xmm0, %0 \n"
"movhps %%xmm0, %1 \n"
:"=m"(s->tmp_buf[s->revtab[i]]),
"=m"(s->tmp_buf[s->revtab[i+1]])
:"m"(z[i])
);
}
memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
{
x86_reg j, k;
long n = s->mdct_size;
long n4 = n >> 2;
s->imdct_half(s, output + n4, input);
j = -n;
k = n-16;
__asm__ volatile(
"movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
"1: \n"
"movaps (%2,%1), %%xmm0 \n"
"movaps (%3,%0), %%xmm1 \n"
"shufps $0x1b, %%xmm0, %%xmm0 \n"
"shufps $0x1b, %%xmm1, %%xmm1 \n"
"xorps %%xmm7, %%xmm0 \n"
"movaps %%xmm1, (%3,%1) \n"
"movaps %%xmm0, (%2,%0) \n"
"sub $16, %1 \n"
"add $16, %0 \n"
"jl 1b \n"
:"+r"(j), "+r"(k)
:"r"(output+n4), "r"(output+n4*3)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
);
}