You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-15 14:13:16 +02:00
Revert "x86: fft: convert sse inline asm to yasm"
This reverts commit 8299260470
.
It breaks shared builds on x86_64.
This commit is contained in:
@@ -43,6 +43,7 @@ YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o
|
|||||||
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
|
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
|
||||||
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
|
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
|
||||||
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
|
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
|
||||||
|
YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
|
||||||
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
|
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
|
||||||
$(YASM-OBJS-FFT-yes)
|
$(YASM-OBJS-FFT-yes)
|
||||||
|
|
||||||
|
@@ -47,10 +47,6 @@ struc FFTContext
|
|||||||
.mdctbits: resd 1
|
.mdctbits: resd 1
|
||||||
.tcos: pointer 1
|
.tcos: pointer 1
|
||||||
.tsin: pointer 1
|
.tsin: pointer 1
|
||||||
.fftperm: pointer 1
|
|
||||||
.fftcalc: pointer 1
|
|
||||||
.imdctcalc:pointer 1
|
|
||||||
.imdcthalf:pointer 1
|
|
||||||
endstruc
|
endstruc
|
||||||
|
|
||||||
%define M_SQRT1_2 0.70710678118654752440
|
%define M_SQRT1_2 0.70710678118654752440
|
||||||
@@ -69,7 +65,6 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
|
|||||||
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
|
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
|
||||||
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
||||||
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
|
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
|
||||||
ps_m1m1m1m1: times 4 dd 1<<31
|
|
||||||
ps_m1p1: dd 1<<31, 0
|
ps_m1p1: dd 1<<31, 0
|
||||||
|
|
||||||
%assign i 16
|
%assign i 16
|
||||||
@@ -538,16 +533,6 @@ DEFINE_ARGS z, w, n, o1, o3
|
|||||||
rep ret
|
rep ret
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
|
|
||||||
lea r2, [dispatch_tab%1]
|
|
||||||
mov r2, [r2 + (%2q-2)*gprsize]
|
|
||||||
%ifdef PIC
|
|
||||||
lea r3, [$$]
|
|
||||||
add r2, r3
|
|
||||||
%endif
|
|
||||||
call r2
|
|
||||||
%endmacro ; FFT_DISPATCH
|
|
||||||
|
|
||||||
INIT_YMM avx
|
INIT_YMM avx
|
||||||
|
|
||||||
%if HAVE_AVX
|
%if HAVE_AVX
|
||||||
@@ -564,14 +549,6 @@ INIT_YMM avx
|
|||||||
|
|
||||||
DECL_PASS pass_avx, PASS_BIG 1
|
DECL_PASS pass_avx, PASS_BIG 1
|
||||||
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
||||||
|
|
||||||
cglobal fft_calc, 2,5,8
|
|
||||||
mov r3d, [r0 + FFTContext.nbits]
|
|
||||||
mov r0, r1
|
|
||||||
mov r1, r3
|
|
||||||
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
|
||||||
REP_RET
|
|
||||||
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
INIT_XMM sse
|
INIT_XMM sse
|
||||||
@@ -589,112 +566,6 @@ INIT_XMM sse
|
|||||||
DECL_PASS pass_sse, PASS_BIG 1
|
DECL_PASS pass_sse, PASS_BIG 1
|
||||||
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
||||||
|
|
||||||
cglobal fft_calc, 2,5,8
|
|
||||||
mov r3d, [r0 + FFTContext.nbits]
|
|
||||||
PUSH r1
|
|
||||||
PUSH r3
|
|
||||||
mov r0, r1
|
|
||||||
mov r1, r3
|
|
||||||
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
|
||||||
POP rcx
|
|
||||||
POP r4
|
|
||||||
cmp rcx, 4
|
|
||||||
jg .end
|
|
||||||
mov r2, -1
|
|
||||||
add rcx, 3
|
|
||||||
shl r2, cl
|
|
||||||
sub r4, r2
|
|
||||||
.loop
|
|
||||||
movaps xmm0, [r4 + r2]
|
|
||||||
movaps xmm1, xmm0
|
|
||||||
unpcklps xmm0, [r4 + r2 + 16]
|
|
||||||
unpckhps xmm1, [r4 + r2 + 16]
|
|
||||||
movaps [r4 + r2], xmm0
|
|
||||||
movaps [r4 + r2 + 16], xmm1
|
|
||||||
add r2, 32
|
|
||||||
jl .loop
|
|
||||||
.end:
|
|
||||||
REP_RET
|
|
||||||
|
|
||||||
cextern_naked memcpy
|
|
||||||
|
|
||||||
cglobal fft_permute, 2,7,1
|
|
||||||
mov r4, [r0 + FFTContext.revtab]
|
|
||||||
mov r5, [r0 + FFTContext.tmpbuf]
|
|
||||||
mov ecx, [r0 + FFTContext.nbits]
|
|
||||||
mov r2, 1
|
|
||||||
shl r2, cl
|
|
||||||
xor r0, r0
|
|
||||||
%if ARCH_X86_32
|
|
||||||
mov r1, r1m
|
|
||||||
%endif
|
|
||||||
.loop:
|
|
||||||
movaps xmm0, [r1 + 8*r0]
|
|
||||||
movzx r6, word [r4 + 2*r0]
|
|
||||||
movzx r3, word [r4 + 2*r0 + 2]
|
|
||||||
movlps [r5 + 8*r6], xmm0
|
|
||||||
movhps [r5 + 8*r3], xmm0
|
|
||||||
add r0, 2
|
|
||||||
cmp r0, r2
|
|
||||||
jl .loop
|
|
||||||
shl r2, 3
|
|
||||||
%if ARCH_X86_64
|
|
||||||
mov r0, r1
|
|
||||||
mov r1, r5
|
|
||||||
%else
|
|
||||||
push r2
|
|
||||||
push r5
|
|
||||||
push r1
|
|
||||||
%endif
|
|
||||||
%if ARCH_X86_64 && WIN64 == 0
|
|
||||||
jmp memcpy
|
|
||||||
%else
|
|
||||||
call memcpy
|
|
||||||
%if ARCH_X86_32
|
|
||||||
add esp, 12
|
|
||||||
%endif
|
|
||||||
REP_RET
|
|
||||||
%endif
|
|
||||||
|
|
||||||
cglobal imdct_calc, 3,5,3
|
|
||||||
mov r3d, [r0 + FFTContext.mdctsize]
|
|
||||||
mov r4, [r0 + FFTContext.imdcthalf]
|
|
||||||
add r1, r3
|
|
||||||
PUSH r3
|
|
||||||
PUSH r1
|
|
||||||
%if ARCH_X86_32
|
|
||||||
push r2
|
|
||||||
push r1
|
|
||||||
push r0
|
|
||||||
%else
|
|
||||||
sub rsp, 8
|
|
||||||
%endif
|
|
||||||
call r4
|
|
||||||
%if ARCH_X86_32
|
|
||||||
add esp, 12
|
|
||||||
%else
|
|
||||||
add rsp, 8
|
|
||||||
%endif
|
|
||||||
POP r1
|
|
||||||
POP r3
|
|
||||||
lea r0, [r1 + 2*r3]
|
|
||||||
mov r2, r3
|
|
||||||
sub r3, 16
|
|
||||||
neg r2
|
|
||||||
movaps xmm2, [ps_m1m1m1m1]
|
|
||||||
.loop:
|
|
||||||
movaps xmm0, [r1 + r3]
|
|
||||||
movaps xmm1, [r0 + r2]
|
|
||||||
shufps xmm0, xmm0, 0x1b
|
|
||||||
shufps xmm1, xmm1, 0x1b
|
|
||||||
xorps xmm0, xmm2
|
|
||||||
movaps [r0 + r3], xmm1
|
|
||||||
movaps [r1 + r2], xmm0
|
|
||||||
sub r3, 16
|
|
||||||
add r2, 16
|
|
||||||
jl .loop
|
|
||||||
REP_RET
|
|
||||||
|
|
||||||
INIT_MMX 3dnow
|
INIT_MMX 3dnow
|
||||||
%define mulps pfmul
|
%define mulps pfmul
|
||||||
%define addps pfadd
|
%define addps pfadd
|
||||||
@@ -712,6 +583,16 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
|
|||||||
%define SECTION_REL
|
%define SECTION_REL
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
|
||||||
|
lea r2, [dispatch_tab%1]
|
||||||
|
mov r2, [r2 + (%2q-2)*gprsize]
|
||||||
|
%ifdef PIC
|
||||||
|
lea r3, [$$]
|
||||||
|
add r2, r3
|
||||||
|
%endif
|
||||||
|
call r2
|
||||||
|
%endmacro ; FFT_DISPATCH
|
||||||
|
|
||||||
%macro DECL_FFT 1-2 ; nbits, suffix
|
%macro DECL_FFT 1-2 ; nbits, suffix
|
||||||
%ifidn %0, 1
|
%ifidn %0, 1
|
||||||
%xdefine fullsuffix SUFFIX
|
%xdefine fullsuffix SUFFIX
|
||||||
|
110
libavcodec/x86/fft_sse.c
Normal file
110
libavcodec/x86/fft_sse.c
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
/*
|
||||||
|
* FFT/MDCT transform with SSE optimizations
|
||||||
|
* Copyright (c) 2008 Loren Merritt
|
||||||
|
*
|
||||||
|
* This file is part of Libav.
|
||||||
|
*
|
||||||
|
* Libav is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Libav is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with Libav; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavutil/x86_cpu.h"
|
||||||
|
#include "libavcodec/dsputil.h"
|
||||||
|
#include "fft.h"
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
|
DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
|
||||||
|
{ 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
|
||||||
|
|
||||||
|
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
|
||||||
|
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
|
||||||
|
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
|
||||||
|
|
||||||
|
#if HAVE_AVX
|
||||||
|
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
|
||||||
|
{
|
||||||
|
ff_fft_dispatch_interleave_avx(z, s->nbits);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
|
||||||
|
{
|
||||||
|
int n = 1 << s->nbits;
|
||||||
|
|
||||||
|
ff_fft_dispatch_interleave_sse(z, s->nbits);
|
||||||
|
|
||||||
|
if(n <= 16) {
|
||||||
|
x86_reg i = -8*n;
|
||||||
|
__asm__ volatile(
|
||||||
|
"1: \n"
|
||||||
|
"movaps (%0,%1), %%xmm0 \n"
|
||||||
|
"movaps %%xmm0, %%xmm1 \n"
|
||||||
|
"unpcklps 16(%0,%1), %%xmm0 \n"
|
||||||
|
"unpckhps 16(%0,%1), %%xmm1 \n"
|
||||||
|
"movaps %%xmm0, (%0,%1) \n"
|
||||||
|
"movaps %%xmm1, 16(%0,%1) \n"
|
||||||
|
"add $32, %0 \n"
|
||||||
|
"jl 1b \n"
|
||||||
|
:"+r"(i)
|
||||||
|
:"r"(z+n)
|
||||||
|
:"memory"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
|
||||||
|
{
|
||||||
|
int n = 1 << s->nbits;
|
||||||
|
int i;
|
||||||
|
for(i=0; i<n; i+=2) {
|
||||||
|
__asm__ volatile(
|
||||||
|
"movaps %2, %%xmm0 \n"
|
||||||
|
"movlps %%xmm0, %0 \n"
|
||||||
|
"movhps %%xmm0, %1 \n"
|
||||||
|
:"=m"(s->tmp_buf[s->revtab[i]]),
|
||||||
|
"=m"(s->tmp_buf[s->revtab[i+1]])
|
||||||
|
:"m"(z[i])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||||
|
{
|
||||||
|
x86_reg j, k;
|
||||||
|
long n = s->mdct_size;
|
||||||
|
long n4 = n >> 2;
|
||||||
|
|
||||||
|
s->imdct_half(s, output + n4, input);
|
||||||
|
|
||||||
|
j = -n;
|
||||||
|
k = n-16;
|
||||||
|
__asm__ volatile(
|
||||||
|
"movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
|
||||||
|
"1: \n"
|
||||||
|
"movaps (%2,%1), %%xmm0 \n"
|
||||||
|
"movaps (%3,%0), %%xmm1 \n"
|
||||||
|
"shufps $0x1b, %%xmm0, %%xmm0 \n"
|
||||||
|
"shufps $0x1b, %%xmm1, %%xmm1 \n"
|
||||||
|
"xorps %%xmm7, %%xmm0 \n"
|
||||||
|
"movaps %%xmm1, (%3,%1) \n"
|
||||||
|
"movaps %%xmm0, (%2,%0) \n"
|
||||||
|
"sub $16, %1 \n"
|
||||||
|
"add $16, %0 \n"
|
||||||
|
"jl 1b \n"
|
||||||
|
:"+r"(j), "+r"(k)
|
||||||
|
:"r"(output+n4), "r"(output+n4*3)
|
||||||
|
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
|
||||||
|
);
|
||||||
|
}
|
Reference in New Issue
Block a user