diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 6602cceea6..6464739d03 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o -YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ $(YASM-OBJS-FFT-yes) YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index b60d8b0a47..1cacfb7bd6 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -45,6 +45,10 @@ struc FFTContext .mdctbits: resd 1 .tcos: pointer 1 .tsin: pointer 1 + .fftperm: pointer 1 + .fftcalc: pointer 1 + .imdctcalc:pointer 1 + .imdcthalf:pointer 1 endstruc SECTION_RODATA @@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 +ps_m1m1m1m1: times 4 dd 1<<31 ps_m1p1: dd 1<<31, 0 %assign i 16 @@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3 rep ret %endmacro +%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs + lea r2, [dispatch_tab%1] + mov r2, [r2 + (%2q-2)*gprsize] +%ifdef PIC + lea r3, [$$] + add r2, r3 +%endif + call r2 +%endmacro ; FFT_DISPATCH + INIT_YMM avx %if HAVE_AVX @@ -548,6 +563,14 @@ INIT_YMM avx DECL_PASS pass_avx, PASS_BIG 1 DECL_PASS pass_interleave_avx, PASS_BIG 0 + +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + REP_RET + %endif INIT_XMM sse @@ -565,6 +588,112 @@ INIT_XMM sse DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_interleave_sse, PASS_BIG 0 +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + PUSH r1 + PUSH r3 + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + POP rcx + POP r4 + cmp rcx, 4 + jg .end + mov r2, -1 + add rcx, 3 + shl r2, cl + sub r4, r2 +.loop + movaps xmm0, [r4 + r2] + movaps xmm1, xmm0 + unpcklps xmm0, [r4 + r2 + 16] + unpckhps xmm1, [r4 + r2 + 16] + movaps [r4 + r2], xmm0 + movaps [r4 + r2 + 16], xmm1 + add r2, 32 + jl .loop +.end: + REP_RET + +cextern_naked memcpy + +cglobal fft_permute, 2,7,1 + mov r4, [r0 + FFTContext.revtab] + mov r5, [r0 + FFTContext.tmpbuf] + mov ecx, [r0 + FFTContext.nbits] + mov r2, 1 + shl r2, cl + xor r0, r0 +%if ARCH_X86_32 + mov r1, r1m +%endif +.loop: + movaps xmm0, [r1 + 8*r0] + movzx r6, word [r4 + 2*r0] + movzx r3, word [r4 + 2*r0 + 2] + movlps [r5 + 8*r6], xmm0 + movhps [r5 + 8*r3], xmm0 + add r0, 2 + cmp r0, r2 + jl .loop + shl r2, 3 +%if ARCH_X86_64 + mov r0, r1 + mov r1, r5 +%else + push r2 + push r5 + push r1 +%endif +%if ARCH_X86_64 && WIN64 == 0 + jmp memcpy +%else + call memcpy +%if ARCH_X86_32 + add esp, 12 +%endif + REP_RET +%endif + +cglobal imdct_calc, 3,5,3 + mov r3d, [r0 + FFTContext.mdctsize] + mov r4, [r0 + FFTContext.imdcthalf] + add r1, r3 + PUSH r3 + PUSH r1 +%if ARCH_X86_32 + push r2 + push r1 + push r0 +%else + sub rsp, 8 +%endif + call r4 +%if ARCH_X86_32 + add esp, 12 +%else + add rsp, 8 +%endif + POP r1 + POP r3 + lea r0, [r1 + 2*r3] + mov r2, r3 + sub r3, 16 + neg r2 + movaps xmm2, [ps_m1m1m1m1] +.loop: + movaps xmm0, [r1 + r3] + movaps xmm1, [r0 + r2] + shufps xmm0, xmm0, 0x1b + shufps xmm1, xmm1, 0x1b + xorps xmm0, xmm2 + movaps [r0 + r3], xmm1 + movaps [r1 + r2], xmm0 + sub r3, 16 + add r2, 16 + jl .loop + REP_RET + INIT_MMX 3dnow %define mulps pfmul %define addps pfadd @@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0 %define SECTION_REL %endif -%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs - lea r2, [dispatch_tab%1] - mov r2, [r2 + (%2q-2)*gprsize] -%ifdef PIC - lea r3, [$$] - add r2, r3 -%endif - call r2 -%endmacro ; FFT_DISPATCH - %macro DECL_FFT 1-2 ; nbits, suffix %ifidn %0, 1 %xdefine fullsuffix SUFFIX diff --git a/libavcodec/x86/fft_sse.c b/libavcodec/x86/fft_sse.c deleted file mode 100644 index 13b992f47a..0000000000 --- a/libavcodec/x86/fft_sse.c +++ /dev/null @@ -1,110 +0,0 @@ -/* - * FFT/MDCT transform with SSE optimizations - * Copyright (c) 2008 Loren Merritt - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "fft.h" -#include "config.h" - -DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] = - { 1U << 31, 1U << 31, 1U << 31, 1U << 31 }; - -void ff_fft_dispatch_sse(FFTComplex *z, int nbits); -void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); -void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits); - -#if HAVE_AVX -void ff_fft_calc_avx(FFTContext *s, FFTComplex *z) -{ - ff_fft_dispatch_interleave_avx(z, s->nbits); -} -#endif - -void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) -{ - int n = 1 << s->nbits; - - ff_fft_dispatch_interleave_sse(z, s->nbits); - - if(n <= 16) { - x86_reg i = -8*n; - __asm__ volatile( - "1: \n" - "movaps (%0,%1), %%xmm0 \n" - "movaps %%xmm0, %%xmm1 \n" - "unpcklps 16(%0,%1), %%xmm0 \n" - "unpckhps 16(%0,%1), %%xmm1 \n" - "movaps %%xmm0, (%0,%1) \n" - "movaps %%xmm1, 16(%0,%1) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(z+n) - :"memory" - ); - } -} - -void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) -{ - int n = 1 << s->nbits; - int i; - for(i=0; itmp_buf[s->revtab[i]]), - "=m"(s->tmp_buf[s->revtab[i+1]]) - :"m"(z[i]) - ); - } - memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); -} - -void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - x86_reg j, k; - long n = s->mdct_size; - long n4 = n >> 2; - - s->imdct_half(s, output + n4, input); - - j = -n; - k = n-16; - __asm__ volatile( - "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n" - "1: \n" - "movaps (%2,%1), %%xmm0 \n" - "movaps (%3,%0), %%xmm1 \n" - "shufps $0x1b, %%xmm0, %%xmm0 \n" - "shufps $0x1b, %%xmm1, %%xmm1 \n" - "xorps %%xmm7, %%xmm0 \n" - "movaps %%xmm1, (%3,%1) \n" - "movaps %%xmm0, (%2,%0) \n" - "sub $16, %1 \n" - "add $16, %0 \n" - "jl 1b \n" - :"+r"(j), "+r"(k) - :"r"(output+n4), "r"(output+n4*3) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7") - ); -}