FFmpeg/libavcodec/x86/fft_sse.c

/*
 * FFT/MDCT transform with SSE optimizations
 * Copyright (c) 2008 Loren Merritt
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "fft.h"
#include "config.h"

DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
    { 1U << 31, 1U << 31, 1U << 31, 1U << 31 };

void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);

#if HAVE_AVX
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
{
    ff_fft_dispatch_interleave_avx(z, s->nbits);
}
#endif

void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
    int n = 1 << s->nbits;

    ff_fft_dispatch_interleave_sse(z, s->nbits);

    if(n <= 16) {
        x86_reg i = -8*n;
        __asm__ volatile(
            "1: \n"
            "movaps     (%0,%1), %%xmm0 \n"
            "movaps      %%xmm0, %%xmm1 \n"
            "unpcklps 16(%0,%1), %%xmm0 \n"
            "unpckhps 16(%0,%1), %%xmm1 \n"
            "movaps      %%xmm0,   (%0,%1) \n"
            "movaps      %%xmm1, 16(%0,%1) \n"
            "add $32, %0 \n"
            "jl 1b \n"
            :"+r"(i)
            :"r"(z+n)
            :"memory"
        );
    }
}

void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
{
    int n = 1 << s->nbits;
    int i;
    for(i=0; i<n; i+=2) {
        __asm__ volatile(
            "movaps %2, %%xmm0 \n"
            "movlps %%xmm0, %0 \n"
            "movhps %%xmm0, %1 \n"
            :"=m"(s->tmp_buf[s->revtab[i]]),
             "=m"(s->tmp_buf[s->revtab[i+1]])
            :"m"(z[i])
        );
    }
    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}

void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
{
    x86_reg j, k;
    long n = s->mdct_size;
    long n4 = n >> 2;

    s->imdct_half(s, output + n4, input);

    j = -n;
    k = n-16;
    __asm__ volatile(
        "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
        "1: \n"
        "movaps       (%2,%1), %%xmm0 \n"
        "movaps       (%3,%0), %%xmm1 \n"
        "shufps $0x1b, %%xmm0, %%xmm0 \n"
        "shufps $0x1b, %%xmm1, %%xmm1 \n"
        "xorps         %%xmm7, %%xmm0 \n"
        "movaps        %%xmm1, (%3,%1) \n"
        "movaps        %%xmm0, (%2,%0) \n"
        "sub $16, %1 \n"
        "add $16, %0 \n"
        "jl 1b \n"
        :"+r"(j), "+r"(k)
        :"r"(output+n4), "r"(output+n4*3)
        XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
    );
}
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00			`/*`
			`* FFT/MDCT transform with SSE optimizations`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 03:33:34 +03:00			`* Copyright (c) 2008 Loren Merritt`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* version 2.1 of the License, or (at your option) any later version.`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* FFmpeg is distributed in the hope that it will be useful,`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* License along with FFmpeg; if not, write to the Free Software`
Update licensing information: The FSF changed postal address. Originally committed as revision 4842 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-01-13 00:43:26 +02:00			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00			`*/`
Use full path for #includes from another directory. Originally committed as revision 13098 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-05-09 14:56:36 +03:00
			`#include "libavutil/x86_cpu.h"`
			`#include "libavcodec/dsputil.h"`
Move per-arch fft init bits into the corresponding subdirs Originally committed as revision 19864 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-09-16 00:14:14 +03:00			`#include "fft.h"`
x86: Add appropriate ifdefs around certain AVX functions. nasm versions prior to 2.09 have trouble assembling some of our AVX code. Protect these sections by preprocessor macros to allow compilation to pass. 2011-05-27 22:18:12 +03:00			`#include "config.h"`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00
fft: avoid a signed overflow As a signed integer, 1<<31 overflows, so force it to unsigned. Signed-off-by: Alex Converse <alex.converse@gmail.com> 2011-09-20 04:32:09 +03:00			`DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =`
			`{ 1U << 31, 1U << 31, 1U << 31, 1U << 31 };`
sse implementation of imdct. patch mostly by Zuxy Meng (zuxy dot meng at gmail dot com) Originally committed as revision 6311 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-09-21 19:37:39 +03:00
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 03:26:58 +03:00			`void ff_fft_dispatch_sse(FFTComplex *z, int nbits);`
			`void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);`
Add AVX FFT implementation. Signed-off-by: Reinhard Tartler <siretart@tauware.de> 2011-04-25 12:39:01 +03:00			`void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);`

Fix compilation with YASM/NASM versions not supporting AVX. 2011-05-26 20:44:39 +03:00			`#if HAVE_AVX`
Add AVX FFT implementation. Signed-off-by: Reinhard Tartler <siretart@tauware.de> 2011-04-25 12:39:01 +03:00			`void ff_fft_calc_avx(FFTContext s, FFTComplex z)`
			`{`
			`ff_fft_dispatch_interleave_avx(z, s->nbits);`
			`}`
Fix compilation with YASM/NASM versions not supporting AVX. 2011-05-26 20:44:39 +03:00			`#endif`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00
fft_() renamed into ff_fft_() patch by (Gildas Bazin <gbazin at altern dot org>) Originally committed as revision 2882 to svn://svn.ffmpeg.org/ffmpeg/trunk 2004-03-13 23:43:24 +02:00			`void ff_fft_calc_sse(FFTContext s, FFTComplex z)`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00			`{`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 03:26:58 +03:00			`int n = 1 << s->nbits;`

			`ff_fft_dispatch_interleave_sse(z, s->nbits);`

			`if(n <= 16) {`
			`x86_reg i = -8*n;`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-10-16 16:34:09 +03:00			`__asm__ volatile(`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 03:26:58 +03:00			`"1: \n"`
			`"movaps (%0,%1), %%xmm0 \n"`
			`"movaps %%xmm0, %%xmm1 \n"`
			`"unpcklps 16(%0,%1), %%xmm0 \n"`
			`"unpckhps 16(%0,%1), %%xmm1 \n"`
			`"movaps %%xmm0, (%0,%1) \n"`
			`"movaps %%xmm1, 16(%0,%1) \n"`
			`"add $32, %0 \n"`
			`"jl 1b \n"`
			`:"+r"(i)`
			`:"r"(z+n)`
			`:"memory"`
			`);`
			`}`
			`}`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 03:26:58 +03:00			`void ff_fft_permute_sse(FFTContext s, FFTComplex z)`
			`{`
			`int n = 1 << s->nbits;`
			`int i;`
			`for(i=0; i<n; i+=2) {`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-10-16 16:34:09 +03:00			`__asm__ volatile(`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 03:26:58 +03:00			`"movaps %2, %%xmm0 \n"`
			`"movlps %%xmm0, %0 \n"`
			`"movhps %%xmm0, %1 \n"`
			`:"=m"(s->tmp_buf[s->revtab[i]]),`
			`"=m"(s->tmp_buf[s->revtab[i+1]])`
			`:"m"(z[i])`
			`);`
			`}`
			`memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 02:34:08 +02:00			`}`
added define for builtins use - inverse fix by Romain Dolbeau Originally committed as revision 1410 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 19:41:43 +02:00
Merge FFTContext and MDCTContext Originally committed as revision 19931 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-09-20 20:30:20 +03:00			`void ff_imdct_calc_sse(FFTContext s, FFTSample output, const FFTSample *input)`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 18:03:58 +03:00			`{`
			`x86_reg j, k;`
imdct/x86: Use "s->mdct_size" instead of "1 << s->mdct_bits". It generates smaller cleaner code. Originally committed as revision 24887 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-23 18:51:09 +03:00			`long n = s->mdct_size;`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 03:33:34 +03:00			`long n4 = n >> 2;`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 18:03:58 +03:00
Add AVX FFT implementation. Signed-off-by: Reinhard Tartler <siretart@tauware.de> 2011-04-25 12:39:01 +03:00			`s->imdct_half(s, output + n4, input);`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 18:03:58 +03:00
			`j = -n;`
			`k = n-16;`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-10-16 16:34:09 +03:00			`__asm__ volatile(`
Fix ff_imdct_calc_sse() on gcc-4.6 Gcc 4.6 only preserves the first value when using an array with an "m" constraint. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit 770c410fbb8e1b87ce8ad7f3d7eddaa55e2b8295) 2011-01-30 11:04:41 +02:00			`"movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 03:33:34 +03:00			`"1: \n"`
			`"movaps (%2,%1), %%xmm0 \n"`
			`"movaps (%3,%0), %%xmm1 \n"`
			`"shufps $0x1b, %%xmm0, %%xmm0 \n"`
			`"shufps $0x1b, %%xmm1, %%xmm1 \n"`
			`"xorps %%xmm7, %%xmm0 \n"`
			`"movaps %%xmm1, (%3,%1) \n"`
			`"movaps %%xmm0, (%2,%0) \n"`
			`"sub $16, %1 \n"`
			`"add $16, %0 \n"`
			`"jl 1b \n"`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 18:03:58 +03:00			`:"+r"(j), "+r"(k)`
Fix ff_imdct_calc_sse() on gcc-4.6 Gcc 4.6 only preserves the first value when using an array with an "m" constraint. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit 770c410fbb8e1b87ce8ad7f3d7eddaa55e2b8295) 2011-01-30 11:04:41 +02:00			`:"r"(output+n4), "r"(output+n4*3)`
fft: mark xmm registers as clobbered in ff_imdct_calc_sse Originally committed as revision 25363 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-10-06 04:27:02 +03:00			`XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 18:03:58 +03:00			`);`
			`}`