FFmpeg/libavutil/x86/float_dsp_init.c

/*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"

#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
#include "asm.h"

extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
                               int len);
extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
                               int len);

extern void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
                                      int len);
extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
                                      int len);

extern void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
                                      int len);

extern void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
                                       double mul, int len);
extern void ff_vector_dmul_scalar_avx(double *dst, const double *src,
                                      double mul, int len);

#if HAVE_6REGS
static void vector_fmul_window_3dnowext(float *dst, const float *src0,
                                        const float *src1, const float *win,
                                        int len)
{
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 8;
    __asm__ volatile (
        "1:                             \n"
        "pswapd (%5, %1), %%mm1         \n"
        "movq   (%5, %0), %%mm0         \n"
        "pswapd (%4, %1), %%mm5         \n"
        "movq   (%3, %0), %%mm4         \n"
        "movq      %%mm0, %%mm2         \n"
        "movq      %%mm1, %%mm3         \n"
        "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
        "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
        "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
        "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
        "pfadd     %%mm3, %%mm2         \n"
        "pfsub     %%mm0, %%mm1         \n"
        "pswapd    %%mm2, %%mm2         \n"
        "movq      %%mm1, (%2, %0)      \n"
        "movq      %%mm2, (%2, %1)      \n"
        "sub          $8, %1            \n"
        "add          $8, %0            \n"
        "jl           1b                \n"
        "femms                          \n"
        : "+r"(i), "+r"(j)
        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
    );
}

static void vector_fmul_window_sse(float *dst, const float *src0,
                                   const float *src1, const float *win, int len)
{
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 16;
    __asm__ volatile (
        "1:                             \n"
        "movaps      (%5, %1), %%xmm1   \n"
        "movaps      (%5, %0), %%xmm0   \n"
        "movaps      (%4, %1), %%xmm5   \n"
        "movaps      (%3, %0), %%xmm4   \n"
        "shufps $0x1b, %%xmm1, %%xmm1   \n"
        "shufps $0x1b, %%xmm5, %%xmm5   \n"
        "movaps        %%xmm0, %%xmm2   \n"
        "movaps        %%xmm1, %%xmm3   \n"
        "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
        "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
        "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
        "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
        "addps         %%xmm3, %%xmm2   \n"
        "subps         %%xmm0, %%xmm1   \n"
        "shufps $0x1b, %%xmm2, %%xmm2   \n"
        "movaps        %%xmm1, (%2, %0) \n"
        "movaps        %%xmm2, (%2, %1) \n"
        "sub              $16, %1       \n"
        "add              $16, %0       \n"
        "jl                1b           \n"
        : "+r"(i), "+r"(j)
        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
    );
}
#endif /* HAVE_6REGS */

void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
    int mm_flags = av_get_cpu_flags();

#if HAVE_6REGS
    if (INLINE_AMD3DNOWEXT(mm_flags)) {
        fdsp->vector_fmul_window  = vector_fmul_window_3dnowext;
    }
    if (INLINE_SSE(mm_flags)) {
        fdsp->vector_fmul_window = vector_fmul_window_sse;
    }
#endif
    if (EXTERNAL_SSE(mm_flags)) {
        fdsp->vector_fmul = ff_vector_fmul_sse;
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
        fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
    }
    if (EXTERNAL_SSE2(mm_flags)) {
        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
    }
    if (EXTERNAL_AVX(mm_flags)) {
        fdsp->vector_fmul = ff_vector_fmul_avx;
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
    }
}
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 19:58:41 +03:00			`/*`
			`* This file is part of Libav.`
			`*`
			`* Libav is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* Libav is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with Libav; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "config.h"`

			`#include "libavutil/cpu.h"`
			`#include "libavutil/float_dsp.h"`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`#include "cpu.h"`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 07:47:30 +03:00			`#include "asm.h"`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 19:58:41 +03:00
			`extern void ff_vector_fmul_sse(float dst, const float src0, const float *src1,`
			`int len);`
			`extern void ff_vector_fmul_avx(float dst, const float src0, const float *src1,`
			`int len);`

float_dsp: add x86-optimized functions for vector_fmac_scalar() 2012-06-09 06:20:59 +03:00			`extern void ff_vector_fmac_scalar_sse(float dst, const float src, float mul,`
			`int len);`
			`extern void ff_vector_fmac_scalar_avx(float dst, const float src, float mul,`
			`int len);`

x86: float_dsp: add SSE version of vector_fmul_scalar() 2012-09-23 01:41:25 +03:00			`extern void ff_vector_fmul_scalar_sse(float dst, const float src, float mul,`
			`int len);`

float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 22:00:53 +03:00			`extern void ff_vector_dmul_scalar_sse2(double dst, const double src,`
			`double mul, int len);`
			`extern void ff_vector_dmul_scalar_avx(double dst, const double src,`
			`double mul, int len);`

lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 07:47:30 +03:00			`#if HAVE_6REGS`
			`static void vector_fmul_window_3dnowext(float dst, const float src0,`
			`const float src1, const float win,`
			`int len)`
			`{`
			`x86_reg i = -len * 4;`
			`x86_reg j = len * 4 - 8;`
			`__asm__ volatile (`
			`"1: \n"`
			`"pswapd (%5, %1), %%mm1 \n"`
			`"movq (%5, %0), %%mm0 \n"`
			`"pswapd (%4, %1), %%mm5 \n"`
			`"movq (%3, %0), %%mm4 \n"`
			`"movq %%mm0, %%mm2 \n"`
			`"movq %%mm1, %%mm3 \n"`
			`"pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]`
			`"pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]`
			`"pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]`
			`"pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]`
			`"pfadd %%mm3, %%mm2 \n"`
			`"pfsub %%mm0, %%mm1 \n"`
			`"pswapd %%mm2, %%mm2 \n"`
			`"movq %%mm1, (%2, %0) \n"`
			`"movq %%mm2, (%2, %1) \n"`
			`"sub $8, %1 \n"`
			`"add $8, %0 \n"`
			`"jl 1b \n"`
			`"femms \n"`
			`: "+r"(i), "+r"(j)`
			`: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)`
			`);`
			`}`

			`static void vector_fmul_window_sse(float dst, const float src0,`
			`const float src1, const float win, int len)`
			`{`
			`x86_reg i = -len * 4;`
			`x86_reg j = len * 4 - 16;`
			`__asm__ volatile (`
			`"1: \n"`
			`"movaps (%5, %1), %%xmm1 \n"`
			`"movaps (%5, %0), %%xmm0 \n"`
			`"movaps (%4, %1), %%xmm5 \n"`
			`"movaps (%3, %0), %%xmm4 \n"`
			`"shufps $0x1b, %%xmm1, %%xmm1 \n"`
			`"shufps $0x1b, %%xmm5, %%xmm5 \n"`
			`"movaps %%xmm0, %%xmm2 \n"`
			`"movaps %%xmm1, %%xmm3 \n"`
			`"mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]`
			`"mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]`
			`"mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]`
			`"mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]`
			`"addps %%xmm3, %%xmm2 \n"`
			`"subps %%xmm0, %%xmm1 \n"`
			`"shufps $0x1b, %%xmm2, %%xmm2 \n"`
			`"movaps %%xmm1, (%2, %0) \n"`
			`"movaps %%xmm2, (%2, %1) \n"`
			`"sub $16, %1 \n"`
			`"add $16, %0 \n"`
			`"jl 1b \n"`
			`: "+r"(i), "+r"(j)`
			`: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)`
			`);`
			`}`
			`#endif /* HAVE_6REGS */`

Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 19:58:41 +03:00			`void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)`
			`{`
			`int mm_flags = av_get_cpu_flags();`

lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 07:47:30 +03:00			`#if HAVE_6REGS`
			`if (INLINE_AMD3DNOWEXT(mm_flags)) {`
			`fdsp->vector_fmul_window = vector_fmul_window_3dnowext;`
			`}`
			`if (INLINE_SSE(mm_flags)) {`
			`fdsp->vector_fmul_window = vector_fmul_window_sse;`
			`}`
			`#endif`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`if (EXTERNAL_SSE(mm_flags)) {`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 19:58:41 +03:00			`fdsp->vector_fmul = ff_vector_fmul_sse;`
float_dsp: add x86-optimized functions for vector_fmac_scalar() 2012-06-09 06:20:59 +03:00			`fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;`
x86: float_dsp: add SSE version of vector_fmul_scalar() 2012-09-23 01:41:25 +03:00			`fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 19:58:41 +03:00			`}`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 22:00:53 +03:00			`if (EXTERNAL_SSE2(mm_flags)) {`
			`fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;`
			`}`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`if (EXTERNAL_AVX(mm_flags)) {`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 19:58:41 +03:00			`fdsp->vector_fmul = ff_vector_fmul_avx;`
float_dsp: add x86-optimized functions for vector_fmac_scalar() 2012-06-09 06:20:59 +03:00			`fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 22:00:53 +03:00			`fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 19:58:41 +03:00			`}`
			`}`