FFmpeg/libavcodec/x86/ac3dsp_init.c

/*
 * x86-optimized AC-3 DSP utils
 * Copyright (c) 2011 Justin Ruggles
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "dsputil_mmx.h"
#include "libavcodec/ac3.h"
#include "libavcodec/ac3dsp.h"

extern void ff_ac3_exponent_min_mmx   (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
extern void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);

extern int ff_ac3_max_msb_abs_int16_mmx  (const int16_t *src, int len);
extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);

extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);

extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);

extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
extern void ff_float_to_fixed24_sse  (int32_t *dst, const float *src, unsigned int len);
extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);

extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);

extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);

#if ARCH_X86_32 && defined(__INTEL_COMPILER)
#       undef HAVE_7REGS
#       define HAVE_7REGS 0
#endif

#if HAVE_SSE_INLINE && HAVE_7REGS

#define IF1(x) x
#define IF0(x)

#define MIX5(mono, stereo)                                      \
    __asm__ volatile (                                          \
        "movss           0(%1), %%xmm5          \n"             \
        "movss           8(%1), %%xmm6          \n"             \
        "movss          24(%1), %%xmm7          \n"             \
        "shufps     $0, %%xmm5, %%xmm5          \n"             \
        "shufps     $0, %%xmm6, %%xmm6          \n"             \
        "shufps     $0, %%xmm7, %%xmm7          \n"             \
        "1:                                     \n"             \
        "movaps       (%0, %2), %%xmm0          \n"             \
        "movaps       (%0, %3), %%xmm1          \n"             \
        "movaps       (%0, %4), %%xmm2          \n"             \
        "movaps       (%0, %5), %%xmm3          \n"             \
        "movaps       (%0, %6), %%xmm4          \n"             \
        "mulps          %%xmm5, %%xmm0          \n"             \
        "mulps          %%xmm6, %%xmm1          \n"             \
        "mulps          %%xmm5, %%xmm2          \n"             \
        "mulps          %%xmm7, %%xmm3          \n"             \
        "mulps          %%xmm7, %%xmm4          \n"             \
 stereo("addps          %%xmm1, %%xmm0          \n")            \
        "addps          %%xmm1, %%xmm2          \n"             \
        "addps          %%xmm3, %%xmm0          \n"             \
        "addps          %%xmm4, %%xmm2          \n"             \
   mono("addps          %%xmm2, %%xmm0          \n")            \
        "movaps         %%xmm0, (%0, %2)        \n"             \
 stereo("movaps         %%xmm2, (%0, %3)        \n")            \
        "add               $16, %0              \n"             \
        "jl                 1b                  \n"             \
        : "+&r"(i)                                              \
        : "r"(matrix),                                          \
          "r"(samples[0] + len),                                \
          "r"(samples[1] + len),                                \
          "r"(samples[2] + len),                                \
          "r"(samples[3] + len),                                \
          "r"(samples[4] + len)                                 \
        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",      \
                      "%xmm4", "%xmm5", "%xmm6", "%xmm7",)      \
         "memory"                                               \
    );

#define MIX_MISC(stereo)                                        \
    __asm__ volatile (                                          \
        "mov              %5, %2            \n"                 \
        "1:                                 \n"                 \
        "mov -%c7(%6, %2, %c8), %3          \n"                 \
        "movaps     (%3, %0), %%xmm0        \n"                 \
 stereo("movaps       %%xmm0, %%xmm1        \n")                \
        "mulps        %%xmm4, %%xmm0        \n"                 \
 stereo("mulps        %%xmm5, %%xmm1        \n")                \
        "2:                                 \n"                 \
        "mov   (%6, %2, %c8), %1            \n"                 \
        "movaps     (%1, %0), %%xmm2        \n"                 \
 stereo("movaps       %%xmm2, %%xmm3        \n")                \
        "mulps   (%4, %2, 8), %%xmm2        \n"                 \
 stereo("mulps 16(%4, %2, 8), %%xmm3        \n")                \
        "addps        %%xmm2, %%xmm0        \n"                 \
 stereo("addps        %%xmm3, %%xmm1        \n")                \
        "add              $4, %2            \n"                 \
        "jl               2b                \n"                 \
        "mov              %5, %2            \n"                 \
 stereo("mov   (%6, %2, %c8), %1            \n")                \
        "movaps       %%xmm0, (%3, %0)      \n"                 \
 stereo("movaps       %%xmm1, (%1, %0)      \n")                \
        "add             $16, %0            \n"                 \
        "jl               1b                \n"                 \
        : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m)                \
        : "r"(matrix_simd + in_ch),                             \
          "g"((intptr_t) - 4 * (in_ch - 1)),                    \
          "r"(samp + in_ch),                                    \
          "i"(sizeof(float *)), "i"(sizeof(float *)/4)          \
        : "memory"                                              \
    );

static void ac3_downmix_sse(float **samples, float (*matrix)[2],
                            int out_ch, int in_ch, int len)
{
    int (*matrix_cmp)[2] = (int(*)[2])matrix;
    intptr_t i, j, k, m;

    i = -len * sizeof(float);
    if (in_ch == 5 && out_ch == 2 &&
        !(matrix_cmp[0][1] | matrix_cmp[2][0]   |
          matrix_cmp[3][1] | matrix_cmp[4][0]   |
          (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
          (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
        MIX5(IF0, IF1);
    } else if (in_ch == 5 && out_ch == 1 &&
               matrix_cmp[0][0] == matrix_cmp[2][0] &&
               matrix_cmp[3][0] == matrix_cmp[4][0]) {
        MIX5(IF1, IF0);
    } else {
        DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
        float *samp[AC3_MAX_CHANNELS];

        for (j = 0; j < in_ch; j++)
            samp[j] = samples[j] + len;

        j = 2 * in_ch * sizeof(float);
        __asm__ volatile (
            "1:                                 \n"
            "sub             $8, %0             \n"
            "movss     (%2, %0), %%xmm4         \n"
            "movss    4(%2, %0), %%xmm5         \n"
            "shufps          $0, %%xmm4, %%xmm4 \n"
            "shufps          $0, %%xmm5, %%xmm5 \n"
            "movaps      %%xmm4,   (%1, %0, 4)  \n"
            "movaps      %%xmm5, 16(%1, %0, 4)  \n"
            "jg              1b                 \n"
            : "+&r"(j)
            : "r"(matrix_simd), "r"(matrix)
            : "memory"
        );
        if (out_ch == 2) {
            MIX_MISC(IF1);
        } else {
            MIX_MISC(IF0);
        }
    }
}

#endif /* HAVE_SSE_INLINE && HAVE_7REGS */

av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
    int mm_flags = av_get_cpu_flags();

    if (EXTERNAL_MMX(mm_flags)) {
        c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
        c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
        c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
    }
    if (EXTERNAL_AMD3DNOW(mm_flags)) {
        c->extract_exponents = ff_ac3_extract_exponents_3dnow;
        if (!bit_exact) {
            c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
        }
    }
    if (EXTERNAL_MMXEXT(mm_flags)) {
        c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
    }
    if (EXTERNAL_SSE(mm_flags)) {
        c->float_to_fixed24 = ff_float_to_fixed24_sse;
    }
    if (EXTERNAL_SSE2(mm_flags)) {
        c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
        c->float_to_fixed24 = ff_float_to_fixed24_sse2;
        c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
        c->extract_exponents = ff_ac3_extract_exponents_sse2;
        if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
            c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
            c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
        }
    }
    if (EXTERNAL_SSSE3(mm_flags)) {
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
        if (!(mm_flags & AV_CPU_FLAG_ATOM)) {
            c->extract_exponents = ff_ac3_extract_exponents_ssse3;
        }
    }

#if HAVE_SSE_INLINE && HAVE_7REGS
    if (INLINE_SSE(mm_flags)) {
        c->downmix = ac3_downmix_sse;
    }
#endif
}
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`/*`
			`* x86-optimized AC-3 DSP utils`
			`* Copyright (c) 2011 Justin Ruggles`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`#include "libavutil/mem.h"`
x86: rename libavutil/x86_cpu.h to libavutil/x86/asm.h This puts x86-specific things in the x86/ subdirectory where they belong. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-08-08 15:51:52 +03:00			`#include "libavutil/x86/asm.h"`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`#include "libavutil/x86/cpu.h"`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`#include "dsputil_mmx.h"`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`#include "libavcodec/ac3.h"`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`#include "libavcodec/ac3dsp.h"`

			`extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);`
			`extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);`
			`extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);`

ac3dsp: simplify x86 versions of ac3_max_msb_abs_int16 Simplifies the code by using cpuflags and a new macro. Also fixes the invalid use of the MMX2 pshufw operation in the MMX-only function. 2012-05-14 22:56:39 +03:00			`extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);`
x86: mmx2 ---> mmxext in asm constructs 2012-07-10 01:04:18 +03:00			`extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);`
ac3dsp: simplify x86 versions of ac3_max_msb_abs_int16 Simplifies the code by using cpuflags and a new macro. Also fixes the invalid use of the MMX2 pshufw operation in the MMX-only function. 2012-05-14 22:56:39 +03:00			`extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);`
			`extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-13 21:49:50 +02:00
ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder. 2011-03-11 23:45:01 +02:00			`extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);`
			`extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);`

			`extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);`
			`extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);`

ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 2011-03-16 04:29:04 +02:00			`extern void ff_float_to_fixed24_3dnow(int32_t dst, const float src, unsigned int len);`
			`extern void ff_float_to_fixed24_sse (int32_t dst, const float src, unsigned int len);`
			`extern void ff_float_to_fixed24_sse2 (int32_t dst, const float src, unsigned int len);`

ac3enc: modify mantissa bit counting to keep bap counts for all values of bap instead of just 0 to 4. This does all the actual bit counting as a final step. 2011-05-26 22:53:25 +03:00			`extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);`

ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). 2011-07-01 00:48:44 +03:00			`extern void ff_ac3_extract_exponents_3dnow(uint8_t exp, int32_t coef, int nb_coefs);`
			`extern void ff_ac3_extract_exponents_sse2 (uint8_t exp, int32_t coef, int nb_coefs);`
			`extern void ff_ac3_extract_exponents_ssse3(uint8_t exp, int32_t coef, int nb_coefs);`

x86/ac3dsp_init: try to workaround ICC failure. The asm code is not valid for older compilers as it uses too many operands, ICC on x86_32 seems affected by this. This patch disables the affected code for ICC on x86_32 and should make it compileable again. A better fix would be to use fewer operands or to change this code to yasm, later is being worked on AFAIK so this is a temporary solution. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-12-20 19:09:09 +03:00			`#if ARCH_X86_32 && defined(__INTEL_COMPILER)`
			`# undef HAVE_7REGS`
			`# define HAVE_7REGS 0`
			`#endif`

ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`#if HAVE_SSE_INLINE && HAVE_7REGS`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00
			`#define IF1(x) x`
			`#define IF0(x)`

			`#define MIX5(mono, stereo) \`
			`__asm__ volatile ( \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"movss 0(%1), %%xmm5 \n" \`
			`"movss 8(%1), %%xmm6 \n" \`
			`"movss 24(%1), %%xmm7 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`"shufps $0, %%xmm5, %%xmm5 \n" \`
			`"shufps $0, %%xmm6, %%xmm6 \n" \`
			`"shufps $0, %%xmm7, %%xmm7 \n" \`
			`"1: \n" \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"movaps (%0, %2), %%xmm0 \n" \`
			`"movaps (%0, %3), %%xmm1 \n" \`
			`"movaps (%0, %4), %%xmm2 \n" \`
			`"movaps (%0, %5), %%xmm3 \n" \`
			`"movaps (%0, %6), %%xmm4 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`"mulps %%xmm5, %%xmm0 \n" \`
			`"mulps %%xmm6, %%xmm1 \n" \`
			`"mulps %%xmm5, %%xmm2 \n" \`
			`"mulps %%xmm7, %%xmm3 \n" \`
			`"mulps %%xmm7, %%xmm4 \n" \`
			`stereo("addps %%xmm1, %%xmm0 \n") \`
			`"addps %%xmm1, %%xmm2 \n" \`
			`"addps %%xmm3, %%xmm0 \n" \`
			`"addps %%xmm4, %%xmm2 \n" \`
			`mono("addps %%xmm2, %%xmm0 \n") \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"movaps %%xmm0, (%0, %2) \n" \`
			`stereo("movaps %%xmm2, (%0, %3) \n") \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`"add $16, %0 \n" \`
			`"jl 1b \n" \`
			`: "+&r"(i) \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`: "r"(matrix), \`
			`"r"(samples[0] + len), \`
			`"r"(samples[1] + len), \`
			`"r"(samples[2] + len), \`
			`"r"(samples[3] + len), \`
			`"r"(samples[4] + len) \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \`
			`"%xmm4", "%xmm5", "%xmm6", "%xmm7",) \`
			`"memory" \`
			`);`

			`#define MIX_MISC(stereo) \`
			`__asm__ volatile ( \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"mov %5, %2 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`"1: \n" \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"mov -%c7(%6, %2, %c8), %3 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`"movaps (%3, %0), %%xmm0 \n" \`
			`stereo("movaps %%xmm0, %%xmm1 \n") \`
			`"mulps %%xmm4, %%xmm0 \n" \`
			`stereo("mulps %%xmm5, %%xmm1 \n") \`
			`"2: \n" \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"mov (%6, %2, %c8), %1 \n" \`
			`"movaps (%1, %0), %%xmm2 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`stereo("movaps %%xmm2, %%xmm3 \n") \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"mulps (%4, %2, 8), %%xmm2 \n" \`
			`stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`"addps %%xmm2, %%xmm0 \n" \`
			`stereo("addps %%xmm3, %%xmm1 \n") \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"add $4, %2 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`"jl 2b \n" \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`"mov %5, %2 \n" \`
			`stereo("mov (%6, %2, %c8), %1 \n") \`
			`"movaps %%xmm0, (%3, %0) \n" \`
			`stereo("movaps %%xmm1, (%1, %0) \n") \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`"add $16, %0 \n" \`
			`"jl 1b \n" \`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`: "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \`
			`: "r"(matrix_simd + in_ch), \`
			`"g"((intptr_t) - 4 * (in_ch - 1)), \`
			`"r"(samp + in_ch), \`
			`"i"(sizeof(float )), "i"(sizeof(float )/4) \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`: "memory" \`
			`);`

ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`static void ac3_downmix_sse(float *samples, float (matrix)[2],`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`int out_ch, int in_ch, int len)`
			`{`
			`int (matrix_cmp)[2] = (int()[2])matrix;`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`intptr_t i, j, k, m;`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00
			`i = -len * sizeof(float);`
			`if (in_ch == 5 && out_ch == 2 &&`
			`!(matrix_cmp[0][1] \| matrix_cmp[2][0] \|`
			`matrix_cmp[3][1] \| matrix_cmp[4][0] \|`
			`(matrix_cmp[1][0] ^ matrix_cmp[1][1]) \|`
			`(matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {`
			`MIX5(IF0, IF1);`
			`} else if (in_ch == 5 && out_ch == 1 &&`
			`matrix_cmp[0][0] == matrix_cmp[2][0] &&`
			`matrix_cmp[3][0] == matrix_cmp[4][0]) {`
			`MIX5(IF1, IF0);`
			`} else {`
			`DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];`
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`float *samp[AC3_MAX_CHANNELS];`

			`for (j = 0; j < in_ch; j++)`
			`samp[j] = samples[j] + len;`

ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`j = 2 * in_ch * sizeof(float);`
			`__asm__ volatile (`
			`"1: \n"`
			`"sub $8, %0 \n"`
			`"movss (%2, %0), %%xmm4 \n"`
			`"movss 4(%2, %0), %%xmm5 \n"`
			`"shufps $0, %%xmm4, %%xmm4 \n"`
			`"shufps $0, %%xmm5, %%xmm5 \n"`
			`"movaps %%xmm4, (%1, %0, 4) \n"`
			`"movaps %%xmm5, 16(%1, %0, 4) \n"`
			`"jg 1b \n"`
			`: "+&r"(j)`
			`: "r"(matrix_simd), "r"(matrix)`
			`: "memory"`
			`);`
			`if (out_ch == 2) {`
			`MIX_MISC(IF1);`
			`} else {`
			`MIX_MISC(IF0);`
			`}`
			`}`
			`}`

ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`#endif /* HAVE_SSE_INLINE && HAVE_7REGS */`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 2011-03-16 04:29:04 +02:00			`av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`{`
			`int mm_flags = av_get_cpu_flags();`

x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`if (EXTERNAL_MMX(mm_flags)) {`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`c->ac3_exponent_min = ff_ac3_exponent_min_mmx;`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-13 21:49:50 +02:00			`c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;`
ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder. 2011-03-11 23:45:01 +02:00			`c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;`
			`c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`}`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`if (EXTERNAL_AMD3DNOW(mm_flags)) {`
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). 2011-07-01 00:48:44 +03:00			`c->extract_exponents = ff_ac3_extract_exponents_3dnow;`
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 2011-03-16 04:29:04 +02:00			`if (!bit_exact) {`
			`c->float_to_fixed24 = ff_float_to_fixed24_3dnow;`
			`}`
			`}`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`if (EXTERNAL_MMXEXT(mm_flags)) {`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;`
x86: mmx2 ---> mmxext in asm constructs 2012-07-10 01:04:18 +03:00			`c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`}`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`if (EXTERNAL_SSE(mm_flags)) {`
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 2011-03-16 04:29:04 +02:00			`c->float_to_fixed24 = ff_float_to_fixed24_sse;`
			`}`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`if (EXTERNAL_SSE2(mm_flags)) {`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`c->ac3_exponent_min = ff_ac3_exponent_min_sse2;`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-13 21:49:50 +02:00			`c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;`
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 2011-03-16 04:29:04 +02:00			`c->float_to_fixed24 = ff_float_to_fixed24_sse2;`
ac3enc: modify mantissa bit counting to keep bap counts for all values of bap instead of just 0 to 4. This does all the actual bit counting as a final step. 2011-05-26 22:53:25 +03:00			`c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;`
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). 2011-07-01 00:48:44 +03:00			`c->extract_exponents = ff_ac3_extract_exponents_sse2;`
ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder. 2011-03-11 23:45:01 +02:00			`if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {`
			`c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;`
			`c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;`
			`}`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-13 21:49:50 +02:00			`}`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 20:01:05 +03:00			`if (EXTERNAL_SSSE3(mm_flags)) {`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-13 21:49:50 +02:00			`c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;`
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). 2011-07-01 00:48:44 +03:00			`if (!(mm_flags & AV_CPU_FLAG_ATOM)) {`
			`c->extract_exponents = ff_ac3_extract_exponents_ssse3;`
			`}`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`}`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00
ac3dec: make downmix() take array of pointers to channel data 2012-09-13 21:35:18 +03:00			`#if HAVE_SSE_INLINE && HAVE_7REGS`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-09-11 19:55:11 +03:00			`if (INLINE_SSE(mm_flags)) {`
			`c->downmix = ac3_downmix_sse;`
			`}`
x86: ac3dsp: Only refer to the ac3_downmix_sse symbol if it has been declared This fixes building without inline assembly. Signed-off-by: Martin Storsjö <martin@martin.st> 2012-09-13 11:18:25 +03:00			`#endif`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 2011-02-10 19:20:36 +02:00			`}`