FFmpeg/libavcodec/arm/dca.h

/*
 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef AVCODEC_ARM_DCA_H
#define AVCODEC_ARM_DCA_H

#include <stdint.h>

#include "config.h"
#include "libavcodec/mathops.h"

#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB

#define decode_blockcodes decode_blockcodes
static inline int decode_blockcodes(int code1, int code2, int levels,
                                    int32_t *values)
{
    int32_t v0, v1, v2, v3, v4, v5;

    __asm__ ("smmul   %0,  %6,  %10           \n"
             "smmul   %3,  %7,  %10           \n"
             "smlabb  %6,  %0,  %9,  %6       \n"
             "smlabb  %7,  %3,  %9,  %7       \n"
             "smmul   %1,  %0,  %10           \n"
             "smmul   %4,  %3,  %10           \n"
             "sub     %6,  %6,  %8,  lsr #1   \n"
             "sub     %7,  %7,  %8,  lsr #1   \n"
             "smlabb  %0,  %1,  %9,  %0       \n"
             "smlabb  %3,  %4,  %9,  %3       \n"
             "smmul   %2,  %1,  %10           \n"
             "smmul   %5,  %4,  %10           \n"
             "str     %6,  [%11, #0]          \n"
             "str     %7,  [%11, #16]         \n"
             "sub     %0,  %0,  %8,  lsr #1   \n"
             "sub     %3,  %3,  %8,  lsr #1   \n"
             "smlabb  %1,  %2,  %9,  %1       \n"
             "smlabb  %4,  %5,  %9,  %4       \n"
             "smmul   %6,  %2,  %10           \n"
             "smmul   %7,  %5,  %10           \n"
             "str     %0,  [%11, #4]          \n"
             "str     %3,  [%11, #20]         \n"
             "sub     %1,  %1,  %8,  lsr #1   \n"
             "sub     %4,  %4,  %8,  lsr #1   \n"
             "smlabb  %2,  %6,  %9,  %2       \n"
             "smlabb  %5,  %7,  %9,  %5       \n"
             "str     %1,  [%11, #8]          \n"
             "str     %4,  [%11, #24]         \n"
             "sub     %2,  %2,  %8,  lsr #1   \n"
             "sub     %5,  %5,  %8,  lsr #1   \n"
             "str     %2,  [%11, #12]         \n"
             "str     %5,  [%11, #28]         \n"
             : "=&r"(v0), "=&r"(v1), "=&r"(v2),
               "=&r"(v3), "=&r"(v4), "=&r"(v5),
               "+&r"(code1), "+&r"(code2)
             : "r"(levels - 1), "r"(-levels),
               "r"(ff_inverse[levels]), "r"(values)
             : "memory");

    return code1 | code2;
}

#endif

#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y

#define int8x8_fmul_int32 int8x8_fmul_int32
static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale)
{
    __asm__ ("vcvt.f32.s32 %2,  %2,  #4         \n"
             "vld1.8       {d0},     [%1,:64]   \n"
             "vmovl.s8     q0,  d0              \n"
             "vmovl.s16    q1,  d1              \n"
             "vmovl.s16    q0,  d0              \n"
             "vcvt.f32.s32 q0,  q0              \n"
             "vcvt.f32.s32 q1,  q1              \n"
             "vmul.f32     q0,  q0,  %y2        \n"
             "vmul.f32     q1,  q1,  %y2        \n"
             "vst1.32      {q0-q1},  [%m0,:128] \n"
             : "=Um"(*(float (*)[8])dst)
             : "r"(src), "x"(scale)
             : "d0", "d1", "d2", "d3");
}

#endif

#endif /* AVCODEC_ARM_DCA_H */
dca: NEON optimised high freq VQ decoding Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-09-28 23:53:03 +03:00			`/*`
			`* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>`
			`*`
Reinstate proper FFmpeg license for all files. 2013-08-16 00:12:51 +03:00			`* This file is part of FFmpeg.`
dca: NEON optimised high freq VQ decoding Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-09-28 23:53:03 +03:00			`*`
Reinstate proper FFmpeg license for all files. 2013-08-16 00:12:51 +03:00			`* FFmpeg is free software; you can redistribute it and/or`
dca: NEON optimised high freq VQ decoding Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-09-28 23:53:03 +03:00			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
Reinstate proper FFmpeg license for all files. 2013-08-16 00:12:51 +03:00			`* FFmpeg is distributed in the hope that it will be useful,`
dca: NEON optimised high freq VQ decoding Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-09-28 23:53:03 +03:00			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Reinstate proper FFmpeg license for all files. 2013-08-16 00:12:51 +03:00			`* License along with FFmpeg; if not, write to the Free Software`
dca: NEON optimised high freq VQ decoding Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-09-28 23:53:03 +03:00			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#ifndef AVCODEC_ARM_DCA_H`
			`#define AVCODEC_ARM_DCA_H`

			`#include <stdint.h>`
Fix a number of incorrect intmath.h #includes. 2013-02-26 00:39:46 +03:00
dca: NEON optimised high freq VQ decoding Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-09-28 23:53:03 +03:00			`#include "config.h"`
Fix a number of incorrect intmath.h #includes. 2013-02-26 00:39:46 +03:00			`#include "libavcodec/mathops.h"`
dca: ARMv6 optimised decode_blockcode() This is a hand-tuned version of the code with impossible parts of the FASTDIV function ommitted. 2-5% faster overall on Cortex-A8. Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-10-23 19:39:49 +03:00
Merge commit '637606de2d2e0af0a9fa2f23f943765d7d7c5cd5' * commit '637606de2d2e0af0a9fa2f23f943765d7d7c5cd5': configure: arm: make _inline arch ext symbols depend on inline_asm arm: use HAVE*_INLINE/EXTERNAL macros for conditional compilation Conflicts: configure libavcodec/arm/dca.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 2012-12-08 16:19:55 +03:00			`#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB`
dca: ARMv6 optimised decode_blockcode() This is a hand-tuned version of the code with impossible parts of the FASTDIV function ommitted. 2-5% faster overall on Cortex-A8. Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-10-23 19:39:49 +03:00
			`#define decode_blockcodes decode_blockcodes`
			`static inline int decode_blockcodes(int code1, int code2, int levels,`
fmtconvert: int32_t input to int32_to_float_fmul_scalar It was previously declared as int. Does not change fate results for x86. Conflicts: libavcodec/ppc/fmtconvert_altivec.c Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-12-28 00:33:51 +03:00			`int32_t *values)`
dca: ARMv6 optimised decode_blockcode() This is a hand-tuned version of the code with impossible parts of the FASTDIV function ommitted. 2-5% faster overall on Cortex-A8. Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-10-23 19:39:49 +03:00			`{`
fmtconvert: int32_t input to int32_to_float_fmul_scalar It was previously declared as int. Does not change fate results for x86. Conflicts: libavcodec/ppc/fmtconvert_altivec.c Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-12-28 00:33:51 +03:00			`int32_t v0, v1, v2, v3, v4, v5;`
dca: ARMv6 optimised decode_blockcode() This is a hand-tuned version of the code with impossible parts of the FASTDIV function ommitted. 2-5% faster overall on Cortex-A8. Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-10-23 19:39:49 +03:00
Fix compilation on ARM with android gcc 4.7 With the current code it fails due to running out of registers. So code the store offsets manually into the assembler instead. Passes "make fate-dts". Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de> 2013-03-16 15:36:20 +03:00			`__asm__ ("smmul %0, %6, %10 \n"`
			`"smmul %3, %7, %10 \n"`
			`"smlabb %6, %0, %9, %6 \n"`
			`"smlabb %7, %3, %9, %7 \n"`
			`"smmul %1, %0, %10 \n"`
			`"smmul %4, %3, %10 \n"`
			`"sub %6, %6, %8, lsr #1 \n"`
			`"sub %7, %7, %8, lsr #1 \n"`
			`"smlabb %0, %1, %9, %0 \n"`
			`"smlabb %3, %4, %9, %3 \n"`
			`"smmul %2, %1, %10 \n"`
			`"smmul %5, %4, %10 \n"`
			`"str %6, [%11, #0] \n"`
			`"str %7, [%11, #16] \n"`
			`"sub %0, %0, %8, lsr #1 \n"`
			`"sub %3, %3, %8, lsr #1 \n"`
			`"smlabb %1, %2, %9, %1 \n"`
			`"smlabb %4, %5, %9, %4 \n"`
			`"smmul %6, %2, %10 \n"`
			`"smmul %7, %5, %10 \n"`
			`"str %0, [%11, #4] \n"`
			`"str %3, [%11, #20] \n"`
			`"sub %1, %1, %8, lsr #1 \n"`
			`"sub %4, %4, %8, lsr #1 \n"`
			`"smlabb %2, %6, %9, %2 \n"`
			`"smlabb %5, %7, %9, %5 \n"`
			`"str %1, [%11, #8] \n"`
			`"str %4, [%11, #24] \n"`
			`"sub %2, %2, %8, lsr #1 \n"`
			`"sub %5, %5, %8, lsr #1 \n"`
			`"str %2, [%11, #12] \n"`
			`"str %5, [%11, #28] \n"`
			`: "=&r"(v0), "=&r"(v1), "=&r"(v2),`
dca: ARMv6 optimised decode_blockcode() This is a hand-tuned version of the code with impossible parts of the FASTDIV function ommitted. 2-5% faster overall on Cortex-A8. Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-10-23 19:39:49 +03:00			`"=&r"(v3), "=&r"(v4), "=&r"(v5),`
			`"+&r"(code1), "+&r"(code2)`
Fix compilation on ARM with android gcc 4.7 With the current code it fails due to running out of registers. So code the store offsets manually into the assembler instead. Passes "make fate-dts". Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de> 2013-03-16 15:36:20 +03:00			`: "r"(levels - 1), "r"(-levels),`
			`"r"(ff_inverse[levels]), "r"(values)`
			`: "memory");`
dca: ARMv6 optimised decode_blockcode() This is a hand-tuned version of the code with impossible parts of the FASTDIV function ommitted. 2-5% faster overall on Cortex-A8. Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-10-23 19:39:49 +03:00
			`return code1 \| code2;`
			`}`

			`#endif`
dca: NEON optimised high freq VQ decoding Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-09-28 23:53:03 +03:00
arm: use HAVE*_INLINE/EXTERNAL macros for conditional compilation These macros reflect the actual capabilities required here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-12-01 19:41:39 +03:00			`#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y`
dca: NEON optimised high freq VQ decoding Signed-off-by: Mans Rullgard <mans@mansr.com> 2011-09-28 23:53:03 +03:00
			`#define int8x8_fmul_int32 int8x8_fmul_int32`
			`static inline void int8x8_fmul_int32(float dst, const int8_t src, int scale)`
			`{`
			`__asm__ ("vcvt.f32.s32 %2, %2, #4 \n"`
			`"vld1.8 {d0}, [%1,:64] \n"`
			`"vmovl.s8 q0, d0 \n"`
			`"vmovl.s16 q1, d1 \n"`
			`"vmovl.s16 q0, d0 \n"`
			`"vcvt.f32.s32 q0, q0 \n"`
			`"vcvt.f32.s32 q1, q1 \n"`
			`"vmul.f32 q0, q0, %y2 \n"`
			`"vmul.f32 q1, q1, %y2 \n"`
			`"vst1.32 {q0-q1}, [%m0,:128] \n"`
			`: "=Um"((float ()[8])dst)`
			`: "r"(src), "x"(scale)`
			`: "d0", "d1", "d2", "d3");`
			`}`

			`#endif`

			`#endif /* AVCODEC_ARM_DCA_H */`