diff --git a/libavcodec/liba52/a52_internal.h b/libavcodec/liba52/a52_internal.h index a158227699..5235704ffe 100644 --- a/libavcodec/liba52/a52_internal.h +++ b/libavcodec/liba52/a52_internal.h @@ -118,6 +118,8 @@ void a52_upmix (sample_t * samples, int acmod, int output); void a52_imdct_init (uint32_t mm_accel); void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias); void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias); +//extern void (* a52_imdct_256) (sample_t data[], sample_t delay[], sample_t bias); +//extern void (* a52_imdct_512) (sample_t data[], sample_t delay[], sample_t bias); #define ROUND(x) ((int)((x) + ((x) > 0 ? 0.5 : -0.5))) diff --git a/libavcodec/liba52/a52_util.h b/libavcodec/liba52/a52_util.h new file mode 100644 index 0000000000..121393ec19 --- /dev/null +++ b/libavcodec/liba52/a52_util.h @@ -0,0 +1,32 @@ +/* + * a52_util.h + * Copyright (C) 2000-2003 Michel Lespinasse + * Copyright (C) 1999-2000 Aaron Holtzman + * + * This file is part of a52dec, a free ATSC A-52 stream decoder. + * See http://liba52.sourceforge.net/ for updates. + * + * a52dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * a52dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef A52_UTIL_H +#define A52_UTIL_H + +uint16_t a52_crc16_block(uint8_t *data,uint32_t num_bytes); + +void* a52_resample_init(uint32_t mm_accel,int flags,int chans); +extern int (* a52_resample) (float * _f, int16_t * s16); + +#endif /* A52_H */ diff --git a/libavcodec/liba52/crc.c b/libavcodec/liba52/crc.c new file mode 100644 index 0000000000..6698155bd4 --- /dev/null +++ b/libavcodec/liba52/crc.c @@ -0,0 +1,73 @@ +/* + * crc.c + * + * Copyright (C) Aaron Holtzman - May 1999 + * + * This file is part of ac3dec, a free Dolby AC-3 stream decoder. + * + * ac3dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * ac3dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include + +static const uint16_t crc_lut[256] = +{ + 0x0000,0x8005,0x800f,0x000a,0x801b,0x001e,0x0014,0x8011, + 0x8033,0x0036,0x003c,0x8039,0x0028,0x802d,0x8027,0x0022, + 0x8063,0x0066,0x006c,0x8069,0x0078,0x807d,0x8077,0x0072, + 0x0050,0x8055,0x805f,0x005a,0x804b,0x004e,0x0044,0x8041, + 0x80c3,0x00c6,0x00cc,0x80c9,0x00d8,0x80dd,0x80d7,0x00d2, + 0x00f0,0x80f5,0x80ff,0x00fa,0x80eb,0x00ee,0x00e4,0x80e1, + 0x00a0,0x80a5,0x80af,0x00aa,0x80bb,0x00be,0x00b4,0x80b1, + 0x8093,0x0096,0x009c,0x8099,0x0088,0x808d,0x8087,0x0082, + 0x8183,0x0186,0x018c,0x8189,0x0198,0x819d,0x8197,0x0192, + 0x01b0,0x81b5,0x81bf,0x01ba,0x81ab,0x01ae,0x01a4,0x81a1, + 0x01e0,0x81e5,0x81ef,0x01ea,0x81fb,0x01fe,0x01f4,0x81f1, + 0x81d3,0x01d6,0x01dc,0x81d9,0x01c8,0x81cd,0x81c7,0x01c2, + 0x0140,0x8145,0x814f,0x014a,0x815b,0x015e,0x0154,0x8151, + 0x8173,0x0176,0x017c,0x8179,0x0168,0x816d,0x8167,0x0162, + 0x8123,0x0126,0x012c,0x8129,0x0138,0x813d,0x8137,0x0132, + 0x0110,0x8115,0x811f,0x011a,0x810b,0x010e,0x0104,0x8101, + 0x8303,0x0306,0x030c,0x8309,0x0318,0x831d,0x8317,0x0312, + 0x0330,0x8335,0x833f,0x033a,0x832b,0x032e,0x0324,0x8321, + 0x0360,0x8365,0x836f,0x036a,0x837b,0x037e,0x0374,0x8371, + 0x8353,0x0356,0x035c,0x8359,0x0348,0x834d,0x8347,0x0342, + 0x03c0,0x83c5,0x83cf,0x03ca,0x83db,0x03de,0x03d4,0x83d1, + 0x83f3,0x03f6,0x03fc,0x83f9,0x03e8,0x83ed,0x83e7,0x03e2, + 0x83a3,0x03a6,0x03ac,0x83a9,0x03b8,0x83bd,0x83b7,0x03b2, + 0x0390,0x8395,0x839f,0x039a,0x838b,0x038e,0x0384,0x8381, + 0x0280,0x8285,0x828f,0x028a,0x829b,0x029e,0x0294,0x8291, + 0x82b3,0x02b6,0x02bc,0x82b9,0x02a8,0x82ad,0x82a7,0x02a2, + 0x82e3,0x02e6,0x02ec,0x82e9,0x02f8,0x82fd,0x82f7,0x02f2, + 0x02d0,0x82d5,0x82df,0x02da,0x82cb,0x02ce,0x02c4,0x82c1, + 0x8243,0x0246,0x024c,0x8249,0x0258,0x825d,0x8257,0x0252, + 0x0270,0x8275,0x827f,0x027a,0x826b,0x026e,0x0264,0x8261, + 0x0220,0x8225,0x822f,0x022a,0x823b,0x023e,0x0234,0x8231, + 0x8213,0x0216,0x021c,0x8219,0x0208,0x820d,0x8207,0x0202 +}; + +uint16_t a52_crc16_block(uint8_t *data,uint32_t num_bytes) +{ + uint32_t i; + uint16_t state=0; + + for(i=0;i>8)] ^ (state<<8); + + return state; +} diff --git a/libavcodec/liba52/mm_accel.h b/libavcodec/liba52/mm_accel.h index 25258c3683..8afbd354cd 100644 --- a/libavcodec/liba52/mm_accel.h +++ b/libavcodec/liba52/mm_accel.h @@ -31,6 +31,11 @@ #define MM_ACCEL_X86_MMX 0x80000000 #define MM_ACCEL_X86_3DNOW 0x40000000 #define MM_ACCEL_X86_MMXEXT 0x20000000 +#define MM_ACCEL_X86_SSE 0x10000000 +#define MM_ACCEL_X86_3DNOWEXT 0x08000000 + +/* PPC accelerations */ +#define MM_ACCEL_PPC_ALTIVEC 0x00010000 uint32_t mm_accel (void); diff --git a/libavcodec/liba52/resample.c b/libavcodec/liba52/resample.c new file mode 100644 index 0000000000..284cbbe78d --- /dev/null +++ b/libavcodec/liba52/resample.c @@ -0,0 +1,45 @@ + +// a52_resample_init should find the requested converter (from type flags -> +// given number of channels) and set up some function pointers... + +// a52_resample() should do the conversion. + +#include +#include +#include "a52.h" +#include "mm_accel.h" +#include "config.h" +#include "../libpostproc/mangle.h" + +int (* a52_resample) (float * _f, int16_t * s16)=NULL; + +#include "resample_c.c" + +#ifdef ARCH_X86 +#include "resample_mmx.c" +#endif + +void* a52_resample_init(uint32_t mm_accel,int flags,int chans){ +void* tmp; + +#ifdef ARCH_X86 + if(mm_accel&MM_ACCEL_X86_MMX){ + tmp=a52_resample_MMX(flags,chans); + if(tmp){ + if(a52_resample==NULL) fprintf(stderr, "Using MMX optimized resampler\n"); + a52_resample=tmp; + return tmp; + } + } +#endif + + tmp=a52_resample_C(flags,chans); + if(tmp){ + if(a52_resample==NULL) fprintf(stderr, "No accelerated resampler found\n"); + a52_resample=tmp; + return tmp; + } + + fprintf(stderr, "Unimplemented resampler for mode 0x%X -> %d channels conversion - Contact MPlayer developers!\n", flags, chans); + return NULL; +} diff --git a/libavcodec/liba52/resample_c.c b/libavcodec/liba52/resample_c.c new file mode 100644 index 0000000000..a618ec6e9e --- /dev/null +++ b/libavcodec/liba52/resample_c.c @@ -0,0 +1,183 @@ +// this code is based on a52dec/libao/audio_out_oss.c + +static inline int16_t convert (int32_t i) +{ + if (i > 0x43c07fff) + return 32767; + else if (i < 0x43bf8000) + return -32768; + else + return i - 0x43c00000; +} + +static int a52_resample_MONO_to_5_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0; + s16[5*i+4] = convert (f[i]); + } + return 5*256; +} + +static int a52_resample_MONO_to_1_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[i] = convert (f[i]); + } + return 1*256; +} + +static int a52_resample_STEREO_to_2_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[2*i] = convert (f[i]); + s16[2*i+1] = convert (f[i+256]); + } + return 2*256; +} + +static int a52_resample_3F_to_5_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[5*i] = convert (f[i]); + s16[5*i+1] = convert (f[i+512]); + s16[5*i+2] = s16[5*i+3] = 0; + s16[5*i+4] = convert (f[i+256]); + } + return 5*256; +} + +static int a52_resample_2F_2R_to_4_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[4*i] = convert (f[i]); + s16[4*i+1] = convert (f[i+256]); + s16[4*i+2] = convert (f[i+512]); + s16[4*i+3] = convert (f[i+768]); + } + return 4*256; +} + +static int a52_resample_3F_2R_to_5_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[5*i] = convert (f[i]); + s16[5*i+1] = convert (f[i+512]); + s16[5*i+2] = convert (f[i+768]); + s16[5*i+3] = convert (f[i+1024]); + s16[5*i+4] = convert (f[i+256]); + } + return 5*256; +} + +static int a52_resample_MONO_LFE_to_6_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0; + s16[6*i+4] = convert (f[i+256]); + s16[6*i+5] = convert (f[i]); + } + return 6*256; +} + +static int a52_resample_STEREO_LFE_to_6_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[6*i] = convert (f[i+256]); + s16[6*i+1] = convert (f[i+512]); + s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0; + s16[6*i+5] = convert (f[i]); + } + return 6*256; +} + +static int a52_resample_3F_LFE_to_6_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[6*i] = convert (f[i+256]); + s16[6*i+1] = convert (f[i+768]); + s16[6*i+2] = s16[6*i+3] = 0; + s16[6*i+4] = convert (f[i+512]); + s16[6*i+5] = convert (f[i]); + } + return 6*256; +} + +static int a52_resample_2F_2R_LFE_to_6_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[6*i] = convert (f[i+256]); + s16[6*i+1] = convert (f[i+512]); + s16[6*i+2] = convert (f[i+768]); + s16[6*i+3] = convert (f[i+1024]); + s16[6*i+4] = 0; + s16[6*i+5] = convert (f[i]); + } + return 6*256; +} + +static int a52_resample_3F_2R_LFE_to_6_C(float * _f, int16_t * s16){ + int i; + int32_t * f = (int32_t *) _f; + for (i = 0; i < 256; i++) { + s16[6*i] = convert (f[i+256]); + s16[6*i+1] = convert (f[i+768]); + s16[6*i+2] = convert (f[i+1024]); + s16[6*i+3] = convert (f[i+1280]); + s16[6*i+4] = convert (f[i+512]); + s16[6*i+5] = convert (f[i]); + } + return 6*256; +} + + +static void* a52_resample_C(int flags, int ch){ + switch (flags) { + case A52_MONO: + if(ch==5) return a52_resample_MONO_to_5_C; + if(ch==1) return a52_resample_MONO_to_1_C; + break; + case A52_CHANNEL: + case A52_STEREO: + case A52_DOLBY: + if(ch==2) return a52_resample_STEREO_to_2_C; + break; + case A52_3F: + if(ch==5) return a52_resample_3F_to_5_C; + break; + case A52_2F2R: + if(ch==4) return a52_resample_2F_2R_to_4_C; + break; + case A52_3F2R: + if(ch==5) return a52_resample_3F_2R_to_5_C; + break; + case A52_MONO | A52_LFE: + if(ch==6) return a52_resample_MONO_LFE_to_6_C; + break; + case A52_CHANNEL | A52_LFE: + case A52_STEREO | A52_LFE: + case A52_DOLBY | A52_LFE: + if(ch==6) return a52_resample_STEREO_LFE_to_6_C; + break; + case A52_3F | A52_LFE: + if(ch==6) return a52_resample_3F_LFE_to_6_C; + break; + case A52_2F2R | A52_LFE: + if(ch==6) return a52_resample_2F_2R_LFE_to_6_C; + break; + case A52_3F2R | A52_LFE: + if(ch==6) return a52_resample_3F_2R_LFE_to_6_C; + break; + } + return NULL; +} diff --git a/libavcodec/liba52/resample_mmx.c b/libavcodec/liba52/resample_mmx.c new file mode 100644 index 0000000000..a4079798f7 --- /dev/null +++ b/libavcodec/liba52/resample_mmx.c @@ -0,0 +1,518 @@ + +// MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL) + +/* optimization TODO / NOTES + movntq is slightly faster (0.5% with the current test.c benchmark) + (but thats just test.c so that needs to be testd in reallity) + and it would mean (C / MMX2 / MMX / 3DNOW) versions +*/ + +static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL; +static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL; +static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL; +static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL; + +static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-512, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq "MANGLE(wm1100)", %%mm3 \n\t" + "movq "MANGLE(wm0101)", %%mm4 \n\t" + "movq "MANGLE(wm1010)", %%mm5 \n\t" + "pxor %%mm6, %%mm6 \n\t" + "1: \n\t" + "movq (%1, %%esi, 2), %%mm0 \n\t" + "movq 8(%1, %%esi, 2), %%mm1 \n\t" + "leal (%%esi, %%esi, 4), %%edi \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "movq %%mm0, %%mm1 \n\t" + "pand %%mm4, %%mm0 \n\t" + "pand %%mm5, %%mm1 \n\t" + "movq %%mm6, (%0, %%edi) \n\t" // 0 0 0 0 + "movd %%mm0, 8(%0, %%edi) \n\t" // A 0 + "pand %%mm3, %%mm0 \n\t" + "movd %%mm6, 12(%0, %%edi) \n\t" // 0 0 + "movd %%mm1, 16(%0, %%edi) \n\t" // 0 B + "pand %%mm3, %%mm1 \n\t" + "movd %%mm6, 20(%0, %%edi) \n\t" // 0 0 + "movq %%mm0, 24(%0, %%edi) \n\t" // 0 0 C 0 + "movq %%mm1, 32(%0, %%edi) \n\t" // 0 0 0 B + "addl $8, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1280), "r" (f+256) + :"%esi", "%edi", "memory" + ); + return 5*256; +} + +static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; +/* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it +#ifdef HAVE_SSE + asm volatile( + "movl $-1024, %%esi \n\t" + "1: \n\t" + "cvtps2pi (%1, %%esi), %%mm0 \n\t" + "cvtps2pi 1024(%1, %%esi), %%mm2\n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "movq %%mm0, (%0, %%esi) \n\t" + "movq %%mm1, 8(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+512), "r" (f+256) + :"%esi", "memory" + );*/ + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "1: \n\t" + "movq (%1, %%esi), %%mm0 \n\t" + "movq 8(%1, %%esi), %%mm1 \n\t" + "movq 1024(%1, %%esi), %%mm2 \n\t" + "movq 1032(%1, %%esi), %%mm3 \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "movq %%mm0, (%0, %%esi) \n\t" + "movq %%mm1, 8(%0, %%esi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+512), "r" (f+256) + :"%esi", "memory" + ); + return 2*256; +} + +static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" + "movq %%mm7, %%mm5 \n\t" + "punpckldq %%mm6, %%mm5 \n\t" + "1: \n\t" + "movd (%1, %%esi), %%mm0 \n\t" + "punpckldq 2048(%1, %%esi), %%mm0\n\t" + "movd 1024(%1, %%esi), %%mm1 \n\t" + "punpckldq 4(%1, %%esi), %%mm1 \n\t" + "movd 2052(%1, %%esi), %%mm2 \n\t" + "movq %%mm7, %%mm3 \n\t" + "punpckldq 1028(%1, %%esi), %%mm3\n\t" + "movd 8(%1, %%esi), %%mm4 \n\t" + "punpckldq 2056(%1, %%esi), %%mm4\n\t" + "leal (%%esi, %%esi, 4), %%edi \n\t" + "sarl $1, %%edi \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm5, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "psubd %%mm7, %%mm4 \n\t" + "packssdw %%mm6, %%mm0 \n\t" + "packssdw %%mm2, %%mm1 \n\t" + "packssdw %%mm4, %%mm3 \n\t" + "movq %%mm0, (%0, %%edi) \n\t" + "movq %%mm1, 8(%0, %%edi) \n\t" + "movq %%mm3, 16(%0, %%edi) \n\t" + + "movd 1032(%1, %%esi), %%mm1 \n\t" + "punpckldq 12(%1, %%esi), %%mm1\n\t" + "movd 2060(%1, %%esi), %%mm2 \n\t" + "movq %%mm7, %%mm3 \n\t" + "punpckldq 1036(%1, %%esi), %%mm3\n\t" + "pxor %%mm0, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm5, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "movq %%mm0, 24(%0, %%edi) \n\t" + "movq %%mm2, 32(%0, %%edi) \n\t" + + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1280), "r" (f+256) + :"%esi", "%edi", "memory" + ); + return 5*256; +} + +static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "1: \n\t" + "movq (%1, %%esi), %%mm0 \n\t" + "movq 8(%1, %%esi), %%mm1 \n\t" + "movq 1024(%1, %%esi), %%mm2 \n\t" + "movq 1032(%1, %%esi), %%mm3 \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "movq 2048(%1, %%esi), %%mm3 \n\t" + "movq 2056(%1, %%esi), %%mm4 \n\t" + "movq 3072(%1, %%esi), %%mm5 \n\t" + "movq 3080(%1, %%esi), %%mm6 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "psubd %%mm7, %%mm4 \n\t" + "psubd %%mm7, %%mm5 \n\t" + "psubd %%mm7, %%mm6 \n\t" + "packssdw %%mm4, %%mm3 \n\t" + "packssdw %%mm6, %%mm5 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "punpcklwd %%mm5, %%mm3 \n\t" + "punpckhwd %%mm5, %%mm4 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm5 \n\t" + "punpckldq %%mm3, %%mm0 \n\t" + "punpckhdq %%mm3, %%mm2 \n\t" + "punpckldq %%mm4, %%mm1 \n\t" + "punpckhdq %%mm4, %%mm5 \n\t" + "movq %%mm0, (%0, %%esi,2) \n\t" + "movq %%mm2, 8(%0, %%esi,2) \n\t" + "movq %%mm1, 16(%0, %%esi,2) \n\t" + "movq %%mm5, 24(%0, %%esi,2) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1024), "r" (f+256) + :"%esi", "memory" + ); + return 4*256; +} + +static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "1: \n\t" + "movd (%1, %%esi), %%mm0 \n\t" + "punpckldq 2048(%1, %%esi), %%mm0\n\t" + "movd 3072(%1, %%esi), %%mm1 \n\t" + "punpckldq 4096(%1, %%esi), %%mm1\n\t" + "movd 1024(%1, %%esi), %%mm2 \n\t" + "punpckldq 4(%1, %%esi), %%mm2 \n\t" + "movd 2052(%1, %%esi), %%mm3 \n\t" + "punpckldq 3076(%1, %%esi), %%mm3\n\t" + "movd 4100(%1, %%esi), %%mm4 \n\t" + "punpckldq 1028(%1, %%esi), %%mm4\n\t" + "movd 8(%1, %%esi), %%mm5 \n\t" + "punpckldq 2056(%1, %%esi), %%mm5\n\t" + "leal (%%esi, %%esi, 4), %%edi \n\t" + "sarl $1, %%edi \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "psubd %%mm7, %%mm4 \n\t" + "psubd %%mm7, %%mm5 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "packssdw %%mm5, %%mm4 \n\t" + "movq %%mm0, (%0, %%edi) \n\t" + "movq %%mm2, 8(%0, %%edi) \n\t" + "movq %%mm4, 16(%0, %%edi) \n\t" + + "movd 3080(%1, %%esi), %%mm0 \n\t" + "punpckldq 4104(%1, %%esi), %%mm0\n\t" + "movd 1032(%1, %%esi), %%mm1 \n\t" + "punpckldq 12(%1, %%esi), %%mm1\n\t" + "movd 2060(%1, %%esi), %%mm2 \n\t" + "punpckldq 3084(%1, %%esi), %%mm2\n\t" + "movd 4108(%1, %%esi), %%mm3 \n\t" + "punpckldq 1036(%1, %%esi), %%mm3\n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "movq %%mm0, 24(%0, %%edi) \n\t" + "movq %%mm2, 32(%0, %%edi) \n\t" + + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1280), "r" (f+256) + :"%esi", "%edi", "memory" + ); + return 5*256; +} + +static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" + "1: \n\t" + "movq 1024(%1, %%esi), %%mm0 \n\t" + "movq 1032(%1, %%esi), %%mm1 \n\t" + "movq (%1, %%esi), %%mm2 \n\t" + "movq 8(%1, %%esi), %%mm3 \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "leal (%%esi, %%esi, 2), %%edi \n\t" + "movq %%mm6, (%0, %%edi) \n\t" + "movd %%mm0, 8(%0, %%edi) \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movq %%mm6, 12(%0, %%edi) \n\t" + "movd %%mm0, 20(%0, %%edi) \n\t" + "movq %%mm6, 24(%0, %%edi) \n\t" + "movd %%mm1, 32(%0, %%edi) \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movq %%mm6, 36(%0, %%edi) \n\t" + "movd %%mm1, 44(%0, %%edi) \n\t" + "addl $16, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1536), "r" (f+256) + :"%esi", "%edi", "memory" + ); + return 6*256; +} + +static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" + "1: \n\t" + "movq 1024(%1, %%esi), %%mm0 \n\t" + "movq 2048(%1, %%esi), %%mm1 \n\t" + "movq (%1, %%esi), %%mm5 \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm5 \n\t" + "leal (%%esi, %%esi, 2), %%edi \n\t" + + "pxor %%mm4, %%mm4 \n\t" + "packssdw %%mm5, %%mm0 \n\t" // FfAa + "packssdw %%mm4, %%mm1 \n\t" // 00Bb + "punpckhwd %%mm0, %%mm4 \n\t" // F0f0 + "punpcklwd %%mm1, %%mm0 \n\t" // BAba + "movq %%mm0, %%mm1 \n\t" // BAba + "punpckldq %%mm4, %%mm3 \n\t" // f0XX + "punpckldq %%mm6, %%mm0 \n\t" // 00ba + "punpckhdq %%mm1, %%mm3 \n\t" // BAf0 + + "movq %%mm0, (%0, %%edi) \n\t" // 00ba + "punpckhdq %%mm4, %%mm0 \n\t" // F000 + "movq %%mm3, 8(%0, %%edi) \n\t" // BAf0 + "movq %%mm0, 16(%0, %%edi) \n\t" // F000 + "addl $8, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1536), "r" (f+256) + :"%esi", "%edi", "memory" + ); + return 6*256; +} + +static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" + "1: \n\t" + "movq 1024(%1, %%esi), %%mm0 \n\t" + "movq 3072(%1, %%esi), %%mm1 \n\t" + "movq 2048(%1, %%esi), %%mm4 \n\t" + "movq (%1, %%esi), %%mm5 \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm4 \n\t" + "psubd %%mm7, %%mm5 \n\t" + "leal (%%esi, %%esi, 2), %%edi \n\t" + + "packssdw %%mm4, %%mm0 \n\t" // EeAa + "packssdw %%mm5, %%mm1 \n\t" // FfBb + "movq %%mm0, %%mm2 \n\t" // EeAa + "punpcklwd %%mm1, %%mm0 \n\t" // BAba + "punpckhwd %%mm1, %%mm2 \n\t" // FEfe + "movq %%mm0, %%mm1 \n\t" // BAba + "punpckldq %%mm6, %%mm0 \n\t" // 00ba + "punpckhdq %%mm1, %%mm1 \n\t" // BABA + + "movq %%mm0, (%0, %%edi) \n\t" + "punpckhdq %%mm2, %%mm0 \n\t" // FE00 + "punpckldq %%mm1, %%mm2 \n\t" // BAfe + "movq %%mm2, 8(%0, %%edi) \n\t" + "movq %%mm0, 16(%0, %%edi) \n\t" + "addl $8, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1536), "r" (f+256) + :"%esi", "%edi", "memory" + ); + return 6*256; +} + +static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" +// "pxor %%mm6, %%mm6 \n\t" + "1: \n\t" + "movq 1024(%1, %%esi), %%mm0 \n\t" + "movq 2048(%1, %%esi), %%mm1 \n\t" + "movq 3072(%1, %%esi), %%mm2 \n\t" + "movq 4096(%1, %%esi), %%mm3 \n\t" + "movq (%1, %%esi), %%mm5 \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "psubd %%mm7, %%mm5 \n\t" + "leal (%%esi, %%esi, 2), %%edi \n\t" + + "packssdw %%mm2, %%mm0 \n\t" // CcAa + "packssdw %%mm3, %%mm1 \n\t" // DdBb + "packssdw %%mm5, %%mm5 \n\t" // FfFf + "movq %%mm0, %%mm2 \n\t" // CcAa + "punpcklwd %%mm1, %%mm0 \n\t" // BAba + "punpckhwd %%mm1, %%mm2 \n\t" // DCdc + "pxor %%mm4, %%mm4 \n\t" // 0000 + "punpcklwd %%mm5, %%mm4 \n\t" // F0f0 + "movq %%mm0, %%mm1 \n\t" // BAba + "movq %%mm4, %%mm3 \n\t" // F0f0 + "punpckldq %%mm2, %%mm0 \n\t" // dcba + "punpckhdq %%mm1, %%mm1 \n\t" // BABA + "punpckldq %%mm1, %%mm4 \n\t" // BAf0 + "punpckhdq %%mm3, %%mm2 \n\t" // F0DC + + "movq %%mm0, (%0, %%edi) \n\t" + "movq %%mm4, 8(%0, %%edi) \n\t" + "movq %%mm2, 16(%0, %%edi) \n\t" + "addl $8, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1536), "r" (f+256) + :"%esi", "%edi", "memory" + ); + return 6*256; +} + +static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){ + int32_t * f = (int32_t *) _f; + asm volatile( + "movl $-1024, %%esi \n\t" + "movq "MANGLE(magicF2W)", %%mm7 \n\t" +// "pxor %%mm6, %%mm6 \n\t" + "1: \n\t" + "movq 1024(%1, %%esi), %%mm0 \n\t" + "movq 3072(%1, %%esi), %%mm1 \n\t" + "movq 4096(%1, %%esi), %%mm2 \n\t" + "movq 5120(%1, %%esi), %%mm3 \n\t" + "movq 2048(%1, %%esi), %%mm4 \n\t" + "movq (%1, %%esi), %%mm5 \n\t" + "psubd %%mm7, %%mm0 \n\t" + "psubd %%mm7, %%mm1 \n\t" + "psubd %%mm7, %%mm2 \n\t" + "psubd %%mm7, %%mm3 \n\t" + "psubd %%mm7, %%mm4 \n\t" + "psubd %%mm7, %%mm5 \n\t" + "leal (%%esi, %%esi, 2), %%edi \n\t" + + "packssdw %%mm2, %%mm0 \n\t" // CcAa + "packssdw %%mm3, %%mm1 \n\t" // DdBb + "packssdw %%mm4, %%mm4 \n\t" // EeEe + "packssdw %%mm5, %%mm5 \n\t" // FfFf + "movq %%mm0, %%mm2 \n\t" // CcAa + "punpcklwd %%mm1, %%mm0 \n\t" // BAba + "punpckhwd %%mm1, %%mm2 \n\t" // DCdc + "punpcklwd %%mm5, %%mm4 \n\t" // FEfe + "movq %%mm0, %%mm1 \n\t" // BAba + "movq %%mm4, %%mm3 \n\t" // FEfe + "punpckldq %%mm2, %%mm0 \n\t" // dcba + "punpckhdq %%mm1, %%mm1 \n\t" // BABA + "punpckldq %%mm1, %%mm4 \n\t" // BAfe + "punpckhdq %%mm3, %%mm2 \n\t" // FEDC + + "movq %%mm0, (%0, %%edi) \n\t" + "movq %%mm4, 8(%0, %%edi) \n\t" + "movq %%mm2, 16(%0, %%edi) \n\t" + "addl $8, %%esi \n\t" + " jnz 1b \n\t" + "emms \n\t" + :: "r" (s16+1536), "r" (f+256) + :"%esi", "%edi", "memory" + ); + return 6*256; +} + + +static void* a52_resample_MMX(int flags, int ch){ + switch (flags) { + case A52_MONO: + if(ch==5) return a52_resample_MONO_to_5_MMX; + break; + case A52_CHANNEL: + case A52_STEREO: + case A52_DOLBY: + if(ch==2) return a52_resample_STEREO_to_2_MMX; + break; + case A52_3F: + if(ch==5) return a52_resample_3F_to_5_MMX; + break; + case A52_2F2R: + if(ch==4) return a52_resample_2F_2R_to_4_MMX; + break; + case A52_3F2R: + if(ch==5) return a52_resample_3F_2R_to_5_MMX; + break; + case A52_MONO | A52_LFE: + if(ch==6) return a52_resample_MONO_LFE_to_6_MMX; + break; + case A52_CHANNEL | A52_LFE: + case A52_STEREO | A52_LFE: + case A52_DOLBY | A52_LFE: + if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX; + break; + case A52_3F | A52_LFE: + if(ch==6) return a52_resample_3F_LFE_to_6_MMX; + break; + case A52_2F2R | A52_LFE: + if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX; + break; + case A52_3F2R | A52_LFE: + if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX; + break; + } + return NULL; +} + +