From 32f8fb8ecf8178b9c9ec8d7152f1fdd8537f7f3a Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 24 Apr 2011 17:50:17 -0400 Subject: [PATCH] Add float_interleave() to FmtConvertContext with x86-optimized versions. Partially based on patches by clsid2 in ffdshow-tryout. ff_float_interleave6() x86 improvements by Loren Merrit. --- libavcodec/fmtconvert.c | 20 +++++ libavcodec/fmtconvert.h | 9 ++ libavcodec/x86/fmtconvert.asm | 141 ++++++++++++++++++++++++++++++++ libavcodec/x86/fmtconvert_mmx.c | 30 +++++++ 4 files changed, 200 insertions(+) diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c index e9707555af..58fece70b2 100644 --- a/libavcodec/fmtconvert.c +++ b/libavcodec/fmtconvert.c @@ -56,11 +56,31 @@ static void float_to_int16_interleave_c(int16_t *dst, const float **src, } } +void ff_float_interleave_c(float *dst, const float **src, unsigned int len, + int channels) +{ + int j, c; + unsigned int i; + if (channels == 2) { + for (i = 0; i < len; i++) { + dst[2*i] = src[0][i]; + dst[2*i+1] = src[1][i]; + } + } else if (channels == 1 && len < INT_MAX / sizeof(float)) { + memcpy(dst, src[0], len * sizeof(float)); + } else { + for (c = 0; c < channels; c++) + for (i = 0, j = c; i < len; i++, j += channels) + dst[j] = src[c][i]; + } +} + av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) { c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; c->float_to_int16 = float_to_int16_c; c->float_to_int16_interleave = float_to_int16_interleave_c; + c->float_interleave = ff_float_interleave_c; if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx); diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h index e0afee47e1..d7741135b7 100644 --- a/libavcodec/fmtconvert.h +++ b/libavcodec/fmtconvert.h @@ -68,8 +68,17 @@ typedef struct FmtConvertContext { */ void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels); + + /** + * Convert an array of interleaved float to multiple arrays of float. + */ + void (*float_interleave)(float *dst, const float **src, unsigned int len, + int channels); } FmtConvertContext; +void ff_float_interleave_c(float *dst, const float **src, unsigned int len, + int channels); + void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 5cd8f6c596..e023b48322 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -20,6 +20,7 @@ ;****************************************************************************** %include "x86inc.asm" +%include "x86util.asm" section .text align=16 @@ -89,3 +90,143 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow %undef pswapd FLOAT_TO_INT16_INTERLEAVE6 3dn2 %undef cvtps2pi + +;----------------------------------------------------------------------------- +; void ff_float_interleave6(float *dst, const float **src, unsigned int len); +;----------------------------------------------------------------------------- + +%macro BUTTERFLYPS 3 + movaps m%3, m%1 + unpcklps m%1, m%2 + unpckhps m%3, m%2 + SWAP %2, %3 +%endmacro + +%macro FLOAT_INTERLEAVE6 2 +cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 +%ifdef ARCH_X86_64 + %define lend r10d + mov lend, r2d +%else + %define lend dword r2m +%endif + mov src1q, [srcq+1*gprsize] + mov src2q, [srcq+2*gprsize] + mov src3q, [srcq+3*gprsize] + mov src4q, [srcq+4*gprsize] + mov src5q, [srcq+5*gprsize] + mov srcq, [srcq] + sub src1q, srcq + sub src2q, srcq + sub src3q, srcq + sub src4q, srcq + sub src5q, srcq +.loop: +%ifidn %1, sse + movaps m0, [srcq] + movaps m1, [srcq+src1q] + movaps m2, [srcq+src2q] + movaps m3, [srcq+src3q] + movaps m4, [srcq+src4q] + movaps m5, [srcq+src5q] + + BUTTERFLYPS 0, 1, 6 + BUTTERFLYPS 2, 3, 6 + BUTTERFLYPS 4, 5, 6 + + movaps m6, m4 + shufps m4, m0, 0xe4 + movlhps m0, m2 + movhlps m6, m2 + movaps [dstq ], m0 + movaps [dstq+16], m4 + movaps [dstq+32], m6 + + movaps m6, m5 + shufps m5, m1, 0xe4 + movlhps m1, m3 + movhlps m6, m3 + movaps [dstq+48], m1 + movaps [dstq+64], m5 + movaps [dstq+80], m6 +%else ; mmx + movq m0, [srcq] + movq m1, [srcq+src1q] + movq m2, [srcq+src2q] + movq m3, [srcq+src3q] + movq m4, [srcq+src4q] + movq m5, [srcq+src5q] + + SBUTTERFLY dq, 0, 1, 6 + SBUTTERFLY dq, 2, 3, 6 + SBUTTERFLY dq, 4, 5, 6 + movq [dstq ], m0 + movq [dstq+ 8], m2 + movq [dstq+16], m4 + movq [dstq+24], m1 + movq [dstq+32], m3 + movq [dstq+40], m5 +%endif + add srcq, mmsize + add dstq, mmsize*6 + sub lend, mmsize/4 + jg .loop +%ifidn %1, mmx + emms +%endif + REP_RET +%endmacro + +INIT_MMX +FLOAT_INTERLEAVE6 mmx, 0 +INIT_XMM +FLOAT_INTERLEAVE6 sse, 7 + +;----------------------------------------------------------------------------- +; void ff_float_interleave2(float *dst, const float **src, unsigned int len); +;----------------------------------------------------------------------------- + +%macro FLOAT_INTERLEAVE2 2 +cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1 + mov src1q, [srcq+gprsize] + mov srcq, [srcq ] + sub src1q, srcq +.loop + MOVPS m0, [srcq ] + MOVPS m1, [srcq+src1q ] + MOVPS m3, [srcq +mmsize] + MOVPS m4, [srcq+src1q+mmsize] + + MOVPS m2, m0 + PUNPCKLDQ m0, m1 + PUNPCKHDQ m2, m1 + + MOVPS m1, m3 + PUNPCKLDQ m3, m4 + PUNPCKHDQ m1, m4 + + MOVPS [dstq ], m0 + MOVPS [dstq+1*mmsize], m2 + MOVPS [dstq+2*mmsize], m3 + MOVPS [dstq+3*mmsize], m1 + + add srcq, mmsize*2 + add dstq, mmsize*4 + sub lend, mmsize/2 + jg .loop +%ifidn %1, mmx + emms +%endif + REP_RET +%endmacro + +INIT_MMX +%define MOVPS movq +%define PUNPCKLDQ punpckldq +%define PUNPCKHDQ punpckhdq +FLOAT_INTERLEAVE2 mmx, 0 +INIT_XMM +%define MOVPS movaps +%define PUNPCKLDQ unpcklps +%define PUNPCKHDQ unpckhps +FLOAT_INTERLEAVE2 sse, 5 diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index 847bd80fcd..61a4272a69 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -235,11 +235,40 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long float_to_int16_interleave_3dnow(dst, src, len, channels); } +void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len); +void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len); + +void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len); +void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len); + +static void float_interleave_mmx(float *dst, const float **src, + unsigned int len, int channels) +{ + if (channels == 2) { + ff_float_interleave2_mmx(dst, src, len); + } else if (channels == 6) + ff_float_interleave6_mmx(dst, src, len); + else + ff_float_interleave_c(dst, src, len, channels); +} + +static void float_interleave_sse(float *dst, const float **src, + unsigned int len, int channels) +{ + if (channels == 2) { + ff_float_interleave2_sse(dst, src, len); + } else if (channels == 6) + ff_float_interleave6_sse(dst, src, len); + else + ff_float_interleave_c(dst, src, len, channels); +} + void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); if (mm_flags & AV_CPU_FLAG_MMX) { + c->float_interleave = float_interleave_mmx; if(mm_flags & AV_CPU_FLAG_3DNOW){ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ @@ -256,6 +285,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; c->float_to_int16 = float_to_int16_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse; + c->float_interleave = float_interleave_sse; } if(mm_flags & AV_CPU_FLAG_SSE2){ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;