From ad507d7907457e678900bac132122ba7be4644cb Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Fri, 14 Feb 2014 15:03:07 +0000 Subject: [PATCH] x86: dcadsp: implement SSE lfe_dir Results for Arrandale/Windows: 32: 1670 -> 316 64: 728 -> 298 Signed-off-by: Janne Grunau --- libavcodec/x86/dcadsp.asm | 87 ++++++++++++++++++++++++++++++++++++ libavcodec/x86/dcadsp_init.c | 4 ++ 2 files changed, 91 insertions(+) diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index 214f5146aa..de827d7676 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -88,3 +88,90 @@ INT8X8_FMUL_INT32 INIT_XMM sse4 INT8X8_FMUL_INT32 + +; %1=v0/v1 %2=in1 %3=in2 +%macro FIR_LOOP 2-3 +.loop%1: +%define va m1 +%define vb m2 +%if %1 +%define OFFSET 0 +%else +%define OFFSET NUM_COEF*count +%endif +; for v0, incrementing and for v1, decrementing + mova va, [cf0q + OFFSET] + mova vb, [cf0q + OFFSET + 4*NUM_COEF] +%if %0 == 3 + mova m4, [cf0q + OFFSET + mmsize] + mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize] +%endif + mulps va, %2 + mulps vb, %2 +%if %0 == 3 + mulps m4, %3 + mulps m0, %3 + addps va, m4 + addps vb, m0 +%endif + ; va = va1 va2 va3 va4 + ; vb = vb1 vb2 vb3 vb4 +%if %1 + SWAP va, vb +%endif + mova m4, va + unpcklps va, vb ; va3 vb3 va4 vb4 + unpckhps m4, vb ; va1 vb1 va2 vb2 + addps m4, va ; va1+3 vb1+3 va2+4 vb2+4 + movhlps vb, m4 ; va1+3 vb1+3 + addps vb, m4 ; va0..4 vb0..4 + movh [outq + count], vb +%if %1 + sub cf0q, 8*NUM_COEF +%endif + add count, 8 + jl .loop%1 +%endmacro + +; void dca_lfe_fir(float *out, float *in, float *coefs) +%macro DCA_LFE_FIR 1 +cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0 +%define IN1 m3 +%define IN2 m5 +%define count inq +%define NUM_COEF 4*(2-%1) +%define NUM_OUT 32*(%1+1) + + movu IN1, [inq + 4 - 1*mmsize] + shufps IN1, IN1, q0123 +%if %1 == 0 + movu IN2, [inq + 4 - 2*mmsize] + shufps IN2, IN2, q0123 +%endif + + mov count, -4*NUM_OUT + add cf0q, 4*NUM_COEF*NUM_OUT + add outq, 4*NUM_OUT + ; compute v0 first +%if %1 == 0 + FIR_LOOP 0, IN1, IN2 +%else + FIR_LOOP 0, IN1 +%endif + shufps IN1, IN1, q0123 + mov count, -4*NUM_OUT + ; cf1 already correctly positioned + add outq, 4*NUM_OUT ; outq now at out2 + sub cf0q, 8*NUM_COEF +%if %1 == 0 + shufps IN2, IN2, q0123 + FIR_LOOP 1, IN2, IN1 +%else + FIR_LOOP 1, IN1 +%endif + RET +%endmacro + +INIT_XMM sse +DCA_LFE_FIR 0 +DCA_LFE_FIR 1 diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c index 976d8a3ba3..c234bd29f5 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/dcadsp_init.c @@ -26,6 +26,8 @@ void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale); void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale); void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale); +void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs); +void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs); av_cold void ff_dcadsp_init_x86(DCADSPContext *s) { @@ -35,6 +37,8 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) #if ARCH_X86_32 s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse; #endif + s->lfe_fir[0] = ff_dca_lfe_fir0_sse; + s->lfe_fir[1] = ff_dca_lfe_fir1_sse; } if (EXTERNAL_SSE2(cpu_flags)) {