diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index e49d63d8ad..2758daa6e4 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -132,10 +132,15 @@ DECODE_HF mulps va, %2 mulps vb, %2 %if %0 == 3 +%if cpuflag(fma3) + fmaddps va, m4, %3, va + fmaddps vb, m0, %3, vb +%else mulps m4, %3 mulps m0, %3 addps va, m4 addps vb, m0 +%endif %endif ; va = va1 va2 va3 va4 ; vb = vb1 vb2 vb3 vb4 @@ -198,6 +203,10 @@ cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0 INIT_XMM sse DCA_LFE_FIR 0 DCA_LFE_FIR 1 +%if HAVE_FMA3_EXTERNAL +INIT_XMM fma3 +DCA_LFE_FIR 0 +%endif %macro SETZERO 1 %if cpuflag(sse2) && notcpuflag(avx) diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c index 77f4398377..bb86c26037 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/dcadsp_init.c @@ -34,6 +34,7 @@ void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end); void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs); void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs); +void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs); av_cold void ff_dcadsp_init_x86(DCADSPContext *s) { @@ -54,6 +55,10 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) if (EXTERNAL_SSE4(cpu_flags)) { s->decode_hf = ff_decode_hf_sse4; } + + if (EXTERNAL_FMA3(cpu_flags)) { + s->lfe_fir[0] = ff_dca_lfe_fir0_fma3; + } }