From 54a51d384055a771ba1eeef3c2f399bd03fa2663 Mon Sep 17 00:00:00 2001 From: James Darnley Date: Tue, 12 Aug 2014 23:22:03 +0200 Subject: [PATCH] lavc/flacenc: partially unroll loop in flac_enc_lpc_16 It now does 12 samples per iteration, up from 4. From 1.8 to 3.2 times faster again. 3.6 to 5.7 times faster overall. Runtime is reduced by a further 2 to 18%. Overall runtime reduced by 4 to 50%. Same conditions as before apply. Signed-off-by: Michael Niedermayer --- libavcodec/flacenc.c | 2 +- libavcodec/x86/flac_dsp_gpl.asm | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c index f37bab8f3e..3b72888966 100644 --- a/libavcodec/flacenc.c +++ b/libavcodec/flacenc.c @@ -80,7 +80,7 @@ typedef struct FlacSubframe { int shift; RiceContext rc; int32_t samples[FLAC_MAX_BLOCKSIZE]; - int32_t residual[FLAC_MAX_BLOCKSIZE+3]; + int32_t residual[FLAC_MAX_BLOCKSIZE+11]; } FlacSubframe; typedef struct FlacFrame { diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm index 1f28be132a..cedf0837a7 100644 --- a/libavcodec/x86/flac_dsp_gpl.asm +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -26,13 +26,13 @@ SECTION_TEXT INIT_XMM sse4 %if ARCH_X86_64 - cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs + cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs DECLARE_REG_TMP 5, 6 %define length r2d movsxd orderq, orderd %else - cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, len, order, coefs + cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs DECLARE_REG_TMP 2, 5 %define length r2mp %endif @@ -59,6 +59,8 @@ neg orderq .looplen: pxor m0, m0 + pxor m4, m4 + pxor m6, m6 mov posj, orderq xor negj, negj @@ -66,20 +68,34 @@ neg orderq movd m2, [coefsq+posj*4] ; c = coefs[j] SPLATD m2 movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] + movu m5, [smpq+negj*4-4+mmsize] + movu m7, [smpq+negj*4-4+mmsize*2] pmulld m1, m2 + pmulld m5, m2 + pmulld m7, m2 paddd m0, m1 ; p += c * s + paddd m4, m5 + paddd m6, m7 dec negj inc posj jnz .looporder psrad m0, m3 ; p >>= shift + psrad m4, m3 + psrad m6, m3 movu m1, [smpq] + movu m5, [smpq+mmsize] + movu m7, [smpq+mmsize*2] psubd m1, m0 ; smp[i] - p + psubd m5, m4 + psubd m7, m6 movu [resq], m1 ; res[i] = smp[i] - (p >> shift) + movu [resq+mmsize], m5 + movu [resq+mmsize*2], m7 - add resq, mmsize - add smpq, mmsize - sub length, mmsize/4 + add resq, 3*mmsize + add smpq, 3*mmsize + sub length, (3*mmsize)/4 jg .looplen RET