From 54a51d384055a771ba1eeef3c2f399bd03fa2663 Mon Sep 17 00:00:00 2001
From: James Darnley <james.darnley@gmail.com>
Date: Tue, 12 Aug 2014 23:22:03 +0200
Subject: [PATCH] lavc/flacenc: partially unroll loop in flac_enc_lpc_16

It now does 12 samples per iteration, up from 4.

From 1.8 to 3.2 times faster again.  3.6 to 5.7 times faster overall.
Runtime is reduced by a further 2 to 18%.  Overall runtime reduced by
4 to 50%.

Same conditions as before apply.

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
 libavcodec/flacenc.c            |  2 +-
 libavcodec/x86/flac_dsp_gpl.asm | 26 +++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index f37bab8f3e..3b72888966 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -80,7 +80,7 @@ typedef struct FlacSubframe {
     int shift;
     RiceContext rc;
     int32_t samples[FLAC_MAX_BLOCKSIZE];
-    int32_t residual[FLAC_MAX_BLOCKSIZE+3];
+    int32_t residual[FLAC_MAX_BLOCKSIZE+11];
 } FlacSubframe;
 
 typedef struct FlacFrame {
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index 1f28be132a..cedf0837a7 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -26,13 +26,13 @@ SECTION_TEXT
 
 INIT_XMM sse4
 %if ARCH_X86_64
-    cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
     DECLARE_REG_TMP 5, 6
     %define length r2d
 
     movsxd orderq, orderd
 %else
-    cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, len, order, coefs
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
     DECLARE_REG_TMP 2, 5
     %define length r2mp
 %endif
@@ -59,6 +59,8 @@ neg  orderq
 
 .looplen:
     pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
     mov  posj, orderq
     xor  negj, negj
 
@@ -66,20 +68,34 @@ neg  orderq
         movd   m2, [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
         movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
         pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
         paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
 
         dec    negj
         inc    posj
     jnz .looporder
 
     psrad  m0,     m3              ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
     movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
     psubd  m1,     m0              ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
     movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
 
-    add resq,   mmsize
-    add smpq,   mmsize
-    sub length, mmsize/4
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
 jg .looplen
 RET