unroll encode_residual_lpc(). speedup varies between 1.2x and 1.8x depending on lpc order.

Originally committed as revision 10596 to svn://svn.ffmpeg.org/ffmpeg/trunk
2025-07-11 14:30:22 +02:00 · 2007-09-27 02:42:00 +00:00
parent 6b19786b11
commit dc44d4ad64
2 changed files with 84 additions and 3 deletions
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@ -834,15 +834,83 @@ static void encode_residual_fixed(int32_t *res, const int32_t *smp, int n,
    }
 }
 #define LPC1(x) {\
    int s = smp[i-(x)+1];\
    p1 += c*s;\
    c = coefs[(x)-2];\
    p0 += c*s;\
 }
 static av_always_inline void encode_residual_lpc_unrolled(
    int32_t *res, const int32_t *smp, int n,
    int order, const int32_t *coefs, int shift, int big)
 {
    int i;
    for(i=order; i<n; i+=2) {
        int c = coefs[order-1];
        int p0 = c * smp[i-order];
        int p1 = 0;
        if(big) {
            switch(order) {
                case 32: LPC1(32)
                case 31: LPC1(31)
                case 30: LPC1(30)
                case 29: LPC1(29)
                case 28: LPC1(28)
                case 27: LPC1(27)
                case 26: LPC1(26)
                case 25: LPC1(25)
                case 24: LPC1(24)
                case 23: LPC1(23)
                case 22: LPC1(22)
                case 21: LPC1(21)
                case 20: LPC1(20)
                case 19: LPC1(19)
                case 18: LPC1(18)
                case 17: LPC1(17)
                case 16: LPC1(16)
                case 15: LPC1(15)
                case 14: LPC1(14)
                case 13: LPC1(13)
                case 12: LPC1(12)
                case 11: LPC1(11)
                case 10: LPC1(10)
                case  9: LPC1( 9)
                         LPC1( 8)
                         LPC1( 7)
                         LPC1( 6)
                         LPC1( 5)
                         LPC1( 4)
                         LPC1( 3)
                         LPC1( 2)
            }
        } else {
            switch(order) {
                case  8: LPC1( 8)
                case  7: LPC1( 7)
                case  6: LPC1( 6)
                case  5: LPC1( 5)
                case  4: LPC1( 4)
                case  3: LPC1( 3)
                case  2: LPC1( 2)
            }
        }
        p1 += c * smp[i];
        res[i  ] = smp[i  ] - (p0 >> shift);
        res[i+1] = smp[i+1] - (p1 >> shift);
    }
 }
 static void encode_residual_lpc(int32_t *res, const int32_t *smp, int n,
                                int order, const int32_t *coefs, int shift)
 {
-    int i, j;
+    int i;
    for(i=0; i<order; i++) {
        res[i] = smp[i];
    }
 #ifdef CONFIG_SMALL
    for(i=order; i<n; i+=2) {
        int j;
        int32_t c = coefs[0];
        int32_t p0 = 0, p1 = c*smp[i];
        for(j=1; j<order; j++) {
@ -855,6 +923,19 @@ static void encode_residual_lpc(int32_t *res, const int32_t *smp, int n,
        res[i+0] = smp[i+0] - (p0 >> shift);
        res[i+1] = smp[i+1] - (p1 >> shift);
    }
 #else
    switch(order) {
        case  1: encode_residual_lpc_unrolled(res, smp, n, 1, coefs, shift, 0); break;
        case  2: encode_residual_lpc_unrolled(res, smp, n, 2, coefs, shift, 0); break;
        case  3: encode_residual_lpc_unrolled(res, smp, n, 3, coefs, shift, 0); break;
        case  4: encode_residual_lpc_unrolled(res, smp, n, 4, coefs, shift, 0); break;
        case  5: encode_residual_lpc_unrolled(res, smp, n, 5, coefs, shift, 0); break;
        case  6: encode_residual_lpc_unrolled(res, smp, n, 6, coefs, shift, 0); break;
        case  7: encode_residual_lpc_unrolled(res, smp, n, 7, coefs, shift, 0); break;
        case  8: encode_residual_lpc_unrolled(res, smp, n, 8, coefs, shift, 0); break;
        default: encode_residual_lpc_unrolled(res, smp, n, order, coefs, shift, 1); break;
    }
 #endif
 }
 static int encode_residual(FlacEncodeContext *ctx, int ch)
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@ -670,7 +670,7 @@ static const AVOption options[]={
 {"context", "context model", OFFSET(context_model), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"slice_flags", NULL, OFFSET(slice_flags), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"xvmc_acceleration", NULL, OFFSET(xvmc_acceleration), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
-{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E, "mbd"},
+{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|A|E, "mbd"},
 {"simple", "use mbcmp (default)", 0, FF_OPT_TYPE_CONST, FF_MB_DECISION_SIMPLE, INT_MIN, INT_MAX, V|E, "mbd"},
 {"bits", "use fewest bits", 0, FF_OPT_TYPE_CONST, FF_MB_DECISION_BITS, INT_MIN, INT_MAX, V|E, "mbd"},
 {"rd", "use best rate distortion", 0, FF_OPT_TYPE_CONST, FF_MB_DECISION_RD, INT_MIN, INT_MAX, V|E, "mbd"},