vp3: Use full transpose for all IDCTs

This way, the special IDCT permutations are no longer needed. This is similar to how H264 does it, and removes the dsputil dependency imposed by the scantable code. Also remove the unused type == 0 cases from the plain C version of the idct. Signed-off-by: Martin Storsjö <martin@martin.st>
2024-12-23 12:43:46 +02:00 · 2013-03-12 07:28:12 -07:00 · 2013-03-12 07:28:12 -07:00 · 015821229f
commit 015821229f
parent 5941978e71
12 changed files with 169 additions and 151 deletions
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@ -41,6 +41,5 @@ av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags)
        c->idct_dc_add   = ff_vp3_idct_dc_add_neon;
        c->v_loop_filter = ff_vp3_v_loop_filter_neon;
        c->h_loop_filter = ff_vp3_h_loop_filter_neon;
-        c->idct_perm     = FF_TRANSPOSE_IDCT_PERM;
    }
 }
--- a/libavcodec/bfin/vp3_bfin.c
+++ b/libavcodec/bfin/vp3_bfin.c
@ -61,6 +61,5 @@ av_cold void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags)
    if (!(flags & CODEC_FLAG_BITEXACT)) {
        c->idct_add = bfin_vp3_idct_add;
        c->idct_put = bfin_vp3_idct_put;
-        c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
    }
 }
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@ -184,7 +184,6 @@ av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
    if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
        c->idct_put  = vp3_idct_put_altivec;
        c->idct_add  = vp3_idct_add_altivec;
-        c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
    }
 #endif
 }
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@ -136,6 +136,7 @@ typedef struct Vp3DecodeContext {
    ThreadFrame current_frame;
    int keyframe;
    uint8_t idct_permutation[64];
+    uint8_t idct_scantable[64];
    DSPContext dsp;
    VideoDSPContext vdsp;
    VP3DSPContext vp3dsp;
@ -173,8 +174,6 @@ typedef struct Vp3DecodeContext {

    int8_t (*motion_val[2])[2];

-    ScanTable scantable;
-
    /* tables */
    uint16_t coded_dc_scale_factor[64];
    uint32_t coded_ac_scale_factor[64];
@ -1351,7 +1350,7 @@ static inline int vp3_dequant(Vp3DecodeContext *s, Vp3Fragment *frag,
                              int plane, int inter, int16_t block[64])
 {
    int16_t *dequantizer = s->qmat[frag->qpi][inter][plane];
-    uint8_t *perm = s->scantable.permutated;
+    uint8_t *perm = s->idct_scantable;
    int i = 0;

    do {
@ -1700,8 +1699,12 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
    ff_videodsp_init(&s->vdsp, 8);
    ff_vp3dsp_init(&s->vp3dsp, avctx->flags);

-    ff_init_scantable_permutation(s->idct_permutation, s->vp3dsp.idct_perm);
-    ff_init_scantable(s->idct_permutation, &s->scantable, ff_zigzag_direct);
+    for (i = 0; i < 64; i++) {
+#define T(x) (x >> 3) | ((x & 7) << 3)
+        s->idct_permutation[i] = T(i);
+        s->idct_scantable[i] = T(ff_zigzag_direct[i]);
+#undef T
+    }

    /* initialize to an impossible value which will force a recalculation
     * in the first frame decode */
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@ -54,7 +54,58 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
    /* Inverse DCT on the rows now */
    for (i = 0; i < 8; i++) {
        /* Check for non-zero values */
-        if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
+        if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+             ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+            A = M(xC1S7, ip[1 * 8]) + M(xC7S1, ip[7 * 8]);
+            B = M(xC7S1, ip[1 * 8]) - M(xC1S7, ip[7 * 8]);
+            C = M(xC3S5, ip[3 * 8]) + M(xC5S3, ip[5 * 8]);
+            D = M(xC3S5, ip[5 * 8]) - M(xC5S3, ip[3 * 8]);
+
+            Ad = M(xC4S4, (A - C));
+            Bd = M(xC4S4, (B - D));
+
+            Cd = A + C;
+            Dd = B + D;
+
+            E = M(xC4S4, (ip[0 * 8] + ip[4 * 8]));
+            F = M(xC4S4, (ip[0 * 8] - ip[4 * 8]));
+
+            G = M(xC2S6, ip[2 * 8]) + M(xC6S2, ip[6 * 8]);
+            H = M(xC6S2, ip[2 * 8]) - M(xC2S6, ip[6 * 8]);
+
+            Ed = E - G;
+            Gd = E + G;
+
+            Add = F + Ad;
+            Bdd = Bd - H;
+
+            Fd = F - Ad;
+            Hd = Bd + H;
+
+            /*  Final sequence of operations over-write original inputs. */
+            ip[0 * 8] = Gd + Cd ;
+            ip[7 * 8] = Gd - Cd ;
+
+            ip[1 * 8] = Add + Hd;
+            ip[2 * 8] = Add - Hd;
+
+            ip[3 * 8] = Ed + Dd ;
+            ip[4 * 8] = Ed - Dd ;
+
+            ip[5 * 8] = Fd + Bdd;
+            ip[6 * 8] = Fd - Bdd;
+        }
+
+        ip += 1;            /* next row */
+    }
+
+    ip = input;
+
+    for ( i = 0; i < 8; i++) {
+        /* Check for non-zero values (bitwise or faster than ||) */
+        if ( ip[1] | ip[2] | ip[3] |
+             ip[4] | ip[5] | ip[6] | ip[7] ) {
+
            A = M(xC1S7, ip[1]) + M(xC7S1, ip[7]);
            B = M(xC7S1, ip[1]) - M(xC1S7, ip[7]);
            C = M(xC3S5, ip[3]) + M(xC5S3, ip[5]);
@ -66,8 +117,13 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
            Cd = A + C;
            Dd = B + D;

-            E = M(xC4S4, (ip[0] + ip[4]));
-            F = M(xC4S4, (ip[0] - ip[4]));
+            E = M(xC4S4, (ip[0] + ip[4])) + 8;
+            F = M(xC4S4, (ip[0] - ip[4])) + 8;
+
+            if(type==1){  //HACK
+                E += 16*128;
+                F += 16*128;
+            }

            G = M(xC2S6, ip[2]) + M(xC6S2, ip[6]);
            H = M(xC6S2, ip[2]) - M(xC2S6, ip[6]);
@ -82,74 +138,7 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
            Hd = Bd + H;

            /* Final sequence of operations over-write original inputs. */
-            ip[0] = Gd + Cd ;
-            ip[7] = Gd - Cd ;
-
-            ip[1] = Add + Hd;
-            ip[2] = Add - Hd;
-
-            ip[3] = Ed + Dd ;
-            ip[4] = Ed - Dd ;
-
-            ip[5] = Fd + Bdd;
-            ip[6] = Fd - Bdd;
-        }
-
-        ip += 8;            /* next row */
-    }
-
-    ip = input;
-
-    for ( i = 0; i < 8; i++) {
-        /* Check for non-zero values (bitwise or faster than ||) */
-        if ( ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
-             ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
-
-            A = M(xC1S7, ip[1*8]) + M(xC7S1, ip[7*8]);
-            B = M(xC7S1, ip[1*8]) - M(xC1S7, ip[7*8]);
-            C = M(xC3S5, ip[3*8]) + M(xC5S3, ip[5*8]);
-            D = M(xC3S5, ip[5*8]) - M(xC5S3, ip[3*8]);
-
-            Ad = M(xC4S4, (A - C));
-            Bd = M(xC4S4, (B - D));
-
-            Cd = A + C;
-            Dd = B + D;
-
-            E = M(xC4S4, (ip[0*8] + ip[4*8])) + 8;
-            F = M(xC4S4, (ip[0*8] - ip[4*8])) + 8;
-
-            if(type==1){  //HACK
-                E += 16*128;
-                F += 16*128;
-            }
-
-            G = M(xC2S6, ip[2*8]) + M(xC6S2, ip[6*8]);
-            H = M(xC6S2, ip[2*8]) - M(xC2S6, ip[6*8]);
-
-            Ed = E - G;
-            Gd = E + G;
-
-            Add = F + Ad;
-            Bdd = Bd - H;
-
-            Fd = F - Ad;
-            Hd = Bd + H;
-
-            /* Final sequence of operations over-write original inputs. */
-            if(type==0){
-                ip[0*8] = (Gd + Cd )  >> 4;
-                ip[7*8] = (Gd - Cd )  >> 4;
-
-                ip[1*8] = (Add + Hd ) >> 4;
-                ip[2*8] = (Add - Hd ) >> 4;
-
-                ip[3*8] = (Ed + Dd )  >> 4;
-                ip[4*8] = (Ed - Dd )  >> 4;
-
-                ip[5*8] = (Fd + Bdd ) >> 4;
-                ip[6*8] = (Fd - Bdd ) >> 4;
-            }else if(type==1){
+            if (type == 1) {
                dst[0*stride] = av_clip_uint8((Gd + Cd )  >> 4);
                dst[7*stride] = av_clip_uint8((Gd - Cd )  >> 4);

@ -176,16 +165,7 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
            }

        } else {
-            if(type==0){
-                ip[0*8] =
-                ip[1*8] =
-                ip[2*8] =
-                ip[3*8] =
-                ip[4*8] =
-                ip[5*8] =
-                ip[6*8] =
-                ip[7*8] = ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20);
-            }else if(type==1){
+            if (type == 1) {
                dst[0*stride]=
                dst[1*stride]=
                dst[2*stride]=
@ -193,10 +173,10 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
                dst[4*stride]=
                dst[5*stride]=
                dst[6*stride]=
-                dst[7*stride]= av_clip_uint8(128 + ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20));
+                dst[7*stride]= av_clip_uint8(128 + ((xC4S4 * ip[0] + (IdctAdjustBeforeShift<<16))>>20));
            }else{
-                if(ip[0*8]){
-                    int v= ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20);
+                if(ip[0]){
+                    int v= ((xC4S4 * ip[0] + (IdctAdjustBeforeShift<<16))>>20);
                    dst[0*stride] = av_clip_uint8(dst[0*stride] + v);
                    dst[1*stride] = av_clip_uint8(dst[1*stride] + v);
                    dst[2*stride] = av_clip_uint8(dst[2*stride] + v);
@ -209,7 +189,7 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
            }
        }

-        ip++;            /* next column */
+        ip += 8;            /* next column */
        dst++;
    }
 }
@ -307,8 +287,6 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
    c->v_loop_filter = vp3_v_loop_filter_c;
    c->h_loop_filter = vp3_h_loop_filter_c;

-    c->idct_perm = FF_NO_IDCT_PERM;
-
    if (ARCH_ARM)
        ff_vp3dsp_init_arm(c, flags);
    if (ARCH_BFIN)
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@ -43,8 +43,6 @@ typedef struct VP3DSPContext {
    void (*idct_dc_add)(uint8_t *dest, int line_size, int16_t *block);
    void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
    void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
-
-    int idct_perm;
 } VP3DSPContext;

 void ff_vp3dsp_init(VP3DSPContext *c, int flags);
--- a/libavcodec/vp5.c
+++ b/libavcodec/vp5.c
@ -173,7 +173,7 @@ static void vp5_parse_coeff(VP56Context *s)
 {
    VP56RangeCoder *c = &s->c;
    VP56Model *model = s->modelp;
-    uint8_t *permute = s->scantable.permutated;
+    uint8_t *permute = s->idct_scantable;
    uint8_t *model1, *model2;
    int coeff, sign, coeff_idx;
    int b, i, cg, idx, ctx, ctx_last;
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@ -263,7 +263,7 @@ static VP56mb vp56_decode_mv(VP56Context *s, int row, int col)

 static void vp56_add_predictors_dc(VP56Context *s, VP56Frame ref_frame)
 {
-    int idx = s->scantable.permutated[0];
+    int idx = s->idct_scantable[0];
    int b;

    for (b=0; b<6; b++) {
@ -661,8 +661,11 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
    ff_videodsp_init(&s->vdsp, 8);
    ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
    ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id);
-    ff_init_scantable_permutation(s->dsp.idct_permutation, s->vp3dsp.idct_perm);
-    ff_init_scantable(s->dsp.idct_permutation, &s->scantable,ff_zigzag_direct);
+    for (i = 0; i < 64; i++) {
+#define T(x) (x >> 3) | ((x & 7) << 3)
+        s->idct_scantable[i] = T(ff_zigzag_direct[i]);
+#undef T
+    }

    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
        s->frames[i] = av_frame_alloc();
--- a/libavcodec/vp56.h
+++ b/libavcodec/vp56.h
@ -100,7 +100,7 @@ struct vp56_context {
    VideoDSPContext vdsp;
    VP3DSPContext vp3dsp;
    VP56DSPContext vp56dsp;
-    ScanTable scantable;
+    uint8_t idct_scantable[64];
    AVFrame *frames[4];
    uint8_t *edge_emu_buffer_alloc;
    uint8_t *edge_emu_buffer;
--- a/libavcodec/vp6.c
+++ b/libavcodec/vp6.c
@ -368,7 +368,7 @@ static unsigned vp6_get_nb_null(VP56Context *s)
 static void vp6_parse_coeff_huffman(VP56Context *s)
 {
    VP56Model *model = s->modelp;
-    uint8_t *permute = s->scantable.permutated;
+    uint8_t *permute = s->idct_scantable;
    VLC *vlc_coeff;
    int coeff, sign, coeff_idx;
    int b, cg, idx;
@ -428,7 +428,7 @@ static void vp6_parse_coeff(VP56Context *s)
 {
    VP56RangeCoder *c = s->ccp;
    VP56Model *model = s->modelp;
-    uint8_t *permute = s->scantable.permutated;
+    uint8_t *permute = s->idct_scantable;
    uint8_t *model1, *model2, *model3;
    int coeff, sign, coeff_idx;
    int b, i, cg, idx, ctx;
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@ -501,22 +501,22 @@ cglobal vp3_h_loop_filter, 3, 4

    ; at this point, function has completed dequantization + dezigzag +
    ; partial transposition; now do the idct itself
-%define I(x) [%1+16* x     ]
-%define J(x) [%1+16*(x-4)+8]
-    RowIDCT
-    Transpose
-
-%define I(x) [%1+16* x   +64]
-%define J(x) [%1+16*(x-4)+72]
-    RowIDCT
-    Transpose
-
 %define I(x) [%1+16*x]
 %define J(x) [%1+16*x]
-    ColumnIDCT
+    RowIDCT
+    Transpose

 %define I(x) [%1+16*x+8]
 %define J(x) [%1+16*x+8]
+    RowIDCT
+    Transpose
+
+%define I(x) [%1+16* x]
+%define J(x) [%1+16*(x-4)+8]
+    ColumnIDCT
+
+%define I(x) [%1+16* x   +64]
+%define J(x) [%1+16*(x-4)+72]
    ColumnIDCT
 %endif ; mmsize == 16/8
 %endmacro
@ -534,10 +534,17 @@ cglobal vp3_idct_put, 3, 4, 9
    mova          m1, [r2+mmsize*2+%%i]
    mova          m2, [r2+mmsize*4+%%i]
    mova          m3, [r2+mmsize*6+%%i]
+%if mmsize == 8
+    packsswb      m0, [r2+mmsize*8+%%i]
+    packsswb      m1, [r2+mmsize*10+%%i]
+    packsswb      m2, [r2+mmsize*12+%%i]
+    packsswb      m3, [r2+mmsize*14+%%i]
+%else
    packsswb      m0, [r2+mmsize*1+%%i]
    packsswb      m1, [r2+mmsize*3+%%i]
    packsswb      m2, [r2+mmsize*5+%%i]
    packsswb      m3, [r2+mmsize*7+%%i]
+%endif
    paddb         m0, m4
    paddb         m1, m4
    paddb         m2, m4
@ -561,7 +568,7 @@ cglobal vp3_idct_put, 3, 4, 9
    movq   [r0+r1*2], m3
    movhps [r0+r3  ], m3
 %endif
-%assign %%i %%i+64
+%assign %%i %%i+8
 %endrep

    pxor          m0, m0
@ -575,47 +582,81 @@ cglobal vp3_idct_put, 3, 4, 9
 cglobal vp3_idct_add, 3, 4, 9
    VP3_IDCT      r2

-    mov           r3, 4
-    pxor          m4, m4
    movsxdifnidn  r1, r1d
-.loop:
+    lea           r3, [r1*3]
+    pxor          m4, m4
+%if mmsize == 16
+%assign %%i 0
+%rep 2
    movq          m0, [r0]
    movq          m1, [r0+r1]
-%if mmsize == 8
-    mova          m2, m0
-    mova          m3, m1
-%endif
+    movq          m2, [r0+r1*2]
+    movq          m3, [r0+r3]
    punpcklbw     m0, m4
    punpcklbw     m1, m4
-%if mmsize == 8
-    punpckhbw     m2, m4
-    punpckhbw     m3, m4
-%endif
-    paddsw        m0, [r2+ 0]
-    paddsw        m1, [r2+16]
-%if mmsize == 8
-    paddsw        m2, [r2+ 8]
-    paddsw        m3, [r2+24]
-    packuswb      m0, m2
-    packuswb      m1, m3
-%else ; mmsize == 16
+    punpcklbw     m2, m4
+    punpcklbw     m3, m4
+    paddsw        m0, [r2+ 0+%%i]
+    paddsw        m1, [r2+16+%%i]
+    paddsw        m2, [r2+32+%%i]
+    paddsw        m3, [r2+48+%%i]
    packuswb      m0, m1
-%endif
+    packuswb      m2, m3
    movq   [r0     ], m0
-%if mmsize == 8
-    movq     [r0+r1], m1
-%else ; mmsize == 16
-    movhps   [r0+r1], m0
+    movhps [r0+r1  ], m0
+    movq   [r0+r1*2], m2
+    movhps [r0+r3  ], m2
+%if %%i == 0
+    lea           r0, [r0+r1*4]
 %endif
-    lea           r0, [r0+r1*2]
-%assign %%offset 0
-%rep 32/mmsize
-    mova [r2+%%offset], m4
-%assign %%offset %%offset+mmsize
+%assign %%i %%i+64
+%endrep
+%else
+%assign %%i 0
+%rep 2
+    movq          m0, [r0]
+    movq          m1, [r0+r1]
+    movq          m2, [r0+r1*2]
+    movq          m3, [r0+r3]
+    movq          m5, m0
+    movq          m6, m1
+    movq          m7, m2
+    punpcklbw     m0, m4
+    punpcklbw     m1, m4
+    punpcklbw     m2, m4
+    punpckhbw     m5, m4
+    punpckhbw     m6, m4
+    punpckhbw     m7, m4
+    paddsw        m0, [r2+ 0+%%i]
+    paddsw        m1, [r2+16+%%i]
+    paddsw        m2, [r2+32+%%i]
+    paddsw        m5, [r2+64+%%i]
+    paddsw        m6, [r2+80+%%i]
+    paddsw        m7, [r2+96+%%i]
+    packuswb      m0, m5
+    movq          m5, m3
+    punpcklbw     m3, m4
+    punpckhbw     m5, m4
+    packuswb      m1, m6
+    paddsw        m3, [r2+48+%%i]
+    paddsw        m5, [r2+112+%%i]
+    packuswb      m2, m7
+    packuswb      m3, m5
+    movq   [r0     ], m0
+    movq   [r0+r1  ], m1
+    movq   [r0+r1*2], m2
+    movq   [r0+r3  ], m3
+%if %%i == 0
+    lea           r0, [r0+r1*4]
+%endif
+%assign %%i %%i+8
+%endrep
+%endif
+%assign %%i 0
+%rep 128/mmsize
+    mova    [r2+%%i], m4
+%assign %%i %%i+mmsize
 %endrep
-    add           r2, 32
-    dec           r3
-    jg .loop
    RET
 %endmacro

--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@ -48,7 +48,6 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
    if (EXTERNAL_MMX(cpuflags)) {
        c->idct_put  = ff_vp3_idct_put_mmx;
        c->idct_add  = ff_vp3_idct_add_mmx;
-        c->idct_perm = FF_PARTTRANS_IDCT_PERM;
    }
 #endif

@ -64,6 +63,5 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
    if (EXTERNAL_SSE2(cpuflags)) {
        c->idct_put  = ff_vp3_idct_put_sse2;
        c->idct_add  = ff_vp3_idct_add_sse2;
-        c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
    }
 }