avcodec/hevc: new idct + asm

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2025-08-10 06:10:52 +02:00 · 2014-06-13 13:29:17 +02:00
parent fa0d0fb42e
commit 92cccb7bcd
8 changed files with 398 additions and 158 deletions
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -1388,8 +1388,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
            s->hevcdsp.transform_skip(dst, coeffs, stride);
        else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2)
            s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride);
-        else
+        else {
-            s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
            if (max_xy == 0)
                s->hevcdsp.transform_dc_add[log2_trafo_size-2](dst, coeffs, stride);
            else {
                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
                if (max_xy < 4)
                    col_limit = FFMIN(4, col_limit);
                else if (max_xy < 8)
                    col_limit = FFMIN(8, col_limit);
                else if (max_xy < 12)
                    col_limit = FFMIN(24, col_limit);
                s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride, col_limit);
            }
        }
    }
 }
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -202,6 +202,11 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
    hevcdsp->transform_add[2]       = FUNC(transform_16x16_add, depth);     \
    hevcdsp->transform_add[3]       = FUNC(transform_32x32_add, depth);     \
                                                                            \
    hevcdsp->transform_dc_add[0]    = FUNC(transform_4x4_dc_add, depth);    \
    hevcdsp->transform_dc_add[1]    = FUNC(transform_8x8_dc_add, depth);    \
    hevcdsp->transform_dc_add[2]    = FUNC(transform_16x16_dc_add, depth);  \
    hevcdsp->transform_dc_add[3]    = FUNC(transform_32x32_dc_add, depth);  \
                                                                            \
    hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth);           \
    hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth);           \
    hevcdsp->sao_band_filter[2] = FUNC(sao_band_filter_2, depth);           \
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -50,7 +50,9 @@ typedef struct HEVCDSPContext {
    void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
    void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs,
                                   ptrdiff_t stride);
-    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t _stride, int col_limit);
    void (*transform_dc_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
    void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                               struct SAOParams *sao, int *borders,
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -178,16 +178,12 @@ static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
 #undef TR_4x4_LUMA
-#define TR_4(dst, src, dstep, sstep, assign)                            \
+#define TR_4(dst, src, dstep, sstep, assign, end)                              \
    do {                                                                       \
-        const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
-                       transform[8 * 2][0] * src[2 * sstep];            \
+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
-        const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
-                       transform[8 * 2][1] * src[2 * sstep];            \
+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
        const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
                       transform[8 * 3][0] * src[3 * sstep];            \
        const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
                       transform[8 * 3][1] * src[3 * sstep];            \
                                                                               \
        assign(dst[0 * dstep], e0 + o0);                                       \
        assign(dst[1 * dstep], e1 + o1);                                       \
@@ -195,40 +191,15 @@ static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
        assign(dst[3 * dstep], e0 - o0);                                       \
    } while (0)
-static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
+#define TR_8(dst, src, dstep, sstep, assign, end)                              \
                                    ptrdiff_t stride)
 {
    int i;
    pixel *dst   = (pixel *)_dst;
    int shift    = 7;
    int add      = 1 << (shift - 1);
    int16_t *src = coeffs;
    stride /= sizeof(pixel);
    for (i = 0; i < 4; i++) {
        TR_4(src, src, 4, 4, SCALE);
        src++;
    }
    shift = 20 - BIT_DEPTH;
    add   = 1 << (shift - 1);
    for (i = 0; i < 4; i++) {
        TR_4(dst, coeffs, 1, 1, ADD_AND_SCALE);
        coeffs += 4;
        dst    += stride;
    }
 }
 #define TR_8(dst, src, dstep, sstep, assign)                      \
    do {                                                                       \
        int i, j;                                                              \
        int e_8[4];                                                            \
        int o_8[4] = { 0 };                                                    \
        for (i = 0; i < 4; i++)                                                \
-            for (j = 1; j < 8; j += 2)                            \
+            for (j = 1; j < end; j += 2)                                       \
                o_8[i] += transform[4 * j][i] * src[j * sstep];                \
-        TR_4(e_8, src, 1, 2 * sstep, SET);                        \
+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
                                                                               \
        for (i = 0; i < 4; i++) {                                              \
            assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
@@ -236,15 +207,15 @@ static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
        }                                                                      \
    } while (0)
-#define TR_16(dst, src, dstep, sstep, assign)                     \
+#define TR_16(dst, src, dstep, sstep, assign, end)                             \
    do {                                                                       \
        int i, j;                                                              \
        int e_16[8];                                                           \
        int o_16[8] = { 0 };                                                   \
        for (i = 0; i < 8; i++)                                                \
-            for (j = 1; j < 16; j += 2)                           \
+            for (j = 1; j < end; j += 2)                                       \
                o_16[i] += transform[2 * j][i] * src[j * sstep];               \
-        TR_8(e_16, src, 1, 2 * sstep, SET);                       \
+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
                                                                               \
        for (i = 0; i < 8; i++) {                                              \
            assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
@@ -252,15 +223,15 @@ static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
        }                                                                      \
    } while (0)
-#define TR_32(dst, src, dstep, sstep, assign)                     \
+#define TR_32(dst, src, dstep, sstep, assign, end)                             \
    do {                                                                       \
        int i, j;                                                              \
        int e_32[16];                                                          \
        int o_32[16] = { 0 };                                                  \
        for (i = 0; i < 16; i++)                                               \
-            for (j = 1; j < 32; j += 2)                           \
+            for (j = 1; j < end; j += 2)                                       \
                o_32[i] += transform[j][i] * src[j * sstep];                   \
-        TR_16(e_32, src, 1, 2 * sstep, SET);                      \
+        TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
                                                                               \
        for (i = 0; i < 16; i++) {                                             \
            assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
@@ -268,82 +239,61 @@ static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
        }                                                                      \
    } while (0)
-
+#define TRANSFORM_ADD(H)                                                       \
-
+static void FUNC(transform_##H ##x ##H ##_add)(                                \
-static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs,
+    uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride, int col_limit) {        \
-                                    ptrdiff_t stride)
+    int i;                                                                     \
-{
+    pixel    *dst    = (pixel *)_dst;                                          \
-    int i;
+    int      stride  = _stride/sizeof(pixel);                                  \
-    pixel *dst   = (pixel *)_dst;
+    int      shift   = 7;                                                      \
-    int shift    = 7;
+    int      add     = 1 << (shift - 1);                                       \
-    int add      = 1 << (shift - 1);
+    int16_t *src     = coeffs;                                                 \
-    int16_t *src = coeffs;
+    int      limit   = FFMIN(col_limit + 4, H);                                \
-
+                                                                               \
-    stride /= sizeof(pixel);
+    for (i = 0; i < H; i++) {                                                  \
-
+        TR_ ## H(src, src, H, H, SCALE, limit);                                \
-    for (i = 0; i < 8; i++) {
+        if (limit < H && i%4 == 0 && !!i)                                      \
-        TR_8(src, src, 8, 8, SCALE);
+            limit -= 4;                                                        \
-        src++;
+        src++;                                                                 \
    }                                                                          \
    limit   = FFMIN(col_limit, H);                                             \
                                                                               \
    shift   = 20 - BIT_DEPTH;                                                  \
    add     = 1 << (shift - 1);                                                \
    for (i = 0; i < H; i++) {                                                  \
        TR_ ## H(dst, coeffs, 1, 1, ADD_AND_SCALE, limit);                     \
        coeffs += H;                                                           \
        dst    += stride;                                                      \
    }                                                                          \
 }
-    shift = 20 - BIT_DEPTH;
+#define TRANSFORM_DC_ADD(H)                                                    \
-    add   = 1 << (shift - 1);
+static void FUNC(transform_##H ##x ##H ##_dc_add)(                             \
-    for (i = 0; i < 8; i++) {
+    uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride) {                       \
-        TR_8(dst, coeffs, 1, 1, ADD_AND_SCALE);
+    int i, j;                                                                  \
-        coeffs += 8;
+    pixel    *dst    = (pixel *)_dst;                                          \
-        dst    += stride;
+    int      stride  = _stride/sizeof(pixel);                                  \
-    }
+    int      shift   = 14 - BIT_DEPTH;                                         \
    int      add     = 1 << (shift - 1);                                       \
    int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;                \
                                                                               \
    for (j = 0; j < H; j++) {                                                  \
        for (i = 0; i < H; i++) {                                              \
            dst[i+j*stride] = av_clip_pixel(dst[i+j*stride] + coeff);          \
        }                                                                      \
    }                                                                          \
 }
-static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs,
+TRANSFORM_ADD( 4)
-                                      ptrdiff_t stride)
+TRANSFORM_ADD( 8)
-{
+TRANSFORM_ADD(16)
-    int i;
+TRANSFORM_ADD(32)
    pixel *dst   = (pixel *)_dst;
    int shift    = 7;
    int add      = 1 << (shift - 1);
    int16_t *src = coeffs;
-    stride /= sizeof(pixel);
+TRANSFORM_DC_ADD( 4)
 TRANSFORM_DC_ADD( 8)
 TRANSFORM_DC_ADD(16)
 TRANSFORM_DC_ADD(32)
    for (i = 0; i < 16; i++) {
        TR_16(src, src, 16, 16, SCALE);
        src++;
    }
    shift = 20 - BIT_DEPTH;
    add   = 1 << (shift - 1);
    for (i = 0; i < 16; i++) {
        TR_16(dst, coeffs, 1, 1, ADD_AND_SCALE);
        coeffs += 16;
        dst    += stride;
    }
 }
 static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs,
                                      ptrdiff_t stride)
 {
    int i;
    pixel *dst   = (pixel *)_dst;
    int shift    = 7;
    int add      = 1 << (shift - 1);
    int16_t *src = coeffs;
    stride /= sizeof(pixel);
    for (i = 0; i < 32; i++) {
        TR_32(src, src, 32, 32, SCALE);
        src++;
    }
    src   = coeffs;
    shift = 20 - BIT_DEPTH;
    add   = 1 << (shift - 1);
    for (i = 0; i < 32; i++) {
        TR_32(dst, coeffs, 1, 1, ADD_AND_SCALE);
        coeffs += 32;
        dst    += stride;
    }
 }
 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
                                  ptrdiff_t stride, SAOParams *sao,
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -92,7 +92,8 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
                                          x86/fpel.o                    \
                                          x86/qpel.o
 YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
-                                          x86/hevc_deblock.o
+                                          x86/hevc_deblock.o            \
                                          x86/hevc_idct.o
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                          x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -0,0 +1,180 @@
 ; /*
 ; * Provide SSE & MMX idct functions for HEVC decoding
 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
 ; *
 ; * This file is part of FFmpeg.
 ; *
 ; * FFmpeg is free software; you can redistribute it and/or
 ; * modify it under the terms of the GNU Lesser General Public
 ; * License as published by the Free Software Foundation; either
 ; * version 2.1 of the License, or (at your option) any later version.
 ; *
 ; * FFmpeg is distributed in the hope that it will be useful,
 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ; * Lesser General Public License for more details.
 ; *
 ; * You should have received a copy of the GNU Lesser General Public
 ; * License along with FFmpeg; if not, write to the Free Software
 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ; */
 %include "libavutil/x86/x86util.asm"
 SECTION_RODATA
 max_pixels_10:          times 8  dw ((1 << 10)-1)
 dc_add_10:              times 4 dd ((1 << 14-10) + 1)
 SECTION .text
 ;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
 %macro DC_ADD_INIT 2
    add              %1w, ((1 << 14-8) + 1)
    sar              %1w, (15-8)
    movd              m0, %1
    lea               %1, [%2*3]
    SPLATW            m0, m0, 0
    pxor              m1, m1
    psubw             m1, m0
    packuswb          m0, m0
    packuswb          m1, m1
 %endmacro
 %macro DC_ADD_OP 4
    %1                m2, [%2     ]
    %1                m3, [%2+%3  ]
    %1                m4, [%2+%3*2]
    %1                m5, [%2+%4  ]
    paddusb           m2, m0
    paddusb           m3, m0
    paddusb           m4, m0
    paddusb           m5, m0
    psubusb           m2, m1
    psubusb           m3, m1
    psubusb           m4, m1
    psubusb           m5, m1
    %1         [%2     ], m2
    %1         [%2+%3  ], m3
    %1         [%2+%3*2], m4
    %1         [%2+%4  ], m5
 %endmacro
 INIT_MMX mmxext
 ; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 %if ARCH_X86_64
 cglobal hevc_idct4_dc_add_8, 3, 4, 0
    movsx             r3, word [r1]
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       movh, r0, r2, r3
    RET
 ; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct8_dc_add_8, 3, 4, 0
    movsx             r3, word [r1]
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    RET
 %else
 ; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct4_dc_add_8, 2, 3, 0
    movsx             r2, word [r1]
    mov               r1, r2m
    DC_ADD_INIT       r2, r1
    DC_ADD_OP       movh, r0, r1, r2
    RET
 ; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct8_dc_add_8, 2, 3, 0
    movsx             r2, word [r1]
    mov               r1, r2m
    DC_ADD_INIT       r2, r1
    DC_ADD_OP       mova, r0, r1, r2
    lea               r0, [r0+r1*4]
    DC_ADD_OP       mova, r0, r1, r2
    RET
 %endif
 INIT_XMM sse2
 ; void ff_hevc_idct16_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct16_dc_add_8, 3, 4, 0
    movsx             r3, word [r1]
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    RET
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
 %macro IDCT_DC_ADD_OP_10 3
    pxor              m5, m5
 %if avx_enabled
    paddw             m1, m0, [%1+0   ]
    paddw             m2, m0, [%1+%2  ]
    paddw             m3, m0, [%1+%2*2]
    paddw             m4, m0, [%1+%3  ]
 %else
    mova              m1, [%1+0   ]
    mova              m2, [%1+%2  ]
    mova              m3, [%1+%2*2]
    mova              m4, [%1+%3  ]
    paddw             m1, m0
    paddw             m2, m0
    paddw             m3, m0
    paddw             m4, m0
 %endif
    CLIPW             m1, m5, m6
    CLIPW             m2, m5, m6
    CLIPW             m3, m5, m6
    CLIPW             m4, m5, m6
    mova       [%1+0   ], m1
    mova       [%1+%2  ], m2
    mova       [%1+%2*2], m3
    mova       [%1+%3  ], m4
 %endmacro
 INIT_MMX mmxext
 cglobal hevc_idct4_dc_add_10,3,3
    mov              r1w, [r1]
    add              r1w, ((1 << 4) + 1)
    sar              r1w, 5
    movd              m0, r1d
    lea               r1, [r2*3]
    SPLATW            m0, m0, 0
    mova              m6, [max_pixels_10]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    RET
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
 %macro IDCT8_DC_ADD 0
 cglobal hevc_idct8_dc_add_10,3,4,7
    mov              r1w, [r1]
    add              r1w, ((1 << 4) + 1)
    sar              r1w, 5
    movd              m0, r1d
    lea               r1, [r2*3]
    SPLATW            m0, m0, 0
    mova              m6, [max_pixels_10]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    lea               r0, [r0+r2*4]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    RET
 %endmacro
 INIT_XMM sse2
 IDCT8_DC_ADD
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_DC_ADD
 %endif
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -28,6 +28,10 @@
 #include <stddef.h>
 #include <stdint.h>
 #define idct_dc_proto(size, bitd, opt) \
                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 #define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
@@ -119,5 +123,26 @@ QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);
 ///////////////////////////////////////////////////////////////////////////////
 // IDCT
 ///////////////////////////////////////////////////////////////////////////////
 idct_dc_proto(4, 8,mmxext);
 idct_dc_proto(8, 8,mmxext);
 idct_dc_proto(16,8,  sse2);
 idct_dc_proto(32,8,  sse2);
 idct_dc_proto(4, 10,mmxext);
 idct_dc_proto(8, 10,  sse2);
 idct_dc_proto(16,10,  sse2);
 idct_dc_proto(32,10,  sse2);
 idct_dc_proto(8, 10,   avx);
 idct_dc_proto(16,10,   avx);
 idct_dc_proto(32,10,   avx);
 #endif // AVCODEC_X86_HEVCDSP_H
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -49,6 +49,48 @@ LFC_FUNCS(uint8_t,  10)
 LFL_FUNCS(uint8_t,   8)
 LFL_FUNCS(uint8_t,  10)
 #if HAVE_SSE2_EXTERNAL
 void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_8_sse2(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride+16, coeffs, stride);
 }
 void ff_hevc_idct16_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct8_dc_add_10_sse2(dst, coeffs, stride);
    ff_hevc_idct8_dc_add_10_sse2(dst+16, coeffs, stride);
    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride, coeffs, stride);
    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride+16, coeffs, stride);
 }
 void ff_hevc_idct32_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_10_sse2(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+32, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride+32, coeffs, stride);
 }
 #endif //HAVE_SSE2_EXTERNAL
 #if HAVE_AVX_EXTERNAL
 void ff_hevc_idct16_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct8_dc_add_10_avx(dst, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+16, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+8*stride, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+8*stride+16, coeffs, stride);
 }
 void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_10_avx(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+32, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+16*stride+32, coeffs, stride);
 }
 #endif //HAVE_AVX_EXTERNAL
 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride,                            \
@@ -368,9 +410,17 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
    int mm_flags = av_get_cpu_flags();
    if (bit_depth == 8) {
        if (EXTERNAL_MMXEXT(mm_flags)) {
                c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_8_mmxext;
                c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_8_mmxext;
        }
        if (EXTERNAL_SSE2(mm_flags)) {
                    c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
                    c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
                    c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_8_sse2;
                    c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_sse2;
        }
        if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
                    c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
@@ -387,12 +437,20 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
        }
    } else if (bit_depth == 10) {
        if (EXTERNAL_MMXEXT(mm_flags)) {
                c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_10_mmxext;
        }
        if (EXTERNAL_SSE2(mm_flags)) {
                    c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
                    c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
                    c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_sse2;
                    c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_sse2;
                    c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_sse2;
                }
        if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
                    c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@@ -410,5 +468,11 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
        }
        if (EXTERNAL_AVX(mm_flags)) {
            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_avx;
            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx;
            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx;
        }
    }
 }