From 92cccb7bcd79845020ed8abebf35170c182443b2 Mon Sep 17 00:00:00 2001
From: plepere <pierre-edouard.lepere@insa-rennes.fr>
Date: Fri, 13 Jun 2014 13:29:17 +0200
Subject: [PATCH] avcodec/hevc: new idct + asm

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
 libavcodec/hevc_cabac.c       |  17 ++-
 libavcodec/hevcdsp.c          |   5 +
 libavcodec/hevcdsp.h          |   4 +-
 libavcodec/hevcdsp_template.c | 254 ++++++++++++++--------------------
 libavcodec/x86/Makefile       |   3 +-
 libavcodec/x86/hevc_idct.asm  | 180 ++++++++++++++++++++++++
 libavcodec/x86/hevcdsp.h      |  25 ++++
 libavcodec/x86/hevcdsp_init.c |  68 ++++++++-
 8 files changed, 398 insertions(+), 158 deletions(-)
 create mode 100644 libavcodec/x86/hevc_idct.asm

diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index 288f88576c..b23b89c393 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -1388,8 +1388,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             s->hevcdsp.transform_skip(dst, coeffs, stride);
         else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2)
             s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride);
-        else
-            s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+        else {
+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+            if (max_xy == 0)
+                s->hevcdsp.transform_dc_add[log2_trafo_size-2](dst, coeffs, stride);
+            else {
+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                if (max_xy < 4)
+                    col_limit = FFMIN(4, col_limit);
+                else if (max_xy < 8)
+                    col_limit = FFMIN(8, col_limit);
+                else if (max_xy < 12)
+                    col_limit = FFMIN(24, col_limit);
+                s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride, col_limit);
+            }
+        }
     }
 }
 
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index e6f56386e7..061651c3aa 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -202,6 +202,11 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     hevcdsp->transform_add[2]       = FUNC(transform_16x16_add, depth);     \
     hevcdsp->transform_add[3]       = FUNC(transform_32x32_add, depth);     \
                                                                             \
+    hevcdsp->transform_dc_add[0]    = FUNC(transform_4x4_dc_add, depth);    \
+    hevcdsp->transform_dc_add[1]    = FUNC(transform_8x8_dc_add, depth);    \
+    hevcdsp->transform_dc_add[2]    = FUNC(transform_16x16_dc_add, depth);  \
+    hevcdsp->transform_dc_add[3]    = FUNC(transform_32x32_dc_add, depth);  \
+                                                                            \
     hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth);           \
     hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth);           \
     hevcdsp->sao_band_filter[2] = FUNC(sao_band_filter_2, depth);           \
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 6fd5469ba1..b16251604d 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -50,7 +50,9 @@ typedef struct HEVCDSPContext {
     void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
     void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs,
                                    ptrdiff_t stride);
-    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t _stride, int col_limit);
+
+    void (*transform_dc_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 
     void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                struct SAOParams *sao, int *borders,
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index be90c7002b..9ca1236974 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -178,172 +178,122 @@ static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
 
 #undef TR_4x4_LUMA
 
-#define TR_4(dst, src, dstep, sstep, assign)                            \
-    do {                                                                \
-        const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
-                       transform[8 * 2][0] * src[2 * sstep];            \
-        const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
-                       transform[8 * 2][1] * src[2 * sstep];            \
-        const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
-                       transform[8 * 3][0] * src[3 * sstep];            \
-        const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
-                       transform[8 * 3][1] * src[3 * sstep];            \
-                                                                        \
-        assign(dst[0 * dstep], e0 + o0);                                \
-        assign(dst[1 * dstep], e1 + o1);                                \
-        assign(dst[2 * dstep], e1 - o1);                                \
-        assign(dst[3 * dstep], e0 - o0);                                \
+#define TR_4(dst, src, dstep, sstep, assign, end)                              \
+    do {                                                                       \
+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
+                                                                               \
+        assign(dst[0 * dstep], e0 + o0);                                       \
+        assign(dst[1 * dstep], e1 + o1);                                       \
+        assign(dst[2 * dstep], e1 - o1);                                       \
+        assign(dst[3 * dstep], e0 - o0);                                       \
     } while (0)
 
-static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
-                                    ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 4; i++) {
-        TR_4(src, src, 4, 4, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 4; i++) {
-        TR_4(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 4;
-        dst    += stride;
-    }
-}
-
-#define TR_8(dst, src, dstep, sstep, assign)                      \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_8[4];                                               \
-        int o_8[4] = { 0 };                                       \
-        for (i = 0; i < 4; i++)                                   \
-            for (j = 1; j < 8; j += 2)                            \
-                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
-        TR_4(e_8, src, 1, 2 * sstep, SET);                        \
-                                                                  \
-        for (i = 0; i < 4; i++) {                                 \
-            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
-            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
-        }                                                         \
+#define TR_8(dst, src, dstep, sstep, assign, end)                              \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_8[4];                                                            \
+        int o_8[4] = { 0 };                                                    \
+        for (i = 0; i < 4; i++)                                                \
+            for (j = 1; j < end; j += 2)                                       \
+                o_8[i] += transform[4 * j][i] * src[j * sstep];                \
+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
+                                                                               \
+        for (i = 0; i < 4; i++) {                                              \
+            assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
+        }                                                                      \
     } while (0)
 
-#define TR_16(dst, src, dstep, sstep, assign)                     \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_16[8];                                              \
-        int o_16[8] = { 0 };                                      \
-        for (i = 0; i < 8; i++)                                   \
-            for (j = 1; j < 16; j += 2)                           \
-                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
-        TR_8(e_16, src, 1, 2 * sstep, SET);                       \
-                                                                  \
-        for (i = 0; i < 8; i++) {                                 \
-            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
-            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
-        }                                                         \
+#define TR_16(dst, src, dstep, sstep, assign, end)                             \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_16[8];                                                           \
+        int o_16[8] = { 0 };                                                   \
+        for (i = 0; i < 8; i++)                                                \
+            for (j = 1; j < end; j += 2)                                       \
+                o_16[i] += transform[2 * j][i] * src[j * sstep];               \
+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
+                                                                               \
+        for (i = 0; i < 8; i++) {                                              \
+            assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
+        }                                                                      \
     } while (0)
 
-#define TR_32(dst, src, dstep, sstep, assign)                     \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_32[16];                                             \
-        int o_32[16] = { 0 };                                     \
-        for (i = 0; i < 16; i++)                                  \
-            for (j = 1; j < 32; j += 2)                           \
-                o_32[i] += transform[j][i] * src[j * sstep];      \
-        TR_16(e_32, src, 1, 2 * sstep, SET);                      \
-                                                                  \
-        for (i = 0; i < 16; i++) {                                \
-            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
-            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
-        }                                                         \
+#define TR_32(dst, src, dstep, sstep, assign, end)                             \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_32[16];                                                          \
+        int o_32[16] = { 0 };                                                  \
+        for (i = 0; i < 16; i++)                                               \
+            for (j = 1; j < end; j += 2)                                       \
+                o_32[i] += transform[j][i] * src[j * sstep];                   \
+        TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
+                                                                               \
+        for (i = 0; i < 16; i++) {                                             \
+            assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
+        }                                                                      \
     } while (0)
 
-
-
-static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs,
-                                    ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 8; i++) {
-        TR_8(src, src, 8, 8, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 8; i++) {
-        TR_8(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 8;
-        dst    += stride;
-    }
+#define TRANSFORM_ADD(H)                                                       \
+static void FUNC(transform_##H ##x ##H ##_add)(                                \
+    uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride, int col_limit) {        \
+    int i;                                                                     \
+    pixel    *dst    = (pixel *)_dst;                                          \
+    int      stride  = _stride/sizeof(pixel);                                  \
+    int      shift   = 7;                                                      \
+    int      add     = 1 << (shift - 1);                                       \
+    int16_t *src     = coeffs;                                                 \
+    int      limit   = FFMIN(col_limit + 4, H);                                \
+                                                                               \
+    for (i = 0; i < H; i++) {                                                  \
+        TR_ ## H(src, src, H, H, SCALE, limit);                                \
+        if (limit < H && i%4 == 0 && !!i)                                      \
+            limit -= 4;                                                        \
+        src++;                                                                 \
+    }                                                                          \
+    limit   = FFMIN(col_limit, H);                                             \
+                                                                               \
+    shift   = 20 - BIT_DEPTH;                                                  \
+    add     = 1 << (shift - 1);                                                \
+    for (i = 0; i < H; i++) {                                                  \
+        TR_ ## H(dst, coeffs, 1, 1, ADD_AND_SCALE, limit);                     \
+        coeffs += H;                                                           \
+        dst    += stride;                                                      \
+    }                                                                          \
 }
 
-static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 16; i++) {
-        TR_16(src, src, 16, 16, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 16; i++) {
-        TR_16(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 16;
-        dst    += stride;
-    }
+#define TRANSFORM_DC_ADD(H)                                                    \
+static void FUNC(transform_##H ##x ##H ##_dc_add)(                             \
+    uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride) {                       \
+    int i, j;                                                                  \
+    pixel    *dst    = (pixel *)_dst;                                          \
+    int      stride  = _stride/sizeof(pixel);                                  \
+    int      shift   = 14 - BIT_DEPTH;                                         \
+    int      add     = 1 << (shift - 1);                                       \
+    int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;                \
+                                                                               \
+    for (j = 0; j < H; j++) {                                                  \
+        for (i = 0; i < H; i++) {                                              \
+            dst[i+j*stride] = av_clip_pixel(dst[i+j*stride] + coeff);          \
+        }                                                                      \
+    }                                                                          \
 }
 
-static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
+TRANSFORM_ADD( 4)
+TRANSFORM_ADD( 8)
+TRANSFORM_ADD(16)
+TRANSFORM_ADD(32)
 
-    stride /= sizeof(pixel);
+TRANSFORM_DC_ADD( 4)
+TRANSFORM_DC_ADD( 8)
+TRANSFORM_DC_ADD(16)
+TRANSFORM_DC_ADD(32)
 
-    for (i = 0; i < 32; i++) {
-        TR_32(src, src, 32, 32, SCALE);
-        src++;
-    }
-    src   = coeffs;
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 32; i++) {
-        TR_32(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 32;
-        dst    += stride;
-    }
-}
 
 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
                                   ptrdiff_t stride, SAOParams *sao,
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index de9dc287e1..865348d55e 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -92,7 +92,8 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
                                           x86/fpel.o                    \
                                           x86/qpel.o
 YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
-                                          x86/hevc_deblock.o
+                                          x86/hevc_deblock.o            \
+                                          x86/hevc_idct.o
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                           x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
new file mode 100644
index 0000000000..5d2f5b563c
--- /dev/null
+++ b/libavcodec/x86/hevc_idct.asm
@@ -0,0 +1,180 @@
+; /*
+; * Provide SSE & MMX idct functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+max_pixels_10:          times 8  dw ((1 << 10)-1)
+dc_add_10:              times 4 dd ((1 << 14-10) + 1)
+
+
+SECTION .text
+
+;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
+
+%macro DC_ADD_INIT 2
+    add              %1w, ((1 << 14-8) + 1)
+    sar              %1w, (15-8)
+    movd              m0, %1
+    lea               %1, [%2*3]
+    SPLATW            m0, m0, 0
+    pxor              m1, m1
+    psubw             m1, m0
+    packuswb          m0, m0
+    packuswb          m1, m1
+%endmacro
+
+%macro DC_ADD_OP 4
+    %1                m2, [%2     ]
+    %1                m3, [%2+%3  ]
+    %1                m4, [%2+%3*2]
+    %1                m5, [%2+%4  ]
+    paddusb           m2, m0
+    paddusb           m3, m0
+    paddusb           m4, m0
+    paddusb           m5, m0
+    psubusb           m2, m1
+    psubusb           m3, m1
+    psubusb           m4, m1
+    psubusb           m5, m1
+    %1         [%2     ], m2
+    %1         [%2+%3  ], m3
+    %1         [%2+%3*2], m4
+    %1         [%2+%4  ], m5
+%endmacro
+
+INIT_MMX mmxext
+; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+%if ARCH_X86_64
+cglobal hevc_idct4_dc_add_8, 3, 4, 0
+    movsx             r3, word [r1]
+    DC_ADD_INIT       r3, r2
+    DC_ADD_OP       movh, r0, r2, r3
+    RET
+
+; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct8_dc_add_8, 3, 4, 0
+    movsx             r3, word [r1]
+    DC_ADD_INIT       r3, r2
+    DC_ADD_OP       mova, r0, r2, r3
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+    RET
+%else
+; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct4_dc_add_8, 2, 3, 0
+    movsx             r2, word [r1]
+    mov               r1, r2m
+    DC_ADD_INIT       r2, r1
+    DC_ADD_OP       movh, r0, r1, r2
+    RET
+
+; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct8_dc_add_8, 2, 3, 0
+    movsx             r2, word [r1]
+    mov               r1, r2m
+    DC_ADD_INIT       r2, r1
+    DC_ADD_OP       mova, r0, r1, r2
+    lea               r0, [r0+r1*4]
+    DC_ADD_OP       mova, r0, r1, r2
+    RET
+%endif
+
+
+INIT_XMM sse2
+; void ff_hevc_idct16_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct16_dc_add_8, 3, 4, 0
+    movsx             r3, word [r1]
+    DC_ADD_INIT       r3, r2
+    DC_ADD_OP       mova, r0, r2, r3
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT_DC_ADD_OP_10 3
+    pxor              m5, m5
+%if avx_enabled
+    paddw             m1, m0, [%1+0   ]
+    paddw             m2, m0, [%1+%2  ]
+    paddw             m3, m0, [%1+%2*2]
+    paddw             m4, m0, [%1+%3  ]
+%else
+    mova              m1, [%1+0   ]
+    mova              m2, [%1+%2  ]
+    mova              m3, [%1+%2*2]
+    mova              m4, [%1+%3  ]
+    paddw             m1, m0
+    paddw             m2, m0
+    paddw             m3, m0
+    paddw             m4, m0
+%endif
+    CLIPW             m1, m5, m6
+    CLIPW             m2, m5, m6
+    CLIPW             m3, m5, m6
+    CLIPW             m4, m5, m6
+    mova       [%1+0   ], m1
+    mova       [%1+%2  ], m2
+    mova       [%1+%2*2], m3
+    mova       [%1+%3  ], m4
+%endmacro
+
+INIT_MMX mmxext
+cglobal hevc_idct4_dc_add_10,3,3
+    mov              r1w, [r1]
+    add              r1w, ((1 << 4) + 1)
+    sar              r1w, 5
+    movd              m0, r1d
+    lea               r1, [r2*3]
+    SPLATW            m0, m0, 0
+    mova              m6, [max_pixels_10]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT8_DC_ADD 0
+cglobal hevc_idct8_dc_add_10,3,4,7
+    mov              r1w, [r1]
+    add              r1w, ((1 << 4) + 1)
+    sar              r1w, 5
+    movd              m0, r1d
+    lea               r1, [r2*3]
+    SPLATW            m0, m0, 0
+    mova              m6, [max_pixels_10]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea               r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT8_DC_ADD
+%endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index c5a64c708a..029492eca3 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -28,6 +28,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+
+#define idct_dc_proto(size, bitd, opt) \
+                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+
 #define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
@@ -119,5 +123,26 @@ QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);
 
+///////////////////////////////////////////////////////////////////////////////
+// IDCT
+///////////////////////////////////////////////////////////////////////////////
+
+
+idct_dc_proto(4, 8,mmxext);
+idct_dc_proto(8, 8,mmxext);
+idct_dc_proto(16,8,  sse2);
+idct_dc_proto(32,8,  sse2);
+
+
+idct_dc_proto(4, 10,mmxext);
+idct_dc_proto(8, 10,  sse2);
+idct_dc_proto(16,10,  sse2);
+idct_dc_proto(32,10,  sse2);
+idct_dc_proto(8, 10,   avx);
+idct_dc_proto(16,10,   avx);
+idct_dc_proto(32,10,   avx);
+
+
+
 
 #endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 30902be3bf..58a0891e5b 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -49,6 +49,48 @@ LFC_FUNCS(uint8_t,  10)
 LFL_FUNCS(uint8_t,   8)
 LFL_FUNCS(uint8_t,  10)
 
+#if HAVE_SSE2_EXTERNAL
+void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct16_dc_add_8_sse2(dst, coeffs, stride);
+    ff_hevc_idct16_dc_add_8_sse2(dst+16, coeffs, stride);
+    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride, coeffs, stride);
+    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride+16, coeffs, stride);
+}
+
+void ff_hevc_idct16_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct8_dc_add_10_sse2(dst, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_sse2(dst+16, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride+16, coeffs, stride);
+}
+
+void ff_hevc_idct32_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct16_dc_add_10_sse2(dst, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_sse2(dst+32, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride+32, coeffs, stride);
+}
+#endif //HAVE_SSE2_EXTERNAL
+#if HAVE_AVX_EXTERNAL
+void ff_hevc_idct16_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct8_dc_add_10_avx(dst, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_avx(dst+16, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_avx(dst+8*stride, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_avx(dst+8*stride+16, coeffs, stride);
+}
+
+void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct16_dc_add_10_avx(dst, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx(dst+32, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx(dst+16*stride, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx(dst+16*stride+32, coeffs, stride);
+}
+#endif //HAVE_AVX_EXTERNAL
 
 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride,                            \
@@ -368,9 +410,17 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
     int mm_flags = av_get_cpu_flags();
 
     if (bit_depth == 8) {
+        if (EXTERNAL_MMXEXT(mm_flags)) {
+                c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_8_mmxext;
+                c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_8_mmxext;
+
+        }
         if (EXTERNAL_SSE2(mm_flags)) {
                     c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
                     c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+
+                    c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_8_sse2;
+                    c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_sse2;
         }
         if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
                     c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
@@ -387,13 +437,21 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
-
         }
     } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(mm_flags)) {
+                c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_10_mmxext;
+
+        }
         if (EXTERNAL_SSE2(mm_flags)) {
                     c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
                     c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
-        }
+
+
+                    c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_sse2;
+                    c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_sse2;
+                    c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_sse2;
+                }
         if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
                     c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
                     c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
@@ -410,5 +468,11 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
         }
+        if (EXTERNAL_AVX(mm_flags)) {
+            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_avx;
+            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx;
+            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx;
+        }
+
     }
 }