From 942e22c651166e8aa67bfffa7a431970200d3203 Mon Sep 17 00:00:00 2001 From: plepere Date: Mon, 16 Jun 2014 14:47:21 +0200 Subject: [PATCH] avcodec/x86/hevc: add avx2 dc idct Signed-off-by: Michael Niedermayer --- libavcodec/x86/hevc_idct.asm | 51 ++++++++++++++++++++++++++++++++--- libavcodec/x86/hevcdsp.h | 6 +++++ libavcodec/x86/hevcdsp_init.c | 18 +++++++++++++ 3 files changed, 72 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm index 6963dc78c5..31532ae907 100644 --- a/libavcodec/x86/hevc_idct.asm +++ b/libavcodec/x86/hevc_idct.asm @@ -20,12 +20,12 @@ ; */ %include "libavutil/x86/x86util.asm" -SECTION_RODATA -max_pixels_10: times 8 dw ((1 << 10)-1) +SECTION_RODATA 32 +max_pixels_10: times 16 dw ((1 << 10)-1) dc_add_10: times 4 dd ((1 << 14-10) + 1) -SECTION .text +SECTION_TEXT 32 ;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file @@ -41,6 +41,18 @@ SECTION .text packuswb m1, m1 %endmacro +%macro DC_ADD_INIT_AVX2 2 + add %1w, ((1 << 14-8) + 1) + sar %1w, (15-8) + movd xm0, %1d + vpbroadcastw m0, xm0 ;SPLATW + lea %1, [%2*3] + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 +%endmacro + %macro DC_ADD_OP 4 %1 m2, [%2 ] %1 m3, [%2+%3 ] @@ -112,6 +124,19 @@ cglobal hevc_idct16_dc_add_8, 3, 4, 0 DC_ADD_OP mova, r0, r2, r3 RET +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +; void ff_hevc_idct32_dc_add_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_idct32_dc_add_8, 3, 4, 6 + movsx r3, word [r1] + DC_ADD_INIT_AVX2 r3, r2 + DC_ADD_OP mova, r0, r2, r3, + %rep 7 + lea r0, [r0+r2*4] + DC_ADD_OP mova, r0, r2, r3 +%endrep + RET +%endif ;HAVE_AVX2_EXTERNAL ;----------------------------------------------------------------------------- ; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride) ;----------------------------------------------------------------------------- @@ -178,3 +203,23 @@ IDCT8_DC_ADD INIT_XMM avx IDCT8_DC_ADD %endif + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal hevc_idct16_dc_add_10,3,4,7 + mov r1w, [r1] + add r1w, ((1 << 4) + 1) + sar r1w, 5 + movd xm0, r1d + lea r1, [r2*3] + vpbroadcastw m0, xm0 ;SPLATW + mova m6, [max_pixels_10] + IDCT_DC_ADD_OP_10 r0, r2, r1 + lea r0, [r0+r2*4] + IDCT_DC_ADD_OP_10 r0, r2, r1 + lea r0, [r0+r2*4] + IDCT_DC_ADD_OP_10 r0, r2, r1 + lea r0, [r0+r2*4] + IDCT_DC_ADD_OP_10 r0, r2, r1 + RET +%endif ;HAVE_AVX_EXTERNAL diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h index 029492eca3..661a860bd8 100644 --- a/libavcodec/x86/hevcdsp.h +++ b/libavcodec/x86/hevcdsp.h @@ -133,6 +133,8 @@ idct_dc_proto(8, 8,mmxext); idct_dc_proto(16,8, sse2); idct_dc_proto(32,8, sse2); +idct_dc_proto(32,8, avx2); + idct_dc_proto(4, 10,mmxext); idct_dc_proto(8, 10, sse2); @@ -142,6 +144,10 @@ idct_dc_proto(8, 10, avx); idct_dc_proto(16,10, avx); idct_dc_proto(32,10, avx); +idct_dc_proto(16,10, avx2); +idct_dc_proto(32,10, avx2); + + diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 58a0891e5b..cad236ddad 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -92,6 +92,17 @@ void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid } #endif //HAVE_AVX_EXTERNAL +#if HAVE_AVX2_EXTERNAL + +void ff_hevc_idct32_dc_add_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +{ + ff_hevc_idct16_dc_add_10_avx2(dst, coeffs, stride); + ff_hevc_idct16_dc_add_10_avx2(dst+32, coeffs, stride); + ff_hevc_idct16_dc_add_10_avx2(dst+16*stride, coeffs, stride); + ff_hevc_idct16_dc_add_10_avx2(dst+16*stride+32, coeffs, stride); +} +#endif //HAVE_AVX2_EXTERNAL + #define mc_rep_func(name, bitd, step, W, opt) \ void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride, \ uint8_t *_src, ptrdiff_t _srcstride, int height, \ @@ -438,6 +449,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth) QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4); QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4); } + if (EXTERNAL_AVX2(mm_flags)) { + c->transform_dc_add[3] = ff_hevc_idct32_dc_add_8_avx2; + } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(mm_flags)) { c->transform_dc_add[0] = ff_hevc_idct4_dc_add_10_mmxext; @@ -473,6 +487,10 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->transform_dc_add[2] = ff_hevc_idct16_dc_add_10_avx; c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx; } + if (EXTERNAL_AVX2(mm_flags)) { + c->transform_dc_add[2] = ff_hevc_idct16_dc_add_10_avx2; + c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx2; + } } }