From 5c56758843eb5ea8fc39177585a57606a34125bc Mon Sep 17 00:00:00 2001 From: James Darnley Date: Wed, 15 Feb 2017 14:36:20 +0100 Subject: [PATCH] avcodec/h264: add avx 8-bit chroma v deblock/loop filter ~1.24x faster (101 vs. 81 cycles) compared with mmxext function --- libavcodec/x86/h264_deblock.asm | 38 +++++++++++++++++++++++++++++++++ libavcodec/x86/h264dsp_init.c | 2 ++ 2 files changed, 40 insertions(+) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 93caa67c85..2e84ca3097 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -1059,6 +1059,44 @@ ff_chroma_intra_body_mmxext: paddb m2, m6 ret +%macro CHROMA_INTER_BODY_XMM 1 + LOAD_MASK alpha_d, beta_d + movd m6, [tc0_q] + %rep %1 + punpcklbw m6, m6 + %endrep + pand m7, m6 + DEBLOCK_P0_Q0 +%endmacro + +%macro CHROMA_V_START_XMM 1 + movsxdifnidn stride_q, stride_d + dec alpha_d + dec beta_d + mov %1, pix_q + sub %1, stride_q + sub %1, stride_q +%endmacro + +%macro DEBLOCK_CHROMA_XMM 1 + +INIT_XMM %1 + +cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_ + CHROMA_V_START_XMM r5 + movq m0, [r5] + movq m1, [r5 + stride_q] + movq m2, [pix_q] + movq m3, [pix_q + stride_q] + CHROMA_INTER_BODY_XMM 1 + movq [r5 + stride_q], m1 + movq [pix_q], m2 +RET + +%endmacro ; DEBLOCK_CHROMA_XMM + +DEBLOCK_CHROMA_XMM avx + ;----------------------------------------------------------------------------- ; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], ; int8_t ref[2][40], int16_t mv[2][40][2], diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 10f19401ef..6794aa5957 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -317,6 +317,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, #if ARCH_X86_64 c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx; #endif + + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) {