diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm index 2edaf9aef1..33b437c257 100644 --- a/libavcodec/x86/hevc_idct.asm +++ b/libavcodec/x86/hevc_idct.asm @@ -1,37 +1,38 @@ -; /* -; * SIMD optimized idct functions for HEVC decoding -; * Copyright (c) 2014 Pierre-Edouard LEPERE -; * Copyright (c) 2014 James Almer -; * -; * This file is part of FFmpeg. -; * -; * FFmpeg is free software; you can redistribute it and/or -; * modify it under the terms of the GNU Lesser General Public -; * License as published by the Free Software Foundation; either -; * version 2.1 of the License, or (at your option) any later version. -; * -; * FFmpeg is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -; * Lesser General Public License for more details. -; * -; * You should have received a copy of the GNU Lesser General Public -; * License along with FFmpeg; if not, write to the Free Software -; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -; */ +;******************************************************************************* +;* SIMD-optimized IDCT functions for HEVC decoding +;* Copyright (c) 2014 Pierre-Edouard LEPERE +;* Copyright (c) 2014 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + %include "libavutil/x86/x86util.asm" SECTION .text -; void ff_hevc_idctHxW_dc_{8,10}_(int16_t *coeffs) +; void ff_hevc_idct_HxW_dc_{8,10}_(int16_t *coeffs) ; %1 = HxW ; %2 = number of loops ; %3 = bitdepth %macro IDCT_DC 3 -cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp - movsx tmpq, word [coeffq] - add tmpw, ((1 << 14-%3) + 1) - sar tmpw, (15-%3) +cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp + movsx tmpd, word [coeffq] + add tmpd, (1 << (14 - %3)) + 1 + sar tmpd, (15 - %3) movd xm0, tmpd SPLATW m0, xm0 DEFINE_ARGS coeff, cnt @@ -41,11 +42,11 @@ cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp mova [coeffq+mmsize*1], m0 mova [coeffq+mmsize*2], m0 mova [coeffq+mmsize*3], m0 - mova [coeffq+mmsize*4], m0 - mova [coeffq+mmsize*5], m0 - mova [coeffq+mmsize*6], m0 - mova [coeffq+mmsize*7], m0 add coeffq, mmsize*8 + mova [coeffq+mmsize*-4], m0 + mova [coeffq+mmsize*-3], m0 + mova [coeffq+mmsize*-2], m0 + mova [coeffq+mmsize*-1], m0 dec cntd jg .loop RET @@ -54,10 +55,10 @@ cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp ; %1 = HxW ; %2 = bitdepth %macro IDCT_DC_NL 2 ; No loop -cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp - movsx tmpq, word [coeffq] - add tmpw, ((1 << 14-%2) + 1) - sar tmpw, (15-%2) +cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp + movsx tmpd, word [coeffq] + add tmpd, (1 << (14 - %2)) + 1 + sar tmpd, (15 - %2) movd m0, tmpd SPLATW m0, xm0 mova [coeffq+mmsize*0], m0 diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h index 3cfdc272cf..63a148e69a 100644 --- a/libavcodec/x86/hevcdsp.h +++ b/libavcodec/x86/hevcdsp.h @@ -29,9 +29,6 @@ #include -#define idct_dc_proto(size, bitd, opt) \ - void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) - #define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \ dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \ dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \ diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index da73d76638..d16e59d9e7 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -59,9 +59,9 @@ LFL_FUNCS(uint8_t, 10, avx) LFL_FUNCS(uint8_t, 12, avx) #define IDCT_FUNCS(W, opt) \ -void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \ -void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \ -void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs) +void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \ +void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \ +void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs) IDCT_FUNCS(4x4, mmxext); IDCT_FUNCS(8x8, mmxext); @@ -698,8 +698,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (bit_depth == 8) { if (EXTERNAL_MMXEXT(cpu_flags)) { - c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext; - c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext; + c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext; c->add_residual[0] = ff_hevc_add_residual4_8_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { @@ -712,9 +712,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) } SAO_BAND_INIT(8, sse2); - c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2; - c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2; - c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2; c->add_residual[1] = ff_hevc_add_residual8_8_sse2; c->add_residual[2] = ff_hevc_add_residual16_8_sse2; @@ -757,8 +757,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2; } if (EXTERNAL_AVX2_FAST(cpu_flags)) { - c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; - c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2; if (ARCH_X86_64) { c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2; c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2; @@ -855,8 +855,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { c->add_residual[0] = ff_hevc_add_residual4_10_mmxext; - c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext; - c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext; + c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2; @@ -868,9 +868,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) SAO_BAND_INIT(10, sse2); SAO_EDGE_INIT(10, sse2); - c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2; - c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2; - c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2; c->add_residual[1] = ff_hevc_add_residual8_10_sse2; c->add_residual[2] = ff_hevc_add_residual16_10_sse2; @@ -904,8 +904,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2; } if (EXTERNAL_AVX2_FAST(cpu_flags)) { - c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2; - c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2; if (ARCH_X86_64) { c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2; c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2; @@ -1059,8 +1059,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) } } else if (bit_depth == 12) { if (EXTERNAL_MMXEXT(cpu_flags)) { - c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext; - c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext; + c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2; @@ -1072,9 +1072,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) SAO_BAND_INIT(12, sse2); SAO_EDGE_INIT(12, sse2); - c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2; - c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2; - c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2; } if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3; @@ -1104,8 +1104,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2; } if (EXTERNAL_AVX2_FAST(cpu_flags)) { - c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2; - c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2; SAO_BAND_INIT(12, avx2); SAO_EDGE_INIT(12, avx2);