mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-13 21:28:01 +02:00
Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942'
* commit 'fca3c3b61952aacc45e9ca54d86a762946c21942': hevc: Add AVX2 DC IDCT Mostly noop as we already have that code. In the ASM, code is merged with the exception of SECTION which is kept uppercase for consistency with the rest of the codebase. Still in the ASM, the prototype comment is fixed to honor the '_' added from the original commit. idct_dc_proto() is dropped as it's not used anymore here. Merged-by: Clément Bœsch <cboesch@gopro.com>
This commit is contained in:
commit
78d16eb452
@ -1,37 +1,38 @@
|
|||||||
; /*
|
;*******************************************************************************
|
||||||
; * SIMD optimized idct functions for HEVC decoding
|
;* SIMD-optimized IDCT functions for HEVC decoding
|
||||||
; * Copyright (c) 2014 Pierre-Edouard LEPERE
|
;* Copyright (c) 2014 Pierre-Edouard LEPERE
|
||||||
; * Copyright (c) 2014 James Almer
|
;* Copyright (c) 2014 James Almer
|
||||||
; *
|
;*
|
||||||
; * This file is part of FFmpeg.
|
;* This file is part of FFmpeg.
|
||||||
; *
|
;*
|
||||||
; * FFmpeg is free software; you can redistribute it and/or
|
;* FFmpeg is free software; you can redistribute it and/or
|
||||||
; * modify it under the terms of the GNU Lesser General Public
|
;* modify it under the terms of the GNU Lesser General Public
|
||||||
; * License as published by the Free Software Foundation; either
|
;* License as published by the Free Software Foundation; either
|
||||||
; * version 2.1 of the License, or (at your option) any later version.
|
;* version 2.1 of the License, or (at your option) any later version.
|
||||||
; *
|
;*
|
||||||
; * FFmpeg is distributed in the hope that it will be useful,
|
;* FFmpeg is distributed in the hope that it will be useful,
|
||||||
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
; * Lesser General Public License for more details.
|
;* Lesser General Public License for more details.
|
||||||
; *
|
;*
|
||||||
; * You should have received a copy of the GNU Lesser General Public
|
;* You should have received a copy of the GNU Lesser General Public
|
||||||
; * License along with FFmpeg; if not, write to the Free Software
|
;* License along with FFmpeg; if not, write to the Free Software
|
||||||
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
; */
|
;******************************************************************************
|
||||||
|
|
||||||
%include "libavutil/x86/x86util.asm"
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
|
; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
|
||||||
; %1 = HxW
|
; %1 = HxW
|
||||||
; %2 = number of loops
|
; %2 = number of loops
|
||||||
; %3 = bitdepth
|
; %3 = bitdepth
|
||||||
%macro IDCT_DC 3
|
%macro IDCT_DC 3
|
||||||
cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
|
cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
|
||||||
movsx tmpq, word [coeffq]
|
movsx tmpd, word [coeffq]
|
||||||
add tmpw, ((1 << 14-%3) + 1)
|
add tmpd, (1 << (14 - %3)) + 1
|
||||||
sar tmpw, (15-%3)
|
sar tmpd, (15 - %3)
|
||||||
movd xm0, tmpd
|
movd xm0, tmpd
|
||||||
SPLATW m0, xm0
|
SPLATW m0, xm0
|
||||||
DEFINE_ARGS coeff, cnt
|
DEFINE_ARGS coeff, cnt
|
||||||
@ -41,11 +42,11 @@ cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
|
|||||||
mova [coeffq+mmsize*1], m0
|
mova [coeffq+mmsize*1], m0
|
||||||
mova [coeffq+mmsize*2], m0
|
mova [coeffq+mmsize*2], m0
|
||||||
mova [coeffq+mmsize*3], m0
|
mova [coeffq+mmsize*3], m0
|
||||||
mova [coeffq+mmsize*4], m0
|
|
||||||
mova [coeffq+mmsize*5], m0
|
|
||||||
mova [coeffq+mmsize*6], m0
|
|
||||||
mova [coeffq+mmsize*7], m0
|
|
||||||
add coeffq, mmsize*8
|
add coeffq, mmsize*8
|
||||||
|
mova [coeffq+mmsize*-4], m0
|
||||||
|
mova [coeffq+mmsize*-3], m0
|
||||||
|
mova [coeffq+mmsize*-2], m0
|
||||||
|
mova [coeffq+mmsize*-1], m0
|
||||||
dec cntd
|
dec cntd
|
||||||
jg .loop
|
jg .loop
|
||||||
RET
|
RET
|
||||||
@ -54,10 +55,10 @@ cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
|
|||||||
; %1 = HxW
|
; %1 = HxW
|
||||||
; %2 = bitdepth
|
; %2 = bitdepth
|
||||||
%macro IDCT_DC_NL 2 ; No loop
|
%macro IDCT_DC_NL 2 ; No loop
|
||||||
cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
|
cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
|
||||||
movsx tmpq, word [coeffq]
|
movsx tmpd, word [coeffq]
|
||||||
add tmpw, ((1 << 14-%2) + 1)
|
add tmpd, (1 << (14 - %2)) + 1
|
||||||
sar tmpw, (15-%2)
|
sar tmpd, (15 - %2)
|
||||||
movd m0, tmpd
|
movd m0, tmpd
|
||||||
SPLATW m0, xm0
|
SPLATW m0, xm0
|
||||||
mova [coeffq+mmsize*0], m0
|
mova [coeffq+mmsize*0], m0
|
||||||
|
@ -29,9 +29,6 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
|
||||||
#define idct_dc_proto(size, bitd, opt) \
|
|
||||||
void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
|
|
||||||
|
|
||||||
#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
|
#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
|
||||||
dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
|
dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
|
||||||
dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
|
dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
|
||||||
|
@ -59,9 +59,9 @@ LFL_FUNCS(uint8_t, 10, avx)
|
|||||||
LFL_FUNCS(uint8_t, 12, avx)
|
LFL_FUNCS(uint8_t, 12, avx)
|
||||||
|
|
||||||
#define IDCT_FUNCS(W, opt) \
|
#define IDCT_FUNCS(W, opt) \
|
||||||
void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
|
void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
|
||||||
void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \
|
void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
|
||||||
void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs)
|
void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
|
||||||
|
|
||||||
IDCT_FUNCS(4x4, mmxext);
|
IDCT_FUNCS(4x4, mmxext);
|
||||||
IDCT_FUNCS(8x8, mmxext);
|
IDCT_FUNCS(8x8, mmxext);
|
||||||
@ -698,8 +698,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
|
|
||||||
if (bit_depth == 8) {
|
if (bit_depth == 8) {
|
||||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||||
c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
|
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
|
||||||
c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
|
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
|
||||||
c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
|
c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||||
@ -712,9 +712,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
}
|
}
|
||||||
SAO_BAND_INIT(8, sse2);
|
SAO_BAND_INIT(8, sse2);
|
||||||
|
|
||||||
c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
|
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
|
||||||
c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
|
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
|
||||||
c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
|
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
|
||||||
|
|
||||||
c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
|
c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
|
||||||
c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
|
c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
|
||||||
@ -757,8 +757,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
|
c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||||
c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
|
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
|
||||||
c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
|
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
|
||||||
if (ARCH_X86_64) {
|
if (ARCH_X86_64) {
|
||||||
c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
|
c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
|
||||||
c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
|
c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
|
||||||
@ -855,8 +855,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
} else if (bit_depth == 10) {
|
} else if (bit_depth == 10) {
|
||||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||||
c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
|
c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
|
||||||
c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
|
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
|
||||||
c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
|
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
|
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
|
||||||
@ -868,9 +868,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
SAO_BAND_INIT(10, sse2);
|
SAO_BAND_INIT(10, sse2);
|
||||||
SAO_EDGE_INIT(10, sse2);
|
SAO_EDGE_INIT(10, sse2);
|
||||||
|
|
||||||
c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
|
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
|
||||||
c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
|
c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
|
||||||
c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
|
c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
|
||||||
|
|
||||||
c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
|
c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
|
||||||
c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
|
c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
|
||||||
@ -904,8 +904,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
|
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||||
c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
|
c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
|
||||||
c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
|
c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
|
||||||
if (ARCH_X86_64) {
|
if (ARCH_X86_64) {
|
||||||
c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
|
c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
|
||||||
c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
|
c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
|
||||||
@ -1059,8 +1059,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
}
|
}
|
||||||
} else if (bit_depth == 12) {
|
} else if (bit_depth == 12) {
|
||||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||||
c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
|
c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
|
||||||
c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
|
c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
|
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
|
||||||
@ -1072,9 +1072,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
SAO_BAND_INIT(12, sse2);
|
SAO_BAND_INIT(12, sse2);
|
||||||
SAO_EDGE_INIT(12, sse2);
|
SAO_EDGE_INIT(12, sse2);
|
||||||
|
|
||||||
c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
|
c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
|
||||||
c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
|
c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
|
||||||
c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
|
c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
|
if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
|
||||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
|
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
|
||||||
@ -1104,8 +1104,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
|||||||
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
|
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||||
c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
|
c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
|
||||||
c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
|
c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
|
||||||
|
|
||||||
SAO_BAND_INIT(12, avx2);
|
SAO_BAND_INIT(12, avx2);
|
||||||
SAO_EDGE_INIT(12, avx2);
|
SAO_EDGE_INIT(12, avx2);
|
||||||
|
Loading…
Reference in New Issue
Block a user