FFmpeg/libavcodec/x86/hevc_idct.asm

;*******************************************************************************
;* SIMD-optimized IDCT functions for HEVC decoding
;* Copyright (c) 2014 Pierre-Edouard LEPERE
;* Copyright (c) 2014 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
; %1 = HxW
; %2 = number of loops
; %3 = bitdepth
%macro IDCT_DC 3
cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
    movsx             tmpd, word [coeffq]
    add               tmpd, (1 << (14 - %3)) + 1
    sar               tmpd, (15 - %3)
    movd               xm0, tmpd
    SPLATW              m0, xm0
    DEFINE_ARGS coeff, cnt
    mov               cntd, %2
.loop:
    mova [coeffq+mmsize*0], m0
    mova [coeffq+mmsize*1], m0
    mova [coeffq+mmsize*2], m0
    mova [coeffq+mmsize*3], m0
    add  coeffq, mmsize*8
    mova [coeffq+mmsize*-4], m0
    mova [coeffq+mmsize*-3], m0
    mova [coeffq+mmsize*-2], m0
    mova [coeffq+mmsize*-1], m0
    dec  cntd
    jg  .loop
    RET
%endmacro

; %1 = HxW
; %2 = bitdepth
%macro IDCT_DC_NL 2 ; No loop
cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
    movsx             tmpd, word [coeffq]
    add               tmpd, (1 << (14 - %2)) + 1
    sar               tmpd, (15 - %2)
    movd                m0, tmpd
    SPLATW              m0, xm0
    mova [coeffq+mmsize*0], m0
    mova [coeffq+mmsize*1], m0
    mova [coeffq+mmsize*2], m0
    mova [coeffq+mmsize*3], m0
%if mmsize == 16
    mova [coeffq+mmsize*4], m0
    mova [coeffq+mmsize*5], m0
    mova [coeffq+mmsize*6], m0
    mova [coeffq+mmsize*7], m0
%endif
    RET
%endmacro

; 8-bit
INIT_MMX mmxext
IDCT_DC_NL  4,      8
IDCT_DC     8,  2,  8

INIT_XMM sse2
IDCT_DC_NL  8,      8
IDCT_DC    16,  4,  8
IDCT_DC    32, 16,  8

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
IDCT_DC    16,  2,  8
IDCT_DC    32,  8,  8
%endif ;HAVE_AVX2_EXTERNAL

; 10-bit
INIT_MMX mmxext
IDCT_DC_NL  4,     10
IDCT_DC     8,  2, 10

INIT_XMM sse2
IDCT_DC_NL  8,     10
IDCT_DC    16,  4, 10
IDCT_DC    32, 16, 10

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
IDCT_DC    16,  2, 10
IDCT_DC    32,  8, 10
%endif ;HAVE_AVX2_EXTERNAL

; 12-bit
INIT_MMX mmxext
IDCT_DC_NL  4,     12
IDCT_DC     8,  2, 12

INIT_XMM sse2
IDCT_DC_NL  8,     12
IDCT_DC    16,  4, 12
IDCT_DC    32, 16, 12

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
IDCT_DC    16,  2, 12
IDCT_DC    32,  8, 12
%endif ;HAVE_AVX2_EXTERNAL
hevc: Add AVX2 DC IDCT Originally written by Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr>. Integrated to Libav by Josh de Kock <josh@itanimul.li>. Signed-off-by: Alexandra Hájková <alexandra@khirnov.net> 2016-06-29 11:56:42 +02:00			`;*******************************************************************************`
			`;* SIMD-optimized IDCT functions for HEVC decoding`
			`;* Copyright (c) 2014 Pierre-Edouard LEPERE`
			`;* Copyright (c) 2014 James Almer`
			`;*`
Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942' * commit 'fca3c3b61952aacc45e9ca54d86a762946c21942': hevc: Add AVX2 DC IDCT Mostly noop as we already have that code. In the ASM, code is merged with the exception of SECTION which is kept uppercase for consistency with the rest of the codebase. Still in the ASM, the prototype comment is fixed to honor the '_' added from the original commit. idct_dc_proto() is dropped as it's not used anymore here. Merged-by: Clément Bœsch <cboesch@gopro.com> 2017-01-31 17:50:21 +02:00			`;* This file is part of FFmpeg.`
hevc: Add AVX2 DC IDCT Originally written by Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr>. Integrated to Libav by Josh de Kock <josh@itanimul.li>. Signed-off-by: Alexandra Hájková <alexandra@khirnov.net> 2016-06-29 11:56:42 +02:00			`;*`
Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942' * commit 'fca3c3b61952aacc45e9ca54d86a762946c21942': hevc: Add AVX2 DC IDCT Mostly noop as we already have that code. In the ASM, code is merged with the exception of SECTION which is kept uppercase for consistency with the rest of the codebase. Still in the ASM, the prototype comment is fixed to honor the '_' added from the original commit. idct_dc_proto() is dropped as it's not used anymore here. Merged-by: Clément Bœsch <cboesch@gopro.com> 2017-01-31 17:50:21 +02:00			`;* FFmpeg is free software; you can redistribute it and/or`
hevc: Add AVX2 DC IDCT Originally written by Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr>. Integrated to Libav by Josh de Kock <josh@itanimul.li>. Signed-off-by: Alexandra Hájková <alexandra@khirnov.net> 2016-06-29 11:56:42 +02:00			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942' * commit 'fca3c3b61952aacc45e9ca54d86a762946c21942': hevc: Add AVX2 DC IDCT Mostly noop as we already have that code. In the ASM, code is merged with the exception of SECTION which is kept uppercase for consistency with the rest of the codebase. Still in the ASM, the prototype comment is fixed to honor the '_' added from the original commit. idct_dc_proto() is dropped as it's not used anymore here. Merged-by: Clément Bœsch <cboesch@gopro.com> 2017-01-31 17:50:21 +02:00			`;* FFmpeg is distributed in the hope that it will be useful,`
hevc: Add AVX2 DC IDCT Originally written by Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr>. Integrated to Libav by Josh de Kock <josh@itanimul.li>. Signed-off-by: Alexandra Hájková <alexandra@khirnov.net> 2016-06-29 11:56:42 +02:00			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942' * commit 'fca3c3b61952aacc45e9ca54d86a762946c21942': hevc: Add AVX2 DC IDCT Mostly noop as we already have that code. In the ASM, code is merged with the exception of SECTION which is kept uppercase for consistency with the rest of the codebase. Still in the ASM, the prototype comment is fixed to honor the '_' added from the original commit. idct_dc_proto() is dropped as it's not used anymore here. Merged-by: Clément Bœsch <cboesch@gopro.com> 2017-01-31 17:50:21 +02:00			`;* License along with FFmpeg; if not, write to the Free Software`
hevc: Add AVX2 DC IDCT Originally written by Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr>. Integrated to Libav by Josh de Kock <josh@itanimul.li>. Signed-off-by: Alexandra Hájková <alexandra@khirnov.net> 2016-06-29 11:56:42 +02:00			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00			`%include "libavutil/x86/x86util.asm"`

x86inc: Drop SECTION_TEXT macro The .text section is already 16-byte aligned by default on all supported platforms so `SECTION_TEXT` isn't any different from `SECTION .text`. 2015-05-27 21:38:14 +02:00			`SECTION .text`
avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00
Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942' * commit 'fca3c3b61952aacc45e9ca54d86a762946c21942': hevc: Add AVX2 DC IDCT Mostly noop as we already have that code. In the ASM, code is merged with the exception of SECTION which is kept uppercase for consistency with the rest of the codebase. Still in the ASM, the prototype comment is fixed to honor the '_' added from the original commit. idct_dc_proto() is dropped as it's not used anymore here. Merged-by: Clément Bœsch <cboesch@gopro.com> 2017-01-31 17:50:21 +02:00			`; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`; %1 = HxW`
			`; %2 = number of loops`
			`; %3 = bitdepth`
			`%macro IDCT_DC 3`
hevc: Add AVX2 DC IDCT Originally written by Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr>. Integrated to Libav by Josh de Kock <josh@itanimul.li>. Signed-off-by: Alexandra Hájková <alexandra@khirnov.net> 2016-06-29 11:56:42 +02:00			`cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp`
			`movsx tmpd, word [coeffq]`
			`add tmpd, (1 << (14 - %3)) + 1`
			`sar tmpd, (15 - %3)`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`movd xm0, tmpd`
			`SPLATW m0, xm0`
			`DEFINE_ARGS coeff, cnt`
			`mov cntd, %2`
x86/hevc_idct: add a colon to labels This fixes a warning spam when using NASM Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-28 22:17:34 +03:00			`.loop:`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`mova [coeffq+mmsize*0], m0`
			`mova [coeffq+mmsize*1], m0`
			`mova [coeffq+mmsize*2], m0`
			`mova [coeffq+mmsize*3], m0`
			`add coeffq, mmsize*8`
hevc: Add AVX2 DC IDCT Originally written by Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr>. Integrated to Libav by Josh de Kock <josh@itanimul.li>. Signed-off-by: Alexandra Hájková <alexandra@khirnov.net> 2016-06-29 11:56:42 +02:00			`mova [coeffq+mmsize*-4], m0`
			`mova [coeffq+mmsize*-3], m0`
			`mova [coeffq+mmsize*-2], m0`
			`mova [coeffq+mmsize*-1], m0`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`dec cntd`
			`jg .loop`
			`RET`
avcodec/x86/hevc: add avx2 dc idct Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-16 15:47:21 +03:00			`%endmacro`

x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`; %1 = HxW`
			`; %2 = bitdepth`
			`%macro IDCT_DC_NL 2 ; No loop`
hevc: Add AVX2 DC IDCT Originally written by Pierre Edouard Lepere <pierre-edouard.lepere@insa-rennes.fr>. Integrated to Libav by Josh de Kock <josh@itanimul.li>. Signed-off-by: Alexandra Hájková <alexandra@khirnov.net> 2016-06-29 11:56:42 +02:00			`cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp`
			`movsx tmpd, word [coeffq]`
			`add tmpd, (1 << (14 - %2)) + 1`
			`sar tmpd, (15 - %2)`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`movd m0, tmpd`
			`SPLATW m0, xm0`
			`mova [coeffq+mmsize*0], m0`
			`mova [coeffq+mmsize*1], m0`
			`mova [coeffq+mmsize*2], m0`
			`mova [coeffq+mmsize*3], m0`
			`%if mmsize == 16`
			`mova [coeffq+mmsize*4], m0`
			`mova [coeffq+mmsize*5], m0`
			`mova [coeffq+mmsize*6], m0`
			`mova [coeffq+mmsize*7], m0`
			`%endif`
			`RET`
avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00			`%endmacro`

x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`; 8-bit`
avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00			`INIT_MMX mmxext`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`IDCT_DC_NL 4, 8`
			`IDCT_DC 8, 2, 8`
avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00
			`INIT_XMM sse2`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`IDCT_DC_NL 8, 8`
			`IDCT_DC 16, 4, 8`
			`IDCT_DC 32, 16, 8`
avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00
avcodec/x86/hevc: add avx2 dc idct Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-16 15:47:21 +03:00			`%if HAVE_AVX2_EXTERNAL`
			`INIT_YMM avx2`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`IDCT_DC 16, 2, 8`
			`IDCT_DC 32, 8, 8`
avcodec/x86/hevc: add avx2 dc idct Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-16 15:47:21 +03:00			`%endif ;HAVE_AVX2_EXTERNAL`
avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`; 10-bit`
avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00			`INIT_MMX mmxext`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`IDCT_DC_NL 4, 10`
			`IDCT_DC 8, 2, 10`
avcodec/hevc: new idct + asm Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-13 14:29:17 +03:00
			`INIT_XMM sse2`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`IDCT_DC_NL 8, 10`
			`IDCT_DC 16, 4, 10`
			`IDCT_DC 32, 16, 10`
avcodec/x86/hevc: add avx2 dc idct Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-16 15:47:21 +03:00
			`%if HAVE_AVX2_EXTERNAL`
			`INIT_YMM avx2`
x86/hevc_idct: replace old and unused idct functions Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 10:47:14 +03:00			`IDCT_DC 16, 2, 10`
			`IDCT_DC 32, 8, 10`
			`%endif ;HAVE_AVX2_EXTERNAL`
x86/hevc_idct: add 12bit idct_dc Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-26 21:08:51 +03:00
			`; 12-bit`
			`INIT_MMX mmxext`
			`IDCT_DC_NL 4, 12`
			`IDCT_DC 8, 2, 12`

			`INIT_XMM sse2`
			`IDCT_DC_NL 8, 12`
			`IDCT_DC 16, 4, 12`
			`IDCT_DC 32, 16, 12`

			`%if HAVE_AVX2_EXTERNAL`
			`INIT_YMM avx2`
			`IDCT_DC 16, 2, 12`
			`IDCT_DC 32, 8, 12`
			`%endif ;HAVE_AVX2_EXTERNAL`