FFmpeg/libavcodec/x86/idctdsp_init.c

/*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
#include "idctdsp.h"
#include "simple_idct.h"

/* Input permutation for the simple_idct_mmx */
static const uint8_t simple_mmx_permutation[64] = {
    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
};

static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };

av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
                                              enum idct_permutation_type perm_type)
{
    int i;

    switch (perm_type) {
    case FF_IDCT_PERM_SIMPLE:
        for (i = 0; i < 64; i++)
            idct_permutation[i] = simple_mmx_permutation[i];
        return 1;
    case FF_IDCT_PERM_SSE2:
        for (i = 0; i < 64; i++)
            idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
        return 1;
    }

    return 0;
}

av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                                 unsigned high_bit_depth)
{
    int cpu_flags = av_get_cpu_flags();

    if (INLINE_MMX(cpu_flags)) {
        if (!high_bit_depth &&
            avctx->lowres == 0 &&
            (avctx->idct_algo == FF_IDCT_AUTO ||
             avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                c->idct_put  = ff_simple_idct_put_mmx;
                c->idct_add  = ff_simple_idct_add_mmx;
                c->idct      = ff_simple_idct_mmx;
                c->perm_type = FF_IDCT_PERM_SIMPLE;
        }
    }
    if (EXTERNAL_MMX(cpu_flags)) {
        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
    }
    if (EXTERNAL_SSE2(cpu_flags)) {
        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
        c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
        c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
    }

    if (ARCH_X86_64 && avctx->lowres == 0) {
        if (avctx->bits_per_raw_sample == 10 &&
        (avctx->idct_algo == FF_IDCT_AUTO ||
         avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
         avctx->idct_algo == FF_IDCT_SIMPLE)) {
        if (EXTERNAL_SSE2(cpu_flags)) {
            c->idct_put  = ff_simple_idct10_put_sse2;
            c->idct_add  = NULL;
            c->idct      = ff_simple_idct10_sse2;
            c->perm_type = FF_IDCT_PERM_TRANSPOSE;

        }
        if (EXTERNAL_AVX(cpu_flags)) {
            c->idct_put  = ff_simple_idct10_put_avx;
            c->idct_add  = NULL;
            c->idct      = ff_simple_idct10_avx;
            c->perm_type = FF_IDCT_PERM_TRANSPOSE;
        }
        }

        if (avctx->bits_per_raw_sample == 12 &&
            (avctx->idct_algo == FF_IDCT_AUTO ||
             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
            if (EXTERNAL_SSE2(cpu_flags)) {
                c->idct_put  = ff_simple_idct12_put_sse2;
                c->idct_add  = NULL;
                c->idct      = ff_simple_idct12_sse2;
                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
            }
            if (EXTERNAL_AVX(cpu_flags)) {
                c->idct_put  = ff_simple_idct12_put_avx;
                c->idct_add  = NULL;
                c->idct      = ff_simple_idct12_avx;
                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
            }
        }
    }
}
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`/*`
Merge commit 'e3fcb14347466095839c2a3c47ebecff02da891e' * commit 'e3fcb14347466095839c2a3c47ebecff02da891e': dsputil: Split off IDCT bits into their own context Conflicts: configure libavcodec/aic.c libavcodec/arm/Makefile libavcodec/arm/dsputil_init_arm.c libavcodec/arm/dsputil_init_armv6.c libavcodec/asvdec.c libavcodec/dnxhdenc.c libavcodec/dsputil.c libavcodec/dvdec.c libavcodec/dxva2_mpeg2.c libavcodec/intrax8.c libavcodec/mdec.c libavcodec/mjpegdec.c libavcodec/mjpegenc_common.h libavcodec/mpegvideo.c libavcodec/ppc/dsputil_altivec.h libavcodec/ppc/dsputil_ppc.c libavcodec/ppc/idctdsp.c libavcodec/x86/Makefile libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-01 15:38:57 +03:00			`* This file is part of FFmpeg.`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`*`
Merge commit 'e3fcb14347466095839c2a3c47ebecff02da891e' * commit 'e3fcb14347466095839c2a3c47ebecff02da891e': dsputil: Split off IDCT bits into their own context Conflicts: configure libavcodec/aic.c libavcodec/arm/Makefile libavcodec/arm/dsputil_init_arm.c libavcodec/arm/dsputil_init_armv6.c libavcodec/asvdec.c libavcodec/dnxhdenc.c libavcodec/dsputil.c libavcodec/dvdec.c libavcodec/dxva2_mpeg2.c libavcodec/intrax8.c libavcodec/mdec.c libavcodec/mjpegdec.c libavcodec/mjpegenc_common.h libavcodec/mpegvideo.c libavcodec/ppc/dsputil_altivec.h libavcodec/ppc/dsputil_ppc.c libavcodec/ppc/idctdsp.c libavcodec/x86/Makefile libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-01 15:38:57 +03:00			`* FFmpeg is free software; you can redistribute it and/or`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
Merge commit 'e3fcb14347466095839c2a3c47ebecff02da891e' * commit 'e3fcb14347466095839c2a3c47ebecff02da891e': dsputil: Split off IDCT bits into their own context Conflicts: configure libavcodec/aic.c libavcodec/arm/Makefile libavcodec/arm/dsputil_init_arm.c libavcodec/arm/dsputil_init_armv6.c libavcodec/asvdec.c libavcodec/dnxhdenc.c libavcodec/dsputil.c libavcodec/dvdec.c libavcodec/dxva2_mpeg2.c libavcodec/intrax8.c libavcodec/mdec.c libavcodec/mjpegdec.c libavcodec/mjpegenc_common.h libavcodec/mpegvideo.c libavcodec/ppc/dsputil_altivec.h libavcodec/ppc/dsputil_ppc.c libavcodec/ppc/idctdsp.c libavcodec/x86/Makefile libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-01 15:38:57 +03:00			`* FFmpeg is distributed in the hope that it will be useful,`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Merge commit 'e3fcb14347466095839c2a3c47ebecff02da891e' * commit 'e3fcb14347466095839c2a3c47ebecff02da891e': dsputil: Split off IDCT bits into their own context Conflicts: configure libavcodec/aic.c libavcodec/arm/Makefile libavcodec/arm/dsputil_init_arm.c libavcodec/arm/dsputil_init_armv6.c libavcodec/asvdec.c libavcodec/dnxhdenc.c libavcodec/dsputil.c libavcodec/dvdec.c libavcodec/dxva2_mpeg2.c libavcodec/intrax8.c libavcodec/mdec.c libavcodec/mjpegdec.c libavcodec/mjpegenc_common.h libavcodec/mpegvideo.c libavcodec/ppc/dsputil_altivec.h libavcodec/ppc/dsputil_ppc.c libavcodec/ppc/idctdsp.c libavcodec/x86/Makefile libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-01 15:38:57 +03:00			`* License along with FFmpeg; if not, write to the Free Software`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "config.h"`
			`#include "libavutil/attributes.h"`
			`#include "libavutil/cpu.h"`
			`#include "libavutil/x86/cpu.h"`
			`#include "libavcodec/avcodec.h"`
			`#include "libavcodec/idctdsp.h"`
			`#include "idctdsp.h"`
simple_idct: Move x86-specific declarations to a header in the x86 directory 2014-02-18 15:20:01 +03:00			`#include "simple_idct.h"`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00
			`/* Input permutation for the simple_idct_mmx */`
			`static const uint8_t simple_mmx_permutation[64] = {`
			`0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,`
			`0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,`
			`0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,`
			`0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,`
			`0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,`
			`0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,`
			`0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,`
			`0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,`
			`};`

xvididct: Ensure that the scantable permutation is always set correctly This fixes cases where the scantable permuation would get overwritten by the general idctdsp initialization. 2014-08-08 13:10:05 +03:00			`static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };`

dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,`
idct: Convert IDCT permutation #defines to an enum Also rename the enum values to be consistent with other DCT permutations. 2014-02-18 13:30:55 +03:00			`enum idct_permutation_type perm_type)`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`{`
			`int i;`

idct: Convert IDCT permutation #defines to an enum Also rename the enum values to be consistent with other DCT permutations. 2014-02-18 13:30:55 +03:00			`switch (perm_type) {`
			`case FF_IDCT_PERM_SIMPLE:`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`for (i = 0; i < 64; i++)`
			`idct_permutation[i] = simple_mmx_permutation[i];`
			`return 1;`
xvididct: Ensure that the scantable permutation is always set correctly This fixes cases where the scantable permuation would get overwritten by the general idctdsp initialization. 2014-08-08 13:10:05 +03:00			`case FF_IDCT_PERM_SSE2:`
			`for (i = 0; i < 64; i++)`
			`idct_permutation[i] = (i & 0x38) \| idct_sse2_row_perm[i & 7];`
			`return 1;`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`}`

			`return 0;`
			`}`

			`av_cold void ff_idctdsp_init_x86(IDCTDSPContext c, AVCodecContext avctx,`
			`unsigned high_bit_depth)`
			`{`
			`int cpu_flags = av_get_cpu_flags();`

			`if (INLINE_MMX(cpu_flags)) {`
idct: Split off Xvid IDCT The Xvid IDCT is only required to decode some Xvid-encoded MPEG-4 files, so there is no point in having it as an unconditional part of idctdsp. 2014-07-21 23:13:21 +03:00			`if (!high_bit_depth &&`
Merge commit 'a786c8259dafeca9744252230b5d78f67810770c' * commit 'a786c8259dafeca9744252230b5d78f67810770c': idct: Split off Xvid IDCT Conflicts: libavcodec/Makefile libavcodec/mpeg4videodec.c libavcodec/x86/Makefile libavcodec/x86/idctdsp_init.c This split is somewhat restructured leaving the xvid IDCT available outside mpeg4 if manually selected. The code also could not be merged unchanged as it conflicted with a bugfix in FFmpeg Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-08-01 16:38:24 +03:00			`avctx->lowres == 0 &&`
idct: Split off Xvid IDCT The Xvid IDCT is only required to decode some Xvid-encoded MPEG-4 files, so there is no point in having it as an unconditional part of idctdsp. 2014-07-21 23:13:21 +03:00			`(avctx->idct_algo == FF_IDCT_AUTO \|\|`
Merge commit 'a786c8259dafeca9744252230b5d78f67810770c' * commit 'a786c8259dafeca9744252230b5d78f67810770c': idct: Split off Xvid IDCT Conflicts: libavcodec/Makefile libavcodec/mpeg4videodec.c libavcodec/x86/Makefile libavcodec/x86/idctdsp_init.c This split is somewhat restructured leaving the xvid IDCT available outside mpeg4 if manually selected. The code also could not be merged unchanged as it conflicted with a bugfix in FFmpeg Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-08-01 16:38:24 +03:00			`avctx->idct_algo == FF_IDCT_SIMPLEAUTO \|\|`
idct: Split off Xvid IDCT The Xvid IDCT is only required to decode some Xvid-encoded MPEG-4 files, so there is no point in having it as an unconditional part of idctdsp. 2014-07-21 23:13:21 +03:00			`avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {`
idctdsp: prettyprinting cosmetics 2014-07-18 17:37:35 +03:00			`c->idct_put = ff_simple_idct_put_mmx;`
			`c->idct_add = ff_simple_idct_add_mmx;`
			`c->idct = ff_simple_idct_mmx;`
			`c->perm_type = FF_IDCT_PERM_SIMPLE;`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`}`
			`}`
Merge commit 'e3fcb14347466095839c2a3c47ebecff02da891e' * commit 'e3fcb14347466095839c2a3c47ebecff02da891e': dsputil: Split off IDCT bits into their own context Conflicts: configure libavcodec/aic.c libavcodec/arm/Makefile libavcodec/arm/dsputil_init_arm.c libavcodec/arm/dsputil_init_armv6.c libavcodec/asvdec.c libavcodec/dnxhdenc.c libavcodec/dsputil.c libavcodec/dvdec.c libavcodec/dxva2_mpeg2.c libavcodec/intrax8.c libavcodec/mdec.c libavcodec/mjpegdec.c libavcodec/mjpegenc_common.h libavcodec/mpegvideo.c libavcodec/ppc/dsputil_altivec.h libavcodec/ppc/dsputil_ppc.c libavcodec/ppc/idctdsp.c libavcodec/x86/Makefile libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-01 15:38:57 +03:00			`if (EXTERNAL_MMX(cpu_flags)) {`
			`c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;`
x86/idctdsp: port {put,add}_pixels_clamped to yasm Also add sse2 versions for both. put_pixels_clamped port and sse2 version originally written by Timothy Gu. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 2014-09-25 01:53:07 +03:00			`c->put_pixels_clamped = ff_put_pixels_clamped_mmx;`
			`c->add_pixels_clamped = ff_add_pixels_clamped_mmx;`
Merge commit 'e3fcb14347466095839c2a3c47ebecff02da891e' * commit 'e3fcb14347466095839c2a3c47ebecff02da891e': dsputil: Split off IDCT bits into their own context Conflicts: configure libavcodec/aic.c libavcodec/arm/Makefile libavcodec/arm/dsputil_init_arm.c libavcodec/arm/dsputil_init_armv6.c libavcodec/asvdec.c libavcodec/dnxhdenc.c libavcodec/dsputil.c libavcodec/dvdec.c libavcodec/dxva2_mpeg2.c libavcodec/intrax8.c libavcodec/mdec.c libavcodec/mjpegdec.c libavcodec/mjpegenc_common.h libavcodec/mpegvideo.c libavcodec/ppc/dsputil_altivec.h libavcodec/ppc/dsputil_ppc.c libavcodec/ppc/idctdsp.c libavcodec/x86/Makefile libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-01 15:38:57 +03:00			`}`
			`if (EXTERNAL_SSE2(cpu_flags)) {`
			`c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;`
x86/idctdsp: port {put,add}_pixels_clamped to yasm Also add sse2 versions for both. put_pixels_clamped port and sse2 version originally written by Timothy Gu. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 2014-09-25 01:53:07 +03:00			`c->put_pixels_clamped = ff_put_pixels_clamped_sse2;`
			`c->add_pixels_clamped = ff_add_pixels_clamped_sse2;`
Merge commit 'e3fcb14347466095839c2a3c47ebecff02da891e' * commit 'e3fcb14347466095839c2a3c47ebecff02da891e': dsputil: Split off IDCT bits into their own context Conflicts: configure libavcodec/aic.c libavcodec/arm/Makefile libavcodec/arm/dsputil_init_arm.c libavcodec/arm/dsputil_init_armv6.c libavcodec/asvdec.c libavcodec/dnxhdenc.c libavcodec/dsputil.c libavcodec/dvdec.c libavcodec/dxva2_mpeg2.c libavcodec/intrax8.c libavcodec/mdec.c libavcodec/mjpegdec.c libavcodec/mjpegenc_common.h libavcodec/mpegvideo.c libavcodec/ppc/dsputil_altivec.h libavcodec/ppc/dsputil_ppc.c libavcodec/ppc/idctdsp.c libavcodec/x86/Makefile libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-07-01 15:38:57 +03:00			`}`
x86: simple_idct(_put): 10bits versions Modeled from the prores version. Clips to [0;1023] and is bitexact. Bitexactness requires to add offsets in different places compared to prores or C, and makes the function approximately 2% slower. For 16 frames of a DNxHD 4:2:2 10bits test sequence: C: 60861 decicycles in idct, 1048205 runs, 371 skips sse2: 27567 decicycles in idct, 1048216 runs, 360 skips avx: 26272 decicycles in idct, 1048171 runs, 405 skips The add version is not implemented, so the corresponding dsp function is set to NULL to make it clear in a code executing it. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 2015-10-12 19:37:47 +02:00
x86: simple_idct: 12bits versions On 12 frames of a 444p 12 bits DNxHR sequence, _put function: C: 78902 decicycles in idct, 262071 runs, 73 skips avx: 32478 decicycles in idct, 262045 runs, 99 skips Difference between the 2: stddev: 0.39 PSNR:104.47 MAXDIFF: 2 This is unavoidable and due to the scale factors used in the x86 version, which cannot match the C ones. In addition, the trick of adding an initial bias to the input of a pass can overflow, as the input coefficients are already 15bits, which is the maximum this function can handle. Overall, however, the omse on 12 bits samples goes from 0.16916 to 0.16883. Reducing rowshift by 1 improves to 0.0908, but causes overflows. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 2015-10-12 19:37:49 +02:00			`if (ARCH_X86_64 && avctx->lowres == 0) {`
			`if (avctx->bits_per_raw_sample == 10 &&`
x86: simple_idct(_put): 10bits versions Modeled from the prores version. Clips to [0;1023] and is bitexact. Bitexactness requires to add offsets in different places compared to prores or C, and makes the function approximately 2% slower. For 16 frames of a DNxHD 4:2:2 10bits test sequence: C: 60861 decicycles in idct, 1048205 runs, 371 skips sse2: 27567 decicycles in idct, 1048216 runs, 360 skips avx: 26272 decicycles in idct, 1048171 runs, 405 skips The add version is not implemented, so the corresponding dsp function is set to NULL to make it clear in a code executing it. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 2015-10-12 19:37:47 +02:00			`(avctx->idct_algo == FF_IDCT_AUTO \|\|`
			`avctx->idct_algo == FF_IDCT_SIMPLEAUTO \|\|`
			`avctx->idct_algo == FF_IDCT_SIMPLE)) {`
			`if (EXTERNAL_SSE2(cpu_flags)) {`
			`c->idct_put = ff_simple_idct10_put_sse2;`
			`c->idct_add = NULL;`
			`c->idct = ff_simple_idct10_sse2;`
			`c->perm_type = FF_IDCT_PERM_TRANSPOSE;`

			`}`
			`if (EXTERNAL_AVX(cpu_flags)) {`
			`c->idct_put = ff_simple_idct10_put_avx;`
			`c->idct_add = NULL;`
			`c->idct = ff_simple_idct10_avx;`
			`c->perm_type = FF_IDCT_PERM_TRANSPOSE;`
			`}`
x86: simple_idct: 12bits versions On 12 frames of a 444p 12 bits DNxHR sequence, _put function: C: 78902 decicycles in idct, 262071 runs, 73 skips avx: 32478 decicycles in idct, 262045 runs, 99 skips Difference between the 2: stddev: 0.39 PSNR:104.47 MAXDIFF: 2 This is unavoidable and due to the scale factors used in the x86 version, which cannot match the C ones. In addition, the trick of adding an initial bias to the input of a pass can overflow, as the input coefficients are already 15bits, which is the maximum this function can handle. Overall, however, the omse on 12 bits samples goes from 0.16916 to 0.16883. Reducing rowshift by 1 improves to 0.0908, but causes overflows. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 2015-10-12 19:37:49 +02:00			`}`

			`if (avctx->bits_per_raw_sample == 12 &&`
			`(avctx->idct_algo == FF_IDCT_AUTO \|\|`
			`avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {`
			`if (EXTERNAL_SSE2(cpu_flags)) {`
			`c->idct_put = ff_simple_idct12_put_sse2;`
			`c->idct_add = NULL;`
			`c->idct = ff_simple_idct12_sse2;`
			`c->perm_type = FF_IDCT_PERM_TRANSPOSE;`
			`}`
			`if (EXTERNAL_AVX(cpu_flags)) {`
			`c->idct_put = ff_simple_idct12_put_avx;`
			`c->idct_add = NULL;`
			`c->idct = ff_simple_idct12_avx;`
			`c->perm_type = FF_IDCT_PERM_TRANSPOSE;`
			`}`
			`}`
x86: simple_idct(_put): 10bits versions Modeled from the prores version. Clips to [0;1023] and is bitexact. Bitexactness requires to add offsets in different places compared to prores or C, and makes the function approximately 2% slower. For 16 frames of a DNxHD 4:2:2 10bits test sequence: C: 60861 decicycles in idct, 1048205 runs, 371 skips sse2: 27567 decicycles in idct, 1048216 runs, 360 skips avx: 26272 decicycles in idct, 1048171 runs, 405 skips The add version is not implemented, so the corresponding dsp function is set to NULL to make it clear in a code executing it. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 2015-10-12 19:37:47 +02:00			`}`
dsputil: Split off IDCT bits into their own context 2014-01-24 13:55:16 +03:00			`}`