1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-26 19:01:44 +02:00
FFmpeg/libavcodec/x86
Ronald S. Bultje 726501a34e vp9: add 32x32 idct AVX2 implementation.
About 1.8x speedup compared to AVX version for full IDCT. Other
sub-IDCT scenarios also see speedups. Full --bench output for
idct_32x32_add_{bpp}_${subidct}_${opt} (50k cycles):

nop: 16.5
vp9_inv_dct_dct_32x32_add_8_1_c: 2284.4
vp9_inv_dct_dct_32x32_add_8_1_sse2: 145.0
vp9_inv_dct_dct_32x32_add_8_1_ssse3: 137.4
vp9_inv_dct_dct_32x32_add_8_1_avx: 137.1
vp9_inv_dct_dct_32x32_add_8_1_avx2: 73.2
vp9_inv_dct_dct_32x32_add_8_2_c: 14680.8
vp9_inv_dct_dct_32x32_add_8_2_sse2: 2617.2
vp9_inv_dct_dct_32x32_add_8_2_ssse3: 982.9
vp9_inv_dct_dct_32x32_add_8_2_avx: 958.5
vp9_inv_dct_dct_32x32_add_8_2_avx2: 704.2
vp9_inv_dct_dct_32x32_add_8_4_c: 14443.1
vp9_inv_dct_dct_32x32_add_8_4_sse2: 2717.1
vp9_inv_dct_dct_32x32_add_8_4_ssse3: 965.7
vp9_inv_dct_dct_32x32_add_8_4_avx: 1000.7
vp9_inv_dct_dct_32x32_add_8_4_avx2: 717.1
vp9_inv_dct_dct_32x32_add_8_8_c: 14436.4
vp9_inv_dct_dct_32x32_add_8_8_sse2: 2671.8
vp9_inv_dct_dct_32x32_add_8_8_ssse3: 1038.5
vp9_inv_dct_dct_32x32_add_8_8_avx: 983.0
vp9_inv_dct_dct_32x32_add_8_8_avx2: 729.4
vp9_inv_dct_dct_32x32_add_8_16_c: 14614.7
vp9_inv_dct_dct_32x32_add_8_16_sse2: 2701.7
vp9_inv_dct_dct_32x32_add_8_16_ssse3: 1334.4
vp9_inv_dct_dct_32x32_add_8_16_avx: 1276.7
vp9_inv_dct_dct_32x32_add_8_16_avx2: 719.5
vp9_inv_dct_dct_32x32_add_8_32_c: 14363.6
vp9_inv_dct_dct_32x32_add_8_32_sse2: 2575.6
vp9_inv_dct_dct_32x32_add_8_32_ssse3: 2633.9
vp9_inv_dct_dct_32x32_add_8_32_avx: 2539.6
vp9_inv_dct_dct_32x32_add_8_32_avx2: 1395.0
2016-07-26 15:59:07 -04:00
..
aacpsdsp_init.c x86/aacpsdsp: add SSE and SSE3 optimized functions 2015-07-30 19:01:15 -03:00
aacpsdsp.asm x86/aacpsdsp: optimize add_squares loop 2016-06-14 12:41:23 -03:00
ac3dsp_init.c Merge commit '4f22b138886e29f7fffa8c715673951e51be9f32' 2016-01-27 18:23:31 +00:00
ac3dsp.asm
alacdsp_init.c x86/alacdsp: add simd optimized functions 2015-10-06 20:22:00 -03:00
alacdsp.asm x86/alacdsp: add simd optimized functions 2015-10-06 20:22:00 -03:00
audiodsp_init.c Merge commit 'dc40a70c5755bccfb1a1349639943e1f408bea50' 2016-06-26 15:53:00 +02:00
audiodsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-11 11:12:01 +02:00
blockdsp_init.c blockdsp: reindent after parameter removal 2015-10-03 23:34:56 +02:00
blockdsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-04 20:13:09 +02:00
bswapdsp_init.c
bswapdsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-11 11:12:01 +02:00
cabac.h asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
cavsdsp.c avcodec/x86/cavsdsp: silence -Wunused-variable on --disable-mmx 2015-09-24 04:27:50 +02:00
constants.c avcodec/v210: add avx2 version of the 10-bit line encoder 2016-01-17 16:03:43 +01:00
constants.h avcodec/v210: add avx2 version of the 10-bit line encoder 2016-01-17 16:03:43 +01:00
dcadsp_init.c x86/dcadec: add ff_lfe_fir1_float_{sse3,avx} 2016-02-22 21:21:34 -03:00
dcadsp.asm x86/dcadsp: optimize lfe_fir0_float_fma3 on x86_32 2016-07-05 17:48:20 -03:00
dct32.asm x86inc: Drop SECTION_TEXT macro 2015-08-11 11:12:01 +02:00
dct_init.c Merge commit 'ebaf571aca2dd6ce3caeeeec4210a3fccd47e7db' 2015-08-02 12:31:39 +02:00
dirac_dwt_init.c dirac_dwt: Make x86 files/functions names consistent 2016-02-05 19:30:23 -08:00
dirac_dwt.asm dirac_dwt: Make x86 files/functions names consistent 2016-02-05 19:30:23 -08:00
diracdsp_init.c x86/diracdsp: make ff_put_signed_rect_clamped_10_sse4 work on x86_32 2016-07-20 13:43:38 -03:00
diracdsp.asm x86/diracdsp: make ff_put_signed_rect_clamped_10_sse4 work on x86_32 2016-07-20 13:43:38 -03:00
dnxhdenc_init.c
dnxhdenc.asm
fdct.c
fdct.h
fdctdsp_init.c
fft_init.c Merge commit '73ff983e8dd22ccee166403d0bbbc9c1cd543622' 2016-04-12 15:42:21 +01:00
fft.asm avcodec: Extend fft to size 2^17 2016-03-04 13:51:42 +01:00
fft.h fft: Split MDCT bits off from FFT 2016-03-01 10:18:28 +01:00
flac_dsp_gpl.asm x86inc: Drop SECTION_TEXT macro 2015-08-04 20:13:09 +02:00
flacdsp_init.c
flacdsp.asm x86: move XOP emulation code back to x86inc 2015-08-03 17:11:13 -03:00
fmtconvert_init.c Merge commit 'dc40a70c5755bccfb1a1349639943e1f408bea50' 2016-06-26 15:53:00 +02:00
fmtconvert.asm avcodec/x86/fmtconvert: Add emms to int32_to_float_fmul_array8_sse() 2016-01-15 17:08:37 +01:00
fpel.asm
fpel.h x86: fpel: Remove erroneous ff_put_pixels8_mmxext prototype 2015-10-19 16:52:37 -07:00
g722dsp_init.c
g722dsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-04 20:13:09 +02:00
h263_loopfilter.asm x86inc: Drop SECTION_TEXT macro 2015-08-11 11:12:01 +02:00
h263dsp_init.c
h264_chromamc_10bit.asm
h264_chromamc.asm Merge commit '41ed7ab45fc693f7d7fc35664c0233f4c32d69bb' 2016-06-21 21:55:34 +02:00
h264_deblock_10bit.asm
h264_deblock.asm avcodec/h264: Fix segfault in 4:2:2 chroma deblock with 32-bit msvc 2016-02-05 22:01:38 +01:00
h264_i386.h asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
h264_idct_10bit.asm Merge commit 'f1a9eee41c4b5ea35db9ff0088ce4e6f1e187f2c' 2016-07-09 14:52:23 +02:00
h264_idct.asm Merge commit 'f1a9eee41c4b5ea35db9ff0088ce4e6f1e187f2c' 2016-07-09 14:52:23 +02:00
h264_intrapred_10bit.asm vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions. 2015-10-03 14:42:39 -04:00
h264_intrapred_init.c
h264_intrapred.asm
h264_qpel_8bit.asm
h264_qpel_10bit.asm vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction. 2015-10-03 14:42:39 -04:00
h264_qpel.c x86: fpel: Move prototypes for 4-px block functions 2015-10-19 16:52:33 -07:00
h264_weight_10bit.asm
h264_weight.asm avcodec/x86: add missing colon to labels 2015-07-26 02:50:14 -03:00
h264chroma_init.c
h264dsp_init.c avcodec/h264: mmxext 4:2:2 chroma deblock/loop filter 2016-02-05 17:26:04 +01:00
hevc_deblock.asm
hevc_idct.asm x86inc: Drop SECTION_TEXT macro 2015-08-04 20:13:09 +02:00
hevc_mc.asm hevcdsp: use a macro for .rodata section 2015-12-11 16:19:30 +01:00
hevc_res_add.asm
hevc_sao_10bit.asm x86/hevc_sao: add ff_hevc_sao_edge_filter_{8,16}_{10,12} 2015-12-20 17:01:15 -03:00
hevc_sao.asm x86/hevc_sao: move 10/12bit functions into a separate file 2015-09-30 02:59:55 -03:00
hevcdsp_init.c x86: hevc: Fix linking with both yasm and optimizations disabled 2016-02-23 11:47:54 +01:00
hevcdsp.h
hpeldsp_init.c Merge commit 'dc40a70c5755bccfb1a1349639943e1f408bea50' 2016-06-26 15:53:00 +02:00
hpeldsp_rnd_template.c asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
hpeldsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-11 11:12:01 +02:00
hpeldsp.h
huffyuvdsp_init.c
huffyuvdsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-11 11:12:01 +02:00
huffyuvencdsp_mmx.c x86: use the new helper macros where useful 2016-02-14 20:00:21 -03:00
huffyuvencdsp.asm huffyuvencdsp: Undefine "i" macro after each use 2016-02-07 09:19:17 -08:00
idctdsp_init.c x86: simple_idct: 12bits versions 2015-10-13 15:34:32 +02:00
idctdsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-04 20:13:09 +02:00
idctdsp.h
imdct36.asm x86/imdct36: use extractps inside the STORE macro 2016-01-28 13:35:15 -03:00
inline_asm.h Merge commit '41ed7ab45fc693f7d7fc35664c0233f4c32d69bb' 2016-06-21 21:55:34 +02:00
jpeg2000dsp_init.c x86: use the new helper macros where useful 2016-02-14 20:00:21 -03:00
jpeg2000dsp.asm avcodec/x86: add missing colon to labels 2015-07-26 02:50:14 -03:00
lossless_audiodsp_init.c x86: lossless audio: SSE4 madd 32bits 2016-05-07 23:28:48 +02:00
lossless_audiodsp.asm x86: lossless audio: SSE4 madd 32bits 2016-05-07 23:28:48 +02:00
lossless_videodsp_init.c Replace all remaining occurances of step/depth_minus1 and offset_plus1 2015-09-08 17:10:48 +02:00
lossless_videodsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-04 20:13:09 +02:00
lpc.c
Makefile build: miscellaneous cosmetics 2016-04-07 15:26:08 +02:00
mathops.h
me_cmp_init.c asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
me_cmp.asm avcodec/x86: add missing colon to labels 2015-07-26 02:50:14 -03:00
mlpdsp_init.c x86: use the new helper macros where useful 2016-02-14 20:00:21 -03:00
mlpdsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-04 20:13:09 +02:00
mpegaudiodsp.c avcodec/x86/mpegaudiodsp: silence -Wunused-variable on --disable-mmx 2015-09-22 23:45:03 +02:00
mpegvideo.c asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
mpegvideodsp.c Merge commit 'dc40a70c5755bccfb1a1349639943e1f408bea50' 2016-06-26 15:53:00 +02:00
mpegvideoenc_qns_template.c
mpegvideoenc_template.c asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
mpegvideoenc.c avcodec/x86/mpegvideoenc: silence -Wunused-function on --disable-mmx 2015-09-19 23:26:57 +02:00
mpegvideoencdsp_init.c Merge commit '7c6eb0a1b7bf1aac7f033a7ec6d8cacc3b5c2615' 2015-07-27 22:10:35 +02:00
mpegvideoencdsp.asm
pixblockdsp_init.c
pixblockdsp.asm pixblockdsp: x86: Condense diff_pixels_* to a shared macro 2015-11-07 14:31:34 -08:00
pngdsp_init.c
pngdsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-11 11:12:01 +02:00
proresdsp_init.c
proresdsp.asm x86inc: Add debug symbols indicating sizes of compiled functions 2016-01-23 20:46:28 +01:00
qpel.asm
qpeldsp_init.c
qpeldsp.asm x86inc: Drop SECTION_TEXT macro 2015-08-11 11:12:01 +02:00
rnd_template.c asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
rv34dsp_init.c Merge commit 'dc40a70c5755bccfb1a1349639943e1f408bea50' 2016-06-26 15:53:00 +02:00
rv34dsp.asm
rv40dsp_init.c all: fix -Wextra-semi reported on clang 2015-10-24 17:58:17 -04:00
rv40dsp.asm Merge commit '41ed7ab45fc693f7d7fc35664c0233f4c32d69bb' 2016-06-21 21:55:34 +02:00
sbrdsp_init.c
sbrdsp.asm x86/aacdec: use HADDPS macro 2016-06-08 14:18:18 -03:00
simple_idct10_template.asm x86: simple_idct10_template: use const 2015-10-13 22:52:33 +02:00
simple_idct10.asm x86inc: Add debug symbols indicating sizes of compiled functions 2016-01-21 23:19:46 +01:00
simple_idct.c
simple_idct.h x86: simple_idct: 12bits versions 2015-10-13 15:34:32 +02:00
snowdsp.c asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
svq1enc_init.c
svq1enc.asm x86inc: Drop SECTION_TEXT macro 2015-08-04 20:13:09 +02:00
synth_filter_init.c x86: use the new helper macros where useful 2016-02-14 20:00:21 -03:00
synth_filter.asm avcodec/synth_filter: split off remaining code from dcadec files 2016-01-25 14:57:38 -03:00
takdsp_init.c avcodec/takdec: add x86 SIMD for rest of decorrelation modes 2015-10-09 21:38:15 +02:00
takdsp.asm x86/takdsp: use arithmetic shift instructions 2015-10-09 23:52:39 -03:00
ttadsp_init.c
ttadsp.asm
v210-init.c avcodec/x86/v210-init: fix unused variable warning 2015-08-21 17:06:27 +02:00
v210.asm avcodec/x86: add missing colon to labels 2015-07-26 02:50:14 -03:00
v210enc_init.c Merge commit 'e280fe13291e9c712a5f4aa13b5263f3e8afed45' 2016-02-16 17:23:32 +00:00
v210enc.asm Merge commit 'eafb05fcf37cd19a910ca3b17824384f9006bc0a' 2016-02-16 17:02:56 +00:00
vc1dsp_init.c x86: vc1dsp: Convert vc1_inv_trans_*_dc to NASM format 2016-02-01 17:01:11 -08:00
vc1dsp_loopfilter.asm x86/vc1dsp: Split the file into MC and loopfilter 2016-02-29 08:46:53 -08:00
vc1dsp_mc.asm x86/vc1dsp: Split the file into MC and loopfilter 2016-02-29 08:46:53 -08:00
vc1dsp_mmx.c asm: FF_-prefix internal macros used in inline assembly 2016-06-27 17:21:18 +02:00
vc1dsp.h
videodsp_init.c videodsp: assert that linesize is larger than width 2015-07-08 01:32:04 +02:00
videodsp.asm videodsp: fix 1-byte overread in top/bottom READ_NUM_BYTES iterations. 2016-01-18 11:12:47 -05:00
vorbisdsp_init.c
vorbisdsp.asm
vp3dsp_init.c Merge commit '7c6eb0a1b7bf1aac7f033a7ec6d8cacc3b5c2615' 2015-07-27 22:10:35 +02:00
vp3dsp.asm avcodec/x86: add missing colon to labels 2015-07-26 02:50:14 -03:00
vp6dsp_init.c Merge commit 'dc40a70c5755bccfb1a1349639943e1f408bea50' 2016-06-26 15:53:00 +02:00
vp6dsp.asm
vp8dsp_init.c Merge commit 'dc40a70c5755bccfb1a1349639943e1f408bea50' 2016-06-26 15:53:00 +02:00
vp8dsp_loopfilter.asm
vp8dsp.asm
vp9dsp_init_10bpp.c vp9: add subpel MC SIMD for 10/12bpp. 2015-09-16 21:11:34 -04:00
vp9dsp_init_12bpp.c vp9: add subpel MC SIMD for 10/12bpp. 2015-09-16 21:11:34 -04:00
vp9dsp_init_16bpp_template.c x86: use the new helper macros where useful 2016-02-14 20:00:21 -03:00
vp9dsp_init_16bpp.c x86: use the new helper macros where useful 2016-02-14 20:00:21 -03:00
vp9dsp_init.c vp9: add 32x32 idct AVX2 implementation. 2016-07-26 15:59:07 -04:00
vp9dsp_init.h all: fix -Wextra-semi reported on clang 2015-10-24 17:58:17 -04:00
vp9intrapred_16bpp.asm vp9: don't keep a stack pointer if we don't need it. 2015-10-07 08:55:19 -04:00
vp9intrapred.asm
vp9itxfm_16bpp.asm x86/vp9itxfm: fix register clobbering in ff_vp9_idct_idct_4x4_add_12_sse2 2015-10-13 20:21:33 -03:00
vp9itxfm_template.asm vp9: add x86 simd (sse2/ssse3) for iadst4 10bpp functions. 2015-10-13 11:05:58 -04:00
vp9itxfm.asm vp9: add 32x32 idct AVX2 implementation. 2016-07-26 15:59:07 -04:00
vp9lpf_16bpp.asm vp9: sse2/ssse3/avx 16bpp loopfilter x86 simd. 2015-10-03 14:42:39 -04:00
vp9lpf.asm
vp9mc_16bpp.asm vp9: sse2/ssse3/avx 16bpp loopfilter x86 simd. 2015-10-03 14:42:39 -04:00
vp9mc.asm x86/vp9mc: fix string concatenation of fullpel function names 2015-09-20 12:32:27 -03:00
vp56_arith.h
w64xmmtest.c avcodec: add missing xmm/neon clobber test wrappers for the new decode API 2016-07-03 18:04:30 -03:00
xvididct_init.c
xvididct.asm
xvididct.h