mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-12 19:18:44 +02:00
3c9546dfaf
This work is sponsored by, and copyright, Google. These are ported from the ARM version; thanks to the larger amount of registers available, we can do the 16x16 and 32x32 transforms in slices 8 pixels wide instead of 4. This gives a speedup of around 1.4x compared to the 32 bit version. The fact that aarch64 doesn't have the same d/q register aliasing makes some of the macros quite a bit simpler as well. Examples of runtimes vs the 32 bit version, on a Cortex A53: ARM AArch64 vp9_inv_adst_adst_4x4_add_neon: 90.0 87.7 vp9_inv_adst_adst_8x8_add_neon: 400.0 354.7 vp9_inv_adst_adst_16x16_add_neon: 2526.5 1827.2 vp9_inv_dct_dct_4x4_add_neon: 74.0 72.7 vp9_inv_dct_dct_8x8_add_neon: 271.0 256.7 vp9_inv_dct_dct_16x16_add_neon: 1960.7 1372.7 vp9_inv_dct_dct_32x32_add_neon: 11988.9 8088.3 vp9_inv_wht_wht_4x4_add_neon: 63.0 57.7 The speedup vs C code (2-4x) is smaller than in the 32 bit case, mostly because the C code ends up significantly faster (around 1.6x faster, with GCC 5.4) when built for aarch64. Examples of runtimes vs C on a Cortex A57 (for a slightly older version of the patch): A57 gcc-5.3 neon vp9_inv_adst_adst_4x4_add_neon: 152.2 60.0 vp9_inv_adst_adst_8x8_add_neon: 948.2 288.0 vp9_inv_adst_adst_16x16_add_neon: 4830.4 1380.5 vp9_inv_dct_dct_4x4_add_neon: 153.0 58.6 vp9_inv_dct_dct_8x8_add_neon: 789.2 180.2 vp9_inv_dct_dct_16x16_add_neon: 3639.6 917.1 vp9_inv_dct_dct_32x32_add_neon: 20462.1 4985.0 vp9_inv_wht_wht_4x4_add_neon: 91.0 49.8 The asm is around factor 3-4 faster than C on the cortex-a57 and the asm is around 30-50% faster on the a57 compared to the a53. Signed-off-by: Martin Storsjö <martin@martin.st>
49 lines
2.5 KiB
Makefile
49 lines
2.5 KiB
Makefile
# subsystems
|
|
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
|
|
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
|
|
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
|
|
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
|
|
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
|
|
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
|
|
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
|
|
OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o
|
|
OBJS-$(CONFIG_MDCT) += aarch64/mdct_init.o
|
|
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
|
|
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
|
|
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
|
|
|
|
# decoders/encoders
|
|
OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_init.o
|
|
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
|
|
OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
|
|
OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
|
|
OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_aarch64.o
|
|
|
|
# ARMv8 optimizations
|
|
|
|
# subsystems
|
|
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
|
|
|
|
# NEON optimizations
|
|
|
|
# subsystems
|
|
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
|
|
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
|
|
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
|
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
|
|
aarch64/h264idct_neon.o
|
|
NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o
|
|
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
|
|
aarch64/hpeldsp_neon.o
|
|
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
|
|
NEON-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_neon.o
|
|
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
|
|
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
|
|
|
|
# decoders/encoders
|
|
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \
|
|
aarch64/synth_filter_neon.o
|
|
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
|
|
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_neon.o \
|
|
aarch64/vp9mc_neon.o
|