1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-08 13:22:53 +02:00
FFmpeg/libavcodec/arm/Makefile

149 lines
8.0 KiB
Makefile
Raw Normal View History

ARCH_HEADERS = mathops.h
# subsystems
OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
arm/ac3dsp_arm.o
OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o
OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o
OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \
arm/fft_fixed_init_arm.o
OBJS-$(CONFIG_FLACDSP) += arm/flacdsp_init_arm.o \
arm/flacdsp_arm.o
OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_init_arm.o
OBJS-$(CONFIG_G722DSP) += arm/g722dsp_init_arm.o
OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o
OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \
arm/idctdsp_arm.o \
arm/jrevdct_arm.o \
arm/simple_idct_arm.o
OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o
OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_init_arm.o
OBJS-$(CONFIG_RDFT) += arm/rdft_init_arm.o
2015-07-16 17:11:00 +02:00
OBJS-$(CONFIG_RV34DSP) += arm/rv34dsp_init_arm.o
OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_arm.o
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o
OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
2015-07-16 17:11:02 +02:00
OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
2015-07-16 17:11:00 +02:00
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
arm: Add NEON optimizations for 10 and 12 bit vp9 MC This work is sponsored by, and copyright, Google. The plain pixel put/copy functions are used from the 8 bit version, for the double size (e.g. put16 uses ff_vp9_copy32_neon), and a new copy128 is added. Compared with the 8 bit version, the filters can no longer use the trick to accumulate in 16 bit with only saturation at the end, but now the accumulators need to be 32 bit. This avoids the need to keep track of which filter index is the largest though, reducing the size of the executable code for these filters. For the horizontal filters, we only do 4 or 8 pixels wide in parallel (while doing two rows at a time), since we don't have enough register space to filter 16 pixels wide. For the vertical filters, we still do 4 and 8 pixels in parallel just as in the 8 bit case, but we need to store the output after every 2 rows instead of after every 4 rows. Examples of relative speedup compared to the C version, from checkasm: Cortex A7 A8 A9 A53 vp9_avg4_10bpp_neon: 2.25 2.44 3.05 2.16 vp9_avg8_10bpp_neon: 3.66 8.48 3.86 3.50 vp9_avg16_10bpp_neon: 3.39 8.26 3.37 2.72 vp9_avg32_10bpp_neon: 4.03 10.20 4.07 3.42 vp9_avg64_10bpp_neon: 4.15 10.01 4.13 3.70 vp9_avg_8tap_smooth_4h_10bpp_neon: 3.38 6.22 3.41 4.75 vp9_avg_8tap_smooth_4hv_10bpp_neon: 3.89 6.39 4.30 5.32 vp9_avg_8tap_smooth_4v_10bpp_neon: 5.32 9.73 6.34 7.31 vp9_avg_8tap_smooth_8h_10bpp_neon: 4.45 9.40 4.68 6.87 vp9_avg_8tap_smooth_8hv_10bpp_neon: 4.64 8.91 5.44 6.47 vp9_avg_8tap_smooth_8v_10bpp_neon: 6.44 13.42 8.68 8.79 vp9_avg_8tap_smooth_64h_10bpp_neon: 4.66 9.02 4.84 7.71 vp9_avg_8tap_smooth_64hv_10bpp_neon: 4.61 9.14 4.92 7.10 vp9_avg_8tap_smooth_64v_10bpp_neon: 6.90 14.13 9.57 10.41 vp9_put4_10bpp_neon: 1.33 1.46 2.09 1.33 vp9_put8_10bpp_neon: 1.57 3.42 1.83 1.84 vp9_put16_10bpp_neon: 1.55 4.78 2.17 1.89 vp9_put32_10bpp_neon: 2.06 5.35 2.14 2.30 vp9_put64_10bpp_neon: 3.00 2.41 1.95 1.66 vp9_put_8tap_smooth_4h_10bpp_neon: 3.19 5.81 3.31 4.63 vp9_put_8tap_smooth_4hv_10bpp_neon: 3.86 6.22 4.32 5.21 vp9_put_8tap_smooth_4v_10bpp_neon: 5.40 9.77 6.08 7.21 vp9_put_8tap_smooth_8h_10bpp_neon: 4.22 8.41 4.46 6.63 vp9_put_8tap_smooth_8hv_10bpp_neon: 4.56 8.51 5.39 6.25 vp9_put_8tap_smooth_8v_10bpp_neon: 6.60 12.43 8.17 8.89 vp9_put_8tap_smooth_64h_10bpp_neon: 4.41 8.59 4.54 7.49 vp9_put_8tap_smooth_64hv_10bpp_neon: 4.43 8.58 5.34 6.63 vp9_put_8tap_smooth_64v_10bpp_neon: 7.26 13.92 9.27 10.92 For the larger 8tap filters, the speedup vs C code is around 4-14x. Signed-off-by: Martin Storsjö <martin@martin.st>
2016-12-08 23:35:31 +02:00
OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \
arm/vp9dsp_init_12bpp_arm.o \
arm/vp9dsp_init_arm.o
# ARMv5 optimizations
# subsystems
ARMV5TE-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv5te.o \
arm/simple_idct_armv5te.o
ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \
arm/mpegvideo_armv5te_s.o
ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \
arm/videodsp_armv5te.o
# decoders/encoders
ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o
# ARMv6 optimizations
# subsystems
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
arm/hpeldsp_armv6.o
ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \
arm/idctdsp_armv6.o \
arm/simple_idct_armv6.o
ARMV6-OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o
ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_armv6.o
ARMV6-OBJS-$(CONFIG_STARTCODE) += arm/startcode_armv6.o
2015-07-16 17:11:02 +02:00
ARMV6-OBJS-$(CONFIG_VP8DSP) += arm/vp8_armv6.o \
arm/vp8dsp_init_armv6.o \
arm/vp8dsp_armv6.o
# decoders/encoders
ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
# VFP optimizations
# subsystems
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
# decoders/encoders
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
# NEON optimizations
# subsystems
NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
NEON-OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_neon.o \
arm/audiodsp_neon.o \
arm/int_neon.o
NEON-OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_neon.o \
arm/blockdsp_neon.o
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
arm/fft_fixed_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_neon.o
2015-07-16 17:11:04 +02:00
NEON-OBJS-$(CONFIG_G722DSP) += arm/g722dsp_neon.o
2013-01-19 05:34:47 +03:00
NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_neon.o \
arm/h264idct_neon.o
NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o
NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \
arm/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \
arm/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_neon.o \
arm/idctdsp_neon.o \
arm/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
arm/mdct_fixed_neon.o
NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o
NEON-OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_neon.o \
arm/vc1dsp_neon.o
NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
2015-07-16 17:11:02 +02:00
NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \
arm/vp8dsp_neon.o
# decoders/encoders
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
arm/hevcdsp_deblock_neon.o \
arm/hevcdsp_idct_neon.o \
arm/hevcdsp_qpel_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
arm: vp9: Add NEON itxfm routines This work is sponsored by, and copyright, Google. For the transforms up to 8x8, we can fit all the data (including temporaries) in registers and just do a straightforward transform of all the data. For 16x16, we do a transform of 4x16 pixels in 4 slices, using a temporary buffer. For 32x32, we transform 4x32 pixels at a time, in two steps of 4x16 pixels each. Examples of relative speedup compared to the C version, from checkasm: Cortex A7 A8 A9 A53 vp9_inv_adst_adst_4x4_add_neon: 3.39 5.83 4.17 4.01 vp9_inv_adst_adst_8x8_add_neon: 3.79 4.86 4.23 3.98 vp9_inv_adst_adst_16x16_add_neon: 3.33 4.36 4.11 4.16 vp9_inv_dct_dct_4x4_add_neon: 4.06 6.16 4.59 4.46 vp9_inv_dct_dct_8x8_add_neon: 4.61 6.01 4.98 4.86 vp9_inv_dct_dct_16x16_add_neon: 3.35 3.44 3.36 3.79 vp9_inv_dct_dct_32x32_add_neon: 3.89 3.50 3.79 4.42 vp9_inv_wht_wht_4x4_add_neon: 3.22 5.13 3.53 3.77 Thus, the speedup vs C code is around 3-6x. This is mostly marginally faster than the corresponding routines in libvpx on most cores, tested with their 32x32 idct (compared to vpx_idct32x32_1024_add_neon). These numbers are slightly in libvpx's favour since their version doesn't clear the input buffer like ours do (although the effect of that on the total runtime probably is negligible.) Cortex A7 A8 A9 A53 vp9_inv_dct_dct_32x32_add_neon: 18436.8 16874.1 14235.1 11988.9 libvpx vpx_idct32x32_1024_add_neon 20789.0 13344.3 15049.9 13030.5 Only on the Cortex A8, the libvpx function is faster. On the other cores, ours is slightly faster even though ours has got source block clearing integrated. This is an adapted cherry-pick from libav commits a67ae67083151f2f9595a1f2d17b601da19b939e and 52d196fb30fb6628921b5f1b31e7bd11eb7e1d9a. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
2016-11-14 12:32:22 +02:00
NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_neon.o \
arm: vp9: Add NEON loop filters This work is sponsored by, and copyright, Google. The implementation tries to have smart handling of cases where no pixels need the full filtering for the 8/16 width filters, skipping both calculation and writeback of the unmodified pixels in those cases. The actual effect of this is hard to test with checkasm though, since it tests the full filtering, and the benefit depends on how many filtered blocks use the shortcut. Examples of relative speedup compared to the C version, from checkasm: Cortex A7 A8 A9 A53 vp9_loop_filter_h_4_8_neon: 2.72 2.68 1.78 3.15 vp9_loop_filter_h_8_8_neon: 2.36 2.38 1.70 2.91 vp9_loop_filter_h_16_8_neon: 1.80 1.89 1.45 2.01 vp9_loop_filter_h_16_16_neon: 2.81 2.78 2.18 3.16 vp9_loop_filter_mix2_h_44_16_neon: 2.65 2.67 1.93 3.05 vp9_loop_filter_mix2_h_48_16_neon: 2.46 2.38 1.81 2.85 vp9_loop_filter_mix2_h_84_16_neon: 2.50 2.41 1.73 2.85 vp9_loop_filter_mix2_h_88_16_neon: 2.77 2.66 1.96 3.23 vp9_loop_filter_mix2_v_44_16_neon: 4.28 4.46 3.22 5.70 vp9_loop_filter_mix2_v_48_16_neon: 3.92 4.00 3.03 5.19 vp9_loop_filter_mix2_v_84_16_neon: 3.97 4.31 2.98 5.33 vp9_loop_filter_mix2_v_88_16_neon: 3.91 4.19 3.06 5.18 vp9_loop_filter_v_4_8_neon: 4.53 4.47 3.31 6.05 vp9_loop_filter_v_8_8_neon: 3.58 3.99 2.92 5.17 vp9_loop_filter_v_16_8_neon: 3.40 3.50 2.81 4.68 vp9_loop_filter_v_16_16_neon: 4.66 4.41 3.74 6.02 The speedup vs C code is around 2-6x. The numbers are quite inconclusive though, since the checkasm test runs multiple filterings on top of each other, so later rounds might end up with different codepaths (different decisions on which filter to apply, based on input pixel differences). Disabling the early-exit in the asm doesn't give a fair comparison either though, since the C code only does the necessary calcuations for each row. Based on START_TIMER/STOP_TIMER wrapping around a few individual functions, the speedup vs C code is around 4-9x. This is pretty similar in runtime to the corresponding routines in libvpx. (This is comparing vpx_lpf_vertical_16_neon, vpx_lpf_horizontal_edge_8_neon and vpx_lpf_horizontal_edge_16_neon to vp9_loop_filter_h_16_8_neon, vp9_loop_filter_v_16_8_neon and vp9_loop_filter_v_16_16_neon - note that the naming of horizonal and vertical is flipped between the libraries.) In order to have stable, comparable numbers, the early exits in both asm versions were disabled, forcing the full filtering codepath. Cortex A7 A8 A9 A53 vp9_loop_filter_h_16_8_neon: 597.2 472.0 482.4 415.0 libvpx vpx_lpf_vertical_16_neon: 626.0 464.5 470.7 445.0 vp9_loop_filter_v_16_8_neon: 500.2 422.5 429.7 295.0 libvpx vpx_lpf_horizontal_edge_8_neon: 586.5 414.5 415.6 383.2 vp9_loop_filter_v_16_16_neon: 905.0 784.7 791.5 546.0 libvpx vpx_lpf_horizontal_edge_16_neon: 1060.2 751.7 743.5 685.2 Our version is consistently faster on on A7 and A53, marginally slower on A8, and sometimes faster, sometimes slower on A9 (marginally slower in all three tests in this particular test run). This is an adapted cherry-pick from libav commit dd299a2d6d4d1af9528ed35a8131c35946be5973. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
2016-11-14 12:32:23 +02:00
arm/vp9lpf_neon.o \
arm: Add NEON optimizations for 10 and 12 bit vp9 MC This work is sponsored by, and copyright, Google. The plain pixel put/copy functions are used from the 8 bit version, for the double size (e.g. put16 uses ff_vp9_copy32_neon), and a new copy128 is added. Compared with the 8 bit version, the filters can no longer use the trick to accumulate in 16 bit with only saturation at the end, but now the accumulators need to be 32 bit. This avoids the need to keep track of which filter index is the largest though, reducing the size of the executable code for these filters. For the horizontal filters, we only do 4 or 8 pixels wide in parallel (while doing two rows at a time), since we don't have enough register space to filter 16 pixels wide. For the vertical filters, we still do 4 and 8 pixels in parallel just as in the 8 bit case, but we need to store the output after every 2 rows instead of after every 4 rows. Examples of relative speedup compared to the C version, from checkasm: Cortex A7 A8 A9 A53 vp9_avg4_10bpp_neon: 2.25 2.44 3.05 2.16 vp9_avg8_10bpp_neon: 3.66 8.48 3.86 3.50 vp9_avg16_10bpp_neon: 3.39 8.26 3.37 2.72 vp9_avg32_10bpp_neon: 4.03 10.20 4.07 3.42 vp9_avg64_10bpp_neon: 4.15 10.01 4.13 3.70 vp9_avg_8tap_smooth_4h_10bpp_neon: 3.38 6.22 3.41 4.75 vp9_avg_8tap_smooth_4hv_10bpp_neon: 3.89 6.39 4.30 5.32 vp9_avg_8tap_smooth_4v_10bpp_neon: 5.32 9.73 6.34 7.31 vp9_avg_8tap_smooth_8h_10bpp_neon: 4.45 9.40 4.68 6.87 vp9_avg_8tap_smooth_8hv_10bpp_neon: 4.64 8.91 5.44 6.47 vp9_avg_8tap_smooth_8v_10bpp_neon: 6.44 13.42 8.68 8.79 vp9_avg_8tap_smooth_64h_10bpp_neon: 4.66 9.02 4.84 7.71 vp9_avg_8tap_smooth_64hv_10bpp_neon: 4.61 9.14 4.92 7.10 vp9_avg_8tap_smooth_64v_10bpp_neon: 6.90 14.13 9.57 10.41 vp9_put4_10bpp_neon: 1.33 1.46 2.09 1.33 vp9_put8_10bpp_neon: 1.57 3.42 1.83 1.84 vp9_put16_10bpp_neon: 1.55 4.78 2.17 1.89 vp9_put32_10bpp_neon: 2.06 5.35 2.14 2.30 vp9_put64_10bpp_neon: 3.00 2.41 1.95 1.66 vp9_put_8tap_smooth_4h_10bpp_neon: 3.19 5.81 3.31 4.63 vp9_put_8tap_smooth_4hv_10bpp_neon: 3.86 6.22 4.32 5.21 vp9_put_8tap_smooth_4v_10bpp_neon: 5.40 9.77 6.08 7.21 vp9_put_8tap_smooth_8h_10bpp_neon: 4.22 8.41 4.46 6.63 vp9_put_8tap_smooth_8hv_10bpp_neon: 4.56 8.51 5.39 6.25 vp9_put_8tap_smooth_8v_10bpp_neon: 6.60 12.43 8.17 8.89 vp9_put_8tap_smooth_64h_10bpp_neon: 4.41 8.59 4.54 7.49 vp9_put_8tap_smooth_64hv_10bpp_neon: 4.43 8.58 5.34 6.63 vp9_put_8tap_smooth_64v_10bpp_neon: 7.26 13.92 9.27 10.92 For the larger 8tap filters, the speedup vs C code is around 4-14x. Signed-off-by: Martin Storsjö <martin@martin.st>
2016-12-08 23:35:31 +02:00
arm/vp9mc_16bpp_neon.o \
arm: vp9: Add NEON itxfm routines This work is sponsored by, and copyright, Google. For the transforms up to 8x8, we can fit all the data (including temporaries) in registers and just do a straightforward transform of all the data. For 16x16, we do a transform of 4x16 pixels in 4 slices, using a temporary buffer. For 32x32, we transform 4x32 pixels at a time, in two steps of 4x16 pixels each. Examples of relative speedup compared to the C version, from checkasm: Cortex A7 A8 A9 A53 vp9_inv_adst_adst_4x4_add_neon: 3.39 5.83 4.17 4.01 vp9_inv_adst_adst_8x8_add_neon: 3.79 4.86 4.23 3.98 vp9_inv_adst_adst_16x16_add_neon: 3.33 4.36 4.11 4.16 vp9_inv_dct_dct_4x4_add_neon: 4.06 6.16 4.59 4.46 vp9_inv_dct_dct_8x8_add_neon: 4.61 6.01 4.98 4.86 vp9_inv_dct_dct_16x16_add_neon: 3.35 3.44 3.36 3.79 vp9_inv_dct_dct_32x32_add_neon: 3.89 3.50 3.79 4.42 vp9_inv_wht_wht_4x4_add_neon: 3.22 5.13 3.53 3.77 Thus, the speedup vs C code is around 3-6x. This is mostly marginally faster than the corresponding routines in libvpx on most cores, tested with their 32x32 idct (compared to vpx_idct32x32_1024_add_neon). These numbers are slightly in libvpx's favour since their version doesn't clear the input buffer like ours do (although the effect of that on the total runtime probably is negligible.) Cortex A7 A8 A9 A53 vp9_inv_dct_dct_32x32_add_neon: 18436.8 16874.1 14235.1 11988.9 libvpx vpx_idct32x32_1024_add_neon 20789.0 13344.3 15049.9 13030.5 Only on the Cortex A8, the libvpx function is faster. On the other cores, ours is slightly faster even though ours has got source block clearing integrated. This is an adapted cherry-pick from libav commits a67ae67083151f2f9595a1f2d17b601da19b939e and 52d196fb30fb6628921b5f1b31e7bd11eb7e1d9a. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
2016-11-14 12:32:22 +02:00
arm/vp9mc_neon.o