mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
638eceed47
This work is sponsored by, and copyright, Google. This has mostly got the same differences to the 8 bit version as in the arm version. For the horizontal filters, we do 16 pixels in parallel as well. For the 8 pixel wide vertical filters, we can accumulate 4 rows before storing, just as in the 8 bit version. Examples of runtimes vs the 32 bit version, on a Cortex A53: ARM AArch64 vp9_avg4_10bpp_neon: 35.7 30.7 vp9_avg8_10bpp_neon: 93.5 84.7 vp9_avg16_10bpp_neon: 324.4 296.6 vp9_avg32_10bpp_neon: 1236.5 1148.2 vp9_avg64_10bpp_neon: 4639.6 4571.1 vp9_avg_8tap_smooth_4h_10bpp_neon: 130.0 128.0 vp9_avg_8tap_smooth_4hv_10bpp_neon: 440.0 440.5 vp9_avg_8tap_smooth_4v_10bpp_neon: 114.0 105.5 vp9_avg_8tap_smooth_8h_10bpp_neon: 327.0 314.0 vp9_avg_8tap_smooth_8hv_10bpp_neon: 918.7 865.4 vp9_avg_8tap_smooth_8v_10bpp_neon: 330.0 300.2 vp9_avg_8tap_smooth_16h_10bpp_neon: 1187.5 1155.5 vp9_avg_8tap_smooth_16hv_10bpp_neon: 2663.1 2591.0 vp9_avg_8tap_smooth_16v_10bpp_neon: 1107.4 1078.3 vp9_avg_8tap_smooth_64h_10bpp_neon: 17754.6 17454.7 vp9_avg_8tap_smooth_64hv_10bpp_neon: 33285.2 33001.5 vp9_avg_8tap_smooth_64v_10bpp_neon: 16066.9 16048.6 vp9_put4_10bpp_neon: 25.5 21.7 vp9_put8_10bpp_neon: 56.0 52.0 vp9_put16_10bpp_neon/armv8: 183.0 163.1 vp9_put32_10bpp_neon/armv8: 678.6 563.1 vp9_put64_10bpp_neon/armv8: 2679.9 2195.8 vp9_put_8tap_smooth_4h_10bpp_neon: 120.0 118.0 vp9_put_8tap_smooth_4hv_10bpp_neon: 435.2 435.0 vp9_put_8tap_smooth_4v_10bpp_neon: 107.0 98.2 vp9_put_8tap_smooth_8h_10bpp_neon: 303.0 290.0 vp9_put_8tap_smooth_8hv_10bpp_neon: 893.7 828.7 vp9_put_8tap_smooth_8v_10bpp_neon: 305.5 263.5 vp9_put_8tap_smooth_16h_10bpp_neon: 1089.1 1059.2 vp9_put_8tap_smooth_16hv_10bpp_neon: 2578.8 2452.4 vp9_put_8tap_smooth_16v_10bpp_neon: 1009.5 933.5 vp9_put_8tap_smooth_64h_10bpp_neon: 16223.4 15918.6 vp9_put_8tap_smooth_64hv_10bpp_neon: 32153.0 31016.2 vp9_put_8tap_smooth_64v_10bpp_neon: 14516.5 13748.1 These are generally about as fast as the corresponding ARM routines on the same CPU (at least on the A53), in most cases marginally faster. The speedup vs C code is around 4-9x. Signed-off-by: Martin Storsjö <martin@martin.st> |
||
---|---|---|
.. | ||
asm-offsets.h | ||
cabac.h | ||
fft_init_aarch64.c | ||
fft_neon.S | ||
fmtconvert_init.c | ||
fmtconvert_neon.S | ||
h264chroma_init_aarch64.c | ||
h264cmc_neon.S | ||
h264dsp_init_aarch64.c | ||
h264dsp_neon.S | ||
h264idct_neon.S | ||
h264pred_init.c | ||
h264pred_neon.S | ||
h264qpel_init_aarch64.c | ||
h264qpel_neon.S | ||
hpeldsp_init_aarch64.c | ||
hpeldsp_neon.S | ||
Makefile | ||
mdct_neon.S | ||
mpegaudiodsp_init.c | ||
mpegaudiodsp_neon.S | ||
neon.S | ||
neontest.c | ||
rv40dsp_init_aarch64.c | ||
synth_filter_init.c | ||
synth_filter_neon.S | ||
vc1dsp_init_aarch64.c | ||
videodsp_init.c | ||
videodsp.S | ||
vorbisdsp_init.c | ||
vorbisdsp_neon.S | ||
vp9dsp_init_10bpp_aarch64.c | ||
vp9dsp_init_12bpp_aarch64.c | ||
vp9dsp_init_16bpp_aarch64_template.c | ||
vp9dsp_init_aarch64.c | ||
vp9dsp_init.h | ||
vp9itxfm_neon.S | ||
vp9lpf_neon.S | ||
vp9mc_16bpp_neon.S | ||
vp9mc_neon.S |