mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
aarch64: vp9: Add NEON itxfm routines
This work is sponsored by, and copyright, Google.
These are ported from the ARM version; thanks to the larger
amount of registers available, we can do the 16x16 and 32x32
transforms in slices 8 pixels wide instead of 4. This gives
a speedup of around 1.4x compared to the 32 bit version.
The fact that aarch64 doesn't have the same d/q register
aliasing makes some of the macros quite a bit simpler as well.
Examples of runtimes vs the 32 bit version, on a Cortex A53:
ARM AArch64
vp9_inv_adst_adst_4x4_add_neon: 90.0 87.7
vp9_inv_adst_adst_8x8_add_neon: 400.0 354.7
vp9_inv_adst_adst_16x16_add_neon: 2526.5 1827.2
vp9_inv_dct_dct_4x4_add_neon: 74.0 72.7
vp9_inv_dct_dct_8x8_add_neon: 271.0 256.7
vp9_inv_dct_dct_16x16_add_neon: 1960.7 1372.7
vp9_inv_dct_dct_32x32_add_neon: 11988.9 8088.3
vp9_inv_wht_wht_4x4_add_neon: 63.0 57.7
The speedup vs C code (2-4x) is smaller than in the 32 bit case,
mostly because the C code ends up significantly faster (around
1.6x faster, with GCC 5.4) when built for aarch64.
Examples of runtimes vs C on a Cortex A57 (for a slightly older version
of the patch):
A57 gcc-5.3 neon
vp9_inv_adst_adst_4x4_add_neon: 152.2 60.0
vp9_inv_adst_adst_8x8_add_neon: 948.2 288.0
vp9_inv_adst_adst_16x16_add_neon: 4830.4 1380.5
vp9_inv_dct_dct_4x4_add_neon: 153.0 58.6
vp9_inv_dct_dct_8x8_add_neon: 789.2 180.2
vp9_inv_dct_dct_16x16_add_neon: 3639.6 917.1
vp9_inv_dct_dct_32x32_add_neon: 20462.1 4985.0
vp9_inv_wht_wht_4x4_add_neon: 91.0 49.8
The asm is around factor 3-4 faster than C on the cortex-a57 and the asm
is around 30-50% faster on the a57 compared to the a53.
This is an adapted cherry-pick from libav commit
3c9546dfaf
.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
1f7801c2bc
commit
f43079e11c
@ -42,4 +42,5 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
|
||||
# decoders/encoders
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9mc_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_neon.o \
|
||||
aarch64/vp9mc_neon.o
|
||||
|
@ -96,7 +96,7 @@ define_8tap_2d_funcs(16)
|
||||
define_8tap_2d_funcs(8)
|
||||
define_8tap_2d_funcs(4)
|
||||
|
||||
av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
|
||||
static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp, int bpp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
@ -154,3 +154,55 @@ av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
|
||||
init_mc_funcs_dirs(4, 4);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_itxfm(type_a, type_b, sz) \
|
||||
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \
|
||||
ptrdiff_t stride, \
|
||||
int16_t *_block, int eob)
|
||||
|
||||
#define define_itxfm_funcs(sz) \
|
||||
define_itxfm(idct, idct, sz); \
|
||||
define_itxfm(iadst, idct, sz); \
|
||||
define_itxfm(idct, iadst, sz); \
|
||||
define_itxfm(iadst, iadst, sz)
|
||||
|
||||
define_itxfm_funcs(4);
|
||||
define_itxfm_funcs(8);
|
||||
define_itxfm_funcs(16);
|
||||
define_itxfm(idct, idct, 32);
|
||||
define_itxfm(iwht, iwht, 4);
|
||||
|
||||
|
||||
static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp, int bpp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (bpp != 8)
|
||||
return;
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_itxfm(tx, sz) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
|
||||
|
||||
#define init_idct(tx, nm) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
|
||||
|
||||
init_itxfm(TX_4X4, 4x4);
|
||||
init_itxfm(TX_8X8, 8x8);
|
||||
init_itxfm(TX_16X16, 16x16);
|
||||
init_idct(TX_32X32, idct_idct_32x32);
|
||||
init_idct(4, iwht_iwht_4x4);
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
|
||||
{
|
||||
vp9dsp_mc_init_aarch64(dsp, bpp);
|
||||
vp9dsp_itxfm_init_aarch64(dsp, bpp);
|
||||
}
|
||||
|
1116
libavcodec/aarch64/vp9itxfm_neon.S
Normal file
1116
libavcodec/aarch64/vp9itxfm_neon.S
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user