mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-13 21:28:01 +02:00
aarch64: Add NEON optimizations for 10 and 12 bit vp9 itxfm
This work is sponsored by, and copyright, Google. Compared to the arm version, on aarch64 we can keep the full 8x8 transform in registers, and for 16x16 and 32x32, we can process it in slices of 4 pixels instead of 2. Examples of runtimes vs the 32 bit version, on a Cortex A53: ARM AArch64 vp9_inv_adst_adst_4x4_sub4_add_10_neon: 111.0 109.7 vp9_inv_adst_adst_8x8_sub8_add_10_neon: 914.0 733.5 vp9_inv_adst_adst_16x16_sub16_add_10_neon: 5184.0 3745.7 vp9_inv_dct_dct_4x4_sub1_add_10_neon: 65.0 65.7 vp9_inv_dct_dct_4x4_sub4_add_10_neon: 100.0 96.7 vp9_inv_dct_dct_8x8_sub1_add_10_neon: 111.0 119.7 vp9_inv_dct_dct_8x8_sub8_add_10_neon: 618.0 494.7 vp9_inv_dct_dct_16x16_sub1_add_10_neon: 295.1 284.6 vp9_inv_dct_dct_16x16_sub2_add_10_neon: 2303.2 1883.9 vp9_inv_dct_dct_16x16_sub8_add_10_neon: 2984.8 2189.3 vp9_inv_dct_dct_16x16_sub16_add_10_neon: 3890.0 2799.4 vp9_inv_dct_dct_32x32_sub1_add_10_neon: 1044.4 1012.7 vp9_inv_dct_dct_32x32_sub2_add_10_neon: 13333.7 9695.1 vp9_inv_dct_dct_32x32_sub16_add_10_neon: 18531.3 12459.8 vp9_inv_dct_dct_32x32_sub32_add_10_neon: 24470.7 16160.2 vp9_inv_wht_wht_4x4_sub4_add_10_neon: 83.0 79.7 The larger transforms are significantly faster than the corresponding ARM versions. The speedup vs C code is smaller than in 32 bit mode, probably because the 64 bit intermediates in the C code can be expressed more efficiently in aarch64. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
638eceed47
commit
ceb36b8178
@ -42,7 +42,8 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
|
||||
# decoders/encoders
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_neon.o \
|
||||
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
|
||||
aarch64/vp9itxfm_neon.o \
|
||||
aarch64/vp9lpf_neon.o \
|
||||
aarch64/vp9mc_16bpp_neon.o \
|
||||
aarch64/vp9mc_neon.o
|
||||
|
@ -157,7 +157,54 @@ static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
|
||||
}
|
||||
}
|
||||
|
||||
#define define_itxfm2(type_a, type_b, sz, bpp) \
|
||||
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
|
||||
ptrdiff_t stride, \
|
||||
int16_t *_block, int eob)
|
||||
#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
|
||||
|
||||
#define define_itxfm_funcs(sz, bpp) \
|
||||
define_itxfm(idct, idct, sz, bpp); \
|
||||
define_itxfm(iadst, idct, sz, bpp); \
|
||||
define_itxfm(idct, iadst, sz, bpp); \
|
||||
define_itxfm(iadst, iadst, sz, bpp)
|
||||
|
||||
define_itxfm_funcs(4, BPP);
|
||||
define_itxfm_funcs(8, BPP);
|
||||
define_itxfm_funcs(16, BPP);
|
||||
define_itxfm(idct, idct, 32, BPP);
|
||||
define_itxfm(iwht, iwht, 4, BPP);
|
||||
|
||||
|
||||
static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_itxfm2(tx, sz, bpp) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
|
||||
#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
|
||||
|
||||
#define init_idct2(tx, nm, bpp) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
|
||||
#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
|
||||
|
||||
init_itxfm(TX_4X4, 4x4, BPP);
|
||||
init_itxfm(TX_8X8, 8x8, BPP);
|
||||
init_itxfm(TX_16X16, 16x16, BPP);
|
||||
init_idct(TX_32X32, idct_idct_32x32, BPP);
|
||||
init_idct(4, iwht_iwht_4x4, BPP);
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void INIT_FUNC(VP9DSPContext *dsp)
|
||||
{
|
||||
vp9dsp_mc_init_aarch64(dsp);
|
||||
vp9dsp_itxfm_init_aarch64(dsp);
|
||||
}
|
||||
|
1517
libavcodec/aarch64/vp9itxfm_16bpp_neon.S
Normal file
1517
libavcodec/aarch64/vp9itxfm_16bpp_neon.S
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user