You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	aarch64/opusdsp: implement NEON accelerated postfilter and deemphasis
153372 UNITS in postfilter_c,   65536 runs,      0 skips
73164 UNITS in postfilter_neon,   65536 runs,      0 skips -> 2.1x speedup
80591 UNITS in deemphasis_c,  131072 runs,      0 skips
43969 UNITS in deemphasis_neon,  131072 runs,      0 skips -> 1.83x speedup
Total decoder speedup: ~15% on a Raspberry Pi 3 (from 28.1x to 33.5x realtime)
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
    y[0] = x[0] + c1*state;
    y[1] = x[1] + c2*state + c1*x[0];
    y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
    y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
    state = y[3];
    y += 4;
    x += 4;
}
Unlike the x86 version, duplication is used instead of pslldq so
the structure and tables are different.
			
			
This commit is contained in:
		| @@ -15,6 +15,7 @@ OBJS-$(CONFIG_VP8DSP)                   += aarch64/vp8dsp_init_aarch64.o | ||||
| OBJS-$(CONFIG_AAC_DECODER)              += aarch64/aacpsdsp_init_aarch64.o \ | ||||
|                                            aarch64/sbrdsp_init_aarch64.o | ||||
| OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o | ||||
| OBJS-$(CONFIG_OPUS_DECODER)             += aarch64/opusdsp_init.o | ||||
| OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o | ||||
| OBJS-$(CONFIG_VC1DSP)                   += aarch64/vc1dsp_init_aarch64.o | ||||
| OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o | ||||
| @@ -49,6 +50,7 @@ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o | ||||
| # decoders/encoders | ||||
| NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o | ||||
| NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o | ||||
| NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o | ||||
| NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o | ||||
| NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \ | ||||
|                                            aarch64/vp9itxfm_neon.o             \ | ||||
|   | ||||
							
								
								
									
										35
									
								
								libavcodec/aarch64/opusdsp_init.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								libavcodec/aarch64/opusdsp_init.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include "config.h" | ||||
|  | ||||
| #include "libavutil/aarch64/cpu.h" | ||||
| #include "libavcodec/opusdsp.h" | ||||
|  | ||||
| void ff_opus_postfilter_neon(float *data, int period, float *gains, int len); | ||||
| float ff_opus_deemphasis_neon(float *out, float *in, float coeff, int len); | ||||
|  | ||||
| av_cold void ff_opus_dsp_init_aarch64(OpusDSP *ctx) | ||||
| { | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
|  | ||||
|     if (have_neon(cpu_flags)) { | ||||
|         ctx->postfilter = ff_opus_postfilter_neon; | ||||
|         ctx->deemphasis = ff_opus_deemphasis_neon; | ||||
|     } | ||||
| } | ||||
							
								
								
									
										113
									
								
								libavcodec/aarch64/opusdsp_neon.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										113
									
								
								libavcodec/aarch64/opusdsp_neon.S
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,113 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include "libavutil/aarch64/asm.S" | ||||
|  | ||||
|            // 0.85..^1    0.85..^2    0.85..^3    0.85..^4 | ||||
| const tab_st, align=4 | ||||
|         .word 0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f | ||||
| endconst | ||||
| const tab_x0, align=4 | ||||
|         .word 0x0,        0x3f599a00, 0x3f38f671, 0x3f1d382a | ||||
| endconst | ||||
| const tab_x1, align=4 | ||||
|         .word 0x0,        0x0,        0x3f599a00, 0x3f38f671 | ||||
| endconst | ||||
| const tab_x2, align=4 | ||||
|         .word 0x0,        0x0,        0x0,        0x3f599a00 | ||||
| endconst | ||||
|  | ||||
| function ff_opus_deemphasis_neon, export=1 | ||||
|         movrel  x4, tab_st | ||||
|         ld1    {v4.4s}, [x4] | ||||
|         movrel  x4, tab_x0 | ||||
|         ld1    {v5.4s}, [x4] | ||||
|         movrel  x4, tab_x1 | ||||
|         ld1    {v6.4s}, [x4] | ||||
|         movrel  x4, tab_x2 | ||||
|         ld1    {v7.4s}, [x4] | ||||
|  | ||||
|         fmul v0.4s, v4.4s, v0.s[0] | ||||
|  | ||||
| 1:      ld1  {v1.4s, v2.4s}, [x1], #32 | ||||
|  | ||||
|         fmla v0.4s, v5.4s, v1.s[0] | ||||
|         fmul v3.4s, v7.4s, v2.s[2] | ||||
|  | ||||
|         fmla v0.4s, v6.4s, v1.s[1] | ||||
|         fmla v3.4s, v6.4s, v2.s[1] | ||||
|  | ||||
|         fmla v0.4s, v7.4s, v1.s[2] | ||||
|         fmla v3.4s, v5.4s, v2.s[0] | ||||
|  | ||||
|         fadd v1.4s, v1.4s, v0.4s | ||||
|         fadd v2.4s, v2.4s, v3.4s | ||||
|  | ||||
|         fmla v2.4s, v4.4s, v1.s[3] | ||||
|  | ||||
|         st1  {v1.4s, v2.4s}, [x0], #32 | ||||
|         fmul v0.4s, v4.4s, v2.s[3] | ||||
|  | ||||
|         subs w2, w2, #8 | ||||
|         b.gt 1b | ||||
|  | ||||
|         mov s0, v2.s[3] | ||||
|  | ||||
|         ret | ||||
| endfunc | ||||
|  | ||||
| function ff_opus_postfilter_neon, export=1 | ||||
|         ld1 {v0.4s}, [x2] | ||||
|         dup v1.4s, v0.s[1] | ||||
|         dup v2.4s, v0.s[2] | ||||
|         dup v0.4s, v0.s[0] | ||||
|  | ||||
|         add w1, w1, #2 | ||||
|         sub x1, x0, x1, lsl #2 | ||||
|  | ||||
|         ld1 {v3.4s}, [x1] | ||||
|         fmul v3.4s, v3.4s, v2.4s | ||||
|  | ||||
| 1:      add x1, x1, #4 | ||||
|         ld1 {v4.4s}, [x1] | ||||
|         add x1, x1, #4 | ||||
|         ld1 {v5.4s}, [x1] | ||||
|         add x1, x1, #4 | ||||
|         ld1 {v6.4s}, [x1] | ||||
|         add x1, x1, #4 | ||||
|         ld1 {v7.4s}, [x1] | ||||
|  | ||||
|         fmla v3.4s, v7.4s, v2.4s | ||||
|         fadd v6.4s, v6.4s, v4.4s | ||||
|  | ||||
|         ld1 {v8.4s}, [x0] | ||||
|         fmla v8.4s, v5.4s, v0.4s | ||||
|  | ||||
|         fmul v6.4s, v6.4s, v1.4s | ||||
|         fadd v6.4s, v6.4s, v3.4s | ||||
|  | ||||
|         fadd v8.4s, v8.4s, v6.4s | ||||
|         fmul v3.4s, v7.4s, v2.4s | ||||
|  | ||||
|         st1  {v8.4s}, [x0], #16 | ||||
|  | ||||
|         subs w3, w3, #4 | ||||
|         b.gt 1b | ||||
|  | ||||
|         ret | ||||
| endfunc | ||||
| @@ -61,4 +61,7 @@ av_cold void ff_opus_dsp_init(OpusDSP *ctx) | ||||
|  | ||||
|     if (ARCH_X86) | ||||
|         ff_opus_dsp_init_x86(ctx); | ||||
|  | ||||
|     if (ARCH_AARCH64) | ||||
|         ff_opus_dsp_init_aarch64(ctx); | ||||
| } | ||||
|   | ||||
| @@ -31,5 +31,6 @@ typedef struct OpusDSP { | ||||
| void ff_opus_dsp_init(OpusDSP *ctx); | ||||
|  | ||||
| void ff_opus_dsp_init_x86(OpusDSP *ctx); | ||||
| void ff_opus_dsp_init_aarch64(OpusDSP *ctx); | ||||
|  | ||||
| #endif /* AVCODEC_OPUSDSP_H */ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user