ARM: NEON optimised synth_filter_float

2.7x faster DCA decoding on Cortex-A8 Originally committed as revision 22828 to svn://svn.ffmpeg.org/ffmpeg/trunk
2025-03-23 04:24:35 +02:00 · 2010-04-10 16:27:56 +00:00 · 2010-04-10 16:27:56 +00:00 · e73d1a5efc
commit e73d1a5efc
parent f462ed1f82
4 changed files with 133 additions and 0 deletions
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@ -33,6 +33,8 @@ NEON-OBJS-$(CONFIG_H264DSP)            += arm/h264dsp_neon.o            \
                                          arm/h264idct_neon.o           \
                                          arm/h264pred_neon.o           \

+NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o       \
+
 NEON-OBJS-$(CONFIG_VP3_DECODER)        += arm/vp3dsp_neon.o

 OBJS-$(HAVE_NEON)                      += arm/dsputil_init_neon.o       \
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@ -19,6 +19,7 @@
 */

 #include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"

 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
@ -29,6 +30,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input)

 void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);

+void ff_synth_filter_float_neon(FFTContext *imdct,
+                                float *synth_buf_ptr, int *synth_buf_offset,
+                                float synth_buf2[32], const float window[512],
+                                float out[32], const float in[32],
+                                float scale, float bias);
+
 av_cold void ff_fft_init_arm(FFTContext *s)
 {
    if (HAVE_NEON) {
@ -48,3 +55,11 @@ av_cold void ff_rdft_init_arm(RDFTContext *s)
        s->rdft_calc    = ff_rdft_calc_neon;
 }
 #endif
+
+#if CONFIG_DCA_DECODER
+av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
+{
+    if (HAVE_NEON)
+        s->synth_filter_float = ff_synth_filter_float_neon;
+}
+#endif
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+
+function ff_synth_filter_float_neon, export=1
+        push            {r3-r11,lr}
+
+        ldr             r4,  [r2]               @ synth_buf_offset
+        add             r1,  r1,  r4,  lsl #2   @ synth_buf
+        sub             r12, r4,  #32
+        bfc             r12, #9,  #23
+        bic             r4,  r4,  #63
+        str             r12, [r2]
+
+        ldr             r2,  [sp, #12*4]        @ in
+        mov             r9,  r1                 @ synth_buf
+
+        bl              ff_imdct_half_neon
+        pop             {r3}
+
+        ldr             r5,  [sp, #9*4]         @ window
+        ldr             r2,  [sp, #10*4]        @ out
+        vldr            d0,  [sp, #12*4]        @ scale, bias
+        add             r8,  r9,  #12*4
+
+        mov             lr,  #64*4
+        mov             r1,  #4
+1:
+        add             r10, r9,  #16*4         @ synth_buf
+        add             r11, r8,  #16*4
+        add             r0,  r5,  #16*4         @ window
+        add             r6,  r5,  #32*4
+        add             r7,  r5,  #48*4
+
+        vld1.32         {q10},    [r3,:128]     @ a
+        add             r3,  r3,  #16*4
+        vld1.32         {q1},     [r3,:128]     @ b
+        vmov.f32        q2,  #0.0               @ c
+        vmov.f32        q3,  #0.0               @ d
+
+        mov             r12, #512
+2:
+        vld1.32         {q9},     [r8, :128], lr
+        vrev64.32       q9,  q9
+        vld1.32         {q8},     [r5, :128], lr
+        vmls.f32        d20, d16, d19
+        vld1.32         {q11},    [r0, :128], lr
+        vmls.f32        d21, d17, d18
+        vld1.32         {q12},    [r9, :128], lr
+        vmla.f32        d2,  d22, d24
+        vld1.32         {q8},     [r6, :128], lr
+        vmla.f32        d3,  d23, d25
+        vld1.32         {q9},     [r10,:128], lr
+        vmla.f32        d4,  d16, d18
+        vld1.32         {q12},    [r11,:128], lr
+        vmla.f32        d5,  d17, d19
+        vrev64.32       q12, q12
+        vld1.32         {q11},    [r7, :128], lr
+        vmla.f32        d6,  d22, d25
+        vmla.f32        d7,  d23, d24
+        subs            r12, r12, #64
+        beq             3f
+        cmp             r12, r4
+        bne             2b
+        sub             r8,  r8,  #512*4
+        sub             r9,  r9,  #512*4
+        sub             r10, r10, #512*4
+        sub             r11, r11, #512*4
+        b               2b
+3:
+        vdup.32         q8,  d0[1]
+        vdup.32         q9,  d0[1]
+        vmla.f32        q8,  q10, d0[0]
+        vmla.f32        q9,  q1,  d0[0]
+        vst1.32         {q3},     [r3,:128]
+        sub             r3,  r3,  #16*4
+        vst1.32         {q2},     [r3,:128]
+        vst1.32         {q8},     [r2,:128]
+        add             r2,  r2,  #16*4
+        vst1.32         {q9},     [r2,:128]
+
+        subs            r1,  r1,  #1
+        popeq           {r4-r11,pc}
+
+        cmp             r4,  #0
+        subeq           r8,  r8,  #512*4
+        subeq           r9,  r9,  #512*4
+        sub             r5,  r5,  #512*4
+        sub             r2,  r2,  #12*4         @ out
+        add             r3,  r3,  #4*4          @ synth_buf2
+        add             r5,  r5,  #4*4          @ window
+        add             r9,  r9,  #4*4          @ synth_buf
+        sub             r8,  r8,  #4*4          @ synth_buf
+        b               1b
+endfunc
--- a/libavcodec/synth_filter.h
+++ b/libavcodec/synth_filter.h
@ -32,5 +32,6 @@ typedef struct SynthFilterContext {
 } SynthFilterContext;

 void ff_synth_filter_init(SynthFilterContext *c);
+void ff_synth_filter_init_arm(SynthFilterContext *c);

 #endif /* AVCODEC_SYNTH_FILTER_H */