dcadsp: split lfe_dir cases

The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net>
2024-12-23 12:43:46 +02:00 · 2014-02-05 23:40:52 +00:00 · 2014-02-05 23:40:52 +00:00 · 5fdbfcb5b7
commit 5fdbfcb5b7
parent 5b59a9fc61
6 changed files with 64 additions and 43 deletions
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@ -24,16 +24,22 @@
 #include "libavutil/attributes.h"
 #include "libavcodec/dcadsp.h"

-void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
-                        int decifactor, float scale);
+void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs,
+                          float scale);
+void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs,
+                          float scale);
+
+void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs,
+                          float scale);
+void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs,
+                          float scale);
+
 void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
                                SynthFilterContext *synth, FFTContext *imdct,
                                float synth_buf_ptr[512],
                                int *synth_buf_offset, float synth_buf2[32],
                                const float window[512], float *samples_out,
                                float raXin[32], float scale);
-void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
-                         int decifactor, float scale);

 void ff_synth_filter_float_vfp(FFTContext *imdct,
                               float *synth_buf_ptr, int *synth_buf_offset,
@ -52,11 +58,14 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
    int cpu_flags = av_get_cpu_flags();

    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
-        s->lfe_fir = ff_dca_lfe_fir_vfp;
+        s->lfe_fir[0]      = ff_dca_lfe_fir32_vfp;
+        s->lfe_fir[1]      = ff_dca_lfe_fir64_vfp;
        s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
    }
-    if (have_neon(cpu_flags))
-        s->lfe_fir = ff_dca_lfe_fir_neon;
+    if (have_neon(cpu_flags)) {
+        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
+        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
+    }
 }

 av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@ -20,17 +20,23 @@

 #include "libavutil/arm/asm.S"

-function ff_dca_lfe_fir_neon, export=1
+function ff_dca_lfe_fir0_neon, export=1
        push            {r4-r6,lr}
+NOVFP   vmov            s0,  r3                 @ scale
+        mov             r3,  #32                @ decifactor
+        mov             r6,  #256/32
+        b               dca_lfe_fir
+endfunc

+function ff_dca_lfe_fir1_neon, export=1
+        push            {r4-r6,lr}
+NOVFP   vmov            s0,  r3                 @ scale
+        mov             r3,  #64                @ decifactor
+        mov             r6,  #256/64
+dca_lfe_fir:
        add             r4,  r0,  r3,  lsl #2   @ out2
        add             r5,  r2,  #256*4-16     @ cf1
        sub             r1,  r1,  #12
-        cmp             r3,  #32
-        ite             eq
-        moveq           r6,  #256/32
-        movne           r6,  #256/64
-NOVFP   vldr            s0,  [sp, #16]          @ scale
        mov             lr,  #-16
 1:
        vmov.f32        q2,  #0.0               @ v0
--- a/libavcodec/arm/dcadsp_vfp.S
+++ b/libavcodec/arm/dcadsp_vfp.S
@ -24,7 +24,6 @@
 POUT          .req    a1
 PIN           .req    a2
 PCOEF         .req    a3
-DECIFACTOR    .req    a4
 OLDFPSCR      .req    a4
 COUNTER       .req    ip

@ -129,6 +128,15 @@ POST3         .req    s27
 .endm

 .macro dca_lfe_fir  decifactor
+function ff_dca_lfe_fir\decifactor\()_vfp, export=1
+NOVFP   vmov    s0, r3
+        fmrx    OLDFPSCR, FPSCR
+        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
+        fmxr    FPSCR, ip
+        vldr    IN0, [PIN, #-0*4]
+        vldr    IN1, [PIN, #-1*4]
+        vldr    IN2, [PIN, #-2*4]
+        vldr    IN3, [PIN, #-3*4]
 .if \decifactor == 32
  .set JMAX, 8
        vpush   {s16-s31}
@ -165,32 +173,16 @@ POST3         .req    s27
 .endif
        fmxr    FPSCR, OLDFPSCR
        bx      lr
+endfunc
 .endm

-
-/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
- *                         int decifactor, float scale)
- */
-function ff_dca_lfe_fir_vfp, export=1
-        teq     DECIFACTOR, #32
-        fmrx    OLDFPSCR, FPSCR
-        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
-        fmxr    FPSCR, ip
-NOVFP   vldr    s0, [sp]
-        vldr    IN0, [PIN, #-0*4]
-        vldr    IN1, [PIN, #-1*4]
-        vldr    IN2, [PIN, #-2*4]
-        vldr    IN3, [PIN, #-3*4]
-        beq     32f
-64:     dca_lfe_fir  64
+        dca_lfe_fir  64
 .ltorg
-32:     dca_lfe_fir  32
-endfunc
+        dca_lfe_fir  32

        .unreq  POUT
        .unreq  PIN
        .unreq  PCOEF
-        .unreq  DECIFACTOR
        .unreq  OLDFPSCR
        .unreq  COUNTER

--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@ -957,23 +957,23 @@ static void lfe_interpolation_fir(DCAContext *s, int decimation_select,
     * samples_out: An array holding interpolated samples
     */

-    int decifactor;
+    int idx;
    const float *prCoeff;
    int deciindex;

    /* Select decimation filter */
    if (decimation_select == 1) {
-        decifactor = 64;
+        idx = 1;
        prCoeff = lfe_fir_128;
    } else {
-        decifactor = 32;
+        idx = 0;
        prCoeff = lfe_fir_64;
    }
    /* Interpolation */
    for (deciindex = 0; deciindex < num_deci_sample; deciindex++) {
-        s->dcadsp.lfe_fir(samples_out, samples_in, prCoeff, decifactor, scale);
+        s->dcadsp.lfe_fir[idx](samples_out, samples_in, prCoeff, scale);
        samples_in++;
-        samples_out += 2 * decifactor;
+        samples_out += 2 * 32 * (1 + idx);
    }
 }

--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@ -32,8 +32,9 @@ static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
        dst[i] = src[i] * fscale;
 }

-static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
-                          int decifactor, float scale)
+static inline void
+dca_lfe_fir(float *out, const float *in, const float *coefs,
+            int decifactor, float scale)
 {
    float *out2 = out + decifactor;
    const float *cf0 = coefs;
@ -82,9 +83,22 @@ static void dca_qmf_32_subbands(float samples_in[32][8], int sb_act,
    }
 }

+static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs,
+                           float scale)
+{
+    dca_lfe_fir(out, in, coefs, 32, scale);
+}
+
+static void dca_lfe_fir1_c(float *out, const float *in, const float *coefs,
+                           float scale)
+{
+    dca_lfe_fir(out, in, coefs, 64, scale);
+}
+
 av_cold void ff_dcadsp_init(DCADSPContext *s)
 {
-    s->lfe_fir = dca_lfe_fir_c;
+    s->lfe_fir[0] = dca_lfe_fir0_c;
+    s->lfe_fir[1] = dca_lfe_fir1_c;
    s->qmf_32_subbands = dca_qmf_32_subbands;
    s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
    if (ARCH_ARM) ff_dcadsp_init_arm(s);
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@ -23,8 +23,8 @@
 #include "synth_filter.h"

 typedef struct DCADSPContext {
-    void (*lfe_fir)(float *out, const float *in, const float *coefs,
-                    int decifactor, float scale);
+    void (*lfe_fir[2])(float *out, const float *in, const float *coefs,
+                       float scale);
    void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
                            SynthFilterContext *synth, FFTContext *imdct,
                            float synth_buf_ptr[512],