mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
arm: Add VFP-accelerated version of dca_lfe_fir
Before After Mean StdDev Mean StdDev Change This function 868.2 33.5 436.0 27.0 +99.1% Overall 15973.0 223.2 15577.5 83.2 +2.5% Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
b63bb251ea
commit
ba6836c966
@ -52,7 +52,8 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
|
||||
arm/vp8dsp_init_armv6.o \
|
||||
arm/vp8dsp_armv6.o
|
||||
|
||||
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
|
||||
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
|
||||
arm/synth_filter_vfp.o
|
||||
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
|
||||
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
|
||||
|
||||
|
@ -24,6 +24,8 @@
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/dcadsp.h"
|
||||
|
||||
void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
|
||||
int decifactor, float scale);
|
||||
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
|
||||
int decifactor, float scale);
|
||||
|
||||
@ -31,6 +33,8 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
|
||||
s->lfe_fir = ff_dca_lfe_fir_vfp;
|
||||
if (have_neon(cpu_flags))
|
||||
s->lfe_fir = ff_dca_lfe_fir_neon;
|
||||
}
|
||||
|
220
libavcodec/arm/dcadsp_vfp.S
Normal file
220
libavcodec/arm/dcadsp_vfp.S
Normal file
@ -0,0 +1,220 @@
|
||||
/*
|
||||
* Copyright (c) 2013 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
POUT .req a1
|
||||
PIN .req a2
|
||||
PCOEF .req a3
|
||||
DECIFACTOR .req a4
|
||||
OLDFPSCR .req a4
|
||||
COUNTER .req ip
|
||||
|
||||
SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
|
||||
SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
|
||||
IN0 .req s4
|
||||
IN1 .req s5
|
||||
IN2 .req s6
|
||||
IN3 .req s7
|
||||
IN4 .req s0
|
||||
IN5 .req s1
|
||||
IN6 .req s2
|
||||
IN7 .req s3
|
||||
COEF0 .req s8 @ coefficient elements
|
||||
COEF1 .req s9
|
||||
COEF2 .req s10
|
||||
COEF3 .req s11
|
||||
COEF4 .req s12
|
||||
COEF5 .req s13
|
||||
COEF6 .req s14
|
||||
COEF7 .req s15
|
||||
ACCUM0 .req s16 @ double-buffered multiply-accumulate results
|
||||
ACCUM4 .req s20
|
||||
POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
|
||||
POST1 .req s25
|
||||
POST2 .req s26
|
||||
POST3 .req s27
|
||||
|
||||
|
||||
.macro inner_loop decifactor, dir, tail, head
|
||||
.ifc "\dir","up"
|
||||
.set X, 0
|
||||
.set Y, 4
|
||||
.else
|
||||
.set X, 4*JMAX*4 - 4
|
||||
.set Y, -4
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
|
||||
vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
|
||||
vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
|
||||
vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
|
||||
vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
|
||||
vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
|
||||
vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
|
||||
.ifc "\tail",""
|
||||
vmul.f ACCUM4, COEF4, IN1 @ vector operation
|
||||
.endif
|
||||
vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
|
||||
vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
|
||||
.ifnc "\tail",""
|
||||
vmul.f ACCUM4, COEF4, IN1 @ vector operation
|
||||
.endif
|
||||
vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
|
||||
vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstmia POUT!, {POST0-POST3}
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
|
||||
vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
|
||||
vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
|
||||
vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
|
||||
vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
|
||||
vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
|
||||
.if \decifactor == 32
|
||||
vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
|
||||
vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
|
||||
vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
|
||||
vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
|
||||
vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
|
||||
vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
|
||||
vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
|
||||
vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
|
||||
vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
|
||||
vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
|
||||
vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
|
||||
vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
|
||||
vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
|
||||
vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
|
||||
vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
|
||||
vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
|
||||
vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
|
||||
vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
|
||||
vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
|
||||
vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro dca_lfe_fir decifactor
|
||||
.if \decifactor == 32
|
||||
.set JMAX, 8
|
||||
vpush {s16-s31}
|
||||
vmov SCALE32, s0 @ duplicate scalar across vector
|
||||
vldr IN4, [PIN, #-4*4]
|
||||
vldr IN5, [PIN, #-5*4]
|
||||
vldr IN6, [PIN, #-6*4]
|
||||
vldr IN7, [PIN, #-7*4]
|
||||
.else
|
||||
.set JMAX, 4
|
||||
vpush {s16-s27}
|
||||
.endif
|
||||
|
||||
mov COUNTER, #\decifactor/4 - 1
|
||||
inner_loop \decifactor, up,, head
|
||||
1: add PCOEF, PCOEF, #4*JMAX*4
|
||||
subs COUNTER, COUNTER, #1
|
||||
inner_loop \decifactor, up, tail, head
|
||||
bne 1b
|
||||
inner_loop \decifactor, up, tail
|
||||
|
||||
mov COUNTER, #\decifactor/4 - 1
|
||||
inner_loop \decifactor, down,, head
|
||||
1: sub PCOEF, PCOEF, #4*JMAX*4
|
||||
subs COUNTER, COUNTER, #1
|
||||
inner_loop \decifactor, down, tail, head
|
||||
bne 1b
|
||||
inner_loop \decifactor, down, tail
|
||||
|
||||
.if \decifactor == 32
|
||||
vpop {s16-s31}
|
||||
.else
|
||||
vpop {s16-s27}
|
||||
.endif
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
|
||||
/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
|
||||
* int decifactor, float scale)
|
||||
*/
|
||||
function ff_dca_lfe_fir_vfp, export=1
|
||||
teq DECIFACTOR, #32
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, ip
|
||||
NOVFP vldr s0, [sp]
|
||||
vldr IN0, [PIN, #-0*4]
|
||||
vldr IN1, [PIN, #-1*4]
|
||||
vldr IN2, [PIN, #-2*4]
|
||||
vldr IN3, [PIN, #-3*4]
|
||||
beq 32f
|
||||
64: dca_lfe_fir 64
|
||||
.ltorg
|
||||
32: dca_lfe_fir 32
|
||||
endfunc
|
||||
|
||||
.unreq POUT
|
||||
.unreq PIN
|
||||
.unreq PCOEF
|
||||
.unreq DECIFACTOR
|
||||
.unreq OLDFPSCR
|
||||
.unreq COUNTER
|
||||
|
||||
.unreq SCALE32
|
||||
.unreq SCALE64
|
||||
.unreq IN0
|
||||
.unreq IN1
|
||||
.unreq IN2
|
||||
.unreq IN3
|
||||
.unreq IN4
|
||||
.unreq IN5
|
||||
.unreq IN6
|
||||
.unreq IN7
|
||||
.unreq COEF0
|
||||
.unreq COEF1
|
||||
.unreq COEF2
|
||||
.unreq COEF3
|
||||
.unreq COEF4
|
||||
.unreq COEF5
|
||||
.unreq COEF6
|
||||
.unreq COEF7
|
||||
.unreq ACCUM0
|
||||
.unreq ACCUM4
|
||||
.unreq POST0
|
||||
.unreq POST1
|
||||
.unreq POST2
|
||||
.unreq POST3
|
Loading…
Reference in New Issue
Block a user