mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
arm: Add VFP-accelerated version of imdct_half
Before After Mean StdDev Mean StdDev Change This function 2653.0 28.5 1108.8 51.4 +139.3% Overall 17049.5 408.2 15973.0 223.2 +6.7% Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
d6e4f5fef0
commit
b63bb251ea
@ -53,6 +53,7 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
|
||||
arm/vp8dsp_armv6.o
|
||||
|
||||
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
|
||||
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
|
||||
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
|
||||
|
||||
NEON-OBJS += arm/fmtconvert_neon.o
|
||||
|
@ -26,6 +26,8 @@
|
||||
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
@ -48,6 +50,13 @@ av_cold void ff_fft_init_arm(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_vfp(cpu_flags)) {
|
||||
#if CONFIG_MDCT
|
||||
if (!have_vfpv3(cpu_flags))
|
||||
s->imdct_half = ff_imdct_half_vfp;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->fft_permute = ff_fft_permute_neon;
|
||||
s->fft_calc = ff_fft_calc_neon;
|
||||
|
206
libavcodec/arm/mdct_vfp.S
Normal file
206
libavcodec/arm/mdct_vfp.S
Normal file
@ -0,0 +1,206 @@
|
||||
/*
|
||||
* Copyright (c) 2013 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
CONTEXT .req a1
|
||||
ORIGOUT .req a2
|
||||
IN .req a3
|
||||
OUT .req v1
|
||||
REVTAB .req v2
|
||||
TCOS .req v3
|
||||
TSIN .req v4
|
||||
OLDFPSCR .req v5
|
||||
J0 .req a2
|
||||
J1 .req a4
|
||||
J2 .req ip
|
||||
J3 .req lr
|
||||
|
||||
.macro prerotation_innerloop
|
||||
.set trig_lo, k
|
||||
.set trig_hi, n4 - k - 2
|
||||
.set in_lo, trig_lo * 2
|
||||
.set in_hi, trig_hi * 2
|
||||
vldr d8, [TCOS, #trig_lo*4] @ s16,s17
|
||||
vldr d9, [TCOS, #trig_hi*4] @ s18,s19
|
||||
vldr s0, [IN, #in_hi*4 + 12]
|
||||
vldr s1, [IN, #in_hi*4 + 4]
|
||||
vldr s2, [IN, #in_lo*4 + 12]
|
||||
vldr s3, [IN, #in_lo*4 + 4]
|
||||
vmul.f s8, s0, s16 @ vector operation
|
||||
vldr d10, [TSIN, #trig_lo*4] @ s20,s21
|
||||
vldr d11, [TSIN, #trig_hi*4] @ s22,s23
|
||||
vldr s4, [IN, #in_lo*4]
|
||||
vldr s5, [IN, #in_lo*4 + 8]
|
||||
vldr s6, [IN, #in_hi*4]
|
||||
vldr s7, [IN, #in_hi*4 + 8]
|
||||
ldr J0, [REVTAB, #trig_lo*2]
|
||||
vmul.f s12, s0, s20 @ vector operation
|
||||
ldr J2, [REVTAB, #trig_hi*2]
|
||||
mov J1, J0, lsr #16
|
||||
and J0, J0, #255 @ halfword value will be < n4
|
||||
vmls.f s8, s4, s20 @ vector operation
|
||||
mov J3, J2, lsr #16
|
||||
and J2, J2, #255 @ halfword value will be < n4
|
||||
add J0, OUT, J0, lsl #3
|
||||
vmla.f s12, s4, s16 @ vector operation
|
||||
add J1, OUT, J1, lsl #3
|
||||
add J2, OUT, J2, lsl #3
|
||||
add J3, OUT, J3, lsl #3
|
||||
vstr s8, [J0]
|
||||
vstr s9, [J1]
|
||||
vstr s10, [J2]
|
||||
vstr s11, [J3]
|
||||
vstr s12, [J0, #4]
|
||||
vstr s13, [J1, #4]
|
||||
vstr s14, [J2, #4]
|
||||
vstr s15, [J3, #4]
|
||||
.set k, k + 2
|
||||
.endm
|
||||
|
||||
.macro postrotation_innerloop tail, head
|
||||
.set trig_lo_head, n8 - k - 2
|
||||
.set trig_hi_head, n8 + k
|
||||
.set out_lo_head, trig_lo_head * 2
|
||||
.set out_hi_head, trig_hi_head * 2
|
||||
.set trig_lo_tail, n8 - (k - 2) - 2
|
||||
.set trig_hi_tail, n8 + (k - 2)
|
||||
.set out_lo_tail, trig_lo_tail * 2
|
||||
.set out_hi_tail, trig_hi_tail * 2
|
||||
.if (k & 2) == 0
|
||||
TCOS_D0_HEAD .req d10 @ s20,s21
|
||||
TCOS_D1_HEAD .req d11 @ s22,s23
|
||||
TCOS_S0_TAIL .req s24
|
||||
.else
|
||||
TCOS_D0_HEAD .req d12 @ s24,s25
|
||||
TCOS_D1_HEAD .req d13 @ s26,s27
|
||||
TCOS_S0_TAIL .req s20
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
|
||||
vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
|
||||
vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr s0, [OUT, #out_lo_head*4]
|
||||
vldr s1, [OUT, #out_lo_head*4 + 8]
|
||||
vldr s2, [OUT, #out_hi_head*4]
|
||||
vldr s3, [OUT, #out_hi_head*4 + 8]
|
||||
vldr s4, [OUT, #out_lo_head*4 + 4]
|
||||
vldr s5, [OUT, #out_lo_head*4 + 12]
|
||||
vldr s6, [OUT, #out_hi_head*4 + 4]
|
||||
vldr s7, [OUT, #out_hi_head*4 + 12]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s8, [OUT, #out_lo_tail*4]
|
||||
vstr s9, [OUT, #out_lo_tail*4 + 8]
|
||||
vstr s10, [OUT, #out_hi_tail*4]
|
||||
vstr s11, [OUT, #out_hi_tail*4 + 8]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s8, s4, s16 @ vector operation
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s12, [OUT, #out_hi_tail*4 + 12]
|
||||
vstr s13, [OUT, #out_hi_tail*4 + 4]
|
||||
vstr s14, [OUT, #out_lo_tail*4 + 12]
|
||||
vstr s15, [OUT, #out_lo_tail*4 + 4]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s12, s0, s16 @ vector operation
|
||||
vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
|
||||
.endif
|
||||
.unreq TCOS_D0_HEAD
|
||||
.unreq TCOS_D1_HEAD
|
||||
.unreq TCOS_S0_TAIL
|
||||
.ifnc "\head",""
|
||||
.set k, k + 2
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
/* void ff_imdct_half_vfp(FFTContext *s,
|
||||
* FFTSample *output,
|
||||
* const FFTSample *input)
|
||||
*/
|
||||
function ff_imdct_half_vfp, export=1
|
||||
ldr ip, [CONTEXT, #5*4] @ mdct_bits
|
||||
teq ip, #6
|
||||
it ne
|
||||
bne ff_imdct_half_c @ only case currently accelerated is the one used by DCA
|
||||
|
||||
.set n, 1<<6
|
||||
.set n2, n/2
|
||||
.set n4, n/4
|
||||
.set n8, n/8
|
||||
|
||||
push {v1-v5,lr}
|
||||
vpush {s16-s27}
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
mov OUT, ORIGOUT
|
||||
ldr REVTAB, [CONTEXT, #2*4]
|
||||
ldr TCOS, [CONTEXT, #6*4]
|
||||
ldr TSIN, [CONTEXT, #7*4]
|
||||
|
||||
.set k, 0
|
||||
.rept n8/2
|
||||
prerotation_innerloop
|
||||
.endr
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
mov ORIGOUT, OUT
|
||||
ldr ip, [CONTEXT, #9*4]
|
||||
blx ip @ s->fft_calc(s, output)
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
|
||||
.set k, 0
|
||||
postrotation_innerloop , head
|
||||
.rept n8/2 - 1
|
||||
postrotation_innerloop tail, head
|
||||
.endr
|
||||
postrotation_innerloop tail
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
vpop {s16-s27}
|
||||
pop {v1-v5,pc}
|
||||
endfunc
|
||||
|
||||
.unreq CONTEXT
|
||||
.unreq ORIGOUT
|
||||
.unreq IN
|
||||
.unreq OUT
|
||||
.unreq REVTAB
|
||||
.unreq TCOS
|
||||
.unreq TSIN
|
||||
.unreq OLDFPSCR
|
||||
.unreq J0
|
||||
.unreq J1
|
||||
.unreq J2
|
||||
.unreq J3
|
@ -132,7 +132,7 @@ function ff_synth_filter_float_vfp, export=1
|
||||
str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
|
||||
ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
|
||||
VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
|
||||
bl ff_imdct_half_c
|
||||
bl ff_imdct_half_vfp
|
||||
VFP vmov SCALE, s16
|
||||
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
|
Loading…
x
Reference in New Issue
Block a user