mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
arm: Add VFP-accelerated version of fft16
Before After Mean StdDev Mean StdDev Change This function 1389.3 4.2 967.8 35.1 +43.6% Overall 15577.5 83.2 15400.0 336.4 +1.2% Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
ba6836c966
commit
8b9eba664e
@ -54,6 +54,7 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
|
||||
|
||||
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
|
||||
arm/synth_filter_vfp.o
|
||||
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
|
||||
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
|
||||
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
|
||||
|
||||
|
298
libavcodec/arm/fft_vfp.S
Normal file
298
libavcodec/arm/fft_vfp.S
Normal file
@ -0,0 +1,298 @@
|
||||
/*
|
||||
* Copyright (c) 2013 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
@ TODO: * FFTs wider than 16
|
||||
@ * dispatch code
|
||||
|
||||
function fft4_vfp
|
||||
vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
|
||||
vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
|
||||
vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
|
||||
vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
|
||||
@ stall
|
||||
vadd.f s12, s0, s8 @ i0
|
||||
vadd.f s13, s1, s9 @ i1
|
||||
vadd.f s14, s2, s10 @ i2
|
||||
vadd.f s15, s3, s11 @ i3
|
||||
vsub.f s8, s0, s8 @ i4
|
||||
vsub.f s9, s1, s9 @ i5
|
||||
vsub.f s10, s2, s10 @ i6
|
||||
vsub.f s11, s3, s11 @ i7
|
||||
@ stall
|
||||
@ stall
|
||||
vadd.f s0, s12, s14 @ z[0].re
|
||||
vsub.f s4, s12, s14 @ z[2].re
|
||||
vadd.f s1, s13, s15 @ z[0].im
|
||||
vsub.f s5, s13, s15 @ z[2].im
|
||||
vadd.f s7, s9, s10 @ z[3].im
|
||||
vsub.f s3, s9, s10 @ z[1].im
|
||||
vadd.f s2, s8, s11 @ z[1].re
|
||||
vsub.f s6, s8, s11 @ z[3].re
|
||||
@ stall
|
||||
@ stall
|
||||
vstr d0, [a1, #0*2*4]
|
||||
vstr d2, [a1, #2*2*4]
|
||||
@ stall
|
||||
@ stall
|
||||
vstr d1, [a1, #1*2*4]
|
||||
vstr d3, [a1, #3*2*4]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro macro_fft8_head
|
||||
@ FFT4
|
||||
vldr d4, [a1, #0 * 2*4]
|
||||
vldr d6, [a1, #1 * 2*4]
|
||||
vldr d5, [a1, #2 * 2*4]
|
||||
vldr d7, [a1, #3 * 2*4]
|
||||
@ BF
|
||||
vldr d12, [a1, #4 * 2*4]
|
||||
vadd.f s16, s8, s12 @ vector op
|
||||
vldr d14, [a1, #5 * 2*4]
|
||||
vldr d13, [a1, #6 * 2*4]
|
||||
vldr d15, [a1, #7 * 2*4]
|
||||
vsub.f s20, s8, s12 @ vector op
|
||||
vadd.f s0, s16, s18
|
||||
vsub.f s2, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s3, s17, s19
|
||||
vadd.f s7, s21, s22
|
||||
vsub.f s5, s21, s22
|
||||
vadd.f s4, s20, s23
|
||||
vsub.f s6, s20, s23
|
||||
vsub.f s20, s24, s28 @ vector op
|
||||
vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
|
||||
vstr d1, [a1, #1 * 2*4]
|
||||
vldr s0, cos1pi4
|
||||
vadd.f s16, s24, s28 @ vector op
|
||||
vstr d2, [a1, #2 * 2*4]
|
||||
vstr d3, [a1, #3 * 2*4]
|
||||
vldr d12, [a1, #0 * 2*4]
|
||||
@ TRANSFORM
|
||||
vmul.f s20, s20, s0 @ vector x scalar op
|
||||
vldr d13, [a1, #1 * 2*4]
|
||||
vldr d14, [a1, #2 * 2*4]
|
||||
vldr d15, [a1, #3 * 2*4]
|
||||
@ BUTTERFLIES
|
||||
vadd.f s0, s18, s16
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s2, s17, s19
|
||||
vsub.f s3, s18, s16
|
||||
vadd.f s4, s21, s20
|
||||
vsub.f s5, s21, s20
|
||||
vadd.f s6, s22, s23
|
||||
vsub.f s7, s22, s23
|
||||
vadd.f s8, s0, s24 @ vector op
|
||||
vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
|
||||
vstr d1, [a1, #1 * 2*4]
|
||||
vldr d6, [a1, #0 * 2*4]
|
||||
vldr d7, [a1, #1 * 2*4]
|
||||
vadd.f s1, s5, s6
|
||||
vadd.f s0, s7, s4
|
||||
vsub.f s2, s5, s6
|
||||
vsub.f s3, s7, s4
|
||||
vsub.f s12, s24, s12 @ vector op
|
||||
vsub.f s5, s29, s1
|
||||
vsub.f s4, s28, s0
|
||||
vsub.f s6, s30, s2
|
||||
vsub.f s7, s31, s3
|
||||
vadd.f s16, s0, s28 @ vector op
|
||||
vstr d6, [a1, #4 * 2*4]
|
||||
vstr d7, [a1, #6 * 2*4]
|
||||
vstr d4, [a1, #0 * 2*4]
|
||||
vstr d5, [a1, #2 * 2*4]
|
||||
vstr d2, [a1, #5 * 2*4]
|
||||
vstr d3, [a1, #7 * 2*4]
|
||||
.endm
|
||||
|
||||
.macro macro_fft8_tail
|
||||
vstr d8, [a1, #1 * 2*4]
|
||||
vstr d9, [a1, #3 * 2*4]
|
||||
.endm
|
||||
|
||||
function fft8_vfp
|
||||
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
|
||||
fmrx a2, FPSCR
|
||||
fmxr FPSCR, a3
|
||||
vpush {s16-s31}
|
||||
|
||||
macro_fft8_head
|
||||
macro_fft8_tail
|
||||
|
||||
vpop {s16-s31}
|
||||
fmxr FPSCR, a2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.align 3
|
||||
cos1pi4: @ cos(1*pi/4) = sqrt(2)
|
||||
.float 0.707106769084930419921875
|
||||
cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
|
||||
.float 0.92387950420379638671875
|
||||
cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
|
||||
.float 0.3826834261417388916015625
|
||||
|
||||
function ff_fft16_vfp, export=1
|
||||
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
|
||||
fmrx a2, FPSCR
|
||||
fmxr FPSCR, a3
|
||||
vpush {s16-s31}
|
||||
|
||||
macro_fft8_head
|
||||
@ FFT4(z+8)
|
||||
vldr d10, [a1, #8 * 2*4]
|
||||
vldr d12, [a1, #9 * 2*4]
|
||||
vldr d11, [a1, #10 * 2*4]
|
||||
vldr d13, [a1, #11 * 2*4]
|
||||
macro_fft8_tail
|
||||
vadd.f s16, s20, s24 @ vector op
|
||||
@ FFT4(z+12)
|
||||
vldr d4, [a1, #12 * 2*4]
|
||||
vldr d6, [a1, #13 * 2*4]
|
||||
vldr d5, [a1, #14 * 2*4]
|
||||
vsub.f s20, s20, s24 @ vector op
|
||||
vldr d7, [a1, #15 * 2*4]
|
||||
vadd.f s0, s16, s18
|
||||
vsub.f s4, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s5, s17, s19
|
||||
vadd.f s7, s21, s22
|
||||
vsub.f s3, s21, s22
|
||||
vadd.f s2, s20, s23
|
||||
vsub.f s6, s20, s23
|
||||
vadd.f s16, s8, s12 @ vector op
|
||||
vstr d0, [a1, #8 * 2*4]
|
||||
vstr d2, [a1, #10 * 2*4]
|
||||
vstr d1, [a1, #9 * 2*4]
|
||||
vsub.f s20, s8, s12
|
||||
vstr d3, [a1, #11 * 2*4]
|
||||
@ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
|
||||
vldr d12, [a1, #10 * 2*4]
|
||||
vadd.f s0, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s6, s16, s18
|
||||
vsub.f s7, s17, s19
|
||||
vsub.f s3, s21, s22
|
||||
vadd.f s2, s20, s23
|
||||
vadd.f s5, s21, s22
|
||||
vsub.f s4, s20, s23
|
||||
vstr d0, [a1, #12 * 2*4]
|
||||
vmov s0, s6
|
||||
@ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
|
||||
vldr d6, [a1, #9 * 2*4]
|
||||
vstr d1, [a1, #13 * 2*4]
|
||||
vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
|
||||
vstr d2, [a1, #15 * 2*4]
|
||||
vldr d7, [a1, #13 * 2*4]
|
||||
vadd.f s4, s25, s24
|
||||
vsub.f s5, s25, s24
|
||||
vsub.f s6, s0, s7
|
||||
vadd.f s7, s0, s7
|
||||
vmul.f s20, s12, s3 @ vector op
|
||||
@ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
|
||||
vldr d4, [a1, #11 * 2*4]
|
||||
vldr d5, [a1, #15 * 2*4]
|
||||
vldr s1, cos3pi8
|
||||
vmul.f s24, s4, s2 @ vector * scalar op
|
||||
vmul.f s28, s12, s1 @ vector * scalar op
|
||||
vmul.f s12, s8, s1 @ vector * scalar op
|
||||
vadd.f s4, s20, s29
|
||||
vsub.f s5, s21, s28
|
||||
vsub.f s6, s22, s31
|
||||
vadd.f s7, s23, s30
|
||||
vmul.f s8, s8, s3 @ vector * scalar op
|
||||
vldr d8, [a1, #1 * 2*4]
|
||||
vldr d9, [a1, #5 * 2*4]
|
||||
vldr d10, [a1, #3 * 2*4]
|
||||
vldr d11, [a1, #7 * 2*4]
|
||||
vldr d14, [a1, #2 * 2*4]
|
||||
vadd.f s0, s6, s4
|
||||
vadd.f s1, s5, s7
|
||||
vsub.f s2, s5, s7
|
||||
vsub.f s3, s6, s4
|
||||
vadd.f s4, s12, s9
|
||||
vsub.f s5, s13, s8
|
||||
vsub.f s6, s14, s11
|
||||
vadd.f s7, s15, s10
|
||||
vadd.f s12, s0, s16 @ vector op
|
||||
vstr d0, [a1, #1 * 2*4]
|
||||
vstr d1, [a1, #5 * 2*4]
|
||||
vldr d4, [a1, #1 * 2*4]
|
||||
vldr d5, [a1, #5 * 2*4]
|
||||
vadd.f s0, s6, s4
|
||||
vadd.f s1, s5, s7
|
||||
vsub.f s2, s5, s7
|
||||
vsub.f s3, s6, s4
|
||||
vsub.f s8, s16, s8 @ vector op
|
||||
vstr d6, [a1, #1 * 2*4]
|
||||
vstr d7, [a1, #5 * 2*4]
|
||||
vldr d15, [a1, #6 * 2*4]
|
||||
vsub.f s4, s20, s0
|
||||
vsub.f s5, s21, s1
|
||||
vsub.f s6, s22, s2
|
||||
vsub.f s7, s23, s3
|
||||
vadd.f s20, s0, s20 @ vector op
|
||||
vstr d4, [a1, #9 * 2*4]
|
||||
@ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
|
||||
vldr d6, [a1, #8 * 2*4]
|
||||
vstr d5, [a1, #13 * 2*4]
|
||||
vldr d7, [a1, #12 * 2*4]
|
||||
vstr d2, [a1, #11 * 2*4]
|
||||
vldr d8, [a1, #0 * 2*4]
|
||||
vstr d3, [a1, #15 * 2*4]
|
||||
vldr d9, [a1, #4 * 2*4]
|
||||
vadd.f s0, s26, s24
|
||||
vadd.f s1, s25, s27
|
||||
vsub.f s2, s25, s27
|
||||
vsub.f s3, s26, s24
|
||||
vadd.f s4, s14, s12
|
||||
vadd.f s5, s13, s15
|
||||
vsub.f s6, s13, s15
|
||||
vsub.f s7, s14, s12
|
||||
vadd.f s8, s0, s28 @ vector op
|
||||
vstr d0, [a1, #3 * 2*4]
|
||||
vstr d1, [a1, #7 * 2*4]
|
||||
vldr d6, [a1, #3 * 2*4]
|
||||
vldr d7, [a1, #7 * 2*4]
|
||||
vsub.f s0, s16, s4
|
||||
vsub.f s1, s17, s5
|
||||
vsub.f s2, s18, s6
|
||||
vsub.f s3, s19, s7
|
||||
vsub.f s12, s28, s12 @ vector op
|
||||
vadd.f s16, s4, s16 @ vector op
|
||||
vstr d10, [a1, #3 * 2*4]
|
||||
vstr d11, [a1, #7 * 2*4]
|
||||
vstr d4, [a1, #2 * 2*4]
|
||||
vstr d5, [a1, #6 * 2*4]
|
||||
vstr d0, [a1, #8 * 2*4]
|
||||
vstr d1, [a1, #12 * 2*4]
|
||||
vstr d6, [a1, #10 * 2*4]
|
||||
vstr d7, [a1, #14 * 2*4]
|
||||
vstr d8, [a1, #0 * 2*4]
|
||||
vstr d9, [a1, #4 * 2*4]
|
||||
|
||||
vpop {s16-s31}
|
||||
fmxr FPSCR, a2
|
||||
bx lr
|
||||
endfunc
|
@ -174,9 +174,8 @@ function ff_imdct_half_vfp, export=1
|
||||
.endr
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
mov ORIGOUT, OUT
|
||||
ldr ip, [CONTEXT, #9*4]
|
||||
blx ip @ s->fft_calc(s, output)
|
||||
mov a1, OUT
|
||||
bl ff_fft16_vfp
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user