/* * Copyright (c) 2013 RISC OS Open Ltd * Author: Ben Avison <bavison@riscosopen.org> * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/arm/asm.S" @ TODO: * FFTs wider than 16 @ * dispatch code function fft4_vfp vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] vldr d4, [a1, #1*2*4] @ s8,s9 = z[1] vldr d1, [a1, #2*2*4] @ s2,s3 = z[2] vldr d5, [a1, #3*2*4] @ s10,s11 = z[3] @ stall vadd.f s12, s0, s8 @ i0 vadd.f s13, s1, s9 @ i1 vadd.f s14, s2, s10 @ i2 vadd.f s15, s3, s11 @ i3 vsub.f s8, s0, s8 @ i4 vsub.f s9, s1, s9 @ i5 vsub.f s10, s2, s10 @ i6 vsub.f s11, s3, s11 @ i7 @ stall @ stall vadd.f s0, s12, s14 @ z[0].re vsub.f s4, s12, s14 @ z[2].re vadd.f s1, s13, s15 @ z[0].im vsub.f s5, s13, s15 @ z[2].im vadd.f s7, s9, s10 @ z[3].im vsub.f s3, s9, s10 @ z[1].im vadd.f s2, s8, s11 @ z[1].re vsub.f s6, s8, s11 @ z[3].re @ stall @ stall vstr d0, [a1, #0*2*4] vstr d2, [a1, #2*2*4] @ stall @ stall vstr d1, [a1, #1*2*4] vstr d3, [a1, #3*2*4] bx lr endfunc .macro macro_fft8_head @ FFT4 vldr d4, [a1, #0 * 2*4] vldr d6, [a1, #1 * 2*4] vldr d5, [a1, #2 * 2*4] vldr d7, [a1, #3 * 2*4] @ BF vldr d12, [a1, #4 * 2*4] vadd.f s16, s8, s12 @ vector op vldr d14, [a1, #5 * 2*4] vldr d13, [a1, #6 * 2*4] vldr d15, [a1, #7 * 2*4] vsub.f s20, s8, s12 @ vector op vadd.f s0, s16, s18 vsub.f s2, s16, s18 vadd.f s1, s17, s19 vsub.f s3, s17, s19 vadd.f s7, s21, s22 vsub.f s5, s21, s22 vadd.f s4, s20, s23 vsub.f s6, s20, s23 vsub.f s20, s24, s28 @ vector op vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory vstr d1, [a1, #1 * 2*4] vldr s0, cos1pi4 vadd.f s16, s24, s28 @ vector op vstr d2, [a1, #2 * 2*4] vstr d3, [a1, #3 * 2*4] vldr d12, [a1, #0 * 2*4] @ TRANSFORM vmul.f s20, s20, s0 @ vector x scalar op vldr d13, [a1, #1 * 2*4] vldr d14, [a1, #2 * 2*4] vldr d15, [a1, #3 * 2*4] @ BUTTERFLIES vadd.f s0, s18, s16 vadd.f s1, s17, s19 vsub.f s2, s17, s19 vsub.f s3, s18, s16 vadd.f s4, s21, s20 vsub.f s5, s21, s20 vadd.f s6, s22, s23 vsub.f s7, s22, s23 vadd.f s8, s0, s24 @ vector op vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory vstr d1, [a1, #1 * 2*4] vldr d6, [a1, #0 * 2*4] vldr d7, [a1, #1 * 2*4] vadd.f s1, s5, s6 vadd.f s0, s7, s4 vsub.f s2, s5, s6 vsub.f s3, s7, s4 vsub.f s12, s24, s12 @ vector op vsub.f s5, s29, s1 vsub.f s4, s28, s0 vsub.f s6, s30, s2 vsub.f s7, s31, s3 vadd.f s16, s0, s28 @ vector op vstr d6, [a1, #4 * 2*4] vstr d7, [a1, #6 * 2*4] vstr d4, [a1, #0 * 2*4] vstr d5, [a1, #2 * 2*4] vstr d2, [a1, #5 * 2*4] vstr d3, [a1, #7 * 2*4] .endm .macro macro_fft8_tail vstr d8, [a1, #1 * 2*4] vstr d9, [a1, #3 * 2*4] .endm function fft8_vfp ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 fmrx a2, FPSCR fmxr FPSCR, a3 vpush {s16-s31} macro_fft8_head macro_fft8_tail vpop {s16-s31} fmxr FPSCR, a2 bx lr endfunc .align 3 cos1pi4: @ cos(1*pi/4) = sqrt(2) .float 0.707106769084930419921875 cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 .float 0.92387950420379638671875 cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 .float 0.3826834261417388916015625 function ff_fft16_vfp, export=1 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 fmrx a2, FPSCR fmxr FPSCR, a3 vpush {s16-s31} macro_fft8_head @ FFT4(z+8) vldr d10, [a1, #8 * 2*4] vldr d12, [a1, #9 * 2*4] vldr d11, [a1, #10 * 2*4] vldr d13, [a1, #11 * 2*4] macro_fft8_tail vadd.f s16, s20, s24 @ vector op @ FFT4(z+12) vldr d4, [a1, #12 * 2*4] vldr d6, [a1, #13 * 2*4] vldr d5, [a1, #14 * 2*4] vsub.f s20, s20, s24 @ vector op vldr d7, [a1, #15 * 2*4] vadd.f s0, s16, s18 vsub.f s4, s16, s18 vadd.f s1, s17, s19 vsub.f s5, s17, s19 vadd.f s7, s21, s22 vsub.f s3, s21, s22 vadd.f s2, s20, s23 vsub.f s6, s20, s23 vadd.f s16, s8, s12 @ vector op vstr d0, [a1, #8 * 2*4] vstr d2, [a1, #10 * 2*4] vstr d1, [a1, #9 * 2*4] vsub.f s20, s8, s12 vstr d3, [a1, #11 * 2*4] @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4) vldr d12, [a1, #10 * 2*4] vadd.f s0, s16, s18 vadd.f s1, s17, s19 vsub.f s6, s16, s18 vsub.f s7, s17, s19 vsub.f s3, s21, s22 vadd.f s2, s20, s23 vadd.f s5, s21, s22 vsub.f s4, s20, s23 vstr d0, [a1, #12 * 2*4] vmov s0, s6 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8) vldr d6, [a1, #9 * 2*4] vstr d1, [a1, #13 * 2*4] vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8 vstr d2, [a1, #15 * 2*4] vldr d7, [a1, #13 * 2*4] vadd.f s4, s25, s24 vsub.f s5, s25, s24 vsub.f s6, s0, s7 vadd.f s7, s0, s7 vmul.f s20, s12, s3 @ vector op @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8) vldr d4, [a1, #11 * 2*4] vldr d5, [a1, #15 * 2*4] vldr s1, cos3pi8 vmul.f s24, s4, s2 @ vector * scalar op vmul.f s28, s12, s1 @ vector * scalar op vmul.f s12, s8, s1 @ vector * scalar op vadd.f s4, s20, s29 vsub.f s5, s21, s28 vsub.f s6, s22, s31 vadd.f s7, s23, s30 vmul.f s8, s8, s3 @ vector * scalar op vldr d8, [a1, #1 * 2*4] vldr d9, [a1, #5 * 2*4] vldr d10, [a1, #3 * 2*4] vldr d11, [a1, #7 * 2*4] vldr d14, [a1, #2 * 2*4] vadd.f s0, s6, s4 vadd.f s1, s5, s7 vsub.f s2, s5, s7 vsub.f s3, s6, s4 vadd.f s4, s12, s9 vsub.f s5, s13, s8 vsub.f s6, s14, s11 vadd.f s7, s15, s10 vadd.f s12, s0, s16 @ vector op vstr d0, [a1, #1 * 2*4] vstr d1, [a1, #5 * 2*4] vldr d4, [a1, #1 * 2*4] vldr d5, [a1, #5 * 2*4] vadd.f s0, s6, s4 vadd.f s1, s5, s7 vsub.f s2, s5, s7 vsub.f s3, s6, s4 vsub.f s8, s16, s8 @ vector op vstr d6, [a1, #1 * 2*4] vstr d7, [a1, #5 * 2*4] vldr d15, [a1, #6 * 2*4] vsub.f s4, s20, s0 vsub.f s5, s21, s1 vsub.f s6, s22, s2 vsub.f s7, s23, s3 vadd.f s20, s0, s20 @ vector op vstr d4, [a1, #9 * 2*4] @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]) vldr d6, [a1, #8 * 2*4] vstr d5, [a1, #13 * 2*4] vldr d7, [a1, #12 * 2*4] vstr d2, [a1, #11 * 2*4] vldr d8, [a1, #0 * 2*4] vstr d3, [a1, #15 * 2*4] vldr d9, [a1, #4 * 2*4] vadd.f s0, s26, s24 vadd.f s1, s25, s27 vsub.f s2, s25, s27 vsub.f s3, s26, s24 vadd.f s4, s14, s12 vadd.f s5, s13, s15 vsub.f s6, s13, s15 vsub.f s7, s14, s12 vadd.f s8, s0, s28 @ vector op vstr d0, [a1, #3 * 2*4] vstr d1, [a1, #7 * 2*4] vldr d6, [a1, #3 * 2*4] vldr d7, [a1, #7 * 2*4] vsub.f s0, s16, s4 vsub.f s1, s17, s5 vsub.f s2, s18, s6 vsub.f s3, s19, s7 vsub.f s12, s28, s12 @ vector op vadd.f s16, s4, s16 @ vector op vstr d10, [a1, #3 * 2*4] vstr d11, [a1, #7 * 2*4] vstr d4, [a1, #2 * 2*4] vstr d5, [a1, #6 * 2*4] vstr d0, [a1, #8 * 2*4] vstr d1, [a1, #12 * 2*4] vstr d6, [a1, #10 * 2*4] vstr d7, [a1, #14 * 2*4] vstr d8, [a1, #0 * 2*4] vstr d9, [a1, #4 * 2*4] vpop {s16-s31} fmxr FPSCR, a2 bx lr endfunc