FFmpeg/libavcodec/arm/dcadsp_neon.S

/*
 * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

function ff_decode_hf_neon, export=1
        push            {r4-r5,lr}
        add             r2,  r2,  r3
        ldr             r3,       [sp, #12]
        ldrd            r4,  r5,  [sp, #16]
        add             r3,  r3,  r4, lsl #3
        add             r1,  r1,  r4, lsl #2
        add             r0,  r0,  r4, lsl #5

1:      ldr_post        lr,  r1,  #4
        add             r4,  r4,  #1
        add             lr,  r2,  lr, lsl #5
        cmp             r4,  r5
        vld1.32         {d7},     [r3]!
        vld1.8          {d0},     [lr,:64]
        vcvt.f32.s32    d7,  d7,  #4
        vmovl.s8        q1,  d0
        vmovl.s16       q0,  d2
        vmovl.s16       q1,  d3
        vcvt.f32.s32    q0,  q0
        vcvt.f32.s32    q1,  q1
        vmul.f32        q0,  q0,  d7[0]
        vmul.f32        q1,  q1,  d7[0]
        vst1.32         {q0-q1},  [r0,:128]!
        bne             1b

        pop             {r4-r5,pc}
endfunc

function ff_dca_lfe_fir0_neon, export=1
        push            {r4-r6,lr}
        mov             r3,  #32                @ decifactor
        mov             r6,  #256/32
        b               dca_lfe_fir
endfunc

function ff_dca_lfe_fir1_neon, export=1
        push            {r4-r6,lr}
        mov             r3,  #64                @ decifactor
        mov             r6,  #256/64
dca_lfe_fir:
        add             r4,  r0,  r3,  lsl #2   @ out2
        add             r5,  r2,  #256*4-16     @ cf1
        sub             r1,  r1,  #12
        mov             lr,  #-16
1:
        vmov.f32        q2,  #0.0               @ v0
        vmov.f32        q3,  #0.0               @ v1
        mov             r12, r6
2:
        vld1.32         {q8},     [r2,:128]!    @ cf0
        vld1.32         {q9},     [r5,:128], lr @ cf1
        vld1.32         {q1},     [r1], lr      @ in
        subs            r12, r12, #4
        vrev64.32       q10, q8
        vmla.f32        q3,  q1,  q9
        vmla.f32        d4,  d2,  d21
        vmla.f32        d5,  d3,  d20
        bne             2b

        add             r1,  r1,  r6,  lsl #2
        subs            r3,  r3,  #1
        vadd.f32        d4,  d4,  d5
        vadd.f32        d6,  d6,  d7
        vpadd.f32       d5,  d4,  d6
        vst1.32         {d5[0]},  [r0,:32]!
        vst1.32         {d5[1]},  [r4,:32]!
        bne             1b

        pop             {r4-r6,pc}
endfunc
DCA: ARM/NEON optimised lfe_fir Originally committed as revision 22863 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-04-12 23:45:33 +03:00			`/*`
			`* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

ARM: Move asm.S from libavcodec to libavutil This will allow for easier implementation of ARM-optimized functions in libraries other than libavcodec. 2012-05-21 22:46:23 +03:00			`#include "libavutil/arm/asm.S"`
DCA: ARM/NEON optimised lfe_fir Originally committed as revision 22863 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-04-12 23:45:33 +03:00
arm: dcadsp: implement decode_hf as external NEON asm 2014-02-22 20:27:10 +03:00			`function ff_decode_hf_neon, export=1`
			`push {r4-r5,lr}`
			`add r2, r2, r3`
			`ldr r3, [sp, #12]`
			`ldrd r4, r5, [sp, #16]`
			`add r3, r3, r4, lsl #3`
			`add r1, r1, r4, lsl #2`
			`add r0, r0, r4, lsl #5`

			`1: ldr_post lr, r1, #4`
			`add r4, r4, #1`
			`add lr, r2, lr, lsl #5`
			`cmp r4, r5`
			`vld1.32 {d7}, [r3]!`
			`vld1.8 {d0}, [lr,:64]`
			`vcvt.f32.s32 d7, d7, #4`
			`vmovl.s8 q1, d0`
			`vmovl.s16 q0, d2`
			`vmovl.s16 q1, d3`
			`vcvt.f32.s32 q0, q0`
			`vcvt.f32.s32 q1, q1`
			`vmul.f32 q0, q0, d7[0]`
			`vmul.f32 q1, q1, d7[0]`
			`vst1.32 {q0-q1}, [r0,:128]!`
			`bne 1b`

			`pop {r4-r5,pc}`
			`endfunc`

dcadsp: split lfe_dir cases The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net> 2014-02-06 02:40:52 +03:00			`function ff_dca_lfe_fir0_neon, export=1`
DCA: ARM/NEON optimised lfe_fir Originally committed as revision 22863 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-04-12 23:45:33 +03:00			`push {r4-r6,lr}`
dcadsp: split lfe_dir cases The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net> 2014-02-06 02:40:52 +03:00			`mov r3, #32 @ decifactor`
			`mov r6, #256/32`
			`b dca_lfe_fir`
			`endfunc`
DCA: ARM/NEON optimised lfe_fir Originally committed as revision 22863 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-04-12 23:45:33 +03:00
dcadsp: split lfe_dir cases The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net> 2014-02-06 02:40:52 +03:00			`function ff_dca_lfe_fir1_neon, export=1`
			`push {r4-r6,lr}`
			`mov r3, #64 @ decifactor`
			`mov r6, #256/64`
			`dca_lfe_fir:`
DCA: ARM/NEON optimised lfe_fir Originally committed as revision 22863 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-04-12 23:45:33 +03:00			`add r4, r0, r3, lsl #2 @ out2`
			`add r5, r2, #256*4-16 @ cf1`
			`sub r1, r1, #12`
			`mov lr, #-16`
			`1:`
			`vmov.f32 q2, #0.0 @ v0`
			`vmov.f32 q3, #0.0 @ v1`
			`mov r12, r6`
			`2:`
			`vld1.32 {q8}, [r2,:128]! @ cf0`
			`vld1.32 {q9}, [r5,:128], lr @ cf1`
			`vld1.32 {q1}, [r1], lr @ in`
			`subs r12, r12, #4`
			`vrev64.32 q10, q8`
			`vmla.f32 q3, q1, q9`
			`vmla.f32 d4, d2, d21`
			`vmla.f32 d5, d3, d20`
			`bne 2b`

			`add r1, r1, r6, lsl #2`
			`subs r3, r3, #1`
			`vadd.f32 d4, d4, d5`
			`vadd.f32 d6, d6, d7`
dcadec: remove scaling in lfe_interpolation_fir The scaling factor is constant so it is faster to scale the FIR coefficients in the tables during compilation. Signed-off-by: Janne Grunau <janne-libav@jannau.net> 2014-02-14 18:03:06 +03:00			`vpadd.f32 d5, d4, d6`
DCA: ARM/NEON optimised lfe_fir Originally committed as revision 22863 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-04-12 23:45:33 +03:00			`vst1.32 {d5[0]}, [r0,:32]!`
			`vst1.32 {d5[1]}, [r4,:32]!`
			`bne 1b`

			`pop {r4-r6,pc}`
			`endfunc`