mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
arm: Add VFP-accelerated version of qmf_32_subbands
Before After Mean StdDev Mean StdDev Change This function 1323.0 98.0 746.2 60.6 +77.3% Overall 15400.0 336.4 14147.5 288.4 +8.9% Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
800ffab48a
commit
ff30d12159
@ -26,6 +26,12 @@
|
||||
|
||||
void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
|
||||
int decifactor, float scale);
|
||||
void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
|
||||
SynthFilterContext *synth, FFTContext *imdct,
|
||||
float synth_buf_ptr[512],
|
||||
int *synth_buf_offset, float synth_buf2[32],
|
||||
const float window[512], float *samples_out,
|
||||
float raXin[32], float scale);
|
||||
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
|
||||
int decifactor, float scale);
|
||||
|
||||
@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
|
||||
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
|
||||
s->lfe_fir = ff_dca_lfe_fir_vfp;
|
||||
s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
|
||||
}
|
||||
if (have_neon(cpu_flags))
|
||||
s->lfe_fir = ff_dca_lfe_fir_neon;
|
||||
}
|
||||
|
@ -218,3 +218,276 @@ endfunc
|
||||
.unreq POST1
|
||||
.unreq POST2
|
||||
.unreq POST3
|
||||
|
||||
|
||||
IN .req a1
|
||||
SBACT .req a2
|
||||
OLDFPSCR .req a3
|
||||
IMDCT .req a4
|
||||
WINDOW .req v1
|
||||
OUT .req v2
|
||||
BUF .req v3
|
||||
SCALEINT .req v4 @ only used in softfp case
|
||||
COUNT .req v5
|
||||
|
||||
SCALE .req s0
|
||||
|
||||
/* Stack layout differs in softfp and hardfp cases:
|
||||
*
|
||||
* hardfp
|
||||
* fp -> 6 arg words saved by caller
|
||||
* a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
|
||||
* s16-s23 on entry
|
||||
* align 16
|
||||
* buf -> 8*32*4 bytes buffer
|
||||
* s0 on entry
|
||||
* sp -> 3 arg words for callee
|
||||
*
|
||||
* softfp
|
||||
* fp -> 7 arg words saved by caller
|
||||
* a4,v1-v5,fp,lr on entry
|
||||
* s16-s23 on entry
|
||||
* align 16
|
||||
* buf -> 8*32*4 bytes buffer
|
||||
* sp -> 4 arg words for callee
|
||||
*/
|
||||
|
||||
/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
|
||||
* SynthFilterContext *synth, FFTContext *imdct,
|
||||
* float (*synth_buf_ptr)[512],
|
||||
* int *synth_buf_offset, float (*synth_buf2)[32],
|
||||
* const float (*window)[512], float *samples_out,
|
||||
* float (*raXin)[32], float scale);
|
||||
*/
|
||||
function ff_dca_qmf_32_subbands_vfp, export=1
|
||||
VFP push {a3-a4,v1-v3,v5,fp,lr}
|
||||
NOVFP push {a4,v1-v5,fp,lr}
|
||||
add fp, sp, #8*4
|
||||
vpush {s16-s23}
|
||||
@ The buffer pointed at by raXin isn't big enough for us to do a
|
||||
@ complete matrix transposition as we want to, so allocate an
|
||||
@ alternative buffer from the stack. Align to 4 words for speed.
|
||||
sub BUF, sp, #8*32*4
|
||||
bic BUF, BUF, #15
|
||||
mov sp, BUF
|
||||
ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
fmxr FPSCR, lr
|
||||
@ COUNT is used to count down 2 things at once:
|
||||
@ bits 0-4 are the number of word pairs remaining in the output row
|
||||
@ bits 5-31 are the number of words to copy (with possible negation)
|
||||
@ from the source matrix before we start zeroing the remainder
|
||||
mov COUNT, #(-4 << 5) + 16
|
||||
adds COUNT, COUNT, SBACT, lsl #5
|
||||
bmi 2f
|
||||
1:
|
||||
vldr s8, [IN, #(0*8+0)*4]
|
||||
vldr s10, [IN, #(0*8+1)*4]
|
||||
vldr s12, [IN, #(0*8+2)*4]
|
||||
vldr s14, [IN, #(0*8+3)*4]
|
||||
vldr s16, [IN, #(0*8+4)*4]
|
||||
vldr s18, [IN, #(0*8+5)*4]
|
||||
vldr s20, [IN, #(0*8+6)*4]
|
||||
vldr s22, [IN, #(0*8+7)*4]
|
||||
vneg.f s8, s8
|
||||
vldr s9, [IN, #(1*8+0)*4]
|
||||
vldr s11, [IN, #(1*8+1)*4]
|
||||
vldr s13, [IN, #(1*8+2)*4]
|
||||
vldr s15, [IN, #(1*8+3)*4]
|
||||
vneg.f s16, s16
|
||||
vldr s17, [IN, #(1*8+4)*4]
|
||||
vldr s19, [IN, #(1*8+5)*4]
|
||||
vldr s21, [IN, #(1*8+6)*4]
|
||||
vldr s23, [IN, #(1*8+7)*4]
|
||||
vstr d4, [BUF, #(0*32+0)*4]
|
||||
vstr d5, [BUF, #(1*32+0)*4]
|
||||
vstr d6, [BUF, #(2*32+0)*4]
|
||||
vstr d7, [BUF, #(3*32+0)*4]
|
||||
vstr d8, [BUF, #(4*32+0)*4]
|
||||
vstr d9, [BUF, #(5*32+0)*4]
|
||||
vstr d10, [BUF, #(6*32+0)*4]
|
||||
vstr d11, [BUF, #(7*32+0)*4]
|
||||
vldr s9, [IN, #(3*8+0)*4]
|
||||
vldr s11, [IN, #(3*8+1)*4]
|
||||
vldr s13, [IN, #(3*8+2)*4]
|
||||
vldr s15, [IN, #(3*8+3)*4]
|
||||
vldr s17, [IN, #(3*8+4)*4]
|
||||
vldr s19, [IN, #(3*8+5)*4]
|
||||
vldr s21, [IN, #(3*8+6)*4]
|
||||
vldr s23, [IN, #(3*8+7)*4]
|
||||
vneg.f s9, s9
|
||||
vldr s8, [IN, #(2*8+0)*4]
|
||||
vldr s10, [IN, #(2*8+1)*4]
|
||||
vldr s12, [IN, #(2*8+2)*4]
|
||||
vldr s14, [IN, #(2*8+3)*4]
|
||||
vneg.f s17, s17
|
||||
vldr s16, [IN, #(2*8+4)*4]
|
||||
vldr s18, [IN, #(2*8+5)*4]
|
||||
vldr s20, [IN, #(2*8+6)*4]
|
||||
vldr s22, [IN, #(2*8+7)*4]
|
||||
vstr d4, [BUF, #(0*32+2)*4]
|
||||
vstr d5, [BUF, #(1*32+2)*4]
|
||||
vstr d6, [BUF, #(2*32+2)*4]
|
||||
vstr d7, [BUF, #(3*32+2)*4]
|
||||
vstr d8, [BUF, #(4*32+2)*4]
|
||||
vstr d9, [BUF, #(5*32+2)*4]
|
||||
vstr d10, [BUF, #(6*32+2)*4]
|
||||
vstr d11, [BUF, #(7*32+2)*4]
|
||||
add IN, IN, #4*8*4
|
||||
add BUF, BUF, #4*4
|
||||
subs COUNT, COUNT, #(4 << 5) + 2
|
||||
bpl 1b
|
||||
2: @ Now deal with trailing < 4 samples
|
||||
adds COUNT, COUNT, #3 << 5
|
||||
bmi 4f @ sb_act was a multiple of 4
|
||||
bics lr, COUNT, #0x1F
|
||||
bne 3f
|
||||
@ sb_act was n*4+1
|
||||
vldr s8, [IN, #(0*8+0)*4]
|
||||
vldr s10, [IN, #(0*8+1)*4]
|
||||
vldr s12, [IN, #(0*8+2)*4]
|
||||
vldr s14, [IN, #(0*8+3)*4]
|
||||
vldr s16, [IN, #(0*8+4)*4]
|
||||
vldr s18, [IN, #(0*8+5)*4]
|
||||
vldr s20, [IN, #(0*8+6)*4]
|
||||
vldr s22, [IN, #(0*8+7)*4]
|
||||
vneg.f s8, s8
|
||||
vldr s9, zero
|
||||
vldr s11, zero
|
||||
vldr s13, zero
|
||||
vldr s15, zero
|
||||
vneg.f s16, s16
|
||||
vldr s17, zero
|
||||
vldr s19, zero
|
||||
vldr s21, zero
|
||||
vldr s23, zero
|
||||
vstr d4, [BUF, #(0*32+0)*4]
|
||||
vstr d5, [BUF, #(1*32+0)*4]
|
||||
vstr d6, [BUF, #(2*32+0)*4]
|
||||
vstr d7, [BUF, #(3*32+0)*4]
|
||||
vstr d8, [BUF, #(4*32+0)*4]
|
||||
vstr d9, [BUF, #(5*32+0)*4]
|
||||
vstr d10, [BUF, #(6*32+0)*4]
|
||||
vstr d11, [BUF, #(7*32+0)*4]
|
||||
add BUF, BUF, #2*4
|
||||
sub COUNT, COUNT, #1
|
||||
b 4f
|
||||
3: @ sb_act was n*4+2 or n*4+3, so do the first 2
|
||||
vldr s8, [IN, #(0*8+0)*4]
|
||||
vldr s10, [IN, #(0*8+1)*4]
|
||||
vldr s12, [IN, #(0*8+2)*4]
|
||||
vldr s14, [IN, #(0*8+3)*4]
|
||||
vldr s16, [IN, #(0*8+4)*4]
|
||||
vldr s18, [IN, #(0*8+5)*4]
|
||||
vldr s20, [IN, #(0*8+6)*4]
|
||||
vldr s22, [IN, #(0*8+7)*4]
|
||||
vneg.f s8, s8
|
||||
vldr s9, [IN, #(1*8+0)*4]
|
||||
vldr s11, [IN, #(1*8+1)*4]
|
||||
vldr s13, [IN, #(1*8+2)*4]
|
||||
vldr s15, [IN, #(1*8+3)*4]
|
||||
vneg.f s16, s16
|
||||
vldr s17, [IN, #(1*8+4)*4]
|
||||
vldr s19, [IN, #(1*8+5)*4]
|
||||
vldr s21, [IN, #(1*8+6)*4]
|
||||
vldr s23, [IN, #(1*8+7)*4]
|
||||
vstr d4, [BUF, #(0*32+0)*4]
|
||||
vstr d5, [BUF, #(1*32+0)*4]
|
||||
vstr d6, [BUF, #(2*32+0)*4]
|
||||
vstr d7, [BUF, #(3*32+0)*4]
|
||||
vstr d8, [BUF, #(4*32+0)*4]
|
||||
vstr d9, [BUF, #(5*32+0)*4]
|
||||
vstr d10, [BUF, #(6*32+0)*4]
|
||||
vstr d11, [BUF, #(7*32+0)*4]
|
||||
add BUF, BUF, #2*4
|
||||
sub COUNT, COUNT, #(2 << 5) + 1
|
||||
bics lr, COUNT, #0x1F
|
||||
bne 4f
|
||||
@ sb_act was n*4+3
|
||||
vldr s8, [IN, #(2*8+0)*4]
|
||||
vldr s10, [IN, #(2*8+1)*4]
|
||||
vldr s12, [IN, #(2*8+2)*4]
|
||||
vldr s14, [IN, #(2*8+3)*4]
|
||||
vldr s16, [IN, #(2*8+4)*4]
|
||||
vldr s18, [IN, #(2*8+5)*4]
|
||||
vldr s20, [IN, #(2*8+6)*4]
|
||||
vldr s22, [IN, #(2*8+7)*4]
|
||||
vldr s9, zero
|
||||
vldr s11, zero
|
||||
vldr s13, zero
|
||||
vldr s15, zero
|
||||
vldr s17, zero
|
||||
vldr s19, zero
|
||||
vldr s21, zero
|
||||
vldr s23, zero
|
||||
vstr d4, [BUF, #(0*32+0)*4]
|
||||
vstr d5, [BUF, #(1*32+0)*4]
|
||||
vstr d6, [BUF, #(2*32+0)*4]
|
||||
vstr d7, [BUF, #(3*32+0)*4]
|
||||
vstr d8, [BUF, #(4*32+0)*4]
|
||||
vstr d9, [BUF, #(5*32+0)*4]
|
||||
vstr d10, [BUF, #(6*32+0)*4]
|
||||
vstr d11, [BUF, #(7*32+0)*4]
|
||||
add BUF, BUF, #2*4
|
||||
sub COUNT, COUNT, #1
|
||||
4: @ Now fill the remainder with 0
|
||||
vldr s8, zero
|
||||
vldr s9, zero
|
||||
ands COUNT, COUNT, #0x1F
|
||||
beq 6f
|
||||
5: vstr d4, [BUF, #(0*32+0)*4]
|
||||
vstr d4, [BUF, #(1*32+0)*4]
|
||||
vstr d4, [BUF, #(2*32+0)*4]
|
||||
vstr d4, [BUF, #(3*32+0)*4]
|
||||
vstr d4, [BUF, #(4*32+0)*4]
|
||||
vstr d4, [BUF, #(5*32+0)*4]
|
||||
vstr d4, [BUF, #(6*32+0)*4]
|
||||
vstr d4, [BUF, #(7*32+0)*4]
|
||||
add BUF, BUF, #2*4
|
||||
subs COUNT, COUNT, #1
|
||||
bne 5b
|
||||
6:
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
ldr WINDOW, [fp, #3*4]
|
||||
ldr OUT, [fp, #4*4]
|
||||
sub BUF, BUF, #32*4
|
||||
NOVFP ldr SCALEINT, [fp, #6*4]
|
||||
mov COUNT, #8
|
||||
VFP vpush {SCALE}
|
||||
VFP sub sp, sp, #3*4
|
||||
NOVFP sub sp, sp, #4*4
|
||||
7:
|
||||
VFP ldr a1, [fp, #-7*4] @ imdct
|
||||
NOVFP ldr a1, [fp, #-8*4]
|
||||
ldmia fp, {a2-a4}
|
||||
VFP stmia sp, {WINDOW, OUT, BUF}
|
||||
NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
|
||||
VFP vldr SCALE, [sp, #3*4]
|
||||
bl ff_synth_filter_float_vfp
|
||||
add OUT, OUT, #32*4
|
||||
add BUF, BUF, #32*4
|
||||
subs COUNT, COUNT, #1
|
||||
bne 7b
|
||||
|
||||
A sub sp, fp, #(8+8)*4
|
||||
T sub fp, fp, #(8+8)*4
|
||||
T mov sp, fp
|
||||
vpop {s16-s23}
|
||||
VFP pop {a3-a4,v1-v3,v5,fp,pc}
|
||||
NOVFP pop {a4,v1-v5,fp,pc}
|
||||
endfunc
|
||||
|
||||
.unreq IN
|
||||
.unreq SBACT
|
||||
.unreq OLDFPSCR
|
||||
.unreq IMDCT
|
||||
.unreq WINDOW
|
||||
.unreq OUT
|
||||
.unreq BUF
|
||||
.unreq SCALEINT
|
||||
.unreq COUNT
|
||||
|
||||
.unreq SCALE
|
||||
|
||||
.align 2
|
||||
zero: .word 0
|
||||
|
Loading…
Reference in New Issue
Block a user