mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
arm: add ff_int32_to_float_fmul_array8_neon
Quite a bit faster than int32_to_float_fmul_array8_c calling ff_int32_to_float_fmul_scalar_neon through FmtConvertContext. Number of cycles per int32_to_float_fmul_array8 call while decoding padded.dts on exynos5422: before after change cortex-a7: 1270 951 -25% cortex-a15: 434 285 -34% checkasm --bench cycle counts: cortex-a15 cortex-a7 int32_to_float_fmul_array8_c: 1730.4 4384.5 int32_to_float_fmul_array8_neon_c: 571.5 1694.3 int32_to_float_fmul_array8_neon: 374.0 1448.8 Interesting are the differences between int32_to_float_fmul_array8_neon_c and int32_to_float_fmul_array8_neon. The former is current behaviour of calling ff_int32_to_float_fmul_scalar_neon repeatedly from the c function, The raw numbers differ since checkasm uses different lengths than the dca decoder.
This commit is contained in:
parent
a0fc780a20
commit
90b1b9350c
@ -25,6 +25,9 @@
|
|||||||
#include "libavcodec/avcodec.h"
|
#include "libavcodec/avcodec.h"
|
||||||
#include "libavcodec/fmtconvert.h"
|
#include "libavcodec/fmtconvert.h"
|
||||||
|
|
||||||
|
void ff_int32_to_float_fmul_array8_neon(FmtConvertContext *c, float *dst,
|
||||||
|
const int32_t *src, const float *mul,
|
||||||
|
int len);
|
||||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
|
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
|
||||||
float mul, int len);
|
float mul, int len);
|
||||||
|
|
||||||
@ -46,6 +49,7 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (have_neon(cpu_flags)) {
|
if (have_neon(cpu_flags)) {
|
||||||
|
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_neon;
|
||||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* ARM NEON optimised Format Conversion Utils
|
* ARM NEON optimised Format Conversion Utils
|
||||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||||
|
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>b
|
||||||
*
|
*
|
||||||
* This file is part of Libav.
|
* This file is part of Libav.
|
||||||
*
|
*
|
||||||
@ -49,3 +50,39 @@ NOVFP len .req r3
|
|||||||
bx lr
|
bx lr
|
||||||
.unreq len
|
.unreq len
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
|
function ff_int32_to_float_fmul_array8_neon, export=1
|
||||||
|
ldr r0, [sp]
|
||||||
|
lsr r0, r0, #3
|
||||||
|
subs r0, r0, #1
|
||||||
|
beq 1f
|
||||||
|
2:
|
||||||
|
vld1.32 {q0-q1}, [r2,:128]!
|
||||||
|
vld1.32 {q2-q3}, [r2,:128]!
|
||||||
|
vld1.32 {d20}, [r3]!
|
||||||
|
subs r0, r0, #2
|
||||||
|
vcvt.f32.s32 q0, q0
|
||||||
|
vcvt.f32.s32 q1, q1
|
||||||
|
vdup.32 q8, d20[0]
|
||||||
|
vcvt.f32.s32 q2, q2
|
||||||
|
vcvt.f32.s32 q3, q3
|
||||||
|
vmul.f32 q0, q0, q8
|
||||||
|
vdup.32 q9, d20[1]
|
||||||
|
vmul.f32 q1, q1, q8
|
||||||
|
vmul.f32 q2, q2, q9
|
||||||
|
vmul.f32 q3, q3, q9
|
||||||
|
vst1.32 {q0-q1}, [r1,:128]!
|
||||||
|
vst1.32 {q2-q3}, [r1,:128]!
|
||||||
|
bgt 2b
|
||||||
|
it lt
|
||||||
|
bxlt lr
|
||||||
|
1:
|
||||||
|
vld1.32 {q0-q1}, [r2,:128]
|
||||||
|
vld1.32 {d16[],d17[]}, [r3]
|
||||||
|
vcvt.f32.s32 q0, q0
|
||||||
|
vcvt.f32.s32 q1, q1
|
||||||
|
vmul.f32 q0, q0, q8
|
||||||
|
vmul.f32 q1, q1, q8
|
||||||
|
vst1.32 {q0-q1}, [r1,:128]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
Loading…
Reference in New Issue
Block a user