1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-10 06:10:52 +02:00

avcodec/aarch64/ac3dsp_neon.S: Optimize ac3_sum_square_butterfly_int32_neon

Instead of calculating a^2, b^2, (a+b)^2 and (a-b)^2, calculate only
a^2, b^2 and 2*a*b in each iteration and derive the latter parts from
these three at the end.

Before and after:

A78
ac3_sum_square_bufferfly_int32_neon:                   484.8 ( 2.00x)
ac3_sum_square_bufferfly_int32_neon:                   468.2 ( 2.08x)

A72
ac3_sum_square_bufferfly_int32_neon:                   793.6 ( 1.26x)
ac3_sum_square_bufferfly_int32_neon:                   527.3 ( 1.92x)

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Krzysztof Pyrkosz
2025-02-28 22:21:50 +01:00
committed by Martin Storsjö
parent 38929b824b
commit e8d4c55987

View File

@@ -69,21 +69,20 @@ function ff_ac3_sum_square_butterfly_int32_neon, export=1
movi v0.2d, #0 movi v0.2d, #0
movi v1.2d, #0 movi v1.2d, #0
movi v2.2d, #0 movi v2.2d, #0
movi v3.2d, #0
1: ld1 {v4.2s}, [x1], #8 1: ld1 {v4.2s}, [x1], #8
ld1 {v5.2s}, [x2], #8 ld1 {v5.2s}, [x2], #8
add v6.2s, v4.2s, v5.2s
sub v7.2s, v4.2s, v5.2s
smlal v0.2d, v4.2s, v4.2s
smlal v1.2d, v5.2s, v5.2s
smlal v2.2d, v6.2s, v6.2s
smlal v3.2d, v7.2s, v7.2s
subs w3, w3, #2 subs w3, w3, #2
smlal v0.2d, v4.2s, v4.2s // sum of a^2
smlal v1.2d, v5.2s, v5.2s // sum of b^2
sqdmlal v2.2d, v4.2s, v5.2s // sum of 2ab
b.gt 1b b.gt 1b
addp d0, v0.2d addp d0, v0.2d
addp d1, v1.2d addp d1, v1.2d
addp d2, v2.2d addp d2, v2.2d
addp d3, v3.2d sub d3, d0, d2 // a^2 + b^2 - 2ab
add d2, d0, d2
add d3, d3, d1 // a^2 + b^2 + 2ab
add d2, d2, d1
st1 {v0.1d-v3.1d}, [x0] st1 {v0.1d-v3.1d}, [x0]
ret ret
endfunc endfunc