1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-10 06:10:52 +02:00

avcodec/aarch64/opusdsp_neon: Simplify opus_postfilter_neon

This change removes one extra floating point operation and simplifies
load operations at the beginning of the loop by using dedicated register
for each of the 5 pointers and interleaving it with calculations. The
first case seems to be a bit slower, but the performance increase is
substantial in the other two.

A78 before:
postfilter_15_neon:                                   1684.8 ( 4.23x)
postfilter_512_neon:                                  1395.5 ( 5.10x)
postfilter_1022_neon:                                 1357.0 ( 5.25x)

After:
postfilter_15_neon:                                   1742.2 ( 4.09x)
postfilter_512_neon:                                  1169.8 ( 6.09x)
postfilter_1022_neon:                                 1160.0 ( 6.12x)

A72 before:
postfilter_15_neon:                                   3144.8 ( 2.39x)
postfilter_512_neon:                                  3141.2 ( 2.39x)
postfilter_1022_neon:                                 3230.0 ( 2.33x)

After:
postfilter_15_neon:                                   2847.8 ( 2.64x)
postfilter_512_neon:                                  2877.8 ( 2.61x)
postfilter_1022_neon:                                 2837.2 ( 2.65x)

x13s before:
postfilter_15_neon:                                   1615.4 ( 2.61x)
postfilter_512_neon:                                   963.1 ( 4.39x)
postfilter_1022_neon:                                  963.6 ( 4.39x)

After:
postfilter_15_neon:                                   1749.6 ( 2.41x)
postfilter_512_neon:                                   707.1 ( 5.97x)
postfilter_1022_neon:                                  706.1 ( 5.99x)

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Krzysztof Pyrkosz
2025-02-07 20:42:11 +01:00
committed by Martin Storsjö
parent e75a0f3c75
commit 9fb97215df

View File

@@ -55,35 +55,28 @@ endfunc
function ff_opus_postfilter_neon, export=1 function ff_opus_postfilter_neon, export=1
ld1 {v0.4s}, [x2] ld1 {v0.4s}, [x2]
sub x5, x0, w1, sxtw #2
sub x1, x5, #8
dup v1.4s, v0.s[1] dup v1.4s, v0.s[1]
dup v2.4s, v0.s[2] dup v2.4s, v0.s[2]
dup v0.4s, v0.s[0] dup v0.4s, v0.s[0]
add w1, w1, #2 ld1 {v3.4s}, [x1], #16
sub x1, x0, x1, lsl #2 sub x4, x5, #4
add x6, x5, #4
ld1 {v3.4s}, [x1]
fmul v3.4s, v3.4s, v2.4s fmul v3.4s, v3.4s, v2.4s
1: add x1, x1, #4 1: ld1 {v7.4s}, [x1], #16
ld1 {v4.4s}, [x1] ld1 {v4.4s}, [x4], #16
add x1, x1, #4
ld1 {v5.4s}, [x1]
add x1, x1, #4
ld1 {v6.4s}, [x1]
add x1, x1, #4
ld1 {v7.4s}, [x1]
fmla v3.4s, v7.4s, v2.4s fmla v3.4s, v7.4s, v2.4s
ld1 {v6.4s}, [x6], #16
ld1 {v5.4s}, [x5], #16
fadd v6.4s, v6.4s, v4.4s fadd v6.4s, v6.4s, v4.4s
fmla v3.4s, v5.4s, v0.4s
ld1 {v4.4s}, [x0] ld1 {v4.4s}, [x0]
fmla v4.4s, v5.4s, v0.4s fmla v3.4s, v6.4s, v1.4s
fadd v4.4s, v4.4s, v3.4s
fmul v6.4s, v6.4s, v1.4s
fadd v6.4s, v6.4s, v3.4s
fadd v4.4s, v4.4s, v6.4s
fmul v3.4s, v7.4s, v2.4s fmul v3.4s, v7.4s, v2.4s
st1 {v4.4s}, [x0], #16 st1 {v4.4s}, [x0], #16