1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

aarch64: vp8: Optimize put_epel16_h6v6 with vp8_epel8_v6_y2

This makes it similar to put_epel16_v6, and gives a large speedup
on Cortex A53, a minor speedup on A72 and a very minor slowdown on
A73.

Before:                 Cortex A53     A72     A73
vp8_put_epel16_h6v6_neon:   2211.4  1586.5  1431.7
After:
vp8_put_epel16_h6v6_neon:   1736.9  1522.0  1448.1

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2019-02-01 09:47:30 +02:00
parent cef914e083
commit 37394ef01b

View File

@ -769,23 +769,6 @@ endfunc
sqrshrun2 \d0\().16b, v22.8h, #7 sqrshrun2 \d0\().16b, v22.8h, #7
.endm .endm
.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
uxtl \s2\().8h, \s2\().8b
uxtl \s3\().8h, \s3\().8b
uxtl \s1\().8h, \s1\().8b
uxtl \s4\().8h, \s4\().8b
uxtl \s0\().8h, \s0\().8b
uxtl \s5\().8h, \s5\().8b
mul \s2\().8h, \s2\().8h, v0.h[2]
mul \s3\().8h, \s3\().8h, v0.h[3]
mls \s2\().8h, \s1\().8h, v0.h[1]
mls \s3\().8h, \s4\().8h, v0.h[4]
mla \s2\().8h, \s0\().8h, v0.h[0]
mla \s3\().8h, \s5\().8h, v0.h[5]
sqadd \s3\().8h, \s2\().8h, \s3\().8h
sqrshrun \d0\().8b, \s3\().8h, #7
.endm
.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
uxtl \s0\().8h, \s0\().8b uxtl \s0\().8h, \s0\().8b
uxtl \s3\().8h, \s3\().8b uxtl \s3\().8h, \s3\().8b
@ -942,15 +925,18 @@ function ff_put_vp8_epel16_h6v6_neon, export=1
2: 2:
ld1 {v1.8b - v4.8b}, [x7], #32 ld1 {v1.8b - v4.8b}, [x7], #32
ld1 {v16.8b - v19.8b}, [x7], #32 ld1 {v16.8b - v19.8b}, [x7], #32
ld1 {v20.8b - v23.8b}, [x7] ld1 {v20.8b - v23.8b}, [x7], #32
sub x7, x7, #48 ld1 {v24.8b - v25.8b}, [x7]
sub x7, x7, #64
vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
trn1 v2.2d, v5.2d, v2.2d trn1 v1.2d, v1.2d, v2.2d
trn1 v3.2d, v3.2d, v4.2d
st1 {v2.16b}, [x0], x1 st1 {v1.16b}, [x0], x1
subs x4, x4, #1 st1 {v3.16b}, [x0], x1
subs x4, x4, #2
b.ne 2b b.ne 2b
add sp, sp, #336+16 add sp, sp, #336+16