You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
avcodec/aarch64/vvc: Optimize NEON version of vvc_dmvr
This patch replaces blocks of instructions performing rounding and widening shifts with one-liners achieving the same result. Before and after on A78 dmvr_8_12x20_neon: 86.2 ( 6.90x) dmvr_8_20x12_neon: 94.8 ( 5.93x) dmvr_8_20x20_neon: 141.5 ( 6.50x) dmvr_12_12x20_neon: 158.0 ( 3.76x) dmvr_12_20x12_neon: 151.2 ( 3.73x) dmvr_12_20x20_neon: 247.2 ( 3.71x) dmvr_hv_8_12x20_neon: 423.2 ( 3.75x) dmvr_hv_8_20x12_neon: 434.0 ( 3.69x) dmvr_hv_8_20x20_neon: 706.0 ( 3.69x) dmvr_8_12x20_neon: 77.2 ( 7.70x) dmvr_8_20x12_neon: 66.5 ( 8.49x) dmvr_8_20x20_neon: 92.2 ( 9.90x) dmvr_12_12x20_neon: 80.2 ( 7.38x) dmvr_12_20x12_neon: 58.2 ( 9.59x) dmvr_12_20x20_neon: 90.0 (10.15x) dmvr_hv_8_12x20_neon: 369.0 ( 4.34x) dmvr_hv_8_20x12_neon: 355.8 ( 4.49x) dmvr_hv_8_20x20_neon: 574.2 ( 4.51x) Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
committed by
Martin Storsjö
parent
d765e5f043
commit
71a91485fa
@ -251,22 +251,18 @@ function ff_vvc_dmvr_8_neon, export=1
|
||||
1:
|
||||
cbz w15, 2f
|
||||
ldr q0, [src], #16
|
||||
uxtl v1.8h, v0.8b
|
||||
uxtl2 v2.8h, v0.16b
|
||||
ushl v1.8h, v1.8h, v16.8h
|
||||
ushl v2.8h, v2.8h, v16.8h
|
||||
ushll v1.8h, v0.8b, #2
|
||||
ushll2 v2.8h, v0.16b, #2
|
||||
stp q1, q2, [dst], #32
|
||||
b 3f
|
||||
2:
|
||||
ldr d0, [src], #8
|
||||
uxtl v1.8h, v0.8b
|
||||
ushl v1.8h, v1.8h, v16.8h
|
||||
ushll v1.8h, v0.8b, #2
|
||||
str q1, [dst], #16
|
||||
3:
|
||||
subs height, height, #1
|
||||
ldr s3, [src], #4
|
||||
uxtl v4.8h, v3.8b
|
||||
ushl v4.4h, v4.4h, v16.4h
|
||||
ushll v4.8h, v3.8b, #2
|
||||
st1 {v4.4h}, [dst], x7
|
||||
|
||||
add src, src, src_stride
|
||||
@ -281,42 +277,24 @@ function ff_vvc_dmvr_12_neon, export=1
|
||||
cmp width, #16
|
||||
sub src_stride, src_stride, x6, lsl #1
|
||||
cset w15, gt // width > 16
|
||||
movi v16.8h, #2 // offset4
|
||||
sub x7, x7, x6, lsl #1
|
||||
1:
|
||||
cbz w15, 2f
|
||||
ldp q0, q1, [src], #32
|
||||
uaddl v2.4s, v0.4h, v16.4h
|
||||
uaddl2 v3.4s, v0.8h, v16.8h
|
||||
uaddl v4.4s, v1.4h, v16.4h
|
||||
uaddl2 v5.4s, v1.8h, v16.8h
|
||||
ushr v2.4s, v2.4s, #2
|
||||
ushr v3.4s, v3.4s, #2
|
||||
ushr v4.4s, v4.4s, #2
|
||||
ushr v5.4s, v5.4s, #2
|
||||
uqxtn v2.4h, v2.4s
|
||||
uqxtn2 v2.8h, v3.4s
|
||||
uqxtn v4.4h, v4.4s
|
||||
uqxtn2 v4.8h, v5.4s
|
||||
urshr v0.8h, v0.8h, #2
|
||||
urshr v1.8h, v1.8h, #2
|
||||
|
||||
stp q2, q4, [dst], #32
|
||||
stp q0, q1, [dst], #32
|
||||
b 3f
|
||||
2:
|
||||
ldr q0, [src], #16
|
||||
uaddl v2.4s, v0.4h, v16.4h
|
||||
uaddl2 v3.4s, v0.8h, v16.8h
|
||||
ushr v2.4s, v2.4s, #2
|
||||
ushr v3.4s, v3.4s, #2
|
||||
uqxtn v2.4h, v2.4s
|
||||
uqxtn2 v2.8h, v3.4s
|
||||
str q2, [dst], #16
|
||||
urshr v0.8h, v0.8h, #2
|
||||
str q0, [dst], #16
|
||||
3:
|
||||
subs height, height, #1
|
||||
ldr d0, [src], #8
|
||||
uaddl v3.4s, v0.4h, v16.4h
|
||||
ushr v3.4s, v3.4s, #2
|
||||
uqxtn v3.4h, v3.4s
|
||||
st1 {v3.4h}, [dst], x7
|
||||
urshr v0.4h, v0.4h, #2
|
||||
st1 {v0.4h}, [dst], x7
|
||||
|
||||
add src, src, src_stride
|
||||
b.ne 1b
|
||||
@ -344,8 +322,6 @@ function ff_vvc_dmvr_hv_8_neon, export=1
|
||||
ldrb w10, [x12]
|
||||
ldrb w11, [x12, #1]
|
||||
sxtw x6, w6
|
||||
movi v30.8h, #(1 << (8 - 7)) // offset1
|
||||
movi v31.8h, #8 // offset2
|
||||
dup v2.8h, w10 // filter_y[0]
|
||||
dup v3.8h, w11 // filter_y[1]
|
||||
|
||||
@ -373,10 +349,8 @@ function ff_vvc_dmvr_hv_8_neon, export=1
|
||||
mul v16.8h, v16.8h, v0.8h
|
||||
mla v6.8h, v7.8h, v1.8h
|
||||
mla v16.8h, v17.8h, v1.8h
|
||||
add v6.8h, v6.8h, v30.8h
|
||||
add v16.8h, v16.8h, v30.8h
|
||||
ushr v6.8h, v6.8h, #(8 - 6)
|
||||
ushr v7.8h, v16.8h, #(8 - 6)
|
||||
urshr v6.8h, v6.8h, #(8 - 6)
|
||||
urshr v7.8h, v16.8h, #(8 - 6)
|
||||
stp q6, q7, [x13], #32
|
||||
|
||||
cbz w10, 3f
|
||||
@ -386,10 +360,8 @@ function ff_vvc_dmvr_hv_8_neon, export=1
|
||||
mul v17.8h, v17.8h, v2.8h
|
||||
mla v16.8h, v6.8h, v3.8h
|
||||
mla v17.8h, v7.8h, v3.8h
|
||||
add v16.8h, v16.8h, v31.8h
|
||||
add v17.8h, v17.8h, v31.8h
|
||||
ushr v16.8h, v16.8h, #4
|
||||
ushr v17.8h, v17.8h, #4
|
||||
urshr v16.8h, v16.8h, #4
|
||||
urshr v17.8h, v17.8h, #4
|
||||
stp q16, q17, [x14], #32
|
||||
b 3f
|
||||
2:
|
||||
@ -400,8 +372,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1
|
||||
uxtl v6.8h, v4.8b
|
||||
mul v6.8h, v6.8h, v0.8h
|
||||
mla v6.8h, v7.8h, v1.8h
|
||||
add v6.8h, v6.8h, v30.8h
|
||||
ushr v6.8h, v6.8h, #(8 - 6)
|
||||
urshr v6.8h, v6.8h, #(8 - 6)
|
||||
str q6, [x13], #16
|
||||
|
||||
cbz w10, 3f
|
||||
@ -409,8 +380,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1
|
||||
ldr q16, [x12], #16
|
||||
mul v16.8h, v16.8h, v2.8h
|
||||
mla v16.8h, v6.8h, v3.8h
|
||||
add v16.8h, v16.8h, v31.8h
|
||||
ushr v16.8h, v16.8h, #4
|
||||
urshr v16.8h, v16.8h, #4
|
||||
str q16, [x14], #16
|
||||
3:
|
||||
ldur s5, [src, #1]
|
||||
@ -419,8 +389,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1
|
||||
uxtl v6.8h, v4.8b
|
||||
mul v6.4h, v6.4h, v0.4h
|
||||
mla v6.4h, v7.4h, v1.4h
|
||||
add v6.4h, v6.4h, v30.4h
|
||||
ushr v6.4h, v6.4h, #(8 - 6)
|
||||
urshr v6.4h, v6.4h, #(8 - 6)
|
||||
str d6, [x13], #8
|
||||
|
||||
cbz w10, 4f
|
||||
@ -428,8 +397,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1
|
||||
ldr d16, [x12], #8
|
||||
mul v16.4h, v16.4h, v2.4h
|
||||
mla v16.4h, v6.4h, v3.4h
|
||||
add v16.4h, v16.4h, v31.4h
|
||||
ushr v16.4h, v16.4h, #4
|
||||
urshr v16.4h, v16.4h, #4
|
||||
str d16, [x14], #8
|
||||
4:
|
||||
subs height, height, #1
|
||||
|
Reference in New Issue
Block a user