1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-10 06:10:52 +02:00

avcodec/aarch64/vvc: Optimize NEON version of vvc_dmvr

This patch replaces blocks of instructions performing rounding and
widening shifts with one-liners achieving the same result.

Before and after on A78
dmvr_8_12x20_neon:                                      86.2 ( 6.90x)
dmvr_8_20x12_neon:                                      94.8 ( 5.93x)
dmvr_8_20x20_neon:                                     141.5 ( 6.50x)
dmvr_12_12x20_neon:                                    158.0 ( 3.76x)
dmvr_12_20x12_neon:                                    151.2 ( 3.73x)
dmvr_12_20x20_neon:                                    247.2 ( 3.71x)
dmvr_hv_8_12x20_neon:                                  423.2 ( 3.75x)
dmvr_hv_8_20x12_neon:                                  434.0 ( 3.69x)
dmvr_hv_8_20x20_neon:                                  706.0 ( 3.69x)

dmvr_8_12x20_neon:                                      77.2 ( 7.70x)
dmvr_8_20x12_neon:                                      66.5 ( 8.49x)
dmvr_8_20x20_neon:                                      92.2 ( 9.90x)
dmvr_12_12x20_neon:                                     80.2 ( 7.38x)
dmvr_12_20x12_neon:                                     58.2 ( 9.59x)
dmvr_12_20x20_neon:                                     90.0 (10.15x)
dmvr_hv_8_12x20_neon:                                  369.0 ( 4.34x)
dmvr_hv_8_20x12_neon:                                  355.8 ( 4.49x)
dmvr_hv_8_20x20_neon:                                  574.2 ( 4.51x)

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Krzysztof Pyrkosz
2025-03-03 22:32:55 +01:00
committed by Martin Storsjö
parent d765e5f043
commit 71a91485fa

View File

@@ -251,22 +251,18 @@ function ff_vvc_dmvr_8_neon, export=1
1: 1:
cbz w15, 2f cbz w15, 2f
ldr q0, [src], #16 ldr q0, [src], #16
uxtl v1.8h, v0.8b ushll v1.8h, v0.8b, #2
uxtl2 v2.8h, v0.16b ushll2 v2.8h, v0.16b, #2
ushl v1.8h, v1.8h, v16.8h
ushl v2.8h, v2.8h, v16.8h
stp q1, q2, [dst], #32 stp q1, q2, [dst], #32
b 3f b 3f
2: 2:
ldr d0, [src], #8 ldr d0, [src], #8
uxtl v1.8h, v0.8b ushll v1.8h, v0.8b, #2
ushl v1.8h, v1.8h, v16.8h
str q1, [dst], #16 str q1, [dst], #16
3: 3:
subs height, height, #1 subs height, height, #1
ldr s3, [src], #4 ldr s3, [src], #4
uxtl v4.8h, v3.8b ushll v4.8h, v3.8b, #2
ushl v4.4h, v4.4h, v16.4h
st1 {v4.4h}, [dst], x7 st1 {v4.4h}, [dst], x7
add src, src, src_stride add src, src, src_stride
@@ -281,42 +277,24 @@ function ff_vvc_dmvr_12_neon, export=1
cmp width, #16 cmp width, #16
sub src_stride, src_stride, x6, lsl #1 sub src_stride, src_stride, x6, lsl #1
cset w15, gt // width > 16 cset w15, gt // width > 16
movi v16.8h, #2 // offset4
sub x7, x7, x6, lsl #1 sub x7, x7, x6, lsl #1
1: 1:
cbz w15, 2f cbz w15, 2f
ldp q0, q1, [src], #32 ldp q0, q1, [src], #32
uaddl v2.4s, v0.4h, v16.4h urshr v0.8h, v0.8h, #2
uaddl2 v3.4s, v0.8h, v16.8h urshr v1.8h, v1.8h, #2
uaddl v4.4s, v1.4h, v16.4h
uaddl2 v5.4s, v1.8h, v16.8h
ushr v2.4s, v2.4s, #2
ushr v3.4s, v3.4s, #2
ushr v4.4s, v4.4s, #2
ushr v5.4s, v5.4s, #2
uqxtn v2.4h, v2.4s
uqxtn2 v2.8h, v3.4s
uqxtn v4.4h, v4.4s
uqxtn2 v4.8h, v5.4s
stp q2, q4, [dst], #32 stp q0, q1, [dst], #32
b 3f b 3f
2: 2:
ldr q0, [src], #16 ldr q0, [src], #16
uaddl v2.4s, v0.4h, v16.4h urshr v0.8h, v0.8h, #2
uaddl2 v3.4s, v0.8h, v16.8h str q0, [dst], #16
ushr v2.4s, v2.4s, #2
ushr v3.4s, v3.4s, #2
uqxtn v2.4h, v2.4s
uqxtn2 v2.8h, v3.4s
str q2, [dst], #16
3: 3:
subs height, height, #1 subs height, height, #1
ldr d0, [src], #8 ldr d0, [src], #8
uaddl v3.4s, v0.4h, v16.4h urshr v0.4h, v0.4h, #2
ushr v3.4s, v3.4s, #2 st1 {v0.4h}, [dst], x7
uqxtn v3.4h, v3.4s
st1 {v3.4h}, [dst], x7
add src, src, src_stride add src, src, src_stride
b.ne 1b b.ne 1b
@@ -344,8 +322,6 @@ function ff_vvc_dmvr_hv_8_neon, export=1
ldrb w10, [x12] ldrb w10, [x12]
ldrb w11, [x12, #1] ldrb w11, [x12, #1]
sxtw x6, w6 sxtw x6, w6
movi v30.8h, #(1 << (8 - 7)) // offset1
movi v31.8h, #8 // offset2
dup v2.8h, w10 // filter_y[0] dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1] dup v3.8h, w11 // filter_y[1]
@@ -373,10 +349,8 @@ function ff_vvc_dmvr_hv_8_neon, export=1
mul v16.8h, v16.8h, v0.8h mul v16.8h, v16.8h, v0.8h
mla v6.8h, v7.8h, v1.8h mla v6.8h, v7.8h, v1.8h
mla v16.8h, v17.8h, v1.8h mla v16.8h, v17.8h, v1.8h
add v6.8h, v6.8h, v30.8h urshr v6.8h, v6.8h, #(8 - 6)
add v16.8h, v16.8h, v30.8h urshr v7.8h, v16.8h, #(8 - 6)
ushr v6.8h, v6.8h, #(8 - 6)
ushr v7.8h, v16.8h, #(8 - 6)
stp q6, q7, [x13], #32 stp q6, q7, [x13], #32
cbz w10, 3f cbz w10, 3f
@@ -386,10 +360,8 @@ function ff_vvc_dmvr_hv_8_neon, export=1
mul v17.8h, v17.8h, v2.8h mul v17.8h, v17.8h, v2.8h
mla v16.8h, v6.8h, v3.8h mla v16.8h, v6.8h, v3.8h
mla v17.8h, v7.8h, v3.8h mla v17.8h, v7.8h, v3.8h
add v16.8h, v16.8h, v31.8h urshr v16.8h, v16.8h, #4
add v17.8h, v17.8h, v31.8h urshr v17.8h, v17.8h, #4
ushr v16.8h, v16.8h, #4
ushr v17.8h, v17.8h, #4
stp q16, q17, [x14], #32 stp q16, q17, [x14], #32
b 3f b 3f
2: 2:
@@ -400,8 +372,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1
uxtl v6.8h, v4.8b uxtl v6.8h, v4.8b
mul v6.8h, v6.8h, v0.8h mul v6.8h, v6.8h, v0.8h
mla v6.8h, v7.8h, v1.8h mla v6.8h, v7.8h, v1.8h
add v6.8h, v6.8h, v30.8h urshr v6.8h, v6.8h, #(8 - 6)
ushr v6.8h, v6.8h, #(8 - 6)
str q6, [x13], #16 str q6, [x13], #16
cbz w10, 3f cbz w10, 3f
@@ -409,8 +380,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1
ldr q16, [x12], #16 ldr q16, [x12], #16
mul v16.8h, v16.8h, v2.8h mul v16.8h, v16.8h, v2.8h
mla v16.8h, v6.8h, v3.8h mla v16.8h, v6.8h, v3.8h
add v16.8h, v16.8h, v31.8h urshr v16.8h, v16.8h, #4
ushr v16.8h, v16.8h, #4
str q16, [x14], #16 str q16, [x14], #16
3: 3:
ldur s5, [src, #1] ldur s5, [src, #1]
@@ -419,8 +389,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1
uxtl v6.8h, v4.8b uxtl v6.8h, v4.8b
mul v6.4h, v6.4h, v0.4h mul v6.4h, v6.4h, v0.4h
mla v6.4h, v7.4h, v1.4h mla v6.4h, v7.4h, v1.4h
add v6.4h, v6.4h, v30.4h urshr v6.4h, v6.4h, #(8 - 6)
ushr v6.4h, v6.4h, #(8 - 6)
str d6, [x13], #8 str d6, [x13], #8
cbz w10, 4f cbz w10, 4f
@@ -428,8 +397,7 @@ function ff_vvc_dmvr_hv_8_neon, export=1
ldr d16, [x12], #8 ldr d16, [x12], #8
mul v16.4h, v16.4h, v2.4h mul v16.4h, v16.4h, v2.4h
mla v16.4h, v6.4h, v3.4h mla v16.4h, v6.4h, v3.4h
add v16.4h, v16.4h, v31.4h urshr v16.4h, v16.4h, #4
ushr v16.4h, v16.4h, #4
str d16, [x14], #8 str d16, [x14], #8
4: 4:
subs height, height, #1 subs height, height, #1