1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12}

This patch replaces integer widening with halving addition, and
multi-step "emulated" rounding shift with a single asm instruction doing
exactly that.

Benchmarks before and after:
A78
avg_8_64x64_neon:                                     2686.2 ( 6.12x)
avg_8_128x128_neon:                                  10734.2 ( 5.88x)
avg_10_64x64_neon:                                    2536.8 ( 5.40x)
avg_10_128x128_neon:                                 10079.0 ( 5.22x)
avg_12_64x64_neon:                                    2548.2 ( 5.38x)
avg_12_128x128_neon:                                 10133.8 ( 5.19x)

avg_8_64x64_neon:                                      897.8 (18.26x)
avg_8_128x128_neon:                                   3608.5 (17.37x)
avg_10_32x32_neon:                                     444.2 ( 8.51x)
avg_10_64x64_neon:                                    1711.8 ( 8.00x)
avg_12_64x64_neon:                                    1706.2 ( 8.02x)
avg_12_128x128_neon:                                  7010.0 ( 7.46x)

A72
avg_8_64x64_neon:                                     5823.4 ( 3.88x)
avg_8_128x128_neon:                                  17430.5 ( 4.73x)
avg_10_64x64_neon:                                    5228.1 ( 3.71x)
avg_10_128x128_neon:                                 16722.2 ( 4.17x)
avg_12_64x64_neon:                                    5379.1 ( 3.51x)
avg_12_128x128_neon:                                 16715.7 ( 4.17x)

avg_8_64x64_neon:                                     2006.5 (10.61x)
avg_8_128x128_neon:                                   9158.7 ( 8.96x)
avg_10_64x64_neon:                                    3357.7 ( 5.60x)
avg_10_128x128_neon:                                 12411.7 ( 5.56x)
avg_12_64x64_neon:                                    3317.5 ( 5.67x)
avg_12_128x128_neon:                                 12358.5 ( 5.58x)

A53
avg_8_64x64_neon:                                     8327.8 ( 5.18x)
avg_8_128x128_neon:                                  31631.3 ( 5.34x)
avg_10_64x64_neon:                                    8783.5 ( 4.98x)
avg_10_128x128_neon:                                 32617.0 ( 5.25x)
avg_12_64x64_neon:                                    8686.0 ( 5.06x)
avg_12_128x128_neon:                                 32487.5 ( 5.25x)

avg_8_64x64_neon:                                     6032.3 ( 7.17x)
avg_8_128x128_neon:                                  22008.5 ( 7.69x)
avg_10_64x64_neon:                                    7738.0 ( 5.68x)
avg_10_128x128_neon:                                 27813.8 ( 6.14x)
avg_12_64x64_neon:                                    7844.5 ( 5.60x)
avg_12_128x128_neon:                                 26999.5 ( 6.34x)

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Krzysztof Pyrkosz
2025-03-03 22:18:23 +01:00
committed by Martin Storsjö
parent 7225e307be
commit f9b8f30680

View File

@ -24,9 +24,9 @@
#define BDOF_BLOCK_SIZE 16 #define BDOF_BLOCK_SIZE 16
#define BDOF_MIN_BLOCK_SIZE 4 #define BDOF_MIN_BLOCK_SIZE 4
.macro vvc_avg type, bit_depth .macro vvc_w_avg bit_depth
.macro vvc_\type\()_\bit_depth\()_2_4 tap .macro vvc_w_avg_\bit_depth\()_2_4 tap
.if \tap == 2 .if \tap == 2
ldr s0, [src0] ldr s0, [src0]
ldr s2, [src1] ldr s2, [src1]
@ -34,18 +34,11 @@
ldr d0, [src0] ldr d0, [src0]
ldr d2, [src1] ldr d2, [src1]
.endif .endif
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
add v4.4s, v4.4s, v16.4s
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b mov v4.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h smlal v4.4s, v2.4h, v20.4h
sqshl v4.4s, v4.4s, v22.4s sqshl v4.4s, v4.4s, v22.4s
sqxtun v4.4h, v4.4s sqxtun v4.4h, v4.4s
.endif
.if \bit_depth == 8 .if \bit_depth == 8
sqxtun v4.8b, v4.8h sqxtun v4.8b, v4.8h
@ -68,7 +61,7 @@
add dst, dst, dst_stride add dst, dst, dst_stride
.endm .endm
function ff_vvc_\type\()_\bit_depth\()_neon, export=1 function ff_vvc_w_avg_\bit_depth\()_neon, export=1
dst .req x0 dst .req x0
dst_stride .req x1 dst_stride .req x1
src0 .req x2 src0 .req x2
@ -78,9 +71,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
mov x10, #(VVC_MAX_PB_SIZE * 2) mov x10, #(VVC_MAX_PB_SIZE * 2)
cmp width, #8 cmp width, #8
.ifc \type, avg
movi v16.4s, #(1 << (14 - \bit_depth))
.else
lsr x11, x6, #32 // weight0 lsr x11, x6, #32 // weight0
mov w12, w6 // weight1 mov w12, w6 // weight1
lsr x13, x7, #32 // offset lsr x13, x7, #32 // offset
@ -91,9 +81,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
dup v20.8h, w12 dup v20.8h, w12
dup v16.4s, w13 dup v16.4s, w13
dup v22.4s, w14 dup v22.4s, w14
.endif // avg
.if \bit_depth >= 10 .if \bit_depth >= 10
// clip pixel // clip pixel
mov w6, #((1 << \bit_depth) - 1) mov w6, #((1 << \bit_depth) - 1)
dup v17.8h, w6 dup v17.8h, w6
@ -105,25 +94,17 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
b.eq 4f b.eq 4f
2: // width == 2 2: // width == 2
subs height, height, #1 subs height, height, #1
vvc_\type\()_\bit_depth\()_2_4 2 vvc_w_avg_\bit_depth\()_2_4 2
b.ne 2b b.ne 2b
b 32f b 32f
4: // width == 4 4: // width == 4
subs height, height, #1 subs height, height, #1
vvc_\type\()_\bit_depth\()_2_4 4 vvc_w_avg_\bit_depth\()_2_4 4
b.ne 4b b.ne 4b
b 32f b 32f
8: // width == 8 8: // width == 8
ld1 {v0.8h}, [src0], x10 ld1 {v0.8h}, [src0], x10
ld1 {v2.8h}, [src1], x10 ld1 {v2.8h}, [src1], x10
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
saddl2 v5.4s, v0.8h, v2.8h
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v16.4s
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b mov v4.16b, v16.16b
mov v5.16b, v16.16b mov v5.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h smlal v4.4s, v0.4h, v19.4h
@ -134,7 +115,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
sqshl v5.4s, v5.4s, v22.4s sqshl v5.4s, v5.4s, v22.4s
sqxtun v4.4h, v4.4s sqxtun v4.4h, v4.4s
sqxtun2 v4.8h, v5.4s sqxtun2 v4.8h, v5.4s
.endif
subs height, height, #1 subs height, height, #1
.if \bit_depth == 8 .if \bit_depth == 8
sqxtun v4.8b, v4.8h sqxtun v4.8b, v4.8h
@ -153,20 +133,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
17: 17:
ldp q0, q1, [x7], #32 ldp q0, q1, [x7], #32
ldp q2, q3, [x8], #32 ldp q2, q3, [x8], #32
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
saddl2 v5.4s, v0.8h, v2.8h
saddl v6.4s, v1.4h, v3.4h
saddl2 v7.4s, v1.8h, v3.8h
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v16.4s
add v6.4s, v6.4s, v16.4s
add v7.4s, v7.4s, v16.4s
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
sqshrun v6.4h, v6.4s, #(15 - \bit_depth)
sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth)
.else // avg
mov v4.16b, v16.16b mov v4.16b, v16.16b
mov v5.16b, v16.16b mov v5.16b, v16.16b
mov v6.16b, v16.16b mov v6.16b, v16.16b
@ -187,7 +153,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
sqxtun v6.4h, v6.4s sqxtun v6.4h, v6.4s
sqxtun2 v4.8h, v5.4s sqxtun2 v4.8h, v5.4s
sqxtun2 v6.8h, v7.4s sqxtun2 v6.8h, v7.4s
.endif // w_avg
subs w6, w6, #16 subs w6, w6, #16
.if \bit_depth == 8 .if \bit_depth == 8
sqxtun v4.8b, v4.8h sqxtun v4.8b, v4.8h
@ -217,12 +182,130 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
endfunc endfunc
.endm .endm
vvc_avg avg, 8 vvc_w_avg 8
vvc_avg avg, 10 vvc_w_avg 10
vvc_avg avg, 12 vvc_w_avg 12
vvc_avg w_avg, 8
vvc_avg w_avg, 10 .macro vvc_avg bit_depth
vvc_avg w_avg, 12 function ff_vvc_avg_\bit_depth\()_neon, export=1
mov x10, #(VVC_MAX_PB_SIZE * 2)
movi v16.8h, #0
movi v17.16b, #255
ushr v17.8h, v17.8h, #(16 - \bit_depth)
cmp w4, #8
b.gt 16f
b.eq 8f
cmp w4, #4
b.eq 4f
2: // width == 2
ldr s0, [x2]
subs w5, w5, #1
ldr s1, [x3]
.if \bit_depth == 8
shadd v0.4h, v0.4h, v1.4h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str h0, [x0]
.else
shadd v0.4h, v0.4h, v1.4h
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
smax v0.4h, v0.4h, v16.4h
smin v0.4h, v0.4h, v17.4h
str s0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 2b
ret
4: // width == 4
ldr d0, [x2]
subs w5, w5, #1
ldr d1, [x3]
.if \bit_depth == 8
shadd v0.4h, v0.4h, v1.4h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str s0, [x0]
.else
shadd v0.4h, v0.4h, v1.4h
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
smax v0.4h, v0.4h, v16.4h
smin v0.4h, v0.4h, v17.4h
str d0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 4b
ret
8: // width == 8
ldr q0, [x2]
subs w5, w5, #1
ldr q1, [x3]
.if \bit_depth == 8
shadd v0.8h, v0.8h, v1.8h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str d0, [x0]
.else
shadd v0.8h, v0.8h, v1.8h
srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth)
smax v0.8h, v0.8h, v16.8h
smin v0.8h, v0.8h, v17.8h
str q0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 8b
ret
16: // width >= 16
.if \bit_depth == 8
sub x1, x1, w4, sxtw
.else
sub x1, x1, w4, sxtw #1
.endif
sub x10, x10, w4, sxtw #1
3:
mov w6, w4 // width
1:
ldp q0, q1, [x2], #32
subs w6, w6, #16
ldp q2, q3, [x3], #32
.if \bit_depth == 8
shadd v4.8h, v0.8h, v2.8h
shadd v5.8h, v1.8h, v3.8h
sqrshrun v0.8b, v4.8h, #6
sqrshrun2 v0.16b, v5.8h, #6
st1 {v0.16b}, [x0], #16
.else
shadd v4.8h, v0.8h, v2.8h
shadd v5.8h, v1.8h, v3.8h
srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth)
srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth)
smax v0.8h, v0.8h, v16.8h
smax v1.8h, v1.8h, v16.8h
smin v0.8h, v0.8h, v17.8h
smin v1.8h, v1.8h, v17.8h
stp q0, q1, [x0], #32
.endif
b.ne 1b
subs w5, w5, #1
add x2, x2, x10
add x3, x3, x10
add x0, x0, x1
b.ne 3b
ret
endfunc
.endm
vvc_avg 8
vvc_avg 10
vvc_avg 12
/* x0: int16_t *dst /* x0: int16_t *dst
* x1: const uint8_t *_src * x1: const uint8_t *_src