1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12}

This patch replaces integer widening with halving addition, and
multi-step "emulated" rounding shift with a single asm instruction doing
exactly that.

Benchmarks before and after:
A78
avg_8_64x64_neon:                                     2686.2 ( 6.12x)
avg_8_128x128_neon:                                  10734.2 ( 5.88x)
avg_10_64x64_neon:                                    2536.8 ( 5.40x)
avg_10_128x128_neon:                                 10079.0 ( 5.22x)
avg_12_64x64_neon:                                    2548.2 ( 5.38x)
avg_12_128x128_neon:                                 10133.8 ( 5.19x)

avg_8_64x64_neon:                                      897.8 (18.26x)
avg_8_128x128_neon:                                   3608.5 (17.37x)
avg_10_32x32_neon:                                     444.2 ( 8.51x)
avg_10_64x64_neon:                                    1711.8 ( 8.00x)
avg_12_64x64_neon:                                    1706.2 ( 8.02x)
avg_12_128x128_neon:                                  7010.0 ( 7.46x)

A72
avg_8_64x64_neon:                                     5823.4 ( 3.88x)
avg_8_128x128_neon:                                  17430.5 ( 4.73x)
avg_10_64x64_neon:                                    5228.1 ( 3.71x)
avg_10_128x128_neon:                                 16722.2 ( 4.17x)
avg_12_64x64_neon:                                    5379.1 ( 3.51x)
avg_12_128x128_neon:                                 16715.7 ( 4.17x)

avg_8_64x64_neon:                                     2006.5 (10.61x)
avg_8_128x128_neon:                                   9158.7 ( 8.96x)
avg_10_64x64_neon:                                    3357.7 ( 5.60x)
avg_10_128x128_neon:                                 12411.7 ( 5.56x)
avg_12_64x64_neon:                                    3317.5 ( 5.67x)
avg_12_128x128_neon:                                 12358.5 ( 5.58x)

A53
avg_8_64x64_neon:                                     8327.8 ( 5.18x)
avg_8_128x128_neon:                                  31631.3 ( 5.34x)
avg_10_64x64_neon:                                    8783.5 ( 4.98x)
avg_10_128x128_neon:                                 32617.0 ( 5.25x)
avg_12_64x64_neon:                                    8686.0 ( 5.06x)
avg_12_128x128_neon:                                 32487.5 ( 5.25x)

avg_8_64x64_neon:                                     6032.3 ( 7.17x)
avg_8_128x128_neon:                                  22008.5 ( 7.69x)
avg_10_64x64_neon:                                    7738.0 ( 5.68x)
avg_10_128x128_neon:                                 27813.8 ( 6.14x)
avg_12_64x64_neon:                                    7844.5 ( 5.60x)
avg_12_128x128_neon:                                 26999.5 ( 6.34x)

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Krzysztof Pyrkosz
2025-03-03 22:18:23 +01:00
committed by Martin Storsjö
parent 7225e307be
commit f9b8f30680

View File

@ -24,9 +24,9 @@
#define BDOF_BLOCK_SIZE 16
#define BDOF_MIN_BLOCK_SIZE 4
.macro vvc_avg type, bit_depth
.macro vvc_w_avg bit_depth
.macro vvc_\type\()_\bit_depth\()_2_4 tap
.macro vvc_w_avg_\bit_depth\()_2_4 tap
.if \tap == 2
ldr s0, [src0]
ldr s2, [src1]
@ -34,18 +34,11 @@
ldr d0, [src0]
ldr d2, [src1]
.endif
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
add v4.4s, v4.4s, v16.4s
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
sqshl v4.4s, v4.4s, v22.4s
sqxtun v4.4h, v4.4s
.endif
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
@ -68,7 +61,7 @@
add dst, dst, dst_stride
.endm
function ff_vvc_\type\()_\bit_depth\()_neon, export=1
function ff_vvc_w_avg_\bit_depth\()_neon, export=1
dst .req x0
dst_stride .req x1
src0 .req x2
@ -78,9 +71,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
mov x10, #(VVC_MAX_PB_SIZE * 2)
cmp width, #8
.ifc \type, avg
movi v16.4s, #(1 << (14 - \bit_depth))
.else
lsr x11, x6, #32 // weight0
mov w12, w6 // weight1
lsr x13, x7, #32 // offset
@ -91,7 +81,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
dup v20.8h, w12
dup v16.4s, w13
dup v22.4s, w14
.endif // avg
.if \bit_depth >= 10
// clip pixel
@ -105,25 +94,17 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
b.eq 4f
2: // width == 2
subs height, height, #1
vvc_\type\()_\bit_depth\()_2_4 2
vvc_w_avg_\bit_depth\()_2_4 2
b.ne 2b
b 32f
4: // width == 4
subs height, height, #1
vvc_\type\()_\bit_depth\()_2_4 4
vvc_w_avg_\bit_depth\()_2_4 4
b.ne 4b
b 32f
8: // width == 8
ld1 {v0.8h}, [src0], x10
ld1 {v2.8h}, [src1], x10
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
saddl2 v5.4s, v0.8h, v2.8h
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v16.4s
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b
mov v5.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
@ -134,7 +115,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
sqshl v5.4s, v5.4s, v22.4s
sqxtun v4.4h, v4.4s
sqxtun2 v4.8h, v5.4s
.endif
subs height, height, #1
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
@ -153,20 +133,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
17:
ldp q0, q1, [x7], #32
ldp q2, q3, [x8], #32
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
saddl2 v5.4s, v0.8h, v2.8h
saddl v6.4s, v1.4h, v3.4h
saddl2 v7.4s, v1.8h, v3.8h
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v16.4s
add v6.4s, v6.4s, v16.4s
add v7.4s, v7.4s, v16.4s
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
sqshrun v6.4h, v6.4s, #(15 - \bit_depth)
sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth)
.else // avg
mov v4.16b, v16.16b
mov v5.16b, v16.16b
mov v6.16b, v16.16b
@ -187,7 +153,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
sqxtun v6.4h, v6.4s
sqxtun2 v4.8h, v5.4s
sqxtun2 v6.8h, v7.4s
.endif // w_avg
subs w6, w6, #16
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
@ -217,12 +182,130 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
endfunc
.endm
vvc_avg avg, 8
vvc_avg avg, 10
vvc_avg avg, 12
vvc_avg w_avg, 8
vvc_avg w_avg, 10
vvc_avg w_avg, 12
vvc_w_avg 8
vvc_w_avg 10
vvc_w_avg 12
.macro vvc_avg bit_depth
function ff_vvc_avg_\bit_depth\()_neon, export=1
mov x10, #(VVC_MAX_PB_SIZE * 2)
movi v16.8h, #0
movi v17.16b, #255
ushr v17.8h, v17.8h, #(16 - \bit_depth)
cmp w4, #8
b.gt 16f
b.eq 8f
cmp w4, #4
b.eq 4f
2: // width == 2
ldr s0, [x2]
subs w5, w5, #1
ldr s1, [x3]
.if \bit_depth == 8
shadd v0.4h, v0.4h, v1.4h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str h0, [x0]
.else
shadd v0.4h, v0.4h, v1.4h
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
smax v0.4h, v0.4h, v16.4h
smin v0.4h, v0.4h, v17.4h
str s0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 2b
ret
4: // width == 4
ldr d0, [x2]
subs w5, w5, #1
ldr d1, [x3]
.if \bit_depth == 8
shadd v0.4h, v0.4h, v1.4h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str s0, [x0]
.else
shadd v0.4h, v0.4h, v1.4h
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
smax v0.4h, v0.4h, v16.4h
smin v0.4h, v0.4h, v17.4h
str d0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 4b
ret
8: // width == 8
ldr q0, [x2]
subs w5, w5, #1
ldr q1, [x3]
.if \bit_depth == 8
shadd v0.8h, v0.8h, v1.8h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str d0, [x0]
.else
shadd v0.8h, v0.8h, v1.8h
srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth)
smax v0.8h, v0.8h, v16.8h
smin v0.8h, v0.8h, v17.8h
str q0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 8b
ret
16: // width >= 16
.if \bit_depth == 8
sub x1, x1, w4, sxtw
.else
sub x1, x1, w4, sxtw #1
.endif
sub x10, x10, w4, sxtw #1
3:
mov w6, w4 // width
1:
ldp q0, q1, [x2], #32
subs w6, w6, #16
ldp q2, q3, [x3], #32
.if \bit_depth == 8
shadd v4.8h, v0.8h, v2.8h
shadd v5.8h, v1.8h, v3.8h
sqrshrun v0.8b, v4.8h, #6
sqrshrun2 v0.16b, v5.8h, #6
st1 {v0.16b}, [x0], #16
.else
shadd v4.8h, v0.8h, v2.8h
shadd v5.8h, v1.8h, v3.8h
srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth)
srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth)
smax v0.8h, v0.8h, v16.8h
smax v1.8h, v1.8h, v16.8h
smin v0.8h, v0.8h, v17.8h
smin v1.8h, v1.8h, v17.8h
stp q0, q1, [x0], #32
.endif
b.ne 1b
subs w5, w5, #1
add x2, x2, x10
add x3, x3, x10
add x0, x0, x1
b.ne 3b
ret
endfunc
.endm
vvc_avg 8
vvc_avg 10
vvc_avg 12
/* x0: int16_t *dst
* x1: const uint8_t *_src