You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12}
This patch replaces integer widening with halving addition, and multi-step "emulated" rounding shift with a single asm instruction doing exactly that. Benchmarks before and after: A78 avg_8_64x64_neon: 2686.2 ( 6.12x) avg_8_128x128_neon: 10734.2 ( 5.88x) avg_10_64x64_neon: 2536.8 ( 5.40x) avg_10_128x128_neon: 10079.0 ( 5.22x) avg_12_64x64_neon: 2548.2 ( 5.38x) avg_12_128x128_neon: 10133.8 ( 5.19x) avg_8_64x64_neon: 897.8 (18.26x) avg_8_128x128_neon: 3608.5 (17.37x) avg_10_32x32_neon: 444.2 ( 8.51x) avg_10_64x64_neon: 1711.8 ( 8.00x) avg_12_64x64_neon: 1706.2 ( 8.02x) avg_12_128x128_neon: 7010.0 ( 7.46x) A72 avg_8_64x64_neon: 5823.4 ( 3.88x) avg_8_128x128_neon: 17430.5 ( 4.73x) avg_10_64x64_neon: 5228.1 ( 3.71x) avg_10_128x128_neon: 16722.2 ( 4.17x) avg_12_64x64_neon: 5379.1 ( 3.51x) avg_12_128x128_neon: 16715.7 ( 4.17x) avg_8_64x64_neon: 2006.5 (10.61x) avg_8_128x128_neon: 9158.7 ( 8.96x) avg_10_64x64_neon: 3357.7 ( 5.60x) avg_10_128x128_neon: 12411.7 ( 5.56x) avg_12_64x64_neon: 3317.5 ( 5.67x) avg_12_128x128_neon: 12358.5 ( 5.58x) A53 avg_8_64x64_neon: 8327.8 ( 5.18x) avg_8_128x128_neon: 31631.3 ( 5.34x) avg_10_64x64_neon: 8783.5 ( 4.98x) avg_10_128x128_neon: 32617.0 ( 5.25x) avg_12_64x64_neon: 8686.0 ( 5.06x) avg_12_128x128_neon: 32487.5 ( 5.25x) avg_8_64x64_neon: 6032.3 ( 7.17x) avg_8_128x128_neon: 22008.5 ( 7.69x) avg_10_64x64_neon: 7738.0 ( 5.68x) avg_10_128x128_neon: 27813.8 ( 6.14x) avg_12_64x64_neon: 7844.5 ( 5.60x) avg_12_128x128_neon: 26999.5 ( 6.34x) Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
committed by
Martin Storsjö
parent
7225e307be
commit
f9b8f30680
@ -24,9 +24,9 @@
|
|||||||
#define BDOF_BLOCK_SIZE 16
|
#define BDOF_BLOCK_SIZE 16
|
||||||
#define BDOF_MIN_BLOCK_SIZE 4
|
#define BDOF_MIN_BLOCK_SIZE 4
|
||||||
|
|
||||||
.macro vvc_avg type, bit_depth
|
.macro vvc_w_avg bit_depth
|
||||||
|
|
||||||
.macro vvc_\type\()_\bit_depth\()_2_4 tap
|
.macro vvc_w_avg_\bit_depth\()_2_4 tap
|
||||||
.if \tap == 2
|
.if \tap == 2
|
||||||
ldr s0, [src0]
|
ldr s0, [src0]
|
||||||
ldr s2, [src1]
|
ldr s2, [src1]
|
||||||
@ -34,18 +34,11 @@
|
|||||||
ldr d0, [src0]
|
ldr d0, [src0]
|
||||||
ldr d2, [src1]
|
ldr d2, [src1]
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
.ifc \type, avg
|
|
||||||
saddl v4.4s, v0.4h, v2.4h
|
|
||||||
add v4.4s, v4.4s, v16.4s
|
|
||||||
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
|
|
||||||
.else
|
|
||||||
mov v4.16b, v16.16b
|
mov v4.16b, v16.16b
|
||||||
smlal v4.4s, v0.4h, v19.4h
|
smlal v4.4s, v0.4h, v19.4h
|
||||||
smlal v4.4s, v2.4h, v20.4h
|
smlal v4.4s, v2.4h, v20.4h
|
||||||
sqshl v4.4s, v4.4s, v22.4s
|
sqshl v4.4s, v4.4s, v22.4s
|
||||||
sqxtun v4.4h, v4.4s
|
sqxtun v4.4h, v4.4s
|
||||||
.endif
|
|
||||||
|
|
||||||
.if \bit_depth == 8
|
.if \bit_depth == 8
|
||||||
sqxtun v4.8b, v4.8h
|
sqxtun v4.8b, v4.8h
|
||||||
@ -68,7 +61,7 @@
|
|||||||
add dst, dst, dst_stride
|
add dst, dst, dst_stride
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
function ff_vvc_w_avg_\bit_depth\()_neon, export=1
|
||||||
dst .req x0
|
dst .req x0
|
||||||
dst_stride .req x1
|
dst_stride .req x1
|
||||||
src0 .req x2
|
src0 .req x2
|
||||||
@ -78,9 +71,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
|||||||
|
|
||||||
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
||||||
cmp width, #8
|
cmp width, #8
|
||||||
.ifc \type, avg
|
|
||||||
movi v16.4s, #(1 << (14 - \bit_depth))
|
|
||||||
.else
|
|
||||||
lsr x11, x6, #32 // weight0
|
lsr x11, x6, #32 // weight0
|
||||||
mov w12, w6 // weight1
|
mov w12, w6 // weight1
|
||||||
lsr x13, x7, #32 // offset
|
lsr x13, x7, #32 // offset
|
||||||
@ -91,9 +81,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
|||||||
dup v20.8h, w12
|
dup v20.8h, w12
|
||||||
dup v16.4s, w13
|
dup v16.4s, w13
|
||||||
dup v22.4s, w14
|
dup v22.4s, w14
|
||||||
.endif // avg
|
|
||||||
|
|
||||||
.if \bit_depth >= 10
|
.if \bit_depth >= 10
|
||||||
// clip pixel
|
// clip pixel
|
||||||
mov w6, #((1 << \bit_depth) - 1)
|
mov w6, #((1 << \bit_depth) - 1)
|
||||||
dup v17.8h, w6
|
dup v17.8h, w6
|
||||||
@ -105,25 +94,17 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
|||||||
b.eq 4f
|
b.eq 4f
|
||||||
2: // width == 2
|
2: // width == 2
|
||||||
subs height, height, #1
|
subs height, height, #1
|
||||||
vvc_\type\()_\bit_depth\()_2_4 2
|
vvc_w_avg_\bit_depth\()_2_4 2
|
||||||
b.ne 2b
|
b.ne 2b
|
||||||
b 32f
|
b 32f
|
||||||
4: // width == 4
|
4: // width == 4
|
||||||
subs height, height, #1
|
subs height, height, #1
|
||||||
vvc_\type\()_\bit_depth\()_2_4 4
|
vvc_w_avg_\bit_depth\()_2_4 4
|
||||||
b.ne 4b
|
b.ne 4b
|
||||||
b 32f
|
b 32f
|
||||||
8: // width == 8
|
8: // width == 8
|
||||||
ld1 {v0.8h}, [src0], x10
|
ld1 {v0.8h}, [src0], x10
|
||||||
ld1 {v2.8h}, [src1], x10
|
ld1 {v2.8h}, [src1], x10
|
||||||
.ifc \type, avg
|
|
||||||
saddl v4.4s, v0.4h, v2.4h
|
|
||||||
saddl2 v5.4s, v0.8h, v2.8h
|
|
||||||
add v4.4s, v4.4s, v16.4s
|
|
||||||
add v5.4s, v5.4s, v16.4s
|
|
||||||
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
|
|
||||||
sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
|
|
||||||
.else
|
|
||||||
mov v4.16b, v16.16b
|
mov v4.16b, v16.16b
|
||||||
mov v5.16b, v16.16b
|
mov v5.16b, v16.16b
|
||||||
smlal v4.4s, v0.4h, v19.4h
|
smlal v4.4s, v0.4h, v19.4h
|
||||||
@ -134,7 +115,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
|||||||
sqshl v5.4s, v5.4s, v22.4s
|
sqshl v5.4s, v5.4s, v22.4s
|
||||||
sqxtun v4.4h, v4.4s
|
sqxtun v4.4h, v4.4s
|
||||||
sqxtun2 v4.8h, v5.4s
|
sqxtun2 v4.8h, v5.4s
|
||||||
.endif
|
|
||||||
subs height, height, #1
|
subs height, height, #1
|
||||||
.if \bit_depth == 8
|
.if \bit_depth == 8
|
||||||
sqxtun v4.8b, v4.8h
|
sqxtun v4.8b, v4.8h
|
||||||
@ -153,20 +133,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
|||||||
17:
|
17:
|
||||||
ldp q0, q1, [x7], #32
|
ldp q0, q1, [x7], #32
|
||||||
ldp q2, q3, [x8], #32
|
ldp q2, q3, [x8], #32
|
||||||
.ifc \type, avg
|
|
||||||
saddl v4.4s, v0.4h, v2.4h
|
|
||||||
saddl2 v5.4s, v0.8h, v2.8h
|
|
||||||
saddl v6.4s, v1.4h, v3.4h
|
|
||||||
saddl2 v7.4s, v1.8h, v3.8h
|
|
||||||
add v4.4s, v4.4s, v16.4s
|
|
||||||
add v5.4s, v5.4s, v16.4s
|
|
||||||
add v6.4s, v6.4s, v16.4s
|
|
||||||
add v7.4s, v7.4s, v16.4s
|
|
||||||
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
|
|
||||||
sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
|
|
||||||
sqshrun v6.4h, v6.4s, #(15 - \bit_depth)
|
|
||||||
sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth)
|
|
||||||
.else // avg
|
|
||||||
mov v4.16b, v16.16b
|
mov v4.16b, v16.16b
|
||||||
mov v5.16b, v16.16b
|
mov v5.16b, v16.16b
|
||||||
mov v6.16b, v16.16b
|
mov v6.16b, v16.16b
|
||||||
@ -187,7 +153,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
|||||||
sqxtun v6.4h, v6.4s
|
sqxtun v6.4h, v6.4s
|
||||||
sqxtun2 v4.8h, v5.4s
|
sqxtun2 v4.8h, v5.4s
|
||||||
sqxtun2 v6.8h, v7.4s
|
sqxtun2 v6.8h, v7.4s
|
||||||
.endif // w_avg
|
|
||||||
subs w6, w6, #16
|
subs w6, w6, #16
|
||||||
.if \bit_depth == 8
|
.if \bit_depth == 8
|
||||||
sqxtun v4.8b, v4.8h
|
sqxtun v4.8b, v4.8h
|
||||||
@ -217,12 +182,130 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
|||||||
endfunc
|
endfunc
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
vvc_avg avg, 8
|
vvc_w_avg 8
|
||||||
vvc_avg avg, 10
|
vvc_w_avg 10
|
||||||
vvc_avg avg, 12
|
vvc_w_avg 12
|
||||||
vvc_avg w_avg, 8
|
|
||||||
vvc_avg w_avg, 10
|
.macro vvc_avg bit_depth
|
||||||
vvc_avg w_avg, 12
|
function ff_vvc_avg_\bit_depth\()_neon, export=1
|
||||||
|
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
||||||
|
movi v16.8h, #0
|
||||||
|
movi v17.16b, #255
|
||||||
|
ushr v17.8h, v17.8h, #(16 - \bit_depth)
|
||||||
|
|
||||||
|
cmp w4, #8
|
||||||
|
b.gt 16f
|
||||||
|
b.eq 8f
|
||||||
|
cmp w4, #4
|
||||||
|
b.eq 4f
|
||||||
|
|
||||||
|
2: // width == 2
|
||||||
|
ldr s0, [x2]
|
||||||
|
subs w5, w5, #1
|
||||||
|
ldr s1, [x3]
|
||||||
|
.if \bit_depth == 8
|
||||||
|
shadd v0.4h, v0.4h, v1.4h
|
||||||
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
||||||
|
str h0, [x0]
|
||||||
|
.else
|
||||||
|
shadd v0.4h, v0.4h, v1.4h
|
||||||
|
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
|
||||||
|
smax v0.4h, v0.4h, v16.4h
|
||||||
|
smin v0.4h, v0.4h, v17.4h
|
||||||
|
str s0, [x0]
|
||||||
|
.endif
|
||||||
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
||||||
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
||||||
|
add x0, x0, x1
|
||||||
|
b.ne 2b
|
||||||
|
ret
|
||||||
|
|
||||||
|
4: // width == 4
|
||||||
|
ldr d0, [x2]
|
||||||
|
subs w5, w5, #1
|
||||||
|
ldr d1, [x3]
|
||||||
|
.if \bit_depth == 8
|
||||||
|
shadd v0.4h, v0.4h, v1.4h
|
||||||
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
||||||
|
str s0, [x0]
|
||||||
|
.else
|
||||||
|
shadd v0.4h, v0.4h, v1.4h
|
||||||
|
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
|
||||||
|
smax v0.4h, v0.4h, v16.4h
|
||||||
|
smin v0.4h, v0.4h, v17.4h
|
||||||
|
str d0, [x0]
|
||||||
|
.endif
|
||||||
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
||||||
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
||||||
|
add x0, x0, x1
|
||||||
|
b.ne 4b
|
||||||
|
ret
|
||||||
|
|
||||||
|
8: // width == 8
|
||||||
|
ldr q0, [x2]
|
||||||
|
subs w5, w5, #1
|
||||||
|
ldr q1, [x3]
|
||||||
|
.if \bit_depth == 8
|
||||||
|
shadd v0.8h, v0.8h, v1.8h
|
||||||
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
||||||
|
str d0, [x0]
|
||||||
|
.else
|
||||||
|
shadd v0.8h, v0.8h, v1.8h
|
||||||
|
srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth)
|
||||||
|
smax v0.8h, v0.8h, v16.8h
|
||||||
|
smin v0.8h, v0.8h, v17.8h
|
||||||
|
str q0, [x0]
|
||||||
|
.endif
|
||||||
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
||||||
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
||||||
|
add x0, x0, x1
|
||||||
|
b.ne 8b
|
||||||
|
ret
|
||||||
|
|
||||||
|
16: // width >= 16
|
||||||
|
.if \bit_depth == 8
|
||||||
|
sub x1, x1, w4, sxtw
|
||||||
|
.else
|
||||||
|
sub x1, x1, w4, sxtw #1
|
||||||
|
.endif
|
||||||
|
sub x10, x10, w4, sxtw #1
|
||||||
|
3:
|
||||||
|
mov w6, w4 // width
|
||||||
|
1:
|
||||||
|
ldp q0, q1, [x2], #32
|
||||||
|
subs w6, w6, #16
|
||||||
|
ldp q2, q3, [x3], #32
|
||||||
|
.if \bit_depth == 8
|
||||||
|
shadd v4.8h, v0.8h, v2.8h
|
||||||
|
shadd v5.8h, v1.8h, v3.8h
|
||||||
|
sqrshrun v0.8b, v4.8h, #6
|
||||||
|
sqrshrun2 v0.16b, v5.8h, #6
|
||||||
|
st1 {v0.16b}, [x0], #16
|
||||||
|
.else
|
||||||
|
shadd v4.8h, v0.8h, v2.8h
|
||||||
|
shadd v5.8h, v1.8h, v3.8h
|
||||||
|
srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth)
|
||||||
|
srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth)
|
||||||
|
smax v0.8h, v0.8h, v16.8h
|
||||||
|
smax v1.8h, v1.8h, v16.8h
|
||||||
|
smin v0.8h, v0.8h, v17.8h
|
||||||
|
smin v1.8h, v1.8h, v17.8h
|
||||||
|
stp q0, q1, [x0], #32
|
||||||
|
.endif
|
||||||
|
b.ne 1b
|
||||||
|
|
||||||
|
subs w5, w5, #1
|
||||||
|
add x2, x2, x10
|
||||||
|
add x3, x3, x10
|
||||||
|
add x0, x0, x1
|
||||||
|
b.ne 3b
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
vvc_avg 8
|
||||||
|
vvc_avg 10
|
||||||
|
vvc_avg 12
|
||||||
|
|
||||||
/* x0: int16_t *dst
|
/* x0: int16_t *dst
|
||||||
* x1: const uint8_t *_src
|
* x1: const uint8_t *_src
|
||||||
|
Reference in New Issue
Block a user