1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00
Files
FFmpeg/libavcodec/aarch64/vvc/inter.S
Krzysztof Pyrkosz f9b8f30680 avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12}
This patch replaces integer widening with halving addition, and
multi-step "emulated" rounding shift with a single asm instruction doing
exactly that.

Benchmarks before and after:
A78
avg_8_64x64_neon:                                     2686.2 ( 6.12x)
avg_8_128x128_neon:                                  10734.2 ( 5.88x)
avg_10_64x64_neon:                                    2536.8 ( 5.40x)
avg_10_128x128_neon:                                 10079.0 ( 5.22x)
avg_12_64x64_neon:                                    2548.2 ( 5.38x)
avg_12_128x128_neon:                                 10133.8 ( 5.19x)

avg_8_64x64_neon:                                      897.8 (18.26x)
avg_8_128x128_neon:                                   3608.5 (17.37x)
avg_10_32x32_neon:                                     444.2 ( 8.51x)
avg_10_64x64_neon:                                    1711.8 ( 8.00x)
avg_12_64x64_neon:                                    1706.2 ( 8.02x)
avg_12_128x128_neon:                                  7010.0 ( 7.46x)

A72
avg_8_64x64_neon:                                     5823.4 ( 3.88x)
avg_8_128x128_neon:                                  17430.5 ( 4.73x)
avg_10_64x64_neon:                                    5228.1 ( 3.71x)
avg_10_128x128_neon:                                 16722.2 ( 4.17x)
avg_12_64x64_neon:                                    5379.1 ( 3.51x)
avg_12_128x128_neon:                                 16715.7 ( 4.17x)

avg_8_64x64_neon:                                     2006.5 (10.61x)
avg_8_128x128_neon:                                   9158.7 ( 8.96x)
avg_10_64x64_neon:                                    3357.7 ( 5.60x)
avg_10_128x128_neon:                                 12411.7 ( 5.56x)
avg_12_64x64_neon:                                    3317.5 ( 5.67x)
avg_12_128x128_neon:                                 12358.5 ( 5.58x)

A53
avg_8_64x64_neon:                                     8327.8 ( 5.18x)
avg_8_128x128_neon:                                  31631.3 ( 5.34x)
avg_10_64x64_neon:                                    8783.5 ( 4.98x)
avg_10_128x128_neon:                                 32617.0 ( 5.25x)
avg_12_64x64_neon:                                    8686.0 ( 5.06x)
avg_12_128x128_neon:                                 32487.5 ( 5.25x)

avg_8_64x64_neon:                                     6032.3 ( 7.17x)
avg_8_128x128_neon:                                  22008.5 ( 7.69x)
avg_10_64x64_neon:                                    7738.0 ( 5.68x)
avg_10_128x128_neon:                                 27813.8 ( 6.14x)
avg_12_64x64_neon:                                    7844.5 ( 5.60x)
avg_12_128x128_neon:                                 26999.5 ( 6.34x)

Signed-off-by: Martin Storsjö <martin@martin.st>
2025-03-07 15:51:20 +02:00

1066 lines
34 KiB
ArmAsm

/*
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#define VVC_MAX_PB_SIZE 128
#define BDOF_BLOCK_SIZE 16
#define BDOF_MIN_BLOCK_SIZE 4
.macro vvc_w_avg bit_depth
.macro vvc_w_avg_\bit_depth\()_2_4 tap
.if \tap == 2
ldr s0, [src0]
ldr s2, [src1]
.else
ldr d0, [src0]
ldr d2, [src1]
.endif
mov v4.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
sqshl v4.4s, v4.4s, v22.4s
sqxtun v4.4h, v4.4s
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
.if \tap == 2
str h4, [dst]
.else // tap == 4
str s4, [dst]
.endif
.else // bit_depth > 8
umin v4.4h, v4.4h, v17.4h
.if \tap == 2
str s4, [dst]
.else
str d4, [dst]
.endif
.endif
add src0, src0, x10
add src1, src1, x10
add dst, dst, dst_stride
.endm
function ff_vvc_w_avg_\bit_depth\()_neon, export=1
dst .req x0
dst_stride .req x1
src0 .req x2
src1 .req x3
width .req w4
height .req w5
mov x10, #(VVC_MAX_PB_SIZE * 2)
cmp width, #8
lsr x11, x6, #32 // weight0
mov w12, w6 // weight1
lsr x13, x7, #32 // offset
mov w14, w7 // shift
dup v19.8h, w11
neg w14, w14 // so we can use sqshl
dup v20.8h, w12
dup v16.4s, w13
dup v22.4s, w14
.if \bit_depth >= 10
// clip pixel
mov w6, #((1 << \bit_depth) - 1)
dup v17.8h, w6
.endif
b.eq 8f
b.hi 16f
cmp width, #4
b.eq 4f
2: // width == 2
subs height, height, #1
vvc_w_avg_\bit_depth\()_2_4 2
b.ne 2b
b 32f
4: // width == 4
subs height, height, #1
vvc_w_avg_\bit_depth\()_2_4 4
b.ne 4b
b 32f
8: // width == 8
ld1 {v0.8h}, [src0], x10
ld1 {v2.8h}, [src1], x10
mov v4.16b, v16.16b
mov v5.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
smlal2 v5.4s, v0.8h, v19.8h
smlal2 v5.4s, v2.8h, v20.8h
sqshl v4.4s, v4.4s, v22.4s
sqshl v5.4s, v5.4s, v22.4s
sqxtun v4.4h, v4.4s
sqxtun2 v4.8h, v5.4s
subs height, height, #1
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
st1 {v4.8b}, [dst], dst_stride
.else
umin v4.8h, v4.8h, v17.8h
st1 {v4.8h}, [dst], dst_stride
.endif
b.ne 8b
b 32f
16: // width >= 16
mov w6, width
mov x7, src0
mov x8, src1
mov x9, dst
17:
ldp q0, q1, [x7], #32
ldp q2, q3, [x8], #32
mov v4.16b, v16.16b
mov v5.16b, v16.16b
mov v6.16b, v16.16b
mov v7.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
smlal2 v5.4s, v0.8h, v19.8h
smlal2 v5.4s, v2.8h, v20.8h
smlal v6.4s, v1.4h, v19.4h
smlal v6.4s, v3.4h, v20.4h
smlal2 v7.4s, v1.8h, v19.8h
smlal2 v7.4s, v3.8h, v20.8h
sqshl v4.4s, v4.4s, v22.4s
sqshl v5.4s, v5.4s, v22.4s
sqshl v6.4s, v6.4s, v22.4s
sqshl v7.4s, v7.4s, v22.4s
sqxtun v4.4h, v4.4s
sqxtun v6.4h, v6.4s
sqxtun2 v4.8h, v5.4s
sqxtun2 v6.8h, v7.4s
subs w6, w6, #16
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
sqxtun2 v4.16b, v6.8h
str q4, [x9], #16
.else
umin v4.8h, v4.8h, v17.8h
umin v6.8h, v6.8h, v17.8h
stp q4, q6, [x9], #32
.endif
b.ne 17b
subs height, height, #1
add src0, src0, x10
add src1, src1, x10
add dst, dst, dst_stride
b.ne 16b
32:
ret
.unreq dst
.unreq dst_stride
.unreq src0
.unreq src1
.unreq width
.unreq height
endfunc
.endm
vvc_w_avg 8
vvc_w_avg 10
vvc_w_avg 12
.macro vvc_avg bit_depth
function ff_vvc_avg_\bit_depth\()_neon, export=1
mov x10, #(VVC_MAX_PB_SIZE * 2)
movi v16.8h, #0
movi v17.16b, #255
ushr v17.8h, v17.8h, #(16 - \bit_depth)
cmp w4, #8
b.gt 16f
b.eq 8f
cmp w4, #4
b.eq 4f
2: // width == 2
ldr s0, [x2]
subs w5, w5, #1
ldr s1, [x3]
.if \bit_depth == 8
shadd v0.4h, v0.4h, v1.4h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str h0, [x0]
.else
shadd v0.4h, v0.4h, v1.4h
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
smax v0.4h, v0.4h, v16.4h
smin v0.4h, v0.4h, v17.4h
str s0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 2b
ret
4: // width == 4
ldr d0, [x2]
subs w5, w5, #1
ldr d1, [x3]
.if \bit_depth == 8
shadd v0.4h, v0.4h, v1.4h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str s0, [x0]
.else
shadd v0.4h, v0.4h, v1.4h
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
smax v0.4h, v0.4h, v16.4h
smin v0.4h, v0.4h, v17.4h
str d0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 4b
ret
8: // width == 8
ldr q0, [x2]
subs w5, w5, #1
ldr q1, [x3]
.if \bit_depth == 8
shadd v0.8h, v0.8h, v1.8h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str d0, [x0]
.else
shadd v0.8h, v0.8h, v1.8h
srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth)
smax v0.8h, v0.8h, v16.8h
smin v0.8h, v0.8h, v17.8h
str q0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 8b
ret
16: // width >= 16
.if \bit_depth == 8
sub x1, x1, w4, sxtw
.else
sub x1, x1, w4, sxtw #1
.endif
sub x10, x10, w4, sxtw #1
3:
mov w6, w4 // width
1:
ldp q0, q1, [x2], #32
subs w6, w6, #16
ldp q2, q3, [x3], #32
.if \bit_depth == 8
shadd v4.8h, v0.8h, v2.8h
shadd v5.8h, v1.8h, v3.8h
sqrshrun v0.8b, v4.8h, #6
sqrshrun2 v0.16b, v5.8h, #6
st1 {v0.16b}, [x0], #16
.else
shadd v4.8h, v0.8h, v2.8h
shadd v5.8h, v1.8h, v3.8h
srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth)
srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth)
smax v0.8h, v0.8h, v16.8h
smax v1.8h, v1.8h, v16.8h
smin v0.8h, v0.8h, v17.8h
smin v1.8h, v1.8h, v17.8h
stp q0, q1, [x0], #32
.endif
b.ne 1b
subs w5, w5, #1
add x2, x2, x10
add x3, x3, x10
add x0, x0, x1
b.ne 3b
ret
endfunc
.endm
vvc_avg 8
vvc_avg 10
vvc_avg 12
/* x0: int16_t *dst
* x1: const uint8_t *_src
* x2: ptrdiff_t _src_stride
* w3: int height
* x4: intptr_t mx
* x5: intptr_t my
* w6: int width
*/
function ff_vvc_dmvr_8_neon, export=1
dst .req x0
src .req x1
src_stride .req x2
height .req w3
mx .req x4
my .req x5
width .req w6
sxtw x6, w6
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
cmp width, #16
sub src_stride, src_stride, x6
cset w15, gt // width > 16
movi v16.8h, #2 // DMVR_SHIFT
sub x7, x7, x6, lsl #1
1:
cbz w15, 2f
ldr q0, [src], #16
ushll v1.8h, v0.8b, #2
ushll2 v2.8h, v0.16b, #2
stp q1, q2, [dst], #32
b 3f
2:
ldr d0, [src], #8
ushll v1.8h, v0.8b, #2
str q1, [dst], #16
3:
subs height, height, #1
ldr s3, [src], #4
ushll v4.8h, v3.8b, #2
st1 {v4.4h}, [dst], x7
add src, src, src_stride
b.ne 1b
ret
endfunc
function ff_vvc_dmvr_12_neon, export=1
sxtw x6, w6
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
cmp width, #16
sub src_stride, src_stride, x6, lsl #1
cset w15, gt // width > 16
sub x7, x7, x6, lsl #1
1:
cbz w15, 2f
ldp q0, q1, [src], #32
urshr v0.8h, v0.8h, #2
urshr v1.8h, v1.8h, #2
stp q0, q1, [dst], #32
b 3f
2:
ldr q0, [src], #16
urshr v0.8h, v0.8h, #2
str q0, [dst], #16
3:
subs height, height, #1
ldr d0, [src], #8
urshr v0.4h, v0.4h, #2
st1 {v0.4h}, [dst], x7
add src, src, src_stride
b.ne 1b
ret
endfunc
function ff_vvc_dmvr_hv_8_neon, export=1
tmp0 .req x7
tmp1 .req x8
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
add x12, x9, mx, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
mov tmp0, sp
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
// We know the value are positive
dup v0.8h, w10 // filter_x[0]
dup v1.8h, w11 // filter_x[1]
add x12, x9, my, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
sxtw x6, w6
dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1]
// Valid value for width can only be 8 + 4, 16 + 4
cmp width, #16
mov w10, #0 // start filter_y or not
add height, height, #1
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
sub src_stride, src_stride, x6
cset w15, gt // width > 16
1:
mov x12, tmp0
mov x13, tmp1
mov x14, dst
cbz w15, 2f
// width > 16
ldur q5, [src, #1]
ldr q4, [src], #16
uxtl v7.8h, v5.8b
uxtl2 v17.8h, v5.16b
uxtl v6.8h, v4.8b
uxtl2 v16.8h, v4.16b
mul v6.8h, v6.8h, v0.8h
mul v16.8h, v16.8h, v0.8h
mla v6.8h, v7.8h, v1.8h
mla v16.8h, v17.8h, v1.8h
urshr v6.8h, v6.8h, #(8 - 6)
urshr v7.8h, v16.8h, #(8 - 6)
stp q6, q7, [x13], #32
cbz w10, 3f
ldp q16, q17, [x12], #32
mul v16.8h, v16.8h, v2.8h
mul v17.8h, v17.8h, v2.8h
mla v16.8h, v6.8h, v3.8h
mla v17.8h, v7.8h, v3.8h
urshr v16.8h, v16.8h, #4
urshr v17.8h, v17.8h, #4
stp q16, q17, [x14], #32
b 3f
2:
// width > 8
ldur d5, [src, #1]
ldr d4, [src], #8
uxtl v7.8h, v5.8b
uxtl v6.8h, v4.8b
mul v6.8h, v6.8h, v0.8h
mla v6.8h, v7.8h, v1.8h
urshr v6.8h, v6.8h, #(8 - 6)
str q6, [x13], #16
cbz w10, 3f
ldr q16, [x12], #16
mul v16.8h, v16.8h, v2.8h
mla v16.8h, v6.8h, v3.8h
urshr v16.8h, v16.8h, #4
str q16, [x14], #16
3:
ldur s5, [src, #1]
ldr s4, [src], #4
uxtl v7.8h, v5.8b
uxtl v6.8h, v4.8b
mul v6.4h, v6.4h, v0.4h
mla v6.4h, v7.4h, v1.4h
urshr v6.4h, v6.4h, #(8 - 6)
str d6, [x13], #8
cbz w10, 4f
ldr d16, [x12], #8
mul v16.4h, v16.4h, v2.4h
mla v16.4h, v6.4h, v3.4h
urshr v16.4h, v16.4h, #4
str d16, [x14], #8
4:
subs height, height, #1
mov w10, #1
add src, src, src_stride
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
eor tmp0, tmp0, tmp1
eor tmp1, tmp0, tmp1
eor tmp0, tmp0, tmp1
b.ne 1b
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
ret
endfunc
function ff_vvc_dmvr_hv_12_neon, export=1
movi v29.4s, #(12 - 6)
movi v30.4s, #(1 << (12 - 7)) // offset1
b 0f
endfunc
function ff_vvc_dmvr_hv_10_neon, export=1
movi v29.4s, #(10 - 6)
movi v30.4s, #(1 << (10 - 7)) // offset1
0:
movi v31.4s, #8 // offset2
neg v29.4s, v29.4s
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
add x12, x9, mx, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
mov tmp0, sp
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
// We know the value are positive
dup v0.8h, w10 // filter_x[0]
dup v1.8h, w11 // filter_x[1]
add x12, x9, my, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
sxtw x6, w6
dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1]
// Valid value for width can only be 8 + 4, 16 + 4
cmp width, #16
mov w10, #0 // start filter_y or not
add height, height, #1
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
sub src_stride, src_stride, x6, lsl #1
cset w15, gt // width > 16
1:
mov x12, tmp0
mov x13, tmp1
mov x14, dst
cbz w15, 2f
// width > 16
add x16, src, #2
ldp q6, q16, [src], #32
ldp q7, q17, [x16]
umull v4.4s, v6.4h, v0.4h
umull2 v5.4s, v6.8h, v0.8h
umull v18.4s, v16.4h, v0.4h
umull2 v19.4s, v16.8h, v0.8h
umlal v4.4s, v7.4h, v1.4h
umlal2 v5.4s, v7.8h, v1.8h
umlal v18.4s, v17.4h, v1.4h
umlal2 v19.4s, v17.8h, v1.8h
add v4.4s, v4.4s, v30.4s
add v5.4s, v5.4s, v30.4s
add v18.4s, v18.4s, v30.4s
add v19.4s, v19.4s, v30.4s
ushl v4.4s, v4.4s, v29.4s
ushl v5.4s, v5.4s, v29.4s
ushl v18.4s, v18.4s, v29.4s
ushl v19.4s, v19.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
uqxtn v7.4h, v18.4s
uqxtn2 v7.8h, v19.4s
stp q6, q7, [x13], #32
cbz w10, 3f
ldp q4, q5, [x12], #32
umull v17.4s, v4.4h, v2.4h
umull2 v18.4s, v4.8h, v2.8h
umull v19.4s, v5.4h, v2.4h
umull2 v20.4s, v5.8h, v2.8h
umlal v17.4s, v6.4h, v3.4h
umlal2 v18.4s, v6.8h, v3.8h
umlal v19.4s, v7.4h, v3.4h
umlal2 v20.4s, v7.8h, v3.8h
add v17.4s, v17.4s, v31.4s
add v18.4s, v18.4s, v31.4s
add v19.4s, v19.4s, v31.4s
add v20.4s, v20.4s, v31.4s
ushr v17.4s, v17.4s, #4
ushr v18.4s, v18.4s, #4
ushr v19.4s, v19.4s, #4
ushr v20.4s, v20.4s, #4
uqxtn v6.4h, v17.4s
uqxtn2 v6.8h, v18.4s
uqxtn v7.4h, v19.4s
uqxtn2 v7.8h, v20.4s
stp q6, q7, [x14], #32
b 3f
2:
// width > 8
ldur q7, [src, #2]
ldr q6, [src], #16
umull v4.4s, v6.4h, v0.4h
umull2 v5.4s, v6.8h, v0.8h
umlal v4.4s, v7.4h, v1.4h
umlal2 v5.4s, v7.8h, v1.8h
add v4.4s, v4.4s, v30.4s
add v5.4s, v5.4s, v30.4s
ushl v4.4s, v4.4s, v29.4s
ushl v5.4s, v5.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
str q6, [x13], #16
cbz w10, 3f
ldr q16, [x12], #16
umull v17.4s, v16.4h, v2.4h
umull2 v18.4s, v16.8h, v2.8h
umlal v17.4s, v6.4h, v3.4h
umlal2 v18.4s, v6.8h, v3.8h
add v17.4s, v17.4s, v31.4s
add v18.4s, v18.4s, v31.4s
ushr v17.4s, v17.4s, #4
ushr v18.4s, v18.4s, #4
uqxtn v16.4h, v17.4s
uqxtn2 v16.8h, v18.4s
str q16, [x14], #16
3:
ldur d7, [src, #2]
ldr d6, [src], #8
umull v4.4s, v7.4h, v1.4h
umlal v4.4s, v6.4h, v0.4h
add v4.4s, v4.4s, v30.4s
ushl v4.4s, v4.4s, v29.4s
uqxtn v6.4h, v4.4s
str d6, [x13], #8
cbz w10, 4f
ldr d16, [x12], #8
umull v17.4s, v16.4h, v2.4h
umlal v17.4s, v6.4h, v3.4h
add v17.4s, v17.4s, v31.4s
ushr v17.4s, v17.4s, #4
uqxtn v16.4h, v17.4s
str d16, [x14], #8
4:
subs height, height, #1
mov w10, #1
add src, src, src_stride
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
eor tmp0, tmp0, tmp1
eor tmp1, tmp0, tmp1
eor tmp0, tmp0, tmp1
b.ne 1b
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
ret
.unreq dst
.unreq src
.unreq src_stride
.unreq height
.unreq mx
.unreq my
.unreq width
.unreq tmp0
.unreq tmp1
endfunc
function ff_vvc_prof_grad_filter_8x_neon, export=1
gh .req x0
gv .req x1
gstride .req x2
src .req x3
src_stride .req x4
width .req w5
height .req w6
lsl src_stride, src_stride, #1
neg x7, src_stride
1:
mov x10, src
mov w11, width
mov x12, gh
mov x13, gv
2:
ldur q0, [x10, #2]
ldur q1, [x10, #-2]
subs w11, w11, #8
ldr q2, [x10, src_stride]
ldr q3, [x10, x7]
sshr v0.8h, v0.8h, #6
sshr v1.8h, v1.8h, #6
sshr v2.8h, v2.8h, #6
sshr v3.8h, v3.8h, #6
sub v0.8h, v0.8h, v1.8h
sub v2.8h, v2.8h, v3.8h
st1 {v0.8h}, [x12], #16
st1 {v2.8h}, [x13], #16
add x10, x10, #16
b.ne 2b
subs height, height, #1
add gh, gh, gstride, lsl #1
add gv, gv, gstride, lsl #1
add src, src, src_stride
b.ne 1b
ret
.unreq gh
.unreq gv
.unreq gstride
.unreq src
.unreq src_stride
.unreq width
.unreq height
endfunc
.macro vvc_apply_bdof_block bit_depth
dst .req x0
dst_stride .req x1
src0 .req x2
src1 .req x3
gh .req x4
gv .req x5
vx .req x6
vy .req x7
ld1r {v0.8h}, [vx], #2
ld1r {v1.8h}, [vy], #2
ld1r {v2.8h}, [vx]
ld1r {v3.8h}, [vy]
ins v0.d[1], v2.d[1]
ins v1.d[1], v3.d[1]
movi v7.4s, #(1 << (14 - \bit_depth))
ldp x8, x9, [gh]
ldp x10, x11, [gv]
mov x12, #(BDOF_BLOCK_SIZE * 2)
mov w13, #(BDOF_MIN_BLOCK_SIZE)
mov x14, #(VVC_MAX_PB_SIZE * 2)
.if \bit_depth >= 10
// clip pixel
mov w15, #((1 << \bit_depth) - 1)
movi v18.8h, #0
lsl dst_stride, dst_stride, #1
dup v19.8h, w15
.endif
1:
ld1 {v2.8h}, [x8], x12
ld1 {v3.8h}, [x9], x12
ld1 {v4.8h}, [x10], x12
ld1 {v5.8h}, [x11], x12
sub v2.8h, v2.8h, v3.8h
sub v4.8h, v4.8h, v5.8h
smull v3.4s, v0.4h, v2.4h
smull2 v16.4s, v0.8h, v2.8h
smlal v3.4s, v1.4h, v4.4h
smlal2 v16.4s, v1.8h, v4.8h
ld1 {v5.8h}, [src0], x14
ld1 {v6.8h}, [src1], x14
saddl v2.4s, v5.4h, v6.4h
add v2.4s, v2.4s, v7.4s
add v2.4s, v2.4s, v3.4s
saddl2 v4.4s, v5.8h, v6.8h
add v4.4s, v4.4s, v7.4s
add v4.4s, v4.4s, v16.4s
sqshrn v5.4h, v2.4s, #(15 - \bit_depth)
sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth)
subs w13, w13, #1
.if \bit_depth == 8
sqxtun v5.8b, v5.8h
str d5, [dst]
add dst, dst, dst_stride
.else
smin v5.8h, v5.8h, v19.8h
smax v5.8h, v5.8h, v18.8h
st1 {v5.8h}, [dst], dst_stride
.endif
b.ne 1b
ret
.unreq dst
.unreq dst_stride
.unreq src0
.unreq src1
.unreq gh
.unreq gv
.unreq vx
.unreq vy
.endm
function ff_vvc_apply_bdof_block_8_neon, export=1
vvc_apply_bdof_block 8
endfunc
function ff_vvc_apply_bdof_block_10_neon, export=1
vvc_apply_bdof_block 10
endfunc
function ff_vvc_apply_bdof_block_12_neon, export=1
vvc_apply_bdof_block 12
endfunc
function ff_vvc_derive_bdof_vx_vy_neon, export=1
src0 .req x0
src1 .req x1
pad_mask .req w2
gh .req x3
gv .req x4
vx .req x5
vy .req x6
gh0 .req x7
gh1 .req x8
gv0 .req x9
gv1 .req x10
y .req x12
sgx2 .req w7
sgy2 .req w8
sgxgy .req w9
sgxdi .req w10
sgydi .req w11
sgx2_v .req v22
sgy2_v .req v23
sgxgy_v .req v24
sgxdi_v .req v25
sgydi_v .req v26
sgx2_v2 .req v27
sgy2_v2 .req v28
sgxgy_v2 .req v29
sgxdi_v2 .req v30
sgydi_v2 .req v31
ldp gh0, gh1, [gh]
ldp gv0, gv1, [gv]
movi sgx2_v.4s, #0
movi sgy2_v.4s, #0
movi sgxgy_v.4s, #0
movi sgxdi_v.4s, #0
movi sgydi_v.4s, #0
movi sgx2_v2.4s, #0
movi sgy2_v2.4s, #0
movi sgxgy_v2.4s, #0
movi sgxdi_v2.4s, #0
movi sgydi_v2.4s, #0
mov x13, #-1 // dy
movi v6.4s, #0
mov y, #-1
tbz pad_mask, #1, 1f // check pad top
mov x13, #0 // dy: pad top
1:
mov x16, #-2 // dx
add x14, src0, x13, lsl #8 // local src0
add x15, src1, x13, lsl #8 // local src1
add x17, x16, x13, lsl #5
ldr q0, [x14, x16]
ldr q1, [x15, x16]
ldr q2, [gh0, x17]
ldr q3, [gh1, x17]
ldr q4, [gv0, x17]
ldr q5, [gv1, x17]
add x16, x16, #8
add x17, x17, #8
ins v0.s[3], v6.s[3]
ins v1.s[3], v6.s[3]
ins v2.s[3], v6.s[3]
ins v3.s[3], v6.s[3]
ins v4.s[3], v6.s[3]
ins v5.s[3], v6.s[3]
ldr q16, [x14, x16]
ldr q17, [x15, x16]
ldr q18, [gh0, x17]
ldr q19, [gh1, x17]
ldr q20, [gv0, x17]
ldr q21, [gv1, x17]
ins v16.s[3], v6.s[3]
ins v17.s[3], v6.s[3]
ins v18.s[3], v6.s[3]
ins v19.s[3], v6.s[3]
ins v20.s[3], v6.s[3]
ins v21.s[3], v6.s[3]
tbz pad_mask, #0, 20f
// pad left
ins v0.h[0], v0.h[1]
ins v1.h[0], v1.h[1]
ins v2.h[0], v2.h[1]
ins v3.h[0], v3.h[1]
ins v4.h[0], v4.h[1]
ins v5.h[0], v5.h[1]
20:
tbz pad_mask, #2, 21f
// pad right
ins v16.h[5], v16.h[4]
ins v17.h[5], v17.h[4]
ins v18.h[5], v18.h[4]
ins v19.h[5], v19.h[4]
ins v20.h[5], v20.h[4]
ins v21.h[5], v21.h[4]
21:
sshr v0.8h, v0.8h, #4
sshr v1.8h, v1.8h, #4
add v2.8h, v2.8h, v3.8h
add v4.8h, v4.8h, v5.8h
sub v0.8h, v0.8h, v1.8h // diff
sshr v2.8h, v2.8h, #1 // temph
sshr v4.8h, v4.8h, #1 // tempv
sshr v16.8h, v16.8h, #4
sshr v17.8h, v17.8h, #4
add v18.8h, v18.8h, v19.8h
add v20.8h, v20.8h, v21.8h
sub v16.8h, v16.8h, v17.8h // diff
sshr v18.8h, v18.8h, #1 // temph
sshr v20.8h, v20.8h, #1 // tempv
abs v3.8h, v2.8h
abs v5.8h, v4.8h
uxtl v19.4s, v3.4h
uxtl v21.4s, v5.4h
uxtl2 v3.4s, v3.8h
uxtl2 v5.4s, v5.8h
add v3.4s, v3.4s, v19.4s
add v5.4s, v5.4s, v21.4s
add sgx2_v.4s, sgx2_v.4s, v3.4s
add sgy2_v.4s, sgy2_v.4s, v5.4s
abs v3.8h, v18.8h
abs v5.8h, v20.8h
uxtl v19.4s, v3.4h
uxtl v21.4s, v5.4h
uxtl2 v3.4s, v3.8h
uxtl2 v5.4s, v5.8h
add v3.4s, v3.4s, v19.4s
add v5.4s, v5.4s, v21.4s
add sgx2_v2.4s, sgx2_v2.4s, v3.4s
add sgy2_v2.4s, sgy2_v2.4s, v5.4s
cmgt v17.8h, v4.8h, #0
cmlt v7.8h, v4.8h, #0
cmgt v19.8h, v20.8h, #0
cmlt v21.8h, v20.8h, #0
sub v17.8h, v7.8h, v17.8h // VVC_SIGN(tempv)
sub v19.8h, v21.8h, v19.8h // VVC_SIGN(tempv)
smlal sgxgy_v.4s, v17.4h, v2.4h
smlal2 sgxgy_v.4s, v17.8h, v2.8h
smlsl sgydi_v.4s, v17.4h, v0.4h
smlsl2 sgydi_v.4s, v17.8h, v0.8h
cmgt v3.8h, v2.8h, #0
cmlt v5.8h, v2.8h, #0
cmgt v17.8h, v18.8h, #0
cmlt v21.8h, v18.8h, #0
sub v3.8h, v5.8h, v3.8h // VVC_SIGN(temph)
sub v17.8h, v21.8h, v17.8h // VVC_SIGN(temph)
smlal sgxgy_v2.4s, v19.4h, v18.4h
smlal2 sgxgy_v2.4s, v19.8h, v18.8h
smlsl sgydi_v2.4s, v19.4h, v16.4h
smlsl2 sgydi_v2.4s, v19.8h, v16.8h
smlsl sgxdi_v.4s, v3.4h, v0.4h
smlsl2 sgxdi_v.4s, v3.8h, v0.8h
smlsl sgxdi_v2.4s, v17.4h, v16.4h
smlsl2 sgxdi_v2.4s, v17.8h, v16.8h
3:
add y, y, #1
cmp y, #(BDOF_MIN_BLOCK_SIZE)
mov x13, y
b.gt 4f
b.lt 1b
tbz pad_mask, #3, 1b
sub x13, x13, #1 // pad bottom
b 1b
4:
addv s22, sgx2_v.4s
addv s23, sgy2_v.4s
addv s24, sgxgy_v.4s
addv s25, sgxdi_v.4s
addv s26, sgydi_v.4s
mov w3, #31
mov w16, #-15
mov w17, #15
40:
mov w14, #0
mov sgx2, v22.s[0]
mov sgy2, v23.s[0]
mov sgxgy, v24.s[0]
mov sgxdi, v25.s[0]
mov sgydi, v26.s[0]
cbz sgx2, 5f
clz w12, sgx2
lsl sgxdi, sgxdi, #2
sub w13, w3, w12 // log2(sgx2)
asr sgxdi, sgxdi, w13
cmp sgxdi, w16
csel w14, w16, sgxdi, lt // clip to -15
b.le 5f
cmp sgxdi, w17
csel w14, w17, sgxdi, gt // clip to 15
5:
strh w14, [vx], #2
mov w15, #0
cbz sgy2, 6f
lsl sgydi, sgydi, #2
smull x14, w14, sgxgy
asr w14, w14, #1
sub sgydi, sgydi, w14
clz w12, sgy2
sub w13, w3, w12 // log2(sgy2)
asr sgydi, sgydi, w13
cmp sgydi, w16
csel w15, w16, sgydi, lt // clip to -15
b.le 6f
cmp sgydi, w17
csel w15, w17, sgydi, gt // clip to 15
6:
strh w15, [vy], #2
cbz x0, 7f
addv s22, sgx2_v2.4s
addv s23, sgy2_v2.4s
addv s24, sgxgy_v2.4s
addv s25, sgxdi_v2.4s
addv s26, sgydi_v2.4s
mov x0, #0
b 40b
7:
ret
.unreq src0
.unreq src1
.unreq pad_mask
.unreq gh
.unreq gv
.unreq vx
.unreq vy
.unreq sgx2
.unreq sgy2
.unreq sgxgy
.unreq sgxdi
.unreq sgydi
.unreq sgx2_v
.unreq sgy2_v
.unreq sgxgy_v
.unreq sgxdi_v
.unreq sgydi_v
.unreq sgx2_v2
.unreq sgy2_v2
.unreq sgxgy_v2
.unreq sgxdi_v2
.unreq sgydi_v2
.unreq y
endfunc