You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
This patch replaces integer widening with halving addition, and multi-step "emulated" rounding shift with a single asm instruction doing exactly that. Benchmarks before and after: A78 avg_8_64x64_neon: 2686.2 ( 6.12x) avg_8_128x128_neon: 10734.2 ( 5.88x) avg_10_64x64_neon: 2536.8 ( 5.40x) avg_10_128x128_neon: 10079.0 ( 5.22x) avg_12_64x64_neon: 2548.2 ( 5.38x) avg_12_128x128_neon: 10133.8 ( 5.19x) avg_8_64x64_neon: 897.8 (18.26x) avg_8_128x128_neon: 3608.5 (17.37x) avg_10_32x32_neon: 444.2 ( 8.51x) avg_10_64x64_neon: 1711.8 ( 8.00x) avg_12_64x64_neon: 1706.2 ( 8.02x) avg_12_128x128_neon: 7010.0 ( 7.46x) A72 avg_8_64x64_neon: 5823.4 ( 3.88x) avg_8_128x128_neon: 17430.5 ( 4.73x) avg_10_64x64_neon: 5228.1 ( 3.71x) avg_10_128x128_neon: 16722.2 ( 4.17x) avg_12_64x64_neon: 5379.1 ( 3.51x) avg_12_128x128_neon: 16715.7 ( 4.17x) avg_8_64x64_neon: 2006.5 (10.61x) avg_8_128x128_neon: 9158.7 ( 8.96x) avg_10_64x64_neon: 3357.7 ( 5.60x) avg_10_128x128_neon: 12411.7 ( 5.56x) avg_12_64x64_neon: 3317.5 ( 5.67x) avg_12_128x128_neon: 12358.5 ( 5.58x) A53 avg_8_64x64_neon: 8327.8 ( 5.18x) avg_8_128x128_neon: 31631.3 ( 5.34x) avg_10_64x64_neon: 8783.5 ( 4.98x) avg_10_128x128_neon: 32617.0 ( 5.25x) avg_12_64x64_neon: 8686.0 ( 5.06x) avg_12_128x128_neon: 32487.5 ( 5.25x) avg_8_64x64_neon: 6032.3 ( 7.17x) avg_8_128x128_neon: 22008.5 ( 7.69x) avg_10_64x64_neon: 7738.0 ( 5.68x) avg_10_128x128_neon: 27813.8 ( 6.14x) avg_12_64x64_neon: 7844.5 ( 5.60x) avg_12_128x128_neon: 26999.5 ( 6.34x) Signed-off-by: Martin Storsjö <martin@martin.st>
1066 lines
34 KiB
ArmAsm
1066 lines
34 KiB
ArmAsm
/*
|
|
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/aarch64/asm.S"
|
|
|
|
#define VVC_MAX_PB_SIZE 128
|
|
#define BDOF_BLOCK_SIZE 16
|
|
#define BDOF_MIN_BLOCK_SIZE 4
|
|
|
|
.macro vvc_w_avg bit_depth
|
|
|
|
.macro vvc_w_avg_\bit_depth\()_2_4 tap
|
|
.if \tap == 2
|
|
ldr s0, [src0]
|
|
ldr s2, [src1]
|
|
.else
|
|
ldr d0, [src0]
|
|
ldr d2, [src1]
|
|
.endif
|
|
mov v4.16b, v16.16b
|
|
smlal v4.4s, v0.4h, v19.4h
|
|
smlal v4.4s, v2.4h, v20.4h
|
|
sqshl v4.4s, v4.4s, v22.4s
|
|
sqxtun v4.4h, v4.4s
|
|
|
|
.if \bit_depth == 8
|
|
sqxtun v4.8b, v4.8h
|
|
.if \tap == 2
|
|
str h4, [dst]
|
|
.else // tap == 4
|
|
str s4, [dst]
|
|
.endif
|
|
|
|
.else // bit_depth > 8
|
|
umin v4.4h, v4.4h, v17.4h
|
|
.if \tap == 2
|
|
str s4, [dst]
|
|
.else
|
|
str d4, [dst]
|
|
.endif
|
|
.endif
|
|
add src0, src0, x10
|
|
add src1, src1, x10
|
|
add dst, dst, dst_stride
|
|
.endm
|
|
|
|
function ff_vvc_w_avg_\bit_depth\()_neon, export=1
|
|
dst .req x0
|
|
dst_stride .req x1
|
|
src0 .req x2
|
|
src1 .req x3
|
|
width .req w4
|
|
height .req w5
|
|
|
|
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
|
cmp width, #8
|
|
lsr x11, x6, #32 // weight0
|
|
mov w12, w6 // weight1
|
|
lsr x13, x7, #32 // offset
|
|
mov w14, w7 // shift
|
|
|
|
dup v19.8h, w11
|
|
neg w14, w14 // so we can use sqshl
|
|
dup v20.8h, w12
|
|
dup v16.4s, w13
|
|
dup v22.4s, w14
|
|
|
|
.if \bit_depth >= 10
|
|
// clip pixel
|
|
mov w6, #((1 << \bit_depth) - 1)
|
|
dup v17.8h, w6
|
|
.endif
|
|
|
|
b.eq 8f
|
|
b.hi 16f
|
|
cmp width, #4
|
|
b.eq 4f
|
|
2: // width == 2
|
|
subs height, height, #1
|
|
vvc_w_avg_\bit_depth\()_2_4 2
|
|
b.ne 2b
|
|
b 32f
|
|
4: // width == 4
|
|
subs height, height, #1
|
|
vvc_w_avg_\bit_depth\()_2_4 4
|
|
b.ne 4b
|
|
b 32f
|
|
8: // width == 8
|
|
ld1 {v0.8h}, [src0], x10
|
|
ld1 {v2.8h}, [src1], x10
|
|
mov v4.16b, v16.16b
|
|
mov v5.16b, v16.16b
|
|
smlal v4.4s, v0.4h, v19.4h
|
|
smlal v4.4s, v2.4h, v20.4h
|
|
smlal2 v5.4s, v0.8h, v19.8h
|
|
smlal2 v5.4s, v2.8h, v20.8h
|
|
sqshl v4.4s, v4.4s, v22.4s
|
|
sqshl v5.4s, v5.4s, v22.4s
|
|
sqxtun v4.4h, v4.4s
|
|
sqxtun2 v4.8h, v5.4s
|
|
subs height, height, #1
|
|
.if \bit_depth == 8
|
|
sqxtun v4.8b, v4.8h
|
|
st1 {v4.8b}, [dst], dst_stride
|
|
.else
|
|
umin v4.8h, v4.8h, v17.8h
|
|
st1 {v4.8h}, [dst], dst_stride
|
|
.endif
|
|
b.ne 8b
|
|
b 32f
|
|
16: // width >= 16
|
|
mov w6, width
|
|
mov x7, src0
|
|
mov x8, src1
|
|
mov x9, dst
|
|
17:
|
|
ldp q0, q1, [x7], #32
|
|
ldp q2, q3, [x8], #32
|
|
mov v4.16b, v16.16b
|
|
mov v5.16b, v16.16b
|
|
mov v6.16b, v16.16b
|
|
mov v7.16b, v16.16b
|
|
smlal v4.4s, v0.4h, v19.4h
|
|
smlal v4.4s, v2.4h, v20.4h
|
|
smlal2 v5.4s, v0.8h, v19.8h
|
|
smlal2 v5.4s, v2.8h, v20.8h
|
|
smlal v6.4s, v1.4h, v19.4h
|
|
smlal v6.4s, v3.4h, v20.4h
|
|
smlal2 v7.4s, v1.8h, v19.8h
|
|
smlal2 v7.4s, v3.8h, v20.8h
|
|
sqshl v4.4s, v4.4s, v22.4s
|
|
sqshl v5.4s, v5.4s, v22.4s
|
|
sqshl v6.4s, v6.4s, v22.4s
|
|
sqshl v7.4s, v7.4s, v22.4s
|
|
sqxtun v4.4h, v4.4s
|
|
sqxtun v6.4h, v6.4s
|
|
sqxtun2 v4.8h, v5.4s
|
|
sqxtun2 v6.8h, v7.4s
|
|
subs w6, w6, #16
|
|
.if \bit_depth == 8
|
|
sqxtun v4.8b, v4.8h
|
|
sqxtun2 v4.16b, v6.8h
|
|
str q4, [x9], #16
|
|
.else
|
|
umin v4.8h, v4.8h, v17.8h
|
|
umin v6.8h, v6.8h, v17.8h
|
|
stp q4, q6, [x9], #32
|
|
.endif
|
|
b.ne 17b
|
|
|
|
subs height, height, #1
|
|
add src0, src0, x10
|
|
add src1, src1, x10
|
|
add dst, dst, dst_stride
|
|
b.ne 16b
|
|
32:
|
|
ret
|
|
|
|
.unreq dst
|
|
.unreq dst_stride
|
|
.unreq src0
|
|
.unreq src1
|
|
.unreq width
|
|
.unreq height
|
|
endfunc
|
|
.endm
|
|
|
|
vvc_w_avg 8
|
|
vvc_w_avg 10
|
|
vvc_w_avg 12
|
|
|
|
.macro vvc_avg bit_depth
|
|
function ff_vvc_avg_\bit_depth\()_neon, export=1
|
|
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
|
movi v16.8h, #0
|
|
movi v17.16b, #255
|
|
ushr v17.8h, v17.8h, #(16 - \bit_depth)
|
|
|
|
cmp w4, #8
|
|
b.gt 16f
|
|
b.eq 8f
|
|
cmp w4, #4
|
|
b.eq 4f
|
|
|
|
2: // width == 2
|
|
ldr s0, [x2]
|
|
subs w5, w5, #1
|
|
ldr s1, [x3]
|
|
.if \bit_depth == 8
|
|
shadd v0.4h, v0.4h, v1.4h
|
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
|
str h0, [x0]
|
|
.else
|
|
shadd v0.4h, v0.4h, v1.4h
|
|
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
|
|
smax v0.4h, v0.4h, v16.4h
|
|
smin v0.4h, v0.4h, v17.4h
|
|
str s0, [x0]
|
|
.endif
|
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
|
add x0, x0, x1
|
|
b.ne 2b
|
|
ret
|
|
|
|
4: // width == 4
|
|
ldr d0, [x2]
|
|
subs w5, w5, #1
|
|
ldr d1, [x3]
|
|
.if \bit_depth == 8
|
|
shadd v0.4h, v0.4h, v1.4h
|
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
|
str s0, [x0]
|
|
.else
|
|
shadd v0.4h, v0.4h, v1.4h
|
|
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
|
|
smax v0.4h, v0.4h, v16.4h
|
|
smin v0.4h, v0.4h, v17.4h
|
|
str d0, [x0]
|
|
.endif
|
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
|
add x0, x0, x1
|
|
b.ne 4b
|
|
ret
|
|
|
|
8: // width == 8
|
|
ldr q0, [x2]
|
|
subs w5, w5, #1
|
|
ldr q1, [x3]
|
|
.if \bit_depth == 8
|
|
shadd v0.8h, v0.8h, v1.8h
|
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
|
str d0, [x0]
|
|
.else
|
|
shadd v0.8h, v0.8h, v1.8h
|
|
srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth)
|
|
smax v0.8h, v0.8h, v16.8h
|
|
smin v0.8h, v0.8h, v17.8h
|
|
str q0, [x0]
|
|
.endif
|
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
|
add x0, x0, x1
|
|
b.ne 8b
|
|
ret
|
|
|
|
16: // width >= 16
|
|
.if \bit_depth == 8
|
|
sub x1, x1, w4, sxtw
|
|
.else
|
|
sub x1, x1, w4, sxtw #1
|
|
.endif
|
|
sub x10, x10, w4, sxtw #1
|
|
3:
|
|
mov w6, w4 // width
|
|
1:
|
|
ldp q0, q1, [x2], #32
|
|
subs w6, w6, #16
|
|
ldp q2, q3, [x3], #32
|
|
.if \bit_depth == 8
|
|
shadd v4.8h, v0.8h, v2.8h
|
|
shadd v5.8h, v1.8h, v3.8h
|
|
sqrshrun v0.8b, v4.8h, #6
|
|
sqrshrun2 v0.16b, v5.8h, #6
|
|
st1 {v0.16b}, [x0], #16
|
|
.else
|
|
shadd v4.8h, v0.8h, v2.8h
|
|
shadd v5.8h, v1.8h, v3.8h
|
|
srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth)
|
|
srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth)
|
|
smax v0.8h, v0.8h, v16.8h
|
|
smax v1.8h, v1.8h, v16.8h
|
|
smin v0.8h, v0.8h, v17.8h
|
|
smin v1.8h, v1.8h, v17.8h
|
|
stp q0, q1, [x0], #32
|
|
.endif
|
|
b.ne 1b
|
|
|
|
subs w5, w5, #1
|
|
add x2, x2, x10
|
|
add x3, x3, x10
|
|
add x0, x0, x1
|
|
b.ne 3b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
vvc_avg 8
|
|
vvc_avg 10
|
|
vvc_avg 12
|
|
|
|
/* x0: int16_t *dst
|
|
* x1: const uint8_t *_src
|
|
* x2: ptrdiff_t _src_stride
|
|
* w3: int height
|
|
* x4: intptr_t mx
|
|
* x5: intptr_t my
|
|
* w6: int width
|
|
*/
|
|
function ff_vvc_dmvr_8_neon, export=1
|
|
dst .req x0
|
|
src .req x1
|
|
src_stride .req x2
|
|
height .req w3
|
|
mx .req x4
|
|
my .req x5
|
|
width .req w6
|
|
|
|
sxtw x6, w6
|
|
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
|
|
cmp width, #16
|
|
sub src_stride, src_stride, x6
|
|
cset w15, gt // width > 16
|
|
movi v16.8h, #2 // DMVR_SHIFT
|
|
sub x7, x7, x6, lsl #1
|
|
1:
|
|
cbz w15, 2f
|
|
ldr q0, [src], #16
|
|
ushll v1.8h, v0.8b, #2
|
|
ushll2 v2.8h, v0.16b, #2
|
|
stp q1, q2, [dst], #32
|
|
b 3f
|
|
2:
|
|
ldr d0, [src], #8
|
|
ushll v1.8h, v0.8b, #2
|
|
str q1, [dst], #16
|
|
3:
|
|
subs height, height, #1
|
|
ldr s3, [src], #4
|
|
ushll v4.8h, v3.8b, #2
|
|
st1 {v4.4h}, [dst], x7
|
|
|
|
add src, src, src_stride
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_12_neon, export=1
|
|
sxtw x6, w6
|
|
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
|
|
cmp width, #16
|
|
sub src_stride, src_stride, x6, lsl #1
|
|
cset w15, gt // width > 16
|
|
sub x7, x7, x6, lsl #1
|
|
1:
|
|
cbz w15, 2f
|
|
ldp q0, q1, [src], #32
|
|
urshr v0.8h, v0.8h, #2
|
|
urshr v1.8h, v1.8h, #2
|
|
|
|
stp q0, q1, [dst], #32
|
|
b 3f
|
|
2:
|
|
ldr q0, [src], #16
|
|
urshr v0.8h, v0.8h, #2
|
|
str q0, [dst], #16
|
|
3:
|
|
subs height, height, #1
|
|
ldr d0, [src], #8
|
|
urshr v0.4h, v0.4h, #2
|
|
st1 {v0.4h}, [dst], x7
|
|
|
|
add src, src, src_stride
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_hv_8_neon, export=1
|
|
tmp0 .req x7
|
|
tmp1 .req x8
|
|
|
|
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
|
|
|
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
|
|
add x12, x9, mx, lsl #1
|
|
ldrb w10, [x12]
|
|
ldrb w11, [x12, #1]
|
|
mov tmp0, sp
|
|
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
|
|
// We know the value are positive
|
|
dup v0.8h, w10 // filter_x[0]
|
|
dup v1.8h, w11 // filter_x[1]
|
|
|
|
add x12, x9, my, lsl #1
|
|
ldrb w10, [x12]
|
|
ldrb w11, [x12, #1]
|
|
sxtw x6, w6
|
|
dup v2.8h, w10 // filter_y[0]
|
|
dup v3.8h, w11 // filter_y[1]
|
|
|
|
// Valid value for width can only be 8 + 4, 16 + 4
|
|
cmp width, #16
|
|
mov w10, #0 // start filter_y or not
|
|
add height, height, #1
|
|
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
|
sub src_stride, src_stride, x6
|
|
cset w15, gt // width > 16
|
|
1:
|
|
mov x12, tmp0
|
|
mov x13, tmp1
|
|
mov x14, dst
|
|
cbz w15, 2f
|
|
|
|
// width > 16
|
|
ldur q5, [src, #1]
|
|
ldr q4, [src], #16
|
|
uxtl v7.8h, v5.8b
|
|
uxtl2 v17.8h, v5.16b
|
|
uxtl v6.8h, v4.8b
|
|
uxtl2 v16.8h, v4.16b
|
|
mul v6.8h, v6.8h, v0.8h
|
|
mul v16.8h, v16.8h, v0.8h
|
|
mla v6.8h, v7.8h, v1.8h
|
|
mla v16.8h, v17.8h, v1.8h
|
|
urshr v6.8h, v6.8h, #(8 - 6)
|
|
urshr v7.8h, v16.8h, #(8 - 6)
|
|
stp q6, q7, [x13], #32
|
|
|
|
cbz w10, 3f
|
|
|
|
ldp q16, q17, [x12], #32
|
|
mul v16.8h, v16.8h, v2.8h
|
|
mul v17.8h, v17.8h, v2.8h
|
|
mla v16.8h, v6.8h, v3.8h
|
|
mla v17.8h, v7.8h, v3.8h
|
|
urshr v16.8h, v16.8h, #4
|
|
urshr v17.8h, v17.8h, #4
|
|
stp q16, q17, [x14], #32
|
|
b 3f
|
|
2:
|
|
// width > 8
|
|
ldur d5, [src, #1]
|
|
ldr d4, [src], #8
|
|
uxtl v7.8h, v5.8b
|
|
uxtl v6.8h, v4.8b
|
|
mul v6.8h, v6.8h, v0.8h
|
|
mla v6.8h, v7.8h, v1.8h
|
|
urshr v6.8h, v6.8h, #(8 - 6)
|
|
str q6, [x13], #16
|
|
|
|
cbz w10, 3f
|
|
|
|
ldr q16, [x12], #16
|
|
mul v16.8h, v16.8h, v2.8h
|
|
mla v16.8h, v6.8h, v3.8h
|
|
urshr v16.8h, v16.8h, #4
|
|
str q16, [x14], #16
|
|
3:
|
|
ldur s5, [src, #1]
|
|
ldr s4, [src], #4
|
|
uxtl v7.8h, v5.8b
|
|
uxtl v6.8h, v4.8b
|
|
mul v6.4h, v6.4h, v0.4h
|
|
mla v6.4h, v7.4h, v1.4h
|
|
urshr v6.4h, v6.4h, #(8 - 6)
|
|
str d6, [x13], #8
|
|
|
|
cbz w10, 4f
|
|
|
|
ldr d16, [x12], #8
|
|
mul v16.4h, v16.4h, v2.4h
|
|
mla v16.4h, v6.4h, v3.4h
|
|
urshr v16.4h, v16.4h, #4
|
|
str d16, [x14], #8
|
|
4:
|
|
subs height, height, #1
|
|
mov w10, #1
|
|
add src, src, src_stride
|
|
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
|
eor tmp0, tmp0, tmp1
|
|
eor tmp1, tmp0, tmp1
|
|
eor tmp0, tmp0, tmp1
|
|
b.ne 1b
|
|
|
|
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_hv_12_neon, export=1
|
|
movi v29.4s, #(12 - 6)
|
|
movi v30.4s, #(1 << (12 - 7)) // offset1
|
|
b 0f
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_hv_10_neon, export=1
|
|
movi v29.4s, #(10 - 6)
|
|
movi v30.4s, #(1 << (10 - 7)) // offset1
|
|
0:
|
|
movi v31.4s, #8 // offset2
|
|
neg v29.4s, v29.4s
|
|
|
|
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
|
|
|
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
|
|
add x12, x9, mx, lsl #1
|
|
ldrb w10, [x12]
|
|
ldrb w11, [x12, #1]
|
|
mov tmp0, sp
|
|
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
|
|
// We know the value are positive
|
|
dup v0.8h, w10 // filter_x[0]
|
|
dup v1.8h, w11 // filter_x[1]
|
|
|
|
add x12, x9, my, lsl #1
|
|
ldrb w10, [x12]
|
|
ldrb w11, [x12, #1]
|
|
sxtw x6, w6
|
|
dup v2.8h, w10 // filter_y[0]
|
|
dup v3.8h, w11 // filter_y[1]
|
|
|
|
// Valid value for width can only be 8 + 4, 16 + 4
|
|
cmp width, #16
|
|
mov w10, #0 // start filter_y or not
|
|
add height, height, #1
|
|
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
|
sub src_stride, src_stride, x6, lsl #1
|
|
cset w15, gt // width > 16
|
|
1:
|
|
mov x12, tmp0
|
|
mov x13, tmp1
|
|
mov x14, dst
|
|
cbz w15, 2f
|
|
|
|
// width > 16
|
|
add x16, src, #2
|
|
ldp q6, q16, [src], #32
|
|
ldp q7, q17, [x16]
|
|
umull v4.4s, v6.4h, v0.4h
|
|
umull2 v5.4s, v6.8h, v0.8h
|
|
umull v18.4s, v16.4h, v0.4h
|
|
umull2 v19.4s, v16.8h, v0.8h
|
|
umlal v4.4s, v7.4h, v1.4h
|
|
umlal2 v5.4s, v7.8h, v1.8h
|
|
umlal v18.4s, v17.4h, v1.4h
|
|
umlal2 v19.4s, v17.8h, v1.8h
|
|
|
|
add v4.4s, v4.4s, v30.4s
|
|
add v5.4s, v5.4s, v30.4s
|
|
add v18.4s, v18.4s, v30.4s
|
|
add v19.4s, v19.4s, v30.4s
|
|
ushl v4.4s, v4.4s, v29.4s
|
|
ushl v5.4s, v5.4s, v29.4s
|
|
ushl v18.4s, v18.4s, v29.4s
|
|
ushl v19.4s, v19.4s, v29.4s
|
|
uqxtn v6.4h, v4.4s
|
|
uqxtn2 v6.8h, v5.4s
|
|
uqxtn v7.4h, v18.4s
|
|
uqxtn2 v7.8h, v19.4s
|
|
stp q6, q7, [x13], #32
|
|
|
|
cbz w10, 3f
|
|
|
|
ldp q4, q5, [x12], #32
|
|
umull v17.4s, v4.4h, v2.4h
|
|
umull2 v18.4s, v4.8h, v2.8h
|
|
umull v19.4s, v5.4h, v2.4h
|
|
umull2 v20.4s, v5.8h, v2.8h
|
|
umlal v17.4s, v6.4h, v3.4h
|
|
umlal2 v18.4s, v6.8h, v3.8h
|
|
umlal v19.4s, v7.4h, v3.4h
|
|
umlal2 v20.4s, v7.8h, v3.8h
|
|
add v17.4s, v17.4s, v31.4s
|
|
add v18.4s, v18.4s, v31.4s
|
|
add v19.4s, v19.4s, v31.4s
|
|
add v20.4s, v20.4s, v31.4s
|
|
ushr v17.4s, v17.4s, #4
|
|
ushr v18.4s, v18.4s, #4
|
|
ushr v19.4s, v19.4s, #4
|
|
ushr v20.4s, v20.4s, #4
|
|
uqxtn v6.4h, v17.4s
|
|
uqxtn2 v6.8h, v18.4s
|
|
uqxtn v7.4h, v19.4s
|
|
uqxtn2 v7.8h, v20.4s
|
|
stp q6, q7, [x14], #32
|
|
b 3f
|
|
2:
|
|
// width > 8
|
|
ldur q7, [src, #2]
|
|
ldr q6, [src], #16
|
|
umull v4.4s, v6.4h, v0.4h
|
|
umull2 v5.4s, v6.8h, v0.8h
|
|
umlal v4.4s, v7.4h, v1.4h
|
|
umlal2 v5.4s, v7.8h, v1.8h
|
|
|
|
add v4.4s, v4.4s, v30.4s
|
|
add v5.4s, v5.4s, v30.4s
|
|
ushl v4.4s, v4.4s, v29.4s
|
|
ushl v5.4s, v5.4s, v29.4s
|
|
uqxtn v6.4h, v4.4s
|
|
uqxtn2 v6.8h, v5.4s
|
|
str q6, [x13], #16
|
|
|
|
cbz w10, 3f
|
|
|
|
ldr q16, [x12], #16
|
|
umull v17.4s, v16.4h, v2.4h
|
|
umull2 v18.4s, v16.8h, v2.8h
|
|
umlal v17.4s, v6.4h, v3.4h
|
|
umlal2 v18.4s, v6.8h, v3.8h
|
|
add v17.4s, v17.4s, v31.4s
|
|
add v18.4s, v18.4s, v31.4s
|
|
ushr v17.4s, v17.4s, #4
|
|
ushr v18.4s, v18.4s, #4
|
|
uqxtn v16.4h, v17.4s
|
|
uqxtn2 v16.8h, v18.4s
|
|
str q16, [x14], #16
|
|
3:
|
|
ldur d7, [src, #2]
|
|
ldr d6, [src], #8
|
|
umull v4.4s, v7.4h, v1.4h
|
|
umlal v4.4s, v6.4h, v0.4h
|
|
add v4.4s, v4.4s, v30.4s
|
|
ushl v4.4s, v4.4s, v29.4s
|
|
uqxtn v6.4h, v4.4s
|
|
str d6, [x13], #8
|
|
|
|
cbz w10, 4f
|
|
|
|
ldr d16, [x12], #8
|
|
umull v17.4s, v16.4h, v2.4h
|
|
umlal v17.4s, v6.4h, v3.4h
|
|
add v17.4s, v17.4s, v31.4s
|
|
ushr v17.4s, v17.4s, #4
|
|
uqxtn v16.4h, v17.4s
|
|
str d16, [x14], #8
|
|
4:
|
|
subs height, height, #1
|
|
mov w10, #1
|
|
add src, src, src_stride
|
|
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
|
eor tmp0, tmp0, tmp1
|
|
eor tmp1, tmp0, tmp1
|
|
eor tmp0, tmp0, tmp1
|
|
b.ne 1b
|
|
|
|
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
|
ret
|
|
|
|
.unreq dst
|
|
.unreq src
|
|
.unreq src_stride
|
|
.unreq height
|
|
.unreq mx
|
|
.unreq my
|
|
.unreq width
|
|
.unreq tmp0
|
|
.unreq tmp1
|
|
endfunc
|
|
|
|
function ff_vvc_prof_grad_filter_8x_neon, export=1
|
|
gh .req x0
|
|
gv .req x1
|
|
gstride .req x2
|
|
src .req x3
|
|
src_stride .req x4
|
|
width .req w5
|
|
height .req w6
|
|
|
|
lsl src_stride, src_stride, #1
|
|
neg x7, src_stride
|
|
1:
|
|
mov x10, src
|
|
mov w11, width
|
|
mov x12, gh
|
|
mov x13, gv
|
|
2:
|
|
ldur q0, [x10, #2]
|
|
ldur q1, [x10, #-2]
|
|
subs w11, w11, #8
|
|
ldr q2, [x10, src_stride]
|
|
ldr q3, [x10, x7]
|
|
sshr v0.8h, v0.8h, #6
|
|
sshr v1.8h, v1.8h, #6
|
|
sshr v2.8h, v2.8h, #6
|
|
sshr v3.8h, v3.8h, #6
|
|
sub v0.8h, v0.8h, v1.8h
|
|
sub v2.8h, v2.8h, v3.8h
|
|
st1 {v0.8h}, [x12], #16
|
|
st1 {v2.8h}, [x13], #16
|
|
add x10, x10, #16
|
|
b.ne 2b
|
|
|
|
subs height, height, #1
|
|
add gh, gh, gstride, lsl #1
|
|
add gv, gv, gstride, lsl #1
|
|
add src, src, src_stride
|
|
b.ne 1b
|
|
ret
|
|
|
|
.unreq gh
|
|
.unreq gv
|
|
.unreq gstride
|
|
.unreq src
|
|
.unreq src_stride
|
|
.unreq width
|
|
.unreq height
|
|
endfunc
|
|
|
|
.macro vvc_apply_bdof_block bit_depth
|
|
dst .req x0
|
|
dst_stride .req x1
|
|
src0 .req x2
|
|
src1 .req x3
|
|
gh .req x4
|
|
gv .req x5
|
|
vx .req x6
|
|
vy .req x7
|
|
|
|
ld1r {v0.8h}, [vx], #2
|
|
ld1r {v1.8h}, [vy], #2
|
|
ld1r {v2.8h}, [vx]
|
|
ld1r {v3.8h}, [vy]
|
|
ins v0.d[1], v2.d[1]
|
|
ins v1.d[1], v3.d[1]
|
|
|
|
movi v7.4s, #(1 << (14 - \bit_depth))
|
|
ldp x8, x9, [gh]
|
|
ldp x10, x11, [gv]
|
|
mov x12, #(BDOF_BLOCK_SIZE * 2)
|
|
mov w13, #(BDOF_MIN_BLOCK_SIZE)
|
|
mov x14, #(VVC_MAX_PB_SIZE * 2)
|
|
.if \bit_depth >= 10
|
|
// clip pixel
|
|
mov w15, #((1 << \bit_depth) - 1)
|
|
movi v18.8h, #0
|
|
lsl dst_stride, dst_stride, #1
|
|
dup v19.8h, w15
|
|
.endif
|
|
1:
|
|
ld1 {v2.8h}, [x8], x12
|
|
ld1 {v3.8h}, [x9], x12
|
|
ld1 {v4.8h}, [x10], x12
|
|
ld1 {v5.8h}, [x11], x12
|
|
sub v2.8h, v2.8h, v3.8h
|
|
sub v4.8h, v4.8h, v5.8h
|
|
smull v3.4s, v0.4h, v2.4h
|
|
smull2 v16.4s, v0.8h, v2.8h
|
|
smlal v3.4s, v1.4h, v4.4h
|
|
smlal2 v16.4s, v1.8h, v4.8h
|
|
|
|
ld1 {v5.8h}, [src0], x14
|
|
ld1 {v6.8h}, [src1], x14
|
|
saddl v2.4s, v5.4h, v6.4h
|
|
add v2.4s, v2.4s, v7.4s
|
|
add v2.4s, v2.4s, v3.4s
|
|
saddl2 v4.4s, v5.8h, v6.8h
|
|
add v4.4s, v4.4s, v7.4s
|
|
add v4.4s, v4.4s, v16.4s
|
|
|
|
sqshrn v5.4h, v2.4s, #(15 - \bit_depth)
|
|
sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth)
|
|
subs w13, w13, #1
|
|
.if \bit_depth == 8
|
|
sqxtun v5.8b, v5.8h
|
|
str d5, [dst]
|
|
add dst, dst, dst_stride
|
|
.else
|
|
smin v5.8h, v5.8h, v19.8h
|
|
smax v5.8h, v5.8h, v18.8h
|
|
st1 {v5.8h}, [dst], dst_stride
|
|
.endif
|
|
b.ne 1b
|
|
ret
|
|
|
|
.unreq dst
|
|
.unreq dst_stride
|
|
.unreq src0
|
|
.unreq src1
|
|
.unreq gh
|
|
.unreq gv
|
|
.unreq vx
|
|
.unreq vy
|
|
.endm
|
|
|
|
function ff_vvc_apply_bdof_block_8_neon, export=1
|
|
vvc_apply_bdof_block 8
|
|
endfunc
|
|
|
|
function ff_vvc_apply_bdof_block_10_neon, export=1
|
|
vvc_apply_bdof_block 10
|
|
endfunc
|
|
|
|
function ff_vvc_apply_bdof_block_12_neon, export=1
|
|
vvc_apply_bdof_block 12
|
|
endfunc
|
|
|
|
function ff_vvc_derive_bdof_vx_vy_neon, export=1
|
|
src0 .req x0
|
|
src1 .req x1
|
|
pad_mask .req w2
|
|
gh .req x3
|
|
gv .req x4
|
|
vx .req x5
|
|
vy .req x6
|
|
|
|
gh0 .req x7
|
|
gh1 .req x8
|
|
gv0 .req x9
|
|
gv1 .req x10
|
|
y .req x12
|
|
|
|
sgx2 .req w7
|
|
sgy2 .req w8
|
|
sgxgy .req w9
|
|
sgxdi .req w10
|
|
sgydi .req w11
|
|
|
|
sgx2_v .req v22
|
|
sgy2_v .req v23
|
|
sgxgy_v .req v24
|
|
sgxdi_v .req v25
|
|
sgydi_v .req v26
|
|
|
|
sgx2_v2 .req v27
|
|
sgy2_v2 .req v28
|
|
sgxgy_v2 .req v29
|
|
sgxdi_v2 .req v30
|
|
sgydi_v2 .req v31
|
|
|
|
ldp gh0, gh1, [gh]
|
|
ldp gv0, gv1, [gv]
|
|
movi sgx2_v.4s, #0
|
|
movi sgy2_v.4s, #0
|
|
movi sgxgy_v.4s, #0
|
|
movi sgxdi_v.4s, #0
|
|
movi sgydi_v.4s, #0
|
|
movi sgx2_v2.4s, #0
|
|
movi sgy2_v2.4s, #0
|
|
movi sgxgy_v2.4s, #0
|
|
movi sgxdi_v2.4s, #0
|
|
movi sgydi_v2.4s, #0
|
|
mov x13, #-1 // dy
|
|
movi v6.4s, #0
|
|
mov y, #-1
|
|
tbz pad_mask, #1, 1f // check pad top
|
|
mov x13, #0 // dy: pad top
|
|
1:
|
|
mov x16, #-2 // dx
|
|
add x14, src0, x13, lsl #8 // local src0
|
|
add x15, src1, x13, lsl #8 // local src1
|
|
add x17, x16, x13, lsl #5
|
|
ldr q0, [x14, x16]
|
|
ldr q1, [x15, x16]
|
|
ldr q2, [gh0, x17]
|
|
ldr q3, [gh1, x17]
|
|
ldr q4, [gv0, x17]
|
|
ldr q5, [gv1, x17]
|
|
add x16, x16, #8
|
|
add x17, x17, #8
|
|
ins v0.s[3], v6.s[3]
|
|
ins v1.s[3], v6.s[3]
|
|
ins v2.s[3], v6.s[3]
|
|
ins v3.s[3], v6.s[3]
|
|
ins v4.s[3], v6.s[3]
|
|
ins v5.s[3], v6.s[3]
|
|
|
|
ldr q16, [x14, x16]
|
|
ldr q17, [x15, x16]
|
|
ldr q18, [gh0, x17]
|
|
ldr q19, [gh1, x17]
|
|
ldr q20, [gv0, x17]
|
|
ldr q21, [gv1, x17]
|
|
ins v16.s[3], v6.s[3]
|
|
ins v17.s[3], v6.s[3]
|
|
ins v18.s[3], v6.s[3]
|
|
ins v19.s[3], v6.s[3]
|
|
ins v20.s[3], v6.s[3]
|
|
ins v21.s[3], v6.s[3]
|
|
|
|
tbz pad_mask, #0, 20f
|
|
// pad left
|
|
ins v0.h[0], v0.h[1]
|
|
ins v1.h[0], v1.h[1]
|
|
ins v2.h[0], v2.h[1]
|
|
ins v3.h[0], v3.h[1]
|
|
ins v4.h[0], v4.h[1]
|
|
ins v5.h[0], v5.h[1]
|
|
20:
|
|
tbz pad_mask, #2, 21f
|
|
// pad right
|
|
ins v16.h[5], v16.h[4]
|
|
ins v17.h[5], v17.h[4]
|
|
ins v18.h[5], v18.h[4]
|
|
ins v19.h[5], v19.h[4]
|
|
ins v20.h[5], v20.h[4]
|
|
ins v21.h[5], v21.h[4]
|
|
21:
|
|
sshr v0.8h, v0.8h, #4
|
|
sshr v1.8h, v1.8h, #4
|
|
add v2.8h, v2.8h, v3.8h
|
|
add v4.8h, v4.8h, v5.8h
|
|
sub v0.8h, v0.8h, v1.8h // diff
|
|
sshr v2.8h, v2.8h, #1 // temph
|
|
sshr v4.8h, v4.8h, #1 // tempv
|
|
|
|
sshr v16.8h, v16.8h, #4
|
|
sshr v17.8h, v17.8h, #4
|
|
add v18.8h, v18.8h, v19.8h
|
|
add v20.8h, v20.8h, v21.8h
|
|
sub v16.8h, v16.8h, v17.8h // diff
|
|
sshr v18.8h, v18.8h, #1 // temph
|
|
sshr v20.8h, v20.8h, #1 // tempv
|
|
|
|
abs v3.8h, v2.8h
|
|
abs v5.8h, v4.8h
|
|
uxtl v19.4s, v3.4h
|
|
uxtl v21.4s, v5.4h
|
|
uxtl2 v3.4s, v3.8h
|
|
uxtl2 v5.4s, v5.8h
|
|
add v3.4s, v3.4s, v19.4s
|
|
add v5.4s, v5.4s, v21.4s
|
|
add sgx2_v.4s, sgx2_v.4s, v3.4s
|
|
add sgy2_v.4s, sgy2_v.4s, v5.4s
|
|
|
|
abs v3.8h, v18.8h
|
|
abs v5.8h, v20.8h
|
|
uxtl v19.4s, v3.4h
|
|
uxtl v21.4s, v5.4h
|
|
uxtl2 v3.4s, v3.8h
|
|
uxtl2 v5.4s, v5.8h
|
|
add v3.4s, v3.4s, v19.4s
|
|
add v5.4s, v5.4s, v21.4s
|
|
add sgx2_v2.4s, sgx2_v2.4s, v3.4s
|
|
add sgy2_v2.4s, sgy2_v2.4s, v5.4s
|
|
|
|
cmgt v17.8h, v4.8h, #0
|
|
cmlt v7.8h, v4.8h, #0
|
|
cmgt v19.8h, v20.8h, #0
|
|
cmlt v21.8h, v20.8h, #0
|
|
sub v17.8h, v7.8h, v17.8h // VVC_SIGN(tempv)
|
|
sub v19.8h, v21.8h, v19.8h // VVC_SIGN(tempv)
|
|
|
|
smlal sgxgy_v.4s, v17.4h, v2.4h
|
|
smlal2 sgxgy_v.4s, v17.8h, v2.8h
|
|
smlsl sgydi_v.4s, v17.4h, v0.4h
|
|
smlsl2 sgydi_v.4s, v17.8h, v0.8h
|
|
|
|
cmgt v3.8h, v2.8h, #0
|
|
cmlt v5.8h, v2.8h, #0
|
|
cmgt v17.8h, v18.8h, #0
|
|
cmlt v21.8h, v18.8h, #0
|
|
sub v3.8h, v5.8h, v3.8h // VVC_SIGN(temph)
|
|
sub v17.8h, v21.8h, v17.8h // VVC_SIGN(temph)
|
|
|
|
smlal sgxgy_v2.4s, v19.4h, v18.4h
|
|
smlal2 sgxgy_v2.4s, v19.8h, v18.8h
|
|
smlsl sgydi_v2.4s, v19.4h, v16.4h
|
|
smlsl2 sgydi_v2.4s, v19.8h, v16.8h
|
|
|
|
smlsl sgxdi_v.4s, v3.4h, v0.4h
|
|
smlsl2 sgxdi_v.4s, v3.8h, v0.8h
|
|
smlsl sgxdi_v2.4s, v17.4h, v16.4h
|
|
smlsl2 sgxdi_v2.4s, v17.8h, v16.8h
|
|
3:
|
|
add y, y, #1
|
|
cmp y, #(BDOF_MIN_BLOCK_SIZE)
|
|
mov x13, y
|
|
b.gt 4f
|
|
b.lt 1b
|
|
tbz pad_mask, #3, 1b
|
|
sub x13, x13, #1 // pad bottom
|
|
b 1b
|
|
4:
|
|
addv s22, sgx2_v.4s
|
|
addv s23, sgy2_v.4s
|
|
addv s24, sgxgy_v.4s
|
|
addv s25, sgxdi_v.4s
|
|
addv s26, sgydi_v.4s
|
|
|
|
mov w3, #31
|
|
mov w16, #-15
|
|
mov w17, #15
|
|
40:
|
|
mov w14, #0
|
|
|
|
mov sgx2, v22.s[0]
|
|
mov sgy2, v23.s[0]
|
|
mov sgxgy, v24.s[0]
|
|
mov sgxdi, v25.s[0]
|
|
mov sgydi, v26.s[0]
|
|
|
|
cbz sgx2, 5f
|
|
clz w12, sgx2
|
|
lsl sgxdi, sgxdi, #2
|
|
sub w13, w3, w12 // log2(sgx2)
|
|
asr sgxdi, sgxdi, w13
|
|
cmp sgxdi, w16
|
|
csel w14, w16, sgxdi, lt // clip to -15
|
|
b.le 5f
|
|
cmp sgxdi, w17
|
|
csel w14, w17, sgxdi, gt // clip to 15
|
|
5:
|
|
strh w14, [vx], #2
|
|
|
|
mov w15, #0
|
|
cbz sgy2, 6f
|
|
lsl sgydi, sgydi, #2
|
|
smull x14, w14, sgxgy
|
|
asr w14, w14, #1
|
|
sub sgydi, sgydi, w14
|
|
clz w12, sgy2
|
|
sub w13, w3, w12 // log2(sgy2)
|
|
asr sgydi, sgydi, w13
|
|
cmp sgydi, w16
|
|
csel w15, w16, sgydi, lt // clip to -15
|
|
b.le 6f
|
|
cmp sgydi, w17
|
|
csel w15, w17, sgydi, gt // clip to 15
|
|
6:
|
|
strh w15, [vy], #2
|
|
cbz x0, 7f
|
|
addv s22, sgx2_v2.4s
|
|
addv s23, sgy2_v2.4s
|
|
addv s24, sgxgy_v2.4s
|
|
addv s25, sgxdi_v2.4s
|
|
addv s26, sgydi_v2.4s
|
|
mov x0, #0
|
|
b 40b
|
|
7:
|
|
ret
|
|
|
|
.unreq src0
|
|
.unreq src1
|
|
.unreq pad_mask
|
|
.unreq gh
|
|
.unreq gv
|
|
.unreq vx
|
|
.unreq vy
|
|
.unreq sgx2
|
|
.unreq sgy2
|
|
.unreq sgxgy
|
|
.unreq sgxdi
|
|
.unreq sgydi
|
|
.unreq sgx2_v
|
|
.unreq sgy2_v
|
|
.unreq sgxgy_v
|
|
.unreq sgxdi_v
|
|
.unreq sgydi_v
|
|
.unreq sgx2_v2
|
|
.unreq sgy2_v2
|
|
.unreq sgxgy_v2
|
|
.unreq sgxdi_v2
|
|
.unreq sgydi_v2
|
|
.unreq y
|
|
endfunc
|