1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

avcodec/aarch64/vvc: Optimize apply_bdof

Before this patch, prof_grad_filter calculate
gh[0], gh[1], gv[0], gv[1] and save them to stack.

derive_bdof_vx_vy load them from stack and calculate
gh[0] + gh[1], gv[0] + gv[1].

apply_bdof_min_block load them from stack and calculate
gh[0] - gh[1], gv[0] - gv[1]

This patch add bdof_grad_filter, which calculate gh[0] + gh[1],
gh[0] - gh[1], gv[0] + gv[1], gv[0] - gv[1], and save them to
stack, so derive_bdof_vx_vy and apply_bdof_min_block can use the
results directly.

prof_grad_filter is kept for reuse by other functions in the future.

Benchmark on rpi5 with gcc 12
                               Before               After
--------------------------------------------------------------------
apply_bdof_8_8x16_c:       |   7431.4 ( 1.00x)   |   7371.7 ( 1.00x)
apply_bdof_8_8x16_neon:    |   1175.4 ( 6.32x)   |   1036.3 ( 7.11x)
apply_bdof_8_16x8_c:       |   7182.2 ( 1.00x)   |   7201.1 ( 1.00x)
apply_bdof_8_16x8_neon:    |   1021.7 ( 7.03x)   |    879.9 ( 8.18x)
apply_bdof_8_16x16_c:      |  14577.1 ( 1.00x)   |  14589.3 ( 1.00x)
apply_bdof_8_16x16_neon:   |   2012.8 ( 7.24x)   |   1743.3 ( 8.37x)
apply_bdof_10_8x16_c:      |   7292.4 ( 1.00x)   |   7308.5 ( 1.00x)
apply_bdof_10_8x16_neon:   |   1156.3 ( 6.31x)   |   1045.3 ( 6.99x)
apply_bdof_10_16x8_c:      |   7112.4 ( 1.00x)   |   7214.4 ( 1.00x)
apply_bdof_10_16x8_neon:   |   1007.6 ( 7.06x)   |    904.8 ( 7.97x)
apply_bdof_10_16x16_c:     |  14363.3 ( 1.00x)   |  14476.4 ( 1.00x)
apply_bdof_10_16x16_neon:  |   1986.9 ( 7.23x)   |   1783.1 ( 8.12x)
apply_bdof_12_8x16_c:      |   7433.3 ( 1.00x)   |   7374.7 ( 1.00x)
apply_bdof_12_8x16_neon:   |   1155.9 ( 6.43x)   |   1040.8 ( 7.09x)
apply_bdof_12_16x8_c:      |   7171.1 ( 1.00x)   |   7376.3 ( 1.00x)
apply_bdof_12_16x8_neon:   |   1010.8 ( 7.09x)   |    899.4 ( 8.20x)
apply_bdof_12_16x16_c:     |  14515.5 ( 1.00x)   |  14731.5 ( 1.00x)
apply_bdof_12_16x16_neon:  |   1988.4 ( 7.30x)   |   1785.2 ( 8.25x)
This commit is contained in:
Zhao Zhili
2025-08-14 12:42:38 +08:00
committed by Zhao Zhili
parent 2e92417603
commit 6ce02bcc3a
3 changed files with 370 additions and 138 deletions

View File

@@ -30,38 +30,16 @@
#define BDOF_BLOCK_SIZE 16
#define BDOF_MIN_BLOCK_SIZE 4
void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
int16_t *gradient_v,
ptrdiff_t gradient_stride,
const int16_t *_src,
ptrdiff_t src_stride,
int width, int height);
void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
const int16_t *_src1,
int16_t *const gradient_h[2],
int16_t *const gradient_v[2],
int16_t vx[16], int16_t vy[16],
int block_h);
void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
const int16_t *_src1,
int16_t *const gradient_h[2],
int16_t *const gradient_v[2],
int16_t vx[16], int16_t vy[16],
int block_h);
#define BIT_DEPTH 8
#include "alf_template.c"
#include "of_template.c"
#undef BIT_DEPTH
#define BIT_DEPTH 10
#include "alf_template.c"
#include "of_template.c"
#undef BIT_DEPTH
#define BIT_DEPTH 12
#include "alf_template.c"
#include "of_template.c"
#undef BIT_DEPTH
int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
@@ -121,6 +99,15 @@ DMVR_FUN(hv_, 8)
DMVR_FUN(hv_, 10)
DMVR_FUN(hv_, 12)
#define APPLY_BDOF_FUNC(bd) \
void ff_vvc_apply_bdof_ ## bd ## _neon(uint8_t *_dst, ptrdiff_t _dst_stride, \
const int16_t *_src0, const int16_t *_src1, \
int block_w, int block_h);
APPLY_BDOF_FUNC(8)
APPLY_BDOF_FUNC(10)
APPLY_BDOF_FUNC(12)
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
{
int cpu_flags = av_get_cpu_flags();
@@ -202,7 +189,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.w_avg = vvc_w_avg_8;
c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
c->inter.apply_bdof = apply_bdof_8;
c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;
c->sao.band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon;
for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
@@ -246,7 +233,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.avg = ff_vvc_avg_10_neon;
c->inter.w_avg = vvc_w_avg_10;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
c->inter.apply_bdof = apply_bdof_10;
c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
@@ -255,7 +242,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.w_avg = vvc_w_avg_12;
c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
c->inter.apply_bdof = apply_bdof_12;
c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;

View File

@@ -716,7 +716,93 @@ function ff_vvc_prof_grad_filter_8x_neon, export=1
.unreq height
endfunc
.macro vvc_apply_bdof_block bit_depth
function vvc_bdof_grad_filter_8x_neon, export=0
gh0 .req x0
gh1 .req x1
gv0 .req x2
gv1 .req x3
src0 .req x4
src1 .req x5
width .req w6
height .req w7
1:
mov x10, src0
mov w11, width
mov x12, gh0
mov x13, gv0
mov x14, src1
mov x15, gh1
mov x16, gv1
2:
ldur q0, [x10, #2]
ldur q1, [x10, #-2]
ldr q2, [x10, #(VVC_MAX_PB_SIZE << 1)]
ldr q3, [x10, #-(VVC_MAX_PB_SIZE << 1)]
sshr v0.8h, v0.8h, #6
sshr v1.8h, v1.8h, #6
ldur q4, [x14, #2]
ldur q5, [x14, #-2]
sshr v2.8h, v2.8h, #6
sshr v3.8h, v3.8h, #6
ldr q6, [x14, #(VVC_MAX_PB_SIZE << 1)]
ldr q7, [x14, #-(VVC_MAX_PB_SIZE << 1)]
// results of gradient_h0
sub v0.8h, v0.8h, v1.8h
// results of gradient_v0
sub v2.8h, v2.8h, v3.8h
sshr v4.8h, v4.8h, #6
sshr v5.8h, v5.8h, #6
sshr v6.8h, v6.8h, #6
sshr v7.8h, v7.8h, #6
// results of gradient_h1
sub v4.8h, v4.8h, v5.8h
// results of gradient_v1
sub v6.8h, v6.8h, v7.8h
add x10, x10, #16
add x14, x14, #16
// (gradient_h0 + gradient_h1) >> 1
shadd v1.8h, v0.8h, v4.8h
// gradient_h0 - gradient_h1
sub v5.8h, v0.8h, v4.8h
subs w11, w11, #8
// (gradient_v0 + gradient_v1) >> 1
shadd v3.8h, v2.8h, v6.8h
// gradient_v0 - gradient_v1
sub v7.8h, v2.8h, v6.8h
st1 {v1.8h}, [x12], #16
st1 {v5.8h}, [x15], #16
st1 {v3.8h}, [x13], #16
st1 {v7.8h}, [x16], #16
b.ne 2b
subs height, height, #1
add gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
add gv0, gv0, #(BDOF_BLOCK_SIZE << 1)
add src0, src0, #(VVC_MAX_PB_SIZE << 1)
add gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
add gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
add src1, src1, #(VVC_MAX_PB_SIZE << 1)
b.ne 1b
ret
.unreq gh0
.unreq gh1
.unreq gv0
.unreq gv1
.unreq src0
.unreq src1
.unreq width
.unreq height
endfunc
.macro vvc_apply_bdof_block_8x bit_depth
dst .req x0
dst_stride .req x1
src0 .req x2
@@ -726,33 +812,28 @@ endfunc
vx .req x6
vy .req x7
ld1r {v0.8h}, [vx], #2
ld1r {v1.8h}, [vy], #2
ld1r {v2.8h}, [vx]
ld1r {v3.8h}, [vy]
ins v0.d[1], v2.d[1]
ins v1.d[1], v3.d[1]
ldr w8, [sp]
movi v7.4s, #(1 << (14 - \bit_depth))
ldp x8, x9, [gh]
ldp x10, x11, [gv]
mov x12, #(BDOF_BLOCK_SIZE * 2)
mov w13, #(BDOF_MIN_BLOCK_SIZE)
mov x14, #(VVC_MAX_PB_SIZE * 2)
.if \bit_depth >= 10
// clip pixel
mov w15, #((1 << \bit_depth) - 1)
movi v18.8h, #0
lsl dst_stride, dst_stride, #1
dup v19.8h, w15
.endif
0:
ld1r {v0.8h}, [vx], #2
ld1r {v1.8h}, [vy], #2
ld1r {v2.8h}, [vx]
ld1r {v3.8h}, [vy]
mov w13, #(BDOF_MIN_BLOCK_SIZE)
ins v0.d[1], v2.d[1]
ins v1.d[1], v3.d[1]
1:
ld1 {v2.8h}, [x8], x12
ld1 {v3.8h}, [x9], x12
ld1 {v4.8h}, [x10], x12
ld1 {v5.8h}, [x11], x12
sub v2.8h, v2.8h, v3.8h
sub v4.8h, v4.8h, v5.8h
ld1 {v2.8h}, [gh], x12
ld1 {v4.8h}, [gv], x12
smull v3.4s, v0.4h, v2.4h
smull2 v16.4s, v0.8h, v2.8h
smlal v3.4s, v1.4h, v4.4h
@@ -780,6 +861,11 @@ endfunc
st1 {v5.8h}, [dst], dst_stride
.endif
b.ne 1b
subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
add vx, vx, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
add vy, vy, #(2 * BDOF_MIN_BLOCK_SIZE - 2)
b.ne 0b
ret
.unreq dst
@@ -792,16 +878,128 @@ endfunc
.unreq vy
.endm
function ff_vvc_apply_bdof_block_8_neon, export=1
vvc_apply_bdof_block 8
function vvc_apply_bdof_block_8x_8_neon, export=0
vvc_apply_bdof_block_8x 8
endfunc
function ff_vvc_apply_bdof_block_10_neon, export=1
vvc_apply_bdof_block 10
function vvc_apply_bdof_block_8x_10_neon, export=0
vvc_apply_bdof_block_8x 10
endfunc
function ff_vvc_apply_bdof_block_12_neon, export=1
vvc_apply_bdof_block 12
function vvc_apply_bdof_block_8x_12_neon, export=0
vvc_apply_bdof_block_8x 12
endfunc
.macro vvc_apply_bdof_block_16x bit_depth
dst .req x0
dst_stride .req x1
src0 .req x2
src1 .req x3
gh .req x4
gv .req x5
vx .req x6
vy .req x7
ldr w8, [sp]
movi v7.4s, #(1 << (14 - \bit_depth))
.if \bit_depth >= 10
// clip pixel
mov w15, #((1 << \bit_depth) - 1)
movi v18.8h, #0
dup v19.8h, w15
.endif
0:
ld1r {v0.8h}, [vx], #2
ld1r {v1.8h}, [vy], #2
ld1r {v2.8h}, [vx], #2
ld1r {v3.8h}, [vy], #2
mov w13, #(BDOF_MIN_BLOCK_SIZE)
ld1r {v20.8h}, [vx], #2
ld1r {v21.8h}, [vy], #2
ld1r {v22.8h}, [vx], #2
ld1r {v23.8h}, [vy], #2
ins v0.d[1], v2.d[1]
ins v1.d[1], v3.d[1]
ins v20.d[1], v22.d[1]
ins v21.d[1], v23.d[1]
1:
ldp q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2)
ldp q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2)
smull v3.4s, v0.4h, v2.4h
smull2 v16.4s, v0.8h, v2.8h
smlal v3.4s, v1.4h, v4.4h
smlal2 v16.4s, v1.8h, v4.8h
ldp q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2)
ldp q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2)
smull v23.4s, v20.4h, v22.4h
smull2 v27.4s, v20.8h, v22.8h
smlal v23.4s, v21.4h, v24.4h
smlal2 v27.4s, v21.8h, v24.8h
saddl v2.4s, v5.4h, v6.4h
add v2.4s, v2.4s, v7.4s
add v2.4s, v2.4s, v3.4s
saddl2 v4.4s, v5.8h, v6.8h
add v4.4s, v4.4s, v7.4s
add v4.4s, v4.4s, v16.4s
saddl v22.4s, v25.4h, v26.4h
add v22.4s, v22.4s, v7.4s
add v22.4s, v22.4s, v23.4s
saddl2 v24.4s, v25.8h, v26.8h
add v24.4s, v24.4s, v7.4s
add v24.4s, v24.4s, v27.4s
sqshrn v5.4h, v2.4s, #(15 - \bit_depth)
sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth)
sqshrn v25.4h, v22.4s, #(15 - \bit_depth)
sqshrn2 v25.8h, v24.4s, #(15 - \bit_depth)
subs w13, w13, #1
.if \bit_depth == 8
sqxtun v5.8b, v5.8h
sqxtun2 v5.16b, v25.8h
str q5, [dst]
.else
smin v5.8h, v5.8h, v19.8h
smax v5.8h, v5.8h, v18.8h
smin v25.8h, v25.8h, v19.8h
smax v25.8h, v25.8h, v18.8h
stp q5, q25, [dst]
.endif
add dst, dst, dst_stride
b.ne 1b
subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
b.ne 0b
ret
.unreq dst
.unreq dst_stride
.unreq src0
.unreq src1
.unreq gh
.unreq gv
.unreq vx
.unreq vy
.endm
function vvc_apply_bdof_block_16x_8_neon, export=0
vvc_apply_bdof_block_16x 8
endfunc
function vvc_apply_bdof_block_16x_10_neon, export=0
vvc_apply_bdof_block_16x 10
endfunc
function vvc_apply_bdof_block_16x_12_neon, export=0
vvc_apply_bdof_block_16x 12
endfunc
const bdof_vx_vy_8x_tbl
@@ -919,18 +1117,16 @@ endconst
* ----------------------------------------------------------------------
* x0: const int16_t *_src0,
* x1: const int16_t *_src1,
* x2: int16_t *const gradient_h[2],
* x3: int16_t *const gradient_v[2],
* x2: const int16_t *gradient_h,
* x3: const int16_t *gradient_v,
* x4: int16_t vx[16],
* x5: int16_t vy[16],
* w6: int block_h
*/
function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
function vvc_derive_bdof_vx_vy_8x_neon, export=0
stp d11, d10, [sp, #-0x20]!
stp d9, d8, [sp, #0x10]
ldp x14, x13, [x2] // gh0, gh1
ldp x10, x9, [x3] // gv0, gv1
movrel x11, bdof_vx_vy_8x_tbl
ldr q0, [x11] // table
mvni v2.4s, #30 // -31, for log2
@@ -998,17 +1194,13 @@ function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
9:
ldr q28, [x0] // src0
ldr q29, [x1] // src1
ldr q30, [x14], #(BDOF_BLOCK_SIZE * 2) // gh0
ldr q31, [x13], #(BDOF_BLOCK_SIZE * 2) // gh1
ldr q8, [x10], #(BDOF_BLOCK_SIZE * 2) // gv0
ldr q9, [x9], #(BDOF_BLOCK_SIZE * 2) // gv1
ldr q30, [x2], #(BDOF_BLOCK_SIZE * 2) // (gh0 + gh1) >> 1
ldr q31, [x3], #(BDOF_BLOCK_SIZE * 2) // (gv0 + gv1) >> 1
add x0, x0, #(VVC_MAX_PB_SIZE * 2)
add x1, x1, #(VVC_MAX_PB_SIZE * 2)
sshr v28.8h, v28.8h, #0x4
sshr v29.8h, v29.8h, #0x4
shadd v30.8h, v30.8h, v31.8h // tmph
shadd v31.8h, v8.8h, v9.8h // tmpv
sub v8.8h, v28.8h, v29.8h // diff
abs v28.8h, v30.8h
@@ -1067,20 +1259,18 @@ endfunc
/*
* x0: const int16_t *_src0,
* x1: const int16_t *_src1,
* x2: int16_t *const gradient_h[2],
* x3: int16_t *const gradient_v[2],
* x2: const int16_t *gradient_h,
* x3: const int16_t *gradient_v,
* x4: int16_t vx[16],
* x5: int16_t vy[16],
* w6: int block_h
*/
function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
function vvc_derive_bdof_vx_vy_16x_neon, export=0
stp d15, d14, [sp, #-0x40]!
stp d13, d12, [sp, #0x10]
stp d11, d10, [sp, #0x20]
stp d9, d8, [sp, #0x30]
ldp x8, x9, [x2] // gh0, gh1
ldp x10, x11, [x3] // gv0, gv1
movrel x12, bdof_vx_vy_16x_tbl
ldp q0, q1, [x12] // table
mov w13, w6 // y = block_h
@@ -1142,17 +1332,11 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
sshr v31.8h, v29.8h, #0x4
ld1 {v8.8h, v9.8h}, [x1] // src1
sshr v10.8h, v8.8h, #0x4
ld1 {v11.8h, v12.8h}, [x8], #32 // gh0
ldp q13, q8, [x2], #32 // (gh0 + gh1) >> 1
sshr v29.8h, v30.8h, #0x4
sshr v30.8h, v9.8h, #0x4
ld1 {v8.8h, v9.8h}, [x9], #32 // gh1
shadd v13.8h, v11.8h, v8.8h // (gh0 + gh1) >> 1, left half
ld1 {v14.8h, v15.8h}, [x10], #32 // gv0
ld1 {v3.8h, v4.8h}, [x11], #32 // gv1
shadd v5.8h, v14.8h, v3.8h // (gv0 + gv1) >> 1, left half
ldp q5, q3, [x3], #32 // (gv0 + gv1) >> 1
sub v31.8h, v31.8h, v10.8h // diff, left half
shadd v8.8h, v12.8h, v9.8h // (gh0 + gh1) >> 1, right half
shadd v3.8h, v15.8h, v4.8h // (gv0 + gv1) >> 1, right half
sub v4.8h, v29.8h, v30.8h // diff, right half
abs v29.8h, v13.8h
@@ -1219,3 +1403,129 @@ function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
ldp d15, d14, [sp], #0x40
ret
endfunc
function ff_vvc_apply_bdof_10_neon, export=1
mov w6, #10
b 0f
endfunc
function ff_vvc_apply_bdof_12_neon, export=1
mov w6, #12
b 0f
endfunc
// int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
// int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
// int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
#define APPLY_BDOF_STACK_SIZE ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4)
#define GRADIENT_H0_OFFSET 2
#define GRADIENT_H1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2)
#define GRADIENT_V0_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2)
#define GRADIENT_V1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2)
#define VX_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8)
#define VY_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2)
function ff_vvc_apply_bdof_8_neon, export=1
mov w6, #8
0:
stp x19, x20, [sp, #-0x40]!
stp x21, x22, [sp, #0x10]
stp x23, x24, [sp, #0x20]
stp x25, x30, [sp, #0x30]
sub sp, sp, #APPLY_BDOF_STACK_SIZE
mov w19, w6 // bit_depth
mov x20, x0 // dst
mov x21, x1 // dst_stride
mov x22, x2 // src0
mov x23, x3 // src1
mov w24, w4 // block_w
mov w25, w5 // block_h
// int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
add x0, sp, #GRADIENT_H0_OFFSET
add x1, sp, #GRADIENT_H1_OFFSET
add x2, sp, #GRADIENT_V0_OFFSET
add x3, sp, #GRADIENT_V1_OFFSET
mov x4, x22
mov x5, x23
mov w6, w24
mov w7, w25
bl vvc_bdof_grad_filter_8x_neon
cmp w24, #8
mov x0, x22 // src0
mov x1, x23 // src1
add x2, sp, #GRADIENT_H0_OFFSET // gh0
add x3, sp, #GRADIENT_V0_OFFSET // gv0
add x4, sp, #VX_OFFSET // vx
add x5, sp, #VY_OFFSET // vy
mov w6, w25 // block_h
b.gt 16f
bl vvc_derive_bdof_vx_vy_8x_neon
cmp w19, #10 // check bitdepth
mov x0, x20 // dst
mov x1, x21 // dst_stride
mov x2, x22 // src0
mov x3, x23 // src1
add x4, sp, #GRADIENT_H1_OFFSET // gh1
add x5, sp, #GRADIENT_V1_OFFSET // gv1
add x6, sp, #VX_OFFSET
add x7, sp, #VY_OFFSET
str w25, [sp]
b.eq 1f
b.gt 2f
// 8bit
0:
bl vvc_apply_bdof_block_8x_8_neon
b 32f
1:
// 10bit
bl vvc_apply_bdof_block_8x_10_neon
b 32f
2:
// 12bit
bl vvc_apply_bdof_block_8x_12_neon
b 32f
16:
bl vvc_derive_bdof_vx_vy_16x_neon
cmp w19, #10 // check bitdepth
mov x0, x20 // dst
mov x1, x21 // dst_stride
mov x2, x22 // src0
mov x3, x23 // src1
add x4, sp, #GRADIENT_H1_OFFSET // gh1
add x5, sp, #GRADIENT_V1_OFFSET // gv1
add x6, sp, #VX_OFFSET
add x7, sp, #VY_OFFSET
str w25, [sp]
b.eq 17f
b.gt 18f
// 8bit
bl vvc_apply_bdof_block_16x_8_neon
b 32f
17:
// 10bit
bl vvc_apply_bdof_block_16x_10_neon
b 32f
18:
// 12bit
bl vvc_apply_bdof_block_16x_12_neon
32:
add sp, sp, #APPLY_BDOF_STACK_SIZE
ldp x25, x30, [sp, #0x30]
ldp x23, x24, [sp, #0x20]
ldp x21, x22, [sp, #0x10]
ldp x19, x20, [sp], #0x40
ret
endfunc
#undef APPLY_BDOF_STACK_SIZE
#undef GRADIENT_H0_OFFSET
#undef GRADIENT_H1_OFFSET
#undef GRADIENT_V0_OFFSET
#undef GRADIENT_V1_OFFSET
#undef VX_OFFSET
#undef VY_OFFSET

View File

@@ -1,65 +0,0 @@
/*
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/bit_depth_template.c"
void FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(pixel* dst,
ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
const int16_t **gh, const int16_t **gv, int16_t *vx, int16_t *vy);
static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
const int16_t *_src0, const int16_t *_src1,
int block_w, int block_h) {
// +2 for pad left and right
int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
int16_t *gradient_v[2] = {&gradient_buf_v[0][1], &gradient_buf_v[1][1]};
ptrdiff_t dst_stride = _dst_stride / sizeof(pixel);
pixel *dst = (pixel *) _dst;
ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0],
BDOF_BLOCK_SIZE,
_src0, MAX_PB_SIZE, block_w, block_h);
ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
BDOF_BLOCK_SIZE,
_src1, MAX_PB_SIZE, block_w, block_h);
int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
if (block_w == 8)
ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
else
ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h, gradient_v, vx, vy, block_h);
for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
const int16_t *src0 = _src0 + y * MAX_PB_SIZE + x;
const int16_t *src1 = _src1 + y * MAX_PB_SIZE + x;
pixel *d = dst + x;
int idx = BDOF_BLOCK_SIZE * y + x;
const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
src0, src1, gh, gv,
vx + idx1, vy + idx1);
}
dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
}
}