mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-02-04 06:08:26 +02:00
aarch64/vvc: Add apply_bdof
Test on rpi 5 with gcc 12: apply_bdof_8_8x16_c: 7315.2 ( 1.00x) apply_bdof_8_8x16_neon: 1876.8 ( 3.90x) apply_bdof_8_16x8_c: 7170.5 ( 1.00x) apply_bdof_8_16x8_neon: 1752.8 ( 4.09x) apply_bdof_8_16x16_c: 14695.2 ( 1.00x) apply_bdof_8_16x16_neon: 3490.5 ( 4.21x) apply_bdof_10_8x16_c: 7371.5 ( 1.00x) apply_bdof_10_8x16_neon: 1863.8 ( 3.96x) apply_bdof_10_16x8_c: 7172.0 ( 1.00x) apply_bdof_10_16x8_neon: 1766.0 ( 4.06x) apply_bdof_10_16x16_c: 14551.5 ( 1.00x) apply_bdof_10_16x16_neon: 3576.0 ( 4.07x) apply_bdof_12_8x16_c: 7236.5 ( 1.00x) apply_bdof_12_8x16_neon: 1863.8 ( 3.88x) apply_bdof_12_16x8_c: 7316.5 ( 1.00x) apply_bdof_12_16x8_neon: 1758.8 ( 4.16x) apply_bdof_12_16x16_c: 14691.2 ( 1.00x) apply_bdof_12_16x16_neon: 3480.5 ( 4.22x)
This commit is contained in:
parent
7aeae8d1ae
commit
952508ae05
@ -27,16 +27,34 @@
|
||||
#include "libavcodec/vvc/dec.h"
|
||||
#include "libavcodec/vvc/ctu.h"
|
||||
|
||||
#define BDOF_BLOCK_SIZE 16
|
||||
#define BDOF_MIN_BLOCK_SIZE 4
|
||||
|
||||
void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
|
||||
int16_t *gradient_v,
|
||||
ptrdiff_t gradient_stride,
|
||||
const int16_t *_src,
|
||||
ptrdiff_t src_stride,
|
||||
int width, int height);
|
||||
|
||||
void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1,
|
||||
int pad_mask,
|
||||
const int16_t **gradient_h,
|
||||
const int16_t **gradient_v,
|
||||
int16_t *vx, int16_t *vy);
|
||||
#define BIT_DEPTH 8
|
||||
#include "alf_template.c"
|
||||
#include "of_template.c"
|
||||
#undef BIT_DEPTH
|
||||
|
||||
#define BIT_DEPTH 10
|
||||
#include "alf_template.c"
|
||||
#include "of_template.c"
|
||||
#undef BIT_DEPTH
|
||||
|
||||
#define BIT_DEPTH 12
|
||||
#include "alf_template.c"
|
||||
#include "of_template.c"
|
||||
#undef BIT_DEPTH
|
||||
|
||||
int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
|
||||
@ -177,6 +195,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.w_avg = vvc_w_avg_8;
|
||||
c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
|
||||
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
|
||||
c->inter.apply_bdof = apply_bdof_8;
|
||||
|
||||
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
|
||||
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
|
||||
@ -219,6 +238,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.avg = ff_vvc_avg_10_neon;
|
||||
c->inter.w_avg = vvc_w_avg_10;
|
||||
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
|
||||
c->inter.apply_bdof = apply_bdof_10;
|
||||
|
||||
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
|
||||
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
|
||||
@ -227,6 +247,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.w_avg = vvc_w_avg_12;
|
||||
c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
|
||||
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
|
||||
c->inter.apply_bdof = apply_bdof_12;
|
||||
|
||||
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
|
||||
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
|
||||
|
@ -21,6 +21,8 @@
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
#define VVC_MAX_PB_SIZE 128
|
||||
#define BDOF_BLOCK_SIZE 16
|
||||
#define BDOF_MIN_BLOCK_SIZE 4
|
||||
|
||||
.macro vvc_avg type, bit_depth
|
||||
|
||||
@ -613,3 +615,400 @@ function ff_vvc_dmvr_hv_10_neon, export=1
|
||||
.unreq tmp0
|
||||
.unreq tmp1
|
||||
endfunc
|
||||
|
||||
function ff_vvc_prof_grad_filter_8x_neon, export=1
|
||||
gh .req x0
|
||||
gv .req x1
|
||||
gstride .req x2
|
||||
src .req x3
|
||||
src_stride .req x4
|
||||
width .req w5
|
||||
height .req w6
|
||||
|
||||
lsl src_stride, src_stride, #1
|
||||
neg x7, src_stride
|
||||
1:
|
||||
mov x10, src
|
||||
mov w11, width
|
||||
mov x12, gh
|
||||
mov x13, gv
|
||||
2:
|
||||
ldur q0, [x10, #2]
|
||||
ldur q1, [x10, #-2]
|
||||
subs w11, w11, #8
|
||||
ldr q2, [x10, src_stride]
|
||||
ldr q3, [x10, x7]
|
||||
sshr v0.8h, v0.8h, #6
|
||||
sshr v1.8h, v1.8h, #6
|
||||
sshr v2.8h, v2.8h, #6
|
||||
sshr v3.8h, v3.8h, #6
|
||||
sub v0.8h, v0.8h, v1.8h
|
||||
sub v2.8h, v2.8h, v3.8h
|
||||
st1 {v0.8h}, [x12], #16
|
||||
st1 {v2.8h}, [x13], #16
|
||||
add x10, x10, #16
|
||||
b.ne 2b
|
||||
|
||||
subs height, height, #1
|
||||
add gh, gh, gstride, lsl #1
|
||||
add gv, gv, gstride, lsl #1
|
||||
add src, src, src_stride
|
||||
b.ne 1b
|
||||
ret
|
||||
|
||||
.unreq gh
|
||||
.unreq gv
|
||||
.unreq gstride
|
||||
.unreq src
|
||||
.unreq src_stride
|
||||
.unreq width
|
||||
.unreq height
|
||||
endfunc
|
||||
|
||||
.macro vvc_apply_bdof_block bit_depth
|
||||
dst .req x0
|
||||
dst_stride .req x1
|
||||
src0 .req x2
|
||||
src1 .req x3
|
||||
gh .req x4
|
||||
gv .req x5
|
||||
vx .req x6
|
||||
vy .req x7
|
||||
|
||||
ld1r {v0.8h}, [vx], #2
|
||||
ld1r {v1.8h}, [vy], #2
|
||||
ld1r {v2.8h}, [vx]
|
||||
ld1r {v3.8h}, [vy]
|
||||
ins v0.d[1], v2.d[1]
|
||||
ins v1.d[1], v3.d[1]
|
||||
|
||||
movi v7.4s, #(1 << (14 - \bit_depth))
|
||||
ldp x8, x9, [gh]
|
||||
ldp x10, x11, [gv]
|
||||
mov x12, #(BDOF_BLOCK_SIZE * 2)
|
||||
mov w13, #(BDOF_MIN_BLOCK_SIZE)
|
||||
mov x14, #(VVC_MAX_PB_SIZE * 2)
|
||||
.if \bit_depth >= 10
|
||||
// clip pixel
|
||||
mov w15, #((1 << \bit_depth) - 1)
|
||||
movi v18.8h, #0
|
||||
lsl dst_stride, dst_stride, #1
|
||||
dup v19.8h, w15
|
||||
.endif
|
||||
1:
|
||||
ld1 {v2.8h}, [x8], x12
|
||||
ld1 {v3.8h}, [x9], x12
|
||||
ld1 {v4.8h}, [x10], x12
|
||||
ld1 {v5.8h}, [x11], x12
|
||||
sub v2.8h, v2.8h, v3.8h
|
||||
sub v4.8h, v4.8h, v5.8h
|
||||
smull v3.4s, v0.4h, v2.4h
|
||||
smull2 v16.4s, v0.8h, v2.8h
|
||||
smlal v3.4s, v1.4h, v4.4h
|
||||
smlal2 v16.4s, v1.8h, v4.8h
|
||||
|
||||
ld1 {v5.8h}, [src0], x14
|
||||
ld1 {v6.8h}, [src1], x14
|
||||
saddl v2.4s, v5.4h, v6.4h
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
add v2.4s, v2.4s, v3.4s
|
||||
saddl2 v4.4s, v5.8h, v6.8h
|
||||
add v4.4s, v4.4s, v7.4s
|
||||
add v4.4s, v4.4s, v16.4s
|
||||
|
||||
sqshrn v5.4h, v2.4s, #(15 - \bit_depth)
|
||||
sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth)
|
||||
subs w13, w13, #1
|
||||
.if \bit_depth == 8
|
||||
sqxtun v5.8b, v5.8h
|
||||
str d5, [dst]
|
||||
add dst, dst, dst_stride
|
||||
.else
|
||||
smin v5.8h, v5.8h, v19.8h
|
||||
smax v5.8h, v5.8h, v18.8h
|
||||
st1 {v5.8h}, [dst], dst_stride
|
||||
.endif
|
||||
b.ne 1b
|
||||
ret
|
||||
|
||||
.unreq dst
|
||||
.unreq dst_stride
|
||||
.unreq src0
|
||||
.unreq src1
|
||||
.unreq gh
|
||||
.unreq gv
|
||||
.unreq vx
|
||||
.unreq vy
|
||||
.endm
|
||||
|
||||
function ff_vvc_apply_bdof_block_8_neon, export=1
|
||||
vvc_apply_bdof_block 8
|
||||
endfunc
|
||||
|
||||
function ff_vvc_apply_bdof_block_10_neon, export=1
|
||||
vvc_apply_bdof_block 10
|
||||
endfunc
|
||||
|
||||
function ff_vvc_apply_bdof_block_12_neon, export=1
|
||||
vvc_apply_bdof_block 12
|
||||
endfunc
|
||||
|
||||
function ff_vvc_derive_bdof_vx_vy_neon, export=1
|
||||
src0 .req x0
|
||||
src1 .req x1
|
||||
pad_mask .req w2
|
||||
gh .req x3
|
||||
gv .req x4
|
||||
vx .req x5
|
||||
vy .req x6
|
||||
|
||||
gh0 .req x7
|
||||
gh1 .req x8
|
||||
gv0 .req x9
|
||||
gv1 .req x10
|
||||
y .req x12
|
||||
|
||||
sgx2 .req w7
|
||||
sgy2 .req w8
|
||||
sgxgy .req w9
|
||||
sgxdi .req w10
|
||||
sgydi .req w11
|
||||
|
||||
sgx2_v .req v22
|
||||
sgy2_v .req v23
|
||||
sgxgy_v .req v24
|
||||
sgxdi_v .req v25
|
||||
sgydi_v .req v26
|
||||
|
||||
sgx2_v2 .req v27
|
||||
sgy2_v2 .req v28
|
||||
sgxgy_v2 .req v29
|
||||
sgxdi_v2 .req v30
|
||||
sgydi_v2 .req v31
|
||||
|
||||
ldp gh0, gh1, [gh]
|
||||
ldp gv0, gv1, [gv]
|
||||
movi sgx2_v.4s, #0
|
||||
movi sgy2_v.4s, #0
|
||||
movi sgxgy_v.4s, #0
|
||||
movi sgxdi_v.4s, #0
|
||||
movi sgydi_v.4s, #0
|
||||
movi sgx2_v2.4s, #0
|
||||
movi sgy2_v2.4s, #0
|
||||
movi sgxgy_v2.4s, #0
|
||||
movi sgxdi_v2.4s, #0
|
||||
movi sgydi_v2.4s, #0
|
||||
mov x13, #-1 // dy
|
||||
movi v6.4s, #0
|
||||
mov y, #-1
|
||||
tbz pad_mask, #1, 1f // check pad top
|
||||
mov x13, #0 // dy: pad top
|
||||
1:
|
||||
mov x16, #-2 // dx
|
||||
add x14, src0, x13, lsl #8 // local src0
|
||||
add x15, src1, x13, lsl #8 // local src1
|
||||
add x17, x16, x13, lsl #5
|
||||
ldr q0, [x14, x16]
|
||||
ldr q1, [x15, x16]
|
||||
ldr q2, [gh0, x17]
|
||||
ldr q3, [gh1, x17]
|
||||
ldr q4, [gv0, x17]
|
||||
ldr q5, [gv1, x17]
|
||||
add x16, x16, #8
|
||||
add x17, x17, #8
|
||||
ins v0.s[3], v6.s[3]
|
||||
ins v1.s[3], v6.s[3]
|
||||
ins v2.s[3], v6.s[3]
|
||||
ins v3.s[3], v6.s[3]
|
||||
ins v4.s[3], v6.s[3]
|
||||
ins v5.s[3], v6.s[3]
|
||||
|
||||
ldr q16, [x14, x16]
|
||||
ldr q17, [x15, x16]
|
||||
ldr q18, [gh0, x17]
|
||||
ldr q19, [gh1, x17]
|
||||
ldr q20, [gv0, x17]
|
||||
ldr q21, [gv1, x17]
|
||||
ins v16.s[3], v6.s[3]
|
||||
ins v17.s[3], v6.s[3]
|
||||
ins v18.s[3], v6.s[3]
|
||||
ins v19.s[3], v6.s[3]
|
||||
ins v20.s[3], v6.s[3]
|
||||
ins v21.s[3], v6.s[3]
|
||||
|
||||
tbz pad_mask, #0, 20f
|
||||
// pad left
|
||||
ins v0.h[0], v0.h[1]
|
||||
ins v1.h[0], v1.h[1]
|
||||
ins v2.h[0], v2.h[1]
|
||||
ins v3.h[0], v3.h[1]
|
||||
ins v4.h[0], v4.h[1]
|
||||
ins v5.h[0], v5.h[1]
|
||||
20:
|
||||
tbz pad_mask, #2, 21f
|
||||
// pad right
|
||||
ins v16.h[5], v16.h[4]
|
||||
ins v17.h[5], v17.h[4]
|
||||
ins v18.h[5], v18.h[4]
|
||||
ins v19.h[5], v19.h[4]
|
||||
ins v20.h[5], v20.h[4]
|
||||
ins v21.h[5], v21.h[4]
|
||||
21:
|
||||
sshr v0.8h, v0.8h, #4
|
||||
sshr v1.8h, v1.8h, #4
|
||||
add v2.8h, v2.8h, v3.8h
|
||||
add v4.8h, v4.8h, v5.8h
|
||||
sub v0.8h, v0.8h, v1.8h // diff
|
||||
sshr v2.8h, v2.8h, #1 // temph
|
||||
sshr v4.8h, v4.8h, #1 // tempv
|
||||
|
||||
sshr v16.8h, v16.8h, #4
|
||||
sshr v17.8h, v17.8h, #4
|
||||
add v18.8h, v18.8h, v19.8h
|
||||
add v20.8h, v20.8h, v21.8h
|
||||
sub v16.8h, v16.8h, v17.8h // diff
|
||||
sshr v18.8h, v18.8h, #1 // temph
|
||||
sshr v20.8h, v20.8h, #1 // tempv
|
||||
|
||||
abs v3.8h, v2.8h
|
||||
abs v5.8h, v4.8h
|
||||
uxtl v19.4s, v3.4h
|
||||
uxtl v21.4s, v5.4h
|
||||
uxtl2 v3.4s, v3.8h
|
||||
uxtl2 v5.4s, v5.8h
|
||||
add v3.4s, v3.4s, v19.4s
|
||||
add v5.4s, v5.4s, v21.4s
|
||||
add sgx2_v.4s, sgx2_v.4s, v3.4s
|
||||
add sgy2_v.4s, sgy2_v.4s, v5.4s
|
||||
|
||||
abs v3.8h, v18.8h
|
||||
abs v5.8h, v20.8h
|
||||
uxtl v19.4s, v3.4h
|
||||
uxtl v21.4s, v5.4h
|
||||
uxtl2 v3.4s, v3.8h
|
||||
uxtl2 v5.4s, v5.8h
|
||||
add v3.4s, v3.4s, v19.4s
|
||||
add v5.4s, v5.4s, v21.4s
|
||||
add sgx2_v2.4s, sgx2_v2.4s, v3.4s
|
||||
add sgy2_v2.4s, sgy2_v2.4s, v5.4s
|
||||
|
||||
cmgt v17.8h, v4.8h, #0
|
||||
cmlt v7.8h, v4.8h, #0
|
||||
cmgt v19.8h, v20.8h, #0
|
||||
cmlt v21.8h, v20.8h, #0
|
||||
sub v17.8h, v7.8h, v17.8h // VVC_SIGN(tempv)
|
||||
sub v19.8h, v21.8h, v19.8h // VVC_SIGN(tempv)
|
||||
|
||||
smlal sgxgy_v.4s, v17.4h, v2.4h
|
||||
smlal2 sgxgy_v.4s, v17.8h, v2.8h
|
||||
smlsl sgydi_v.4s, v17.4h, v0.4h
|
||||
smlsl2 sgydi_v.4s, v17.8h, v0.8h
|
||||
|
||||
cmgt v3.8h, v2.8h, #0
|
||||
cmlt v5.8h, v2.8h, #0
|
||||
cmgt v17.8h, v18.8h, #0
|
||||
cmlt v21.8h, v18.8h, #0
|
||||
sub v3.8h, v5.8h, v3.8h // VVC_SIGN(temph)
|
||||
sub v17.8h, v21.8h, v17.8h // VVC_SIGN(temph)
|
||||
|
||||
smlal sgxgy_v2.4s, v19.4h, v18.4h
|
||||
smlal2 sgxgy_v2.4s, v19.8h, v18.8h
|
||||
smlsl sgydi_v2.4s, v19.4h, v16.4h
|
||||
smlsl2 sgydi_v2.4s, v19.8h, v16.8h
|
||||
|
||||
smlsl sgxdi_v.4s, v3.4h, v0.4h
|
||||
smlsl2 sgxdi_v.4s, v3.8h, v0.8h
|
||||
smlsl sgxdi_v2.4s, v17.4h, v16.4h
|
||||
smlsl2 sgxdi_v2.4s, v17.8h, v16.8h
|
||||
3:
|
||||
add y, y, #1
|
||||
cmp y, #(BDOF_MIN_BLOCK_SIZE)
|
||||
mov x13, y
|
||||
b.gt 4f
|
||||
b.lt 1b
|
||||
tbz pad_mask, #3, 1b
|
||||
sub x13, x13, #1 // pad bottom
|
||||
b 1b
|
||||
4:
|
||||
addv s22, sgx2_v.4s
|
||||
addv s23, sgy2_v.4s
|
||||
addv s24, sgxgy_v.4s
|
||||
addv s25, sgxdi_v.4s
|
||||
addv s26, sgydi_v.4s
|
||||
|
||||
mov w3, #31
|
||||
mov w16, #-15
|
||||
mov w17, #15
|
||||
40:
|
||||
mov w14, #0
|
||||
|
||||
mov sgx2, v22.s[0]
|
||||
mov sgy2, v23.s[0]
|
||||
mov sgxgy, v24.s[0]
|
||||
mov sgxdi, v25.s[0]
|
||||
mov sgydi, v26.s[0]
|
||||
|
||||
cbz sgx2, 5f
|
||||
clz w12, sgx2
|
||||
lsl sgxdi, sgxdi, #2
|
||||
sub w13, w3, w12 // log2(sgx2)
|
||||
asr sgxdi, sgxdi, w13
|
||||
cmp sgxdi, w16
|
||||
csel w14, w16, sgxdi, lt // clip to -15
|
||||
b.le 5f
|
||||
cmp sgxdi, w17
|
||||
csel w14, w17, sgxdi, gt // clip to 15
|
||||
5:
|
||||
strh w14, [vx], #2
|
||||
|
||||
mov w15, #0
|
||||
cbz sgy2, 6f
|
||||
lsl sgydi, sgydi, #2
|
||||
smull x14, w14, sgxgy
|
||||
asr w14, w14, #1
|
||||
sub sgydi, sgydi, w14
|
||||
clz w12, sgy2
|
||||
sub w13, w3, w12 // log2(sgy2)
|
||||
asr sgydi, sgydi, w13
|
||||
cmp sgydi, w16
|
||||
csel w15, w16, sgydi, lt // clip to -15
|
||||
b.le 6f
|
||||
cmp sgydi, w17
|
||||
csel w15, w17, sgydi, gt // clip to 15
|
||||
6:
|
||||
strh w15, [vy], #2
|
||||
cbz x0, 7f
|
||||
addv s22, sgx2_v2.4s
|
||||
addv s23, sgy2_v2.4s
|
||||
addv s24, sgxgy_v2.4s
|
||||
addv s25, sgxdi_v2.4s
|
||||
addv s26, sgydi_v2.4s
|
||||
mov x0, #0
|
||||
b 40b
|
||||
7:
|
||||
ret
|
||||
|
||||
.unreq src0
|
||||
.unreq src1
|
||||
.unreq pad_mask
|
||||
.unreq gh
|
||||
.unreq gv
|
||||
.unreq vx
|
||||
.unreq vy
|
||||
.unreq sgx2
|
||||
.unreq sgy2
|
||||
.unreq sgxgy
|
||||
.unreq sgxdi
|
||||
.unreq sgydi
|
||||
.unreq sgx2_v
|
||||
.unreq sgy2_v
|
||||
.unreq sgxgy_v
|
||||
.unreq sgxdi_v
|
||||
.unreq sgydi_v
|
||||
.unreq sgx2_v2
|
||||
.unreq sgy2_v2
|
||||
.unreq sgxgy_v2
|
||||
.unreq sgxdi_v2
|
||||
.unreq sgydi_v2
|
||||
.unreq y
|
||||
endfunc
|
||||
|
64
libavcodec/aarch64/vvc/of_template.c
Normal file
64
libavcodec/aarch64/vvc/of_template.c
Normal file
@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/bit_depth_template.c"
|
||||
|
||||
void FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(pixel* dst,
|
||||
ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
|
||||
const int16_t **gh, const int16_t **gv, int16_t *vx, int16_t *vy);
|
||||
|
||||
static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t _dst_stride,
|
||||
const int16_t *_src0, const int16_t *_src1,
|
||||
int block_w, int block_h) {
|
||||
// +2 for pad left and right
|
||||
int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
|
||||
int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2];
|
||||
int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
|
||||
int16_t *gradient_v[2] = {&gradient_buf_v[0][1], &gradient_buf_v[1][1]};
|
||||
ptrdiff_t dst_stride = _dst_stride / sizeof(pixel);
|
||||
pixel *dst = (pixel *) _dst;
|
||||
|
||||
ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0],
|
||||
BDOF_BLOCK_SIZE,
|
||||
_src0, MAX_PB_SIZE, block_w, block_h);
|
||||
ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
|
||||
BDOF_BLOCK_SIZE,
|
||||
_src1, MAX_PB_SIZE, block_w, block_h);
|
||||
|
||||
for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
|
||||
for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
|
||||
const int16_t *src0 = _src0 + y * MAX_PB_SIZE + x;
|
||||
const int16_t *src1 = _src1 + y * MAX_PB_SIZE + x;
|
||||
pixel *d = dst + x;
|
||||
int idx = BDOF_BLOCK_SIZE * y + x;
|
||||
const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] + idx};
|
||||
const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] + idx};
|
||||
int16_t vx[2], vy[2];
|
||||
int pad_mask = !x | ((!y) << 1) |
|
||||
((x + 2 * BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
|
||||
((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
|
||||
ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, vx, vy);
|
||||
FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d, dst_stride,
|
||||
src0, src1, gh, gv,
|
||||
vx, vy);
|
||||
}
|
||||
dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user