mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
aarch64/vvc: Add dmvr_hv
dmvr_hv_8_12x20_c: 8.0 ( 1.00x) dmvr_hv_8_12x20_neon: 1.2 ( 6.62x) dmvr_hv_8_20x12_c: 8.0 ( 1.00x) dmvr_hv_8_20x12_neon: 0.9 ( 8.37x) dmvr_hv_8_20x20_c: 12.9 ( 1.00x) dmvr_hv_8_20x20_neon: 1.7 ( 7.62x) dmvr_hv_10_12x20_c: 7.0 ( 1.00x) dmvr_hv_10_12x20_neon: 1.7 ( 4.09x) dmvr_hv_10_20x12_c: 7.0 ( 1.00x) dmvr_hv_10_20x12_neon: 1.7 ( 4.09x) dmvr_hv_10_20x20_c: 11.2 ( 1.00x) dmvr_hv_10_20x20_neon: 2.7 ( 4.15x) dmvr_hv_12_12x20_c: 6.5 ( 1.00x) dmvr_hv_12_12x20_neon: 1.7 ( 3.79x) dmvr_hv_12_20x12_c: 6.5 ( 1.00x) dmvr_hv_12_20x12_neon: 1.7 ( 3.79x) dmvr_hv_12_20x20_c: 10.2 ( 1.00x) dmvr_hv_12_20x20_neon: 2.2 ( 4.64x) Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
This commit is contained in:
parent
0ba9e8d0d4
commit
bcd65ebd8f
@ -85,6 +85,15 @@ W_AVG_FUN(8)
|
||||
W_AVG_FUN(10)
|
||||
W_AVG_FUN(12)
|
||||
|
||||
#define DMVR_FUN(fn, bd) \
|
||||
void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
|
||||
const uint8_t *_src, ptrdiff_t _src_stride, int height, \
|
||||
intptr_t mx, intptr_t my, int width);
|
||||
|
||||
DMVR_FUN(hv_, 8)
|
||||
DMVR_FUN(hv_, 10)
|
||||
DMVR_FUN(hv_, 12)
|
||||
|
||||
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
@ -157,6 +166,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
|
||||
c->inter.avg = ff_vvc_avg_8_neon;
|
||||
c->inter.w_avg = vvc_w_avg_8;
|
||||
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
|
||||
|
||||
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
|
||||
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
|
||||
@ -198,12 +208,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
} else if (bd == 10) {
|
||||
c->inter.avg = ff_vvc_avg_10_neon;
|
||||
c->inter.w_avg = vvc_w_avg_10;
|
||||
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
|
||||
|
||||
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
|
||||
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
|
||||
} else if (bd == 12) {
|
||||
c->inter.avg = ff_vvc_avg_12_neon;
|
||||
c->inter.w_avg = vvc_w_avg_12;
|
||||
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
|
||||
|
||||
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
|
||||
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
|
||||
|
@ -226,3 +226,310 @@ vvc_avg avg, 12
|
||||
vvc_avg w_avg, 8
|
||||
vvc_avg w_avg, 10
|
||||
vvc_avg w_avg, 12
|
||||
|
||||
/* x0: int16_t *dst
|
||||
* x1: const uint8_t *_src
|
||||
* x2: ptrdiff_t _src_stride
|
||||
* w3: int height
|
||||
* x4: intptr_t mx
|
||||
* x5: intptr_t my
|
||||
* w6: int width
|
||||
*/
|
||||
function ff_vvc_dmvr_hv_8_neon, export=1
|
||||
dst .req x0
|
||||
src .req x1
|
||||
src_stride .req x2
|
||||
height .req w3
|
||||
mx .req x4
|
||||
my .req x5
|
||||
width .req w6
|
||||
tmp0 .req x7
|
||||
tmp1 .req x8
|
||||
|
||||
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
||||
|
||||
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
|
||||
add x12, x9, mx, lsl #1
|
||||
ldrb w10, [x12]
|
||||
ldrb w11, [x12, #1]
|
||||
mov tmp0, sp
|
||||
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
|
||||
// We know the value are positive
|
||||
dup v0.8h, w10 // filter_x[0]
|
||||
dup v1.8h, w11 // filter_x[1]
|
||||
|
||||
add x12, x9, my, lsl #1
|
||||
ldrb w10, [x12]
|
||||
ldrb w11, [x12, #1]
|
||||
sxtw x6, w6
|
||||
movi v30.8h, #(1 << (8 - 7)) // offset1
|
||||
movi v31.8h, #8 // offset2
|
||||
dup v2.8h, w10 // filter_y[0]
|
||||
dup v3.8h, w11 // filter_y[1]
|
||||
|
||||
// Valid value for width can only be 8 + 4, 16 + 4
|
||||
cmp width, #16
|
||||
mov w10, #0 // start filter_y or not
|
||||
add height, height, #1
|
||||
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
||||
sub src_stride, src_stride, x6
|
||||
cset w15, gt // width > 16
|
||||
1:
|
||||
mov x12, tmp0
|
||||
mov x13, tmp1
|
||||
mov x14, dst
|
||||
cbz w15, 2f
|
||||
|
||||
// width > 16
|
||||
ldur q5, [src, #1]
|
||||
ldr q4, [src], #16
|
||||
uxtl v7.8h, v5.8b
|
||||
uxtl2 v17.8h, v5.16b
|
||||
uxtl v6.8h, v4.8b
|
||||
uxtl2 v16.8h, v4.16b
|
||||
mul v6.8h, v6.8h, v0.8h
|
||||
mul v16.8h, v16.8h, v0.8h
|
||||
mla v6.8h, v7.8h, v1.8h
|
||||
mla v16.8h, v17.8h, v1.8h
|
||||
add v6.8h, v6.8h, v30.8h
|
||||
add v16.8h, v16.8h, v30.8h
|
||||
ushr v6.8h, v6.8h, #(8 - 6)
|
||||
ushr v7.8h, v16.8h, #(8 - 6)
|
||||
stp q6, q7, [x13], #32
|
||||
|
||||
cbz w10, 3f
|
||||
|
||||
ldp q16, q17, [x12], #32
|
||||
mul v16.8h, v16.8h, v2.8h
|
||||
mul v17.8h, v17.8h, v2.8h
|
||||
mla v16.8h, v6.8h, v3.8h
|
||||
mla v17.8h, v7.8h, v3.8h
|
||||
add v16.8h, v16.8h, v31.8h
|
||||
add v17.8h, v17.8h, v31.8h
|
||||
ushr v16.8h, v16.8h, #4
|
||||
ushr v17.8h, v17.8h, #4
|
||||
stp q16, q17, [x14], #32
|
||||
b 3f
|
||||
2:
|
||||
// width > 8
|
||||
ldur d5, [src, #1]
|
||||
ldr d4, [src], #8
|
||||
uxtl v7.8h, v5.8b
|
||||
uxtl v6.8h, v4.8b
|
||||
mul v6.8h, v6.8h, v0.8h
|
||||
mla v6.8h, v7.8h, v1.8h
|
||||
add v6.8h, v6.8h, v30.8h
|
||||
ushr v6.8h, v6.8h, #(8 - 6)
|
||||
str q6, [x13], #16
|
||||
|
||||
cbz w10, 3f
|
||||
|
||||
ldr q16, [x12], #16
|
||||
mul v16.8h, v16.8h, v2.8h
|
||||
mla v16.8h, v6.8h, v3.8h
|
||||
add v16.8h, v16.8h, v31.8h
|
||||
ushr v16.8h, v16.8h, #4
|
||||
str q16, [x14], #16
|
||||
3:
|
||||
ldr s5, [src, #1]
|
||||
ldr s4, [src], #4
|
||||
uxtl v7.8h, v5.8b
|
||||
uxtl v6.8h, v4.8b
|
||||
mul v6.4h, v6.4h, v0.4h
|
||||
mla v6.4h, v7.4h, v1.4h
|
||||
add v6.4h, v6.4h, v30.4h
|
||||
ushr v6.4h, v6.4h, #(8 - 6)
|
||||
str d6, [x13], #8
|
||||
|
||||
cbz w10, 4f
|
||||
|
||||
ldr d16, [x12], #8
|
||||
mul v16.4h, v16.4h, v2.4h
|
||||
mla v16.4h, v6.4h, v3.4h
|
||||
add v16.4h, v16.4h, v31.4h
|
||||
ushr v16.4h, v16.4h, #4
|
||||
str d16, [x14], #8
|
||||
4:
|
||||
subs height, height, #1
|
||||
mov w10, #1
|
||||
add src, src, src_stride
|
||||
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
||||
eor tmp0, tmp0, tmp1
|
||||
eor tmp1, tmp0, tmp1
|
||||
eor tmp0, tmp0, tmp1
|
||||
b.ne 1b
|
||||
|
||||
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vvc_dmvr_hv_12_neon, export=1
|
||||
movi v29.4s, #(12 - 6)
|
||||
movi v30.4s, #(1 << (12 - 7)) // offset1
|
||||
b 0f
|
||||
endfunc
|
||||
|
||||
function ff_vvc_dmvr_hv_10_neon, export=1
|
||||
movi v29.4s, #(10 - 6)
|
||||
movi v30.4s, #(1 << (10 - 7)) // offset1
|
||||
0:
|
||||
movi v31.4s, #8 // offset2
|
||||
neg v29.4s, v29.4s
|
||||
|
||||
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
||||
|
||||
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
|
||||
add x12, x9, mx, lsl #1
|
||||
ldrb w10, [x12]
|
||||
ldrb w11, [x12, #1]
|
||||
mov tmp0, sp
|
||||
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
|
||||
// We know the value are positive
|
||||
dup v0.8h, w10 // filter_x[0]
|
||||
dup v1.8h, w11 // filter_x[1]
|
||||
|
||||
add x12, x9, my, lsl #1
|
||||
ldrb w10, [x12]
|
||||
ldrb w11, [x12, #1]
|
||||
sxtw x6, w6
|
||||
dup v2.8h, w10 // filter_y[0]
|
||||
dup v3.8h, w11 // filter_y[1]
|
||||
|
||||
// Valid value for width can only be 8 + 4, 16 + 4
|
||||
cmp width, #16
|
||||
mov w10, #0 // start filter_y or not
|
||||
add height, height, #1
|
||||
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
||||
sub src_stride, src_stride, x6, lsl #1
|
||||
cset w15, gt // width > 16
|
||||
1:
|
||||
mov x12, tmp0
|
||||
mov x13, tmp1
|
||||
mov x14, dst
|
||||
cbz w15, 2f
|
||||
|
||||
// width > 16
|
||||
add x16, src, #2
|
||||
ldp q6, q16, [src], #32
|
||||
ldp q7, q17, [x16]
|
||||
umull v4.4s, v6.4h, v0.4h
|
||||
umull2 v5.4s, v6.8h, v0.8h
|
||||
umull v18.4s, v16.4h, v0.4h
|
||||
umull2 v19.4s, v16.8h, v0.8h
|
||||
umlal v4.4s, v7.4h, v1.4h
|
||||
umlal2 v5.4s, v7.8h, v1.8h
|
||||
umlal v18.4s, v17.4h, v1.4h
|
||||
umlal2 v19.4s, v17.8h, v1.8h
|
||||
|
||||
add v4.4s, v4.4s, v30.4s
|
||||
add v5.4s, v5.4s, v30.4s
|
||||
add v18.4s, v18.4s, v30.4s
|
||||
add v19.4s, v19.4s, v30.4s
|
||||
ushl v4.4s, v4.4s, v29.4s
|
||||
ushl v5.4s, v5.4s, v29.4s
|
||||
ushl v18.4s, v18.4s, v29.4s
|
||||
ushl v19.4s, v19.4s, v29.4s
|
||||
uqxtn v6.4h, v4.4s
|
||||
uqxtn2 v6.8h, v5.4s
|
||||
uqxtn v7.4h, v18.4s
|
||||
uqxtn2 v7.8h, v19.4s
|
||||
stp q6, q7, [x13], #32
|
||||
|
||||
cbz w10, 3f
|
||||
|
||||
ldp q4, q5, [x12], #32
|
||||
umull v17.4s, v4.4h, v2.4h
|
||||
umull2 v18.4s, v4.8h, v2.8h
|
||||
umull v19.4s, v5.4h, v2.4h
|
||||
umull2 v20.4s, v5.8h, v2.8h
|
||||
umlal v17.4s, v6.4h, v3.4h
|
||||
umlal2 v18.4s, v6.8h, v3.8h
|
||||
umlal v19.4s, v7.4h, v3.4h
|
||||
umlal2 v20.4s, v7.8h, v3.8h
|
||||
add v17.4s, v17.4s, v31.4s
|
||||
add v18.4s, v18.4s, v31.4s
|
||||
add v19.4s, v19.4s, v31.4s
|
||||
add v20.4s, v20.4s, v31.4s
|
||||
ushr v17.4s, v17.4s, #4
|
||||
ushr v18.4s, v18.4s, #4
|
||||
ushr v19.4s, v19.4s, #4
|
||||
ushr v20.4s, v20.4s, #4
|
||||
uqxtn v6.4h, v17.4s
|
||||
uqxtn2 v6.8h, v18.4s
|
||||
uqxtn v7.4h, v19.4s
|
||||
uqxtn2 v7.8h, v20.4s
|
||||
stp q6, q7, [x14], #32
|
||||
b 3f
|
||||
2:
|
||||
// width > 8
|
||||
ldur q7, [src, #2]
|
||||
ldr q6, [src], #16
|
||||
umull v4.4s, v6.4h, v0.4h
|
||||
umull2 v5.4s, v6.8h, v0.8h
|
||||
umlal v4.4s, v7.4h, v1.4h
|
||||
umlal2 v5.4s, v7.8h, v1.8h
|
||||
|
||||
add v4.4s, v4.4s, v30.4s
|
||||
add v5.4s, v5.4s, v30.4s
|
||||
ushl v4.4s, v4.4s, v29.4s
|
||||
ushl v5.4s, v5.4s, v29.4s
|
||||
uqxtn v6.4h, v4.4s
|
||||
uqxtn2 v6.8h, v5.4s
|
||||
str q6, [x13], #16
|
||||
|
||||
cbz w10, 3f
|
||||
|
||||
ldr q16, [x12], #16
|
||||
umull v17.4s, v16.4h, v2.4h
|
||||
umull2 v18.4s, v16.8h, v2.8h
|
||||
umlal v17.4s, v6.4h, v3.4h
|
||||
umlal2 v18.4s, v6.8h, v3.8h
|
||||
add v17.4s, v17.4s, v31.4s
|
||||
add v18.4s, v18.4s, v31.4s
|
||||
ushr v17.4s, v17.4s, #4
|
||||
ushr v18.4s, v18.4s, #4
|
||||
uqxtn v16.4h, v17.4s
|
||||
uqxtn2 v16.8h, v18.4s
|
||||
str q16, [x14], #16
|
||||
3:
|
||||
ldr d7, [src, #2]
|
||||
ldr d6, [src], #8
|
||||
umull v4.4s, v7.4h, v1.4h
|
||||
umlal v4.4s, v6.4h, v0.4h
|
||||
add v4.4s, v4.4s, v30.4s
|
||||
ushl v4.4s, v4.4s, v29.4s
|
||||
uqxtn v6.4h, v4.4s
|
||||
str d6, [x13], #8
|
||||
|
||||
cbz w10, 4f
|
||||
|
||||
ldr d16, [x12], #8
|
||||
umull v17.4s, v16.4h, v2.4h
|
||||
umlal v17.4s, v6.4h, v3.4h
|
||||
add v17.4s, v17.4s, v31.4s
|
||||
ushr v17.4s, v17.4s, #4
|
||||
uqxtn v16.4h, v17.4s
|
||||
str d16, [x14], #8
|
||||
4:
|
||||
subs height, height, #1
|
||||
mov w10, #1
|
||||
add src, src, src_stride
|
||||
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
||||
eor tmp0, tmp0, tmp1
|
||||
eor tmp1, tmp0, tmp1
|
||||
eor tmp0, tmp0, tmp1
|
||||
b.ne 1b
|
||||
|
||||
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
||||
ret
|
||||
|
||||
.unreq dst
|
||||
.unreq src
|
||||
.unreq src_stride
|
||||
.unreq height
|
||||
.unreq mx
|
||||
.unreq my
|
||||
.unreq width
|
||||
.unreq tmp0
|
||||
.unreq tmp1
|
||||
endfunc
|
||||
|
Loading…
Reference in New Issue
Block a user