mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-02-04 06:08:26 +02:00
aarch64/vvc: Add put_qpel_hv
With Apple M1 (no i8mm): put_luma_hv_8_4x4_c: 2.2 ( 1.00x) put_luma_hv_8_4x4_neon: 0.8 ( 3.00x) put_luma_hv_8_8x8_c: 7.0 ( 1.00x) put_luma_hv_8_8x8_neon: 0.8 ( 9.33x) put_luma_hv_8_16x16_c: 22.8 ( 1.00x) put_luma_hv_8_16x16_neon: 2.5 ( 9.10x) put_luma_hv_8_32x32_c: 84.8 ( 1.00x) put_luma_hv_8_32x32_neon: 9.5 ( 8.92x) put_luma_hv_8_64x64_c: 333.0 ( 1.00x) put_luma_hv_8_64x64_neon: 35.5 ( 9.38x) put_luma_hv_8_128x128_c: 1294.5 ( 1.00x) put_luma_hv_8_128x128_neon: 137.8 ( 9.40x) With Pixel 8 Pro: put_luma_hv_8_4x4_c: 5.0 ( 1.00x) put_luma_hv_8_4x4_neon: 0.8 ( 6.67x) put_luma_hv_8_4x4_i8mm: 0.2 (20.00x) put_luma_hv_8_8x8_c: 13.2 ( 1.00x) put_luma_hv_8_8x8_neon: 1.2 (10.60x) put_luma_hv_8_8x8_i8mm: 1.2 (10.60x) put_luma_hv_8_16x16_c: 44.2 ( 1.00x) put_luma_hv_8_16x16_neon: 4.5 ( 9.83x) put_luma_hv_8_16x16_i8mm: 4.2 (10.41x) put_luma_hv_8_32x32_c: 160.8 ( 1.00x) put_luma_hv_8_32x32_neon: 17.5 ( 9.19x) put_luma_hv_8_32x32_i8mm: 16.0 (10.05x) put_luma_hv_8_64x64_c: 611.2 ( 1.00x) put_luma_hv_8_64x64_neon: 68.0 ( 8.99x) put_luma_hv_8_64x64_i8mm: 62.2 ( 9.82x) put_luma_hv_8_128x128_c: 2384.8 ( 1.00x) put_luma_hv_8_128x128_neon: 268.8 ( 8.87x) put_luma_hv_8_128x128_i8mm: 245.8 ( 9.70x)
This commit is contained in:
parent
a0b52afd32
commit
5ac6925803
@ -282,4 +282,12 @@ void ff_vvc_put_qpel_v8_8_neon(int16_t *dst, const uint8_t *_src,
|
||||
ptrdiff_t _srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width);
|
||||
|
||||
NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst,
|
||||
const uint8_t *src, ptrdiff_t srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width),);
|
||||
|
||||
NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst,
|
||||
const uint8_t *src, ptrdiff_t srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width), _i8mm);
|
||||
|
||||
#endif
|
||||
|
@ -4140,9 +4140,15 @@ endfunc
|
||||
DISABLE_I8MM
|
||||
#endif
|
||||
|
||||
function vvc_put_qpel_hv4_8_end_neon
|
||||
vvc_load_qpel_filterh x5
|
||||
mov x7, #(VVC_MAX_PB_SIZE * 2)
|
||||
b 1f
|
||||
endfunc
|
||||
|
||||
function hevc_put_hevc_qpel_hv4_8_end_neon
|
||||
load_qpel_filterh x5, x4
|
||||
1:
|
||||
ldr d16, [sp]
|
||||
ldr d17, [sp, x7]
|
||||
add sp, sp, x7, lsl #1
|
||||
@ -4194,9 +4200,16 @@ function hevc_put_hevc_qpel_hv6_8_end_neon
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function vvc_put_qpel_hv8_8_end_neon
|
||||
vvc_load_qpel_filterh x5
|
||||
mov x7, #(VVC_MAX_PB_SIZE * 2)
|
||||
b 1f
|
||||
endfunc
|
||||
|
||||
function hevc_put_hevc_qpel_hv8_8_end_neon
|
||||
mov x7, #128
|
||||
load_qpel_filterh x5, x4
|
||||
1:
|
||||
ldr q16, [sp]
|
||||
ldr q17, [sp, x7]
|
||||
add sp, sp, x7, lsl #1
|
||||
@ -4247,9 +4260,16 @@ function hevc_put_hevc_qpel_hv12_8_end_neon
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function vvc_put_qpel_hv16_8_end_neon
|
||||
vvc_load_qpel_filterh x5
|
||||
mov x7, #(VVC_MAX_PB_SIZE * 2)
|
||||
b 1f
|
||||
endfunc
|
||||
|
||||
function hevc_put_hevc_qpel_hv16_8_end_neon
|
||||
mov x7, #128
|
||||
load_qpel_filterh x5, x4
|
||||
1:
|
||||
ld1 {v16.8h, v17.8h}, [sp], x7
|
||||
ld1 {v18.8h, v19.8h}, [sp], x7
|
||||
ld1 {v20.8h, v21.8h}, [sp], x7
|
||||
@ -4272,6 +4292,12 @@ function hevc_put_hevc_qpel_hv16_8_end_neon
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function vvc_put_qpel_hv32_8_end_neon
|
||||
vvc_load_qpel_filterh x5
|
||||
mov x7, #(VVC_MAX_PB_SIZE * 2)
|
||||
b 0f
|
||||
endfunc
|
||||
|
||||
function hevc_put_hevc_qpel_hv32_8_end_neon
|
||||
mov x7, #128
|
||||
load_qpel_filterh x5, x4
|
||||
@ -4325,6 +4351,25 @@ function ff_hevc_put_hevc_qpel_hv4_8_\suffix, export=1
|
||||
b hevc_put_hevc_qpel_hv4_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_qpel_hv4_8_\suffix, export=1
|
||||
add w10, w3, #8
|
||||
lsl x10, x10, #8
|
||||
mov x14, sp
|
||||
sub sp, sp, x10 // tmp_array
|
||||
stp x5, x30, [sp, #-48]!
|
||||
stp x0, x3, [sp, #16]
|
||||
str x14, [sp, #32]
|
||||
add x0, sp, #48
|
||||
sub x1, x1, x2, lsl #1
|
||||
add x3, x3, #7
|
||||
sub x1, x1, x2
|
||||
bl X(ff_vvc_put_qpel_h4_8_\suffix)
|
||||
ldr x14, [sp, #32]
|
||||
ldp x0, x3, [sp, #16]
|
||||
ldp x5, x30, [sp], #48
|
||||
b vvc_put_qpel_hv4_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_hv6_8_\suffix, export=1
|
||||
add w10, w3, #8
|
||||
mov x7, #128
|
||||
@ -4364,6 +4409,25 @@ function ff_hevc_put_hevc_qpel_hv8_8_\suffix, export=1
|
||||
b hevc_put_hevc_qpel_hv8_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_qpel_hv8_8_\suffix, export=1
|
||||
add w10, w3, #8
|
||||
lsl x10, x10, #8
|
||||
sub x1, x1, x2, lsl #1
|
||||
mov x14, sp
|
||||
sub sp, sp, x10 // tmp_array
|
||||
stp x5, x30, [sp, #-48]!
|
||||
stp x0, x3, [sp, #16]
|
||||
str x14, [sp, #32]
|
||||
add x0, sp, #48
|
||||
add x3, x3, #7
|
||||
sub x1, x1, x2
|
||||
bl X(ff_vvc_put_qpel_h8_8_\suffix)
|
||||
ldr x14, [sp, #32]
|
||||
ldp x0, x3, [sp, #16]
|
||||
ldp x5, x30, [sp], #48
|
||||
b vvc_put_qpel_hv8_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_hv12_8_\suffix, export=1
|
||||
add w10, w3, #8
|
||||
lsl x10, x10, #7
|
||||
@ -4403,6 +4467,25 @@ function ff_hevc_put_hevc_qpel_hv16_8_\suffix, export=1
|
||||
b hevc_put_hevc_qpel_hv16_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_qpel_hv16_8_\suffix, export=1
|
||||
add w10, w3, #8
|
||||
lsl x10, x10, #8
|
||||
sub x1, x1, x2, lsl #1
|
||||
mov x14, sp
|
||||
sub sp, sp, x10 // tmp_array
|
||||
stp x5, x30, [sp, #-48]!
|
||||
stp x0, x3, [sp, #16]
|
||||
str x14, [sp, #32]
|
||||
add x3, x3, #7
|
||||
add x0, sp, #48
|
||||
sub x1, x1, x2
|
||||
bl X(ff_vvc_put_qpel_h16_8_\suffix)
|
||||
ldr x14, [sp, #32]
|
||||
ldp x0, x3, [sp, #16]
|
||||
ldp x5, x30, [sp], #48
|
||||
b vvc_put_qpel_hv16_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_hv24_8_\suffix, export=1
|
||||
stp x4, x5, [sp, #-64]!
|
||||
stp x2, x3, [sp, #16]
|
||||
@ -4439,6 +4522,26 @@ function ff_hevc_put_hevc_qpel_hv32_8_\suffix, export=1
|
||||
b hevc_put_hevc_qpel_hv32_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_qpel_hv32_8_\suffix, export=1
|
||||
add w10, w3, #8
|
||||
sub x1, x1, x2, lsl #1
|
||||
lsl x10, x10, #8
|
||||
sub x1, x1, x2
|
||||
mov x14, sp
|
||||
sub sp, sp, x10 // tmp_array
|
||||
stp x5, x30, [sp, #-48]!
|
||||
stp x0, x3, [sp, #16]
|
||||
str x14, [sp, #32]
|
||||
add x3, x3, #7
|
||||
add x0, sp, #48
|
||||
mov w6, #32
|
||||
bl X(ff_vvc_put_qpel_h32_8_\suffix)
|
||||
ldr x14, [sp, #32]
|
||||
ldp x0, x3, [sp, #16]
|
||||
ldp x5, x30, [sp], #48
|
||||
b vvc_put_qpel_hv32_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_hv48_8_\suffix, export=1
|
||||
stp x4, x5, [sp, #-64]!
|
||||
stp x2, x3, [sp, #16]
|
||||
@ -4472,6 +4575,43 @@ function ff_hevc_put_hevc_qpel_hv64_8_\suffix, export=1
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_qpel_hv64_8_\suffix, export=1
|
||||
stp x4, x5, [sp, #-64]!
|
||||
stp x2, x3, [sp, #16]
|
||||
stp x0, x1, [sp, #32]
|
||||
str x30, [sp, #48]
|
||||
mov x6, #32
|
||||
bl X(ff_vvc_put_qpel_hv32_8_\suffix)
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldp x2, x3, [sp, #16]
|
||||
ldp x4, x5, [sp], #48
|
||||
add x1, x1, #32
|
||||
add x0, x0, #64
|
||||
mov x6, #32
|
||||
bl X(ff_vvc_put_qpel_hv32_8_\suffix)
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_qpel_hv128_8_\suffix, export=1
|
||||
stp x4, x5, [sp, #-64]!
|
||||
stp x2, x3, [sp, #16]
|
||||
stp x0, x1, [sp, #32]
|
||||
str x30, [sp, #48]
|
||||
mov x6, #64
|
||||
bl X(ff_vvc_put_qpel_hv64_8_\suffix)
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldp x2, x3, [sp, #16]
|
||||
ldp x4, x5, [sp], #48
|
||||
add x1, x1, #64
|
||||
add x0, x0, #128
|
||||
mov x6, #64
|
||||
bl X(ff_vvc_put_qpel_hv64_8_\suffix)
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.endm
|
||||
|
||||
qpel_hv neon
|
||||
|
@ -67,6 +67,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.put[0][5][1][0] =
|
||||
c->inter.put[0][6][1][0] = ff_vvc_put_qpel_v8_8_neon;
|
||||
|
||||
c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon;
|
||||
c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon;
|
||||
c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon;
|
||||
c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon;
|
||||
c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon;
|
||||
c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon;
|
||||
|
||||
c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
|
||||
c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
|
||||
c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
|
||||
@ -103,6 +110,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.put[0][4][0][1] = ff_vvc_put_qpel_h32_8_neon_i8mm;
|
||||
c->inter.put[0][5][0][1] = ff_vvc_put_qpel_h64_8_neon_i8mm;
|
||||
c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h128_8_neon_i8mm;
|
||||
|
||||
c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon_i8mm;
|
||||
c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon_i8mm;
|
||||
c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon_i8mm;
|
||||
c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon_i8mm;
|
||||
c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon_i8mm;
|
||||
c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon_i8mm;
|
||||
}
|
||||
} else if (bd == 10) {
|
||||
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
|
||||
|
Loading…
x
Reference in New Issue
Block a user