mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-02-09 14:14:39 +02:00
aarch64/vvc: Add put_epel_hv
On Apple M1: put_chroma_hv_8_4x4_c: 1.7 ( 1.00x) put_chroma_hv_8_4x4_neon: 0.2 ( 7.67x) put_chroma_hv_8_8x8_c: 5.5 ( 1.00x) put_chroma_hv_8_8x8_neon: 0.5 (11.53x) put_chroma_hv_8_16x16_c: 18.5 ( 1.00x) put_chroma_hv_8_16x16_neon: 1.5 (12.53x) put_chroma_hv_8_32x32_c: 72.5 ( 1.00x) put_chroma_hv_8_32x32_neon: 4.7 (15.34x) put_chroma_hv_8_64x64_c: 274.0 ( 1.00x) put_chroma_hv_8_64x64_neon: 18.5 (14.83x) put_chroma_hv_8_128x128_c: 1058.7 ( 1.00x) put_chroma_hv_8_128x128_neon: 75.2 (14.07x) On Android Pixel 8 Pro: put_chroma_hv_8_4x4_c: 1.2 ( 1.00x) put_chroma_hv_8_4x4_neon: 0.0 ( 0.00x) put_chroma_hv_8_4x4_i8mm: 0.2 ( 5.00x) put_chroma_hv_8_8x8_c: 4.0 ( 1.00x) put_chroma_hv_8_8x8_neon: 0.5 ( 8.00x) put_chroma_hv_8_8x8_i8mm: 0.5 ( 8.00x) put_chroma_hv_8_16x16_c: 15.2 ( 1.00x) put_chroma_hv_8_16x16_neon: 2.5 ( 6.10x) put_chroma_hv_8_16x16_i8mm: 2.2 ( 6.78x) put_chroma_hv_8_32x32_c: 61.0 ( 1.00x) put_chroma_hv_8_32x32_neon: 9.8 ( 6.26x) put_chroma_hv_8_32x32_i8mm: 8.5 ( 7.18x) put_chroma_hv_8_64x64_c: 229.5 ( 1.00x) put_chroma_hv_8_64x64_neon: 38.5 ( 5.96x) put_chroma_hv_8_64x64_i8mm: 34.0 ( 6.75x) put_chroma_hv_8_128x128_c: 919.8 ( 1.00x) put_chroma_hv_8_128x128_neon: 154.5 ( 5.95x) put_chroma_hv_8_128x128_i8mm: 140.0 ( 6.57x)
This commit is contained in:
parent
0dcf204e5d
commit
1be5a2374f
@ -297,4 +297,12 @@ NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst,
|
||||
const uint8_t *src, ptrdiff_t srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width), _i8mm);
|
||||
|
||||
NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst,
|
||||
const uint8_t *src, ptrdiff_t srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width),);
|
||||
|
||||
NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst,
|
||||
const uint8_t *src, ptrdiff_t srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width), _i8mm);
|
||||
|
||||
#endif
|
||||
|
@ -72,6 +72,11 @@ endconst
|
||||
sxtl v0.8h, v0.8b
|
||||
.endm
|
||||
|
||||
.macro vvc_load_epel_filterh freg
|
||||
ld1 {v0.8b}, [\freg]
|
||||
sxtl v0.8h, v0.8b
|
||||
.endm
|
||||
|
||||
.macro calc_epelh dst, src0, src1, src2, src3
|
||||
smull \dst\().4s, \src0\().4h, v0.h[0]
|
||||
smlal \dst\().4s, \src1\().4h, v0.h[1]
|
||||
@ -2299,10 +2304,16 @@ endfunc
|
||||
DISABLE_I8MM
|
||||
#endif
|
||||
|
||||
function vvc_put_epel_hv4_8_end_neon
|
||||
vvc_load_epel_filterh x5
|
||||
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
||||
b 0f
|
||||
endfunc
|
||||
|
||||
function hevc_put_hevc_epel_hv4_8_end_neon
|
||||
load_epel_filterh x5, x4
|
||||
mov x10, #(HEVC_MAX_PB_SIZE * 2)
|
||||
0:
|
||||
ldr d16, [sp]
|
||||
ldr d17, [sp, x10]
|
||||
add sp, sp, x10, lsl #1
|
||||
@ -2339,9 +2350,16 @@ function hevc_put_hevc_epel_hv6_8_end_neon
|
||||
2: ret
|
||||
endfunc
|
||||
|
||||
function vvc_put_epel_hv8_8_end_neon
|
||||
vvc_load_epel_filterh x5
|
||||
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
||||
b 0f
|
||||
endfunc
|
||||
|
||||
function hevc_put_hevc_epel_hv8_8_end_neon
|
||||
load_epel_filterh x5, x4
|
||||
mov x10, #(HEVC_MAX_PB_SIZE * 2)
|
||||
0:
|
||||
ldr q16, [sp]
|
||||
ldr q17, [sp, x10]
|
||||
add sp, sp, x10, lsl #1
|
||||
@ -2379,9 +2397,16 @@ function hevc_put_hevc_epel_hv12_8_end_neon
|
||||
2: ret
|
||||
endfunc
|
||||
|
||||
function vvc_put_epel_hv16_8_end_neon
|
||||
vvc_load_epel_filterh x5
|
||||
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
||||
b 0f
|
||||
endfunc
|
||||
|
||||
function hevc_put_hevc_epel_hv16_8_end_neon
|
||||
load_epel_filterh x5, x4
|
||||
mov x10, #(HEVC_MAX_PB_SIZE * 2)
|
||||
0:
|
||||
ld1 {v16.8h, v17.8h}, [sp], x10
|
||||
ld1 {v18.8h, v19.8h}, [sp], x10
|
||||
ld1 {v20.8h, v21.8h}, [sp], x10
|
||||
@ -2437,6 +2462,21 @@ function ff_hevc_put_hevc_epel_hv4_8_\suffix, export=1
|
||||
b hevc_put_hevc_epel_hv4_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_epel_hv4_8_\suffix, export=1
|
||||
add w10, w3, #3
|
||||
lsl x10, x10, #8
|
||||
sub sp, sp, x10 // tmp_array
|
||||
stp x5, x30, [sp, #-32]!
|
||||
stp x0, x3, [sp, #16]
|
||||
add x0, sp, #32
|
||||
sub x1, x1, x2
|
||||
add w3, w3, #3
|
||||
bl X(ff_vvc_put_epel_h4_8_\suffix)
|
||||
ldp x0, x3, [sp, #16]
|
||||
ldp x5, x30, [sp], #32
|
||||
b vvc_put_epel_hv4_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1
|
||||
add w10, w3, #3
|
||||
lsl x10, x10, #7
|
||||
@ -2467,6 +2507,21 @@ function ff_hevc_put_hevc_epel_hv8_8_\suffix, export=1
|
||||
b hevc_put_hevc_epel_hv8_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_epel_hv8_8_\suffix, export=1
|
||||
add w10, w3, #3
|
||||
lsl x10, x10, #8
|
||||
sub sp, sp, x10 // tmp_array
|
||||
stp x5, x30, [sp, #-32]!
|
||||
stp x0, x3, [sp, #16]
|
||||
add x0, sp, #32
|
||||
sub x1, x1, x2
|
||||
add w3, w3, #3
|
||||
bl X(ff_vvc_put_epel_h8_8_\suffix)
|
||||
ldp x0, x3, [sp, #16]
|
||||
ldp x5, x30, [sp], #32
|
||||
b vvc_put_epel_hv8_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1
|
||||
add w10, w3, #3
|
||||
lsl x10, x10, #7
|
||||
@ -2497,6 +2552,21 @@ function ff_hevc_put_hevc_epel_hv16_8_\suffix, export=1
|
||||
b hevc_put_hevc_epel_hv16_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_epel_hv16_8_\suffix, export=1
|
||||
add w10, w3, #3
|
||||
lsl x10, x10, #8
|
||||
sub sp, sp, x10 // tmp_array
|
||||
stp x5, x30, [sp, #-32]!
|
||||
stp x0, x3, [sp, #16]
|
||||
add x0, sp, #32
|
||||
sub x1, x1, x2
|
||||
add w3, w3, #3
|
||||
bl X(ff_vvc_put_epel_h16_8_\suffix)
|
||||
ldp x0, x3, [sp, #16]
|
||||
ldp x5, x30, [sp], #32
|
||||
b vvc_put_epel_hv16_8_end_neon
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1
|
||||
add w10, w3, #3
|
||||
lsl x10, x10, #7
|
||||
@ -2530,6 +2600,24 @@ function ff_hevc_put_hevc_epel_hv32_8_\suffix, export=1
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_epel_hv32_8_\suffix, export=1
|
||||
stp x4, x5, [sp, #-64]!
|
||||
stp x2, x3, [sp, #16]
|
||||
stp x0, x1, [sp, #32]
|
||||
str x30, [sp, #48]
|
||||
mov x6, #16
|
||||
bl X(ff_vvc_put_epel_hv16_8_\suffix)
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldp x2, x3, [sp, #16]
|
||||
ldp x4, x5, [sp], #48
|
||||
add x0, x0, #32
|
||||
add x1, x1, #16
|
||||
mov x6, #16
|
||||
bl X(ff_vvc_put_epel_hv16_8_\suffix)
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1
|
||||
stp x4, x5, [sp, #-64]!
|
||||
stp x2, x3, [sp, #16]
|
||||
@ -2579,6 +2667,43 @@ function ff_hevc_put_hevc_epel_hv64_8_\suffix, export=1
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_epel_hv64_8_\suffix, export=1
|
||||
stp x4, x5, [sp, #-64]!
|
||||
stp x2, x3, [sp, #16]
|
||||
stp x0, x1, [sp, #32]
|
||||
str x30, [sp, #48]
|
||||
mov x6, #32
|
||||
bl X(ff_vvc_put_epel_hv32_8_\suffix)
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldp x2, x3, [sp, #16]
|
||||
ldp x4, x5, [sp], #48
|
||||
add x0, x0, #64
|
||||
add x1, x1, #32
|
||||
mov x6, #32
|
||||
bl X(ff_vvc_put_epel_hv32_8_\suffix)
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_epel_hv128_8_\suffix, export=1
|
||||
stp x4, x5, [sp, #-64]!
|
||||
stp x2, x3, [sp, #16]
|
||||
stp x0, x1, [sp, #32]
|
||||
str x30, [sp, #48]
|
||||
mov x6, #64
|
||||
bl X(ff_vvc_put_epel_hv64_8_\suffix)
|
||||
ldp x0, x1, [sp, #32]
|
||||
ldp x2, x3, [sp, #16]
|
||||
ldp x4, x5, [sp], #48
|
||||
add x0, x0, #128
|
||||
add x1, x1, #64
|
||||
mov x6, #64
|
||||
bl X(ff_vvc_put_epel_hv64_8_\suffix)
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.endm
|
||||
|
||||
epel_hv neon
|
||||
|
@ -84,6 +84,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.put[1][5][0][1] =
|
||||
c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon;
|
||||
|
||||
c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon;
|
||||
c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon;
|
||||
c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon;
|
||||
c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon;
|
||||
c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon;
|
||||
c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon;
|
||||
|
||||
c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
|
||||
c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
|
||||
c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
|
||||
@ -134,6 +141,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm;
|
||||
c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm;
|
||||
c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm;
|
||||
|
||||
c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm;
|
||||
c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm;
|
||||
c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm;
|
||||
c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm;
|
||||
c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
|
||||
c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
|
||||
}
|
||||
} else if (bd == 10) {
|
||||
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
|
||||
|
Loading…
x
Reference in New Issue
Block a user