1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-10 06:10:52 +02:00

aarch64/h26x: Add put_hevc_pel_bi_w_pixels

On rpi5 (A76):

put_hevc_pel_bi_w_pixels4_8_c:                          90.0 ( 1.00x)
put_hevc_pel_bi_w_pixels4_8_neon:                       34.1 ( 2.64x)
put_hevc_pel_bi_w_pixels6_8_c:                         188.3 ( 1.00x)
put_hevc_pel_bi_w_pixels6_8_neon:                       73.5 ( 2.56x)
put_hevc_pel_bi_w_pixels8_8_c:                         327.1 ( 1.00x)
put_hevc_pel_bi_w_pixels8_8_neon:                       75.8 ( 4.32x)
put_hevc_pel_bi_w_pixels12_8_c:                        728.8 ( 1.00x)
put_hevc_pel_bi_w_pixels12_8_neon:                     186.1 ( 3.92x)
put_hevc_pel_bi_w_pixels16_8_c:                       1288.1 ( 1.00x)
put_hevc_pel_bi_w_pixels16_8_neon:                     268.5 ( 4.80x)
put_hevc_pel_bi_w_pixels24_8_c:                       2855.5 ( 1.00x)
put_hevc_pel_bi_w_pixels24_8_neon:                     723.8 ( 3.95x)
put_hevc_pel_bi_w_pixels32_8_c:                       5095.3 ( 1.00x)
put_hevc_pel_bi_w_pixels32_8_neon:                    1165.0 ( 4.37x)
put_hevc_pel_bi_w_pixels48_8_c:                      11521.5 ( 1.00x)
put_hevc_pel_bi_w_pixels48_8_neon:                    2856.0 ( 4.03x)
put_hevc_pel_bi_w_pixels64_8_c:                      21020.5 ( 1.00x)
put_hevc_pel_bi_w_pixels64_8_neon:                    4699.1 ( 4.47x)

Reviewed-by: Martin Storsjö <martin@martin.st>
Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
This commit is contained in:
Zhao Zhili
2025-04-23 00:31:11 +08:00
parent 39786f8cd5
commit 26752368f0
3 changed files with 391 additions and 0 deletions

View File

@@ -92,6 +92,11 @@ NEON8_FNPROTO(pel_bi_pixels, (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, intptr_t mx, intptr_t my, int width),);
NEON8_FNPROTO(pel_bi_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, int denom, int wx0, int wx1,
int ox0, int ox1, intptr_t mx, intptr_t my, int width),);
NEON8_FNPROTO(epel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride, const int16_t *src2,
int height, intptr_t mx, intptr_t my, int width),);

View File

@@ -473,6 +473,379 @@ function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
ret
endfunc
.macro load_bi_w_pixels_param
ldrsw x8, [sp] // wx1
#if defined(__APPLE__)
ldpsw x9, x10, [sp, #4] // ox0, ox1
ldrsw x11, [sp, #32] // width
#else
ldrsw x9, [sp, #8] // ox0
ldrsw x10, [sp, #16] // ox1
ldrsw x11, [sp, 40] // width
#endif
.endm
function ff_hevc_put_hevc_pel_bi_w_pixels4_8_neon, export=1
load_bi_w_pixels_param
add w6, w6, #6 // log2Wd
dup v0.8h, w7 // wx0
dup v1.8h, w8 // wx1
add w9, w9, w10
add w9, w9, #1 // ox0 + ox1 + 1
lsl w9, w9, w6
add w7, w6, #1 // (log2Wd + 1)
mov x8, #(2 * HEVC_MAX_PB_SIZE)
neg w7, w7
dup v2.4s, w9 // (ox0 + ox1 + 1) << logwWd
dup v6.4s, w7 // -(log2Wd + 1)
1:
ld1 {v4.8b}, [x2], x3 // load src
ld1 {v5.8b}, [x4], x8 // load src2
subs w5, w5, #1
mov v3.16b, v2.16b
ushll v4.8h, v4.8b, #6
smlal v3.4s, v4.4h, v1.4h
smlal v3.4s, v5.4h, v0.4h
sshl v3.4s, v3.4s, v6.4s
sqxtn v3.4h, v3.4s
sqxtun v3.8b, v3.8h
st1 {v3.s}[0], [x0], x1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_bi_w_pixels6_8_neon, export=1
load_bi_w_pixels_param
add w6, w6, #6 // log2Wd
dup v0.8h, w7 // wx0
dup v1.8h, w8 // wx1
add w9, w9, w10
add w9, w9, #1 // ox0 + ox1 + 1
lsl w9, w9, w6
add w7, w6, #1 // (log2Wd + 1)
mov x8, #(2 * HEVC_MAX_PB_SIZE)
neg w7, w7
dup v2.4s, w9 // (ox0 + ox1 + 1) << logwWd
dup v6.4s, w7 // -(log2Wd + 1)
sub x1, x1, #4
1:
ld1 {v4.8b}, [x2], x3 // load src
ld1 {v5.8h}, [x4], x8 // load src2
subs w5, w5, #1
mov v3.16b, v2.16b
mov v7.16b, v2.16b
ushll v4.8h, v4.8b, #6
smlal v3.4s, v4.4h, v1.4h
smlal v3.4s, v5.4h, v0.4h
smlal2 v7.4s, v4.8h, v1.8h
smlal2 v7.4s, v5.8h, v0.8h
sshl v3.4s, v3.4s, v6.4s
sshl v7.4s, v7.4s, v6.4s
sqxtn v3.4h, v3.4s
sqxtn2 v3.8h, v7.4s
sqxtun v3.8b, v3.8h
str s3, [x0], #4
st1 {v3.h}[2], [x0], x1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_bi_w_pixels8_8_neon, export=1
load_bi_w_pixels_param
add w6, w6, #6 // log2Wd
dup v0.8h, w7 // wx0
dup v1.8h, w8 // wx1
add w9, w9, w10
add w9, w9, #1 // ox0 + ox1 + 1
lsl w9, w9, w6
add w7, w6, #1 // (log2Wd + 1)
mov x8, #(2 * HEVC_MAX_PB_SIZE)
neg w7, w7
dup v2.4s, w9 // (ox0 + ox1 + 1) << logwWd
dup v6.4s, w7 // -(log2Wd + 1)
1:
ld1 {v4.8b}, [x2], x3 // load src
ld1 {v5.8h}, [x4], x8 // load src2
subs w5, w5, #1
mov v3.16b, v2.16b
mov v7.16b, v2.16b
ushll v4.8h, v4.8b, #6
smlal v3.4s, v4.4h, v1.4h
smlal v3.4s, v5.4h, v0.4h
smlal2 v7.4s, v4.8h, v1.8h
smlal2 v7.4s, v5.8h, v0.8h
sshl v3.4s, v3.4s, v6.4s
sshl v7.4s, v7.4s, v6.4s
sqxtn v3.4h, v3.4s
sqxtn2 v3.8h, v7.4s
sqxtun v3.8b, v3.8h
st1 {v3.8b}, [x0], x1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_bi_w_pixels12_8_neon, export=1
load_bi_w_pixels_param
add w6, w6, #6 // log2Wd
dup v0.8h, w7 // wx0
dup v1.8h, w8 // wx1
add w9, w9, w10
add w9, w9, #1 // ox0 + ox1 + 1
lsl w9, w9, w6
add w7, w6, #1 // (log2Wd + 1)
mov x8, #(2 * HEVC_MAX_PB_SIZE)
neg w7, w7
dup v2.4s, w9 // (ox0 + ox1 + 1) << logwWd
dup v6.4s, w7 // -(log2Wd + 1)
sub x1, x1, #8
1:
ld1 {v24.16b}, [x2], x3 // load src
ld1 {v20.16b, v21.16b}, [x4], x8 // load src2
subs w5, w5, #1
mov v16.16b, v2.16b
mov v17.16b, v2.16b
mov v18.16b, v2.16b
ushll v4.8h, v24.8b, #6
ushll2 v24.8h, v24.16b, #6
smlal v16.4s, v4.4h, v1.4h
smlal v16.4s, v20.4h, v0.4h
smlal2 v17.4s, v4.8h, v1.8h
smlal2 v17.4s, v20.8h, v0.8h
smlal v18.4s, v24.4h, v1.4h
smlal v18.4s, v21.4h, v0.4h
sshl v16.4s, v16.4s, v6.4s
sshl v17.4s, v17.4s, v6.4s
sshl v18.4s, v18.4s, v6.4s
sqxtn v16.4h, v16.4s
sqxtn2 v16.8h, v17.4s
sqxtn v18.4h, v18.4s
sqxtun v3.8b, v16.8h
sqxtun2 v3.16b, v18.8h
str d3, [x0], #8
st1 {v3.s}[2], [x0], x1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_bi_w_pixels16_8_neon, export=1
load_bi_w_pixels_param
add w6, w6, #6 // log2Wd
dup v0.8h, w7 // wx0
dup v1.8h, w8 // wx1
add w9, w9, w10
add w9, w9, #1 // ox0 + ox1 + 1
lsl w9, w9, w6
add w7, w6, #1 // (log2Wd + 1)
mov x8, #(2 * HEVC_MAX_PB_SIZE)
neg w7, w7
dup v2.4s, w9 // (ox0 + ox1 + 1) << logwWd
dup v6.4s, w7 // -(log2Wd + 1)
1:
ld1 {v24.16b}, [x2], x3 // load src
ld1 {v20.16b, v21.16b}, [x4], x8 // load src2
subs w5, w5, #1
mov v16.16b, v2.16b
mov v17.16b, v2.16b
mov v18.16b, v2.16b
mov v19.16b, v2.16b
ushll v4.8h, v24.8b, #6
ushll2 v24.8h, v24.16b, #6
smlal v16.4s, v4.4h, v1.4h
smlal v16.4s, v20.4h, v0.4h
smlal2 v17.4s, v4.8h, v1.8h
smlal2 v17.4s, v20.8h, v0.8h
smlal v18.4s, v24.4h, v1.4h
smlal v18.4s, v21.4h, v0.4h
smlal2 v19.4s, v24.8h, v1.8h
smlal2 v19.4s, v21.8h, v0.8h
sshl v16.4s, v16.4s, v6.4s
sshl v17.4s, v17.4s, v6.4s
sshl v18.4s, v18.4s, v6.4s
sshl v19.4s, v19.4s, v6.4s
sqxtn v16.4h, v16.4s
sqxtn2 v16.8h, v17.4s
sqxtn v18.4h, v18.4s
sqxtn2 v18.8h, v19.4s
sqxtun v3.8b, v16.8h
sqxtun2 v3.16b, v18.8h
st1 {v3.16b}, [x0], x1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_bi_w_pixels24_8_neon, export=1
load_bi_w_pixels_param
add w6, w6, #6 // log2Wd
dup v0.8h, w7 // wx0
dup v1.8h, w8 // wx1
add w9, w9, w10
add w9, w9, #1 // ox0 + ox1 + 1
lsl w9, w9, w6
add w7, w6, #1 // (log2Wd + 1)
mov x8, #(2 * HEVC_MAX_PB_SIZE)
neg w7, w7
dup v2.4s, w9 // (ox0 + ox1 + 1) << logwWd
dup v6.4s, w7 // -(log2Wd + 1)
mov x7, #24
sub x3, x3, x11
sub x8, x8, x11, lsl #1
sub x1, x1, x11
1:
mov w6, w11
2:
ld1 {v24.16b, v25.16b}, [x2], x7
ld1 {v20.8h, v21.8h, v22.8h}, [x4], #48
subs w6, w6, #24
mov v16.16b, v2.16b
mov v17.16b, v2.16b
mov v18.16b, v2.16b
mov v19.16b, v2.16b
mov v26.16b, v2.16b
mov v27.16b, v2.16b
ushll v4.8h, v24.8b, #6
ushll2 v24.8h, v24.16b, #6
ushll v5.8h, v25.8b, #6
smlal v16.4s, v4.4h, v1.4h
smlal v16.4s, v20.4h, v0.4h
smlal2 v17.4s, v4.8h, v1.8h
smlal2 v17.4s, v20.8h, v0.8h
smlal v18.4s, v24.4h, v1.4h
smlal v18.4s, v21.4h, v0.4h
smlal2 v19.4s, v24.8h, v1.8h
smlal2 v19.4s, v21.8h, v0.8h
smlal v26.4s, v5.4h, v1.4h
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v5.8h, v1.8h
smlal2 v27.4s, v22.8h, v0.8h
sshl v16.4s, v16.4s, v6.4s
sshl v17.4s, v17.4s, v6.4s
sshl v18.4s, v18.4s, v6.4s
sshl v19.4s, v19.4s, v6.4s
sshl v26.4s, v26.4s, v6.4s
sshl v27.4s, v27.4s, v6.4s
sqxtn v16.4h, v16.4s
sqxtn2 v16.8h, v17.4s
sqxtn v18.4h, v18.4s
sqxtn2 v18.8h, v19.4s
sqxtn v26.4h, v26.4s
sqxtn2 v26.8h, v27.4s
sqxtun v3.8b, v16.8h
sqxtun2 v3.16b, v18.8h
sqxtun v4.8b, v26.8h
str q3, [x0], #16
str d4, [x0], #8
b.ne 2b
subs w5, w5, #1
add x0, x0, x1
add x2, x2, x3
add x4, x4, x8
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_bi_w_pixels32_8_neon, export=1
load_bi_w_pixels_param
add w6, w6, #6 // log2Wd
dup v0.8h, w7 // wx0
dup v1.8h, w8 // wx1
add w9, w9, w10
add w9, w9, #1 // ox0 + ox1 + 1
lsl w9, w9, w6
add w7, w6, #1 // (log2Wd + 1)
mov x8, #(2 * HEVC_MAX_PB_SIZE)
neg w7, w7
dup v2.4s, w9 // (ox0 + ox1 + 1) << logwWd
dup v6.4s, w7 // -(log2Wd + 1)
sub x3, x3, x11
sub x8, x8, x11, lsl #1
sub x1, x1, x11
1:
mov w6, w11
2:
ld1 {v24.16b, v25.16b}, [x2], #32 // load src
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], #64 // load src2
subs w6, w6, #32
mov v16.16b, v2.16b
mov v17.16b, v2.16b
mov v18.16b, v2.16b
mov v19.16b, v2.16b
mov v26.16b, v2.16b
mov v27.16b, v2.16b
mov v28.16b, v2.16b
mov v29.16b, v2.16b
ushll v4.8h, v24.8b, #6
ushll2 v24.8h, v24.16b, #6
ushll v5.8h, v25.8b, #6
ushll2 v25.8h, v25.16b, #6
smlal v16.4s, v4.4h, v1.4h
smlal v16.4s, v20.4h, v0.4h
smlal2 v17.4s, v4.8h, v1.8h
smlal2 v17.4s, v20.8h, v0.8h
smlal v18.4s, v24.4h, v1.4h
smlal v18.4s, v21.4h, v0.4h
smlal2 v19.4s, v24.8h, v1.8h
smlal2 v19.4s, v21.8h, v0.8h
smlal v26.4s, v5.4h, v1.4h
smlal v26.4s, v22.4h, v0.4h
smlal2 v27.4s, v5.8h, v1.8h
smlal2 v27.4s, v22.8h, v0.8h
smlal v28.4s, v25.4h, v1.4h
smlal v28.4s, v23.4h, v0.4h
smlal2 v29.4s, v25.8h, v1.8h
smlal2 v29.4s, v23.8h, v0.8h
sshl v16.4s, v16.4s, v6.4s
sshl v17.4s, v17.4s, v6.4s
sshl v18.4s, v18.4s, v6.4s
sshl v19.4s, v19.4s, v6.4s
sshl v26.4s, v26.4s, v6.4s
sshl v27.4s, v27.4s, v6.4s
sshl v28.4s, v28.4s, v6.4s
sshl v29.4s, v29.4s, v6.4s
sqxtn v16.4h, v16.4s
sqxtn2 v16.8h, v17.4s
sqxtn v18.4h, v18.4s
sqxtn2 v18.8h, v19.4s
sqxtn v26.4h, v26.4s
sqxtn2 v26.8h, v27.4s
sqxtn v28.4h, v28.4s
sqxtn2 v28.8h, v29.4s
sqxtun v3.8b, v16.8h
sqxtun2 v3.16b, v18.8h
sqxtun v4.8b, v26.8h
sqxtun2 v4.16b, v28.8h
st1 {v3.16b, v4.16b}, [x0], #32
b.ne 2b
subs w5, w5, #1
add x0, x0, x1
add x2, x2, x3
add x4, x4, x8
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1
load_epel_filterb x6, x7

View File

@@ -134,6 +134,17 @@ void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
#define NEON8_FNASSIGN_PARTIAL_6(member, v, h, fn, ext) \
member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \
member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
member[8][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
member[9][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext;
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
int cpu_flags = av_get_cpu_flags();
@@ -204,6 +215,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 0, epel_bi_v,);
NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels,);
NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v,);
NEON8_FNASSIGN_PARTIAL_6(c->put_hevc_qpel_bi_w, 0, 0, pel_bi_w_pixels,);
NEON8_FNASSIGN_PARTIAL_6(c->put_hevc_epel_bi_w, 0, 0, pel_bi_w_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);