1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-28 20:53:54 +02:00

lavc/aarch64: new optimization for 8-bit hevc_qpel_uni_w_h

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Logan Lyu 2023-05-05 22:06:22 +08:00 committed by Martin Storsjö
parent 0b7356c1b4
commit 15972cce8c
2 changed files with 448 additions and 1 deletions

View File

@ -145,6 +145,7 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@ -155,6 +156,12 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width),);
NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width), _i8mm);
#define NEON8_FNASSIGN(member, v, h, fn, ext) \
member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \
@ -174,9 +181,11 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
if (!have_neon(av_get_cpu_flags())) return;
int cpu_flags = av_get_cpu_flags();
if (!have_neon(cpu_flags)) return;
if (bit_depth == 8) {
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon;
@ -236,6 +245,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
if (have_i8mm(cpu_flags)) {
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
}
}
if (bit_depth == 10) {
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon;

View File

@ -1192,3 +1192,437 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
b.hi 3b
ret
endfunc
#if HAVE_I8MM
.macro QPEL_UNI_W_H_HEADER
ldr x12, [sp]
sub x2, x2, #3
movrel x9, qpel_filters
add x9, x9, x12, lsl #3
ldr x11, [x9]
dup v28.2d, x11
mov w10, #-6
sub w10, w10, w5
dup v30.4s, w6 // wx
dup v31.4s, w10 // shift
dup v29.4s, w7 // ox
.endm
function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
1:
ld1 {v0.16b}, [x2], x3
ext v1.16b, v0.16b, v0.16b, #1
ext v2.16b, v0.16b, v0.16b, #2
ext v3.16b, v0.16b, v0.16b, #3
zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
movi v16.2d, #0
movi v17.2d, #0
usdot v16.4s, v0.16b, v28.16b
usdot v17.4s, v2.16b, v28.16b
addp v16.4s, v16.4s, v17.4s
mul v16.4s, v16.4s, v30.4s
sqrshl v16.4s, v16.4s, v31.4s
sqadd v16.4s, v16.4s, v29.4s
sqxtn v16.4h, v16.4s
sqxtun v16.8b, v16.8h
str s16, [x0]
add x0, x0, x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
sub x1, x1, #4
1:
ld1 {v0.16b}, [x2], x3
ext v1.16b, v0.16b, v0.16b, #1
ext v2.16b, v0.16b, v0.16b, #2
ext v3.16b, v0.16b, v0.16b, #3
ext v4.16b, v0.16b, v0.16b, #4
ext v5.16b, v0.16b, v0.16b, #5
zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d
movi v16.2d, #0
movi v17.2d, #0
movi v18.2d, #0
usdot v16.4s, v0.16b, v28.16b
usdot v17.4s, v2.16b, v28.16b
usdot v18.4s, v4.16b, v28.16b
addp v16.4s, v16.4s, v17.4s
addp v18.4s, v18.4s, v18.4s
mul v16.4s, v16.4s, v30.4s
mul v18.2s, v18.2s, v30.2s
sqrshl v16.4s, v16.4s, v31.4s
sqrshl v18.2s, v18.2s, v31.2s
sqadd v16.4s, v16.4s, v29.4s
sqadd v18.2s, v18.2s, v29.2s
sqxtn v16.4h, v16.4s
sqxtn2 v16.8h, v18.4s
sqxtun v16.8b, v16.8h
str s16, [x0], #4
st1 {v16.h}[2], [x0], x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
movi \d0\().2d, #0
movi \d1\().2d, #0
movi \d2\().2d, #0
movi \d3\().2d, #0
usdot \d0\().4s, \s0\().16b, v28.16b
usdot \d1\().4s, \s1\().16b, v28.16b
usdot \d2\().4s, \s2\().16b, v28.16b
usdot \d3\().4s, \s3\().16b, v28.16b
addp \d0\().4s, \d0\().4s, \d1\().4s
addp \d2\().4s, \d2\().4s, \d3\().4s
mul \d0\().4s, \d0\().4s, v30.4s
mul \d2\().4s, \d2\().4s, v30.4s
sqrshl \d0\().4s, \d0\().4s, v31.4s
sqrshl \d2\().4s, \d2\().4s, v31.4s
sqadd \d0\().4s, \d0\().4s, v29.4s
sqadd \d2\().4s, \d2\().4s, v29.4s
.endm
.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
movi \d0\().2d, #0
movi \d1\().2d, #0
usdot \d0\().4s, \s0\().16b, v28.16b
usdot \d1\().4s, \s1\().16b, v28.16b
addp \d0\().4s, \d0\().4s, \d1\().4s
mul \d0\().4s, \d0\().4s, v30.4s
sqrshl \d0\().4s, \d0\().4s, v31.4s
sqadd \d0\().4s, \d0\().4s, v29.4s
.endm
function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
1:
ld1 {v16.16b, v17.16b}, [x2], x3
ext v1.16b, v16.16b, v17.16b, #1
ext v2.16b, v16.16b, v17.16b, #2
ext v3.16b, v16.16b, v17.16b, #3
ext v4.16b, v16.16b, v17.16b, #4
ext v5.16b, v16.16b, v17.16b, #5
ext v6.16b, v16.16b, v17.16b, #6
ext v7.16b, v16.16b, v17.16b, #7
zip1 v0.2d, v16.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d
zip1 v6.2d, v6.2d, v7.2d
QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
sqxtn v18.4h, v18.4s
sqxtn2 v18.8h, v20.4s
sqxtun v18.8b, v18.8h
str d18, [x0]
add x0, x0, x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
add x13, x0, #8
1:
ld1 {v16.16b, v17.16b}, [x2], x3
ext v1.16b, v16.16b, v17.16b, #1
ext v2.16b, v16.16b, v17.16b, #2
ext v3.16b, v16.16b, v17.16b, #3
ext v4.16b, v16.16b, v17.16b, #4
ext v5.16b, v16.16b, v17.16b, #5
ext v6.16b, v16.16b, v17.16b, #6
ext v7.16b, v16.16b, v17.16b, #7
zip1 v18.2d, v16.2d, v1.2d
zip1 v19.2d, v2.2d, v3.2d
zip1 v20.2d, v4.2d, v5.2d
zip1 v21.2d, v6.2d, v7.2d
zip2 v22.2d, v16.2d, v1.2d
zip2 v23.2d, v2.2d, v3.2d
QPEL_UNI_W_H_CALC v18, v19, v20, v21, v0, v2, v4, v6
QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
sqxtn v0.4h, v0.4s
sqxtn2 v0.8h, v4.4s
sqxtn v1.4h, v24.4s
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
str d0, [x0]
str s1, [x13]
add x0, x0, x1
add x13, x13, x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
1:
ld1 {v16.16b, v17.16b}, [x2], x3
ext v1.16b, v16.16b, v17.16b, #1
ext v2.16b, v16.16b, v17.16b, #2
ext v3.16b, v16.16b, v17.16b, #3
ext v4.16b, v16.16b, v17.16b, #4
ext v5.16b, v16.16b, v17.16b, #5
ext v6.16b, v16.16b, v17.16b, #6
ext v7.16b, v16.16b, v17.16b, #7
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
sqxtn v0.4h, v18.4s
sqxtn2 v0.8h, v22.4s
sqxtn v1.4h, v20.4s
sqxtn2 v1.8h, v24.4s
trn1 v2.8h, v0.8h, v1.8h
trn2 v3.8h, v0.8h, v1.8h
sqxtun v0.8b, v2.8h
sqxtun2 v0.16b, v3.8h
st1 {v0.16b}, [x0], x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
sub x1, x1, #16
1:
ld1 {v16.16b, v17.16b}, [x2], x3
ext v1.16b, v16.16b, v17.16b, #1
ext v2.16b, v16.16b, v17.16b, #2
ext v3.16b, v16.16b, v17.16b, #3
ext v4.16b, v16.16b, v17.16b, #4
ext v5.16b, v16.16b, v17.16b, #5
ext v6.16b, v16.16b, v17.16b, #6
ext v7.16b, v16.16b, v17.16b, #7
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
sqxtn v18.4h, v18.4s
sqxtn2 v18.8h, v22.4s
sqxtn v19.4h, v20.4s
sqxtn2 v19.8h, v24.4s
trn1 v20.8h, v18.8h, v19.8h
trn2 v21.8h, v18.8h, v19.8h
sqxtun v26.8b, v20.8h
sqxtun2 v26.16b, v21.8h // 0-15
ext v1.16b, v17.16b, v17.16b, #1
ext v2.16b, v17.16b, v17.16b, #2
ext v3.16b, v17.16b, v17.16b, #3
ext v4.16b, v17.16b, v17.16b, #4
ext v5.16b, v17.16b, v17.16b, #5
ext v6.16b, v17.16b, v17.16b, #6
ext v7.16b, v17.16b, v17.16b, #7
zip1 v0.2d, v17.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d
zip1 v6.2d, v6.2d, v7.2d
QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
sqxtn v18.4h, v18.4s
sqxtn2 v18.8h, v20.4s
sqxtun v27.8b, v18.8h
st1 {v26.16b}, [x0], #16
st1 {v27.8b}, [x0], x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
1:
ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3
ext v1.16b, v16.16b, v17.16b, #1
ext v2.16b, v16.16b, v17.16b, #2
ext v3.16b, v16.16b, v17.16b, #3
ext v4.16b, v16.16b, v17.16b, #4
ext v5.16b, v16.16b, v17.16b, #5
ext v6.16b, v16.16b, v17.16b, #6
ext v7.16b, v16.16b, v17.16b, #7
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v0, v19, v20, v21
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
sqxtn v0.4h, v0.4s
sqxtn2 v0.8h, v22.4s
sqxtn v19.4h, v20.4s
sqxtn2 v19.8h, v24.4s
trn1 v20.8h, v0.8h, v19.8h
trn2 v21.8h, v0.8h, v19.8h
sqxtun v26.8b, v20.8h
sqxtun2 v26.16b, v21.8h // 0-15
ext v1.16b, v17.16b, v18.16b, #1
ext v2.16b, v17.16b, v18.16b, #2
ext v3.16b, v17.16b, v18.16b, #3
ext v4.16b, v17.16b, v18.16b, #4
ext v5.16b, v17.16b, v18.16b, #5
ext v6.16b, v17.16b, v18.16b, #6
ext v7.16b, v17.16b, v18.16b, #7
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v0, v19, v20, v21
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
sqxtn v0.4h, v0.4s
sqxtn2 v0.8h, v22.4s
sqxtn v19.4h, v20.4s
sqxtn2 v19.8h, v24.4s
trn1 v20.8h, v0.8h, v19.8h
trn2 v21.8h, v0.8h, v19.8h
sqxtun v27.8b, v20.8h
sqxtun2 v27.16b, v21.8h // 16-31
st1 {v26.16b, v27.16b}, [x0], x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
1:
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
ext v1.16b, v16.16b, v17.16b, #1
ext v2.16b, v16.16b, v17.16b, #2
ext v3.16b, v16.16b, v17.16b, #3
ext v4.16b, v16.16b, v17.16b, #4
ext v5.16b, v16.16b, v17.16b, #5
ext v6.16b, v16.16b, v17.16b, #6
ext v7.16b, v16.16b, v17.16b, #7
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
sqxtn v20.4h, v20.4s
sqxtn2 v20.8h, v22.4s
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v23.4s
trn1 v22.8h, v20.8h, v21.8h
trn2 v23.8h, v20.8h, v21.8h
sqxtun v25.8b, v22.8h
sqxtun2 v25.16b, v23.8h // 0-15
ext v1.16b, v17.16b, v18.16b, #1
ext v2.16b, v17.16b, v18.16b, #2
ext v3.16b, v17.16b, v18.16b, #3
ext v4.16b, v17.16b, v18.16b, #4
ext v5.16b, v17.16b, v18.16b, #5
ext v6.16b, v17.16b, v18.16b, #6
ext v7.16b, v17.16b, v18.16b, #7
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
sqxtn v20.4h, v20.4s
sqxtn2 v20.8h, v22.4s
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v23.4s
trn1 v22.8h, v20.8h, v21.8h
trn2 v23.8h, v20.8h, v21.8h
sqxtun v26.8b, v22.8h
sqxtun2 v26.16b, v23.8h // 16-31
ext v1.16b, v18.16b, v19.16b, #1
ext v2.16b, v18.16b, v19.16b, #2
ext v3.16b, v18.16b, v19.16b, #3
ext v4.16b, v18.16b, v19.16b, #4
ext v5.16b, v18.16b, v19.16b, #5
ext v6.16b, v18.16b, v19.16b, #6
ext v7.16b, v18.16b, v19.16b, #7
QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
sqxtn v20.4h, v20.4s
sqxtn2 v20.8h, v22.4s
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v23.4s
trn1 v22.8h, v20.8h, v21.8h
trn2 v23.8h, v20.8h, v21.8h
sqxtun v27.8b, v22.8h
sqxtun2 v27.16b, v23.8h // 32-47
st1 {v25.16b, v26.16b, v27.16b}, [x0], x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1
QPEL_UNI_W_H_HEADER
sub x3, x3, #64
1:
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
ext v1.16b, v16.16b, v17.16b, #1
ext v2.16b, v16.16b, v17.16b, #2
ext v3.16b, v16.16b, v17.16b, #3
ext v4.16b, v16.16b, v17.16b, #4
ext v5.16b, v16.16b, v17.16b, #5
ext v6.16b, v16.16b, v17.16b, #6
ext v7.16b, v16.16b, v17.16b, #7
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
sqxtn v20.4h, v20.4s
sqxtn2 v20.8h, v22.4s
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v23.4s
trn1 v22.8h, v20.8h, v21.8h
trn2 v23.8h, v20.8h, v21.8h
sqxtun v16.8b, v22.8h
sqxtun2 v16.16b, v23.8h // 0-15
ext v1.16b, v17.16b, v18.16b, #1
ext v2.16b, v17.16b, v18.16b, #2
ext v3.16b, v17.16b, v18.16b, #3
ext v4.16b, v17.16b, v18.16b, #4
ext v5.16b, v17.16b, v18.16b, #5
ext v6.16b, v17.16b, v18.16b, #6
ext v7.16b, v17.16b, v18.16b, #7
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
sqxtn v20.4h, v20.4s
sqxtn2 v20.8h, v22.4s
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v23.4s
trn1 v22.8h, v20.8h, v21.8h
trn2 v23.8h, v20.8h, v21.8h
sqxtun v17.8b, v22.8h
sqxtun2 v17.16b, v23.8h // 16-31
ext v1.16b, v18.16b, v19.16b, #1
ext v2.16b, v18.16b, v19.16b, #2
ext v3.16b, v18.16b, v19.16b, #3
ext v4.16b, v18.16b, v19.16b, #4
ext v5.16b, v18.16b, v19.16b, #5
ext v6.16b, v18.16b, v19.16b, #6
ext v7.16b, v18.16b, v19.16b, #7
QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
ld1 {v0.16b}, [x2], x3
sqxtn v20.4h, v20.4s
sqxtn2 v20.8h, v22.4s
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v23.4s
trn1 v22.8h, v20.8h, v21.8h
trn2 v23.8h, v20.8h, v21.8h
sqxtun v18.8b, v22.8h
sqxtun2 v18.16b, v23.8h // 32-47
ext v1.16b, v19.16b, v0.16b, #1
ext v2.16b, v19.16b, v0.16b, #2
ext v3.16b, v19.16b, v0.16b, #3
ext v4.16b, v19.16b, v0.16b, #4
ext v5.16b, v19.16b, v0.16b, #5
ext v6.16b, v19.16b, v0.16b, #6
ext v7.16b, v19.16b, v0.16b, #7
QPEL_UNI_W_H_CALC v19, v2, v1, v3, v20, v24, v21, v0
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
sqxtn v20.4h, v20.4s
sqxtn2 v20.8h, v22.4s
sqxtn v21.4h, v21.4s
sqxtn2 v21.8h, v23.4s
trn1 v22.8h, v20.8h, v21.8h
trn2 v23.8h, v20.8h, v21.8h
sqxtun v19.8b, v22.8h
sqxtun2 v19.16b, v23.8h // 48-63
st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
subs w4, w4, #1
b.hi 1b
ret
endfunc
#endif // HAVE_I8MM