1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-28 20:53:54 +02:00

aarch64: hevc: Implement a neon version of hevc_epel_uni_w_h*_8

AWS Graviton 3:
put_hevc_epel_uni_w_h4_8_c: 97.2
put_hevc_epel_uni_w_h4_8_neon: 41.2
put_hevc_epel_uni_w_h4_8_i8mm: 35.2
put_hevc_epel_uni_w_h6_8_c: 203.7
put_hevc_epel_uni_w_h6_8_neon: 84.7
put_hevc_epel_uni_w_h6_8_i8mm: 74.7
put_hevc_epel_uni_w_h8_8_c: 345.7
put_hevc_epel_uni_w_h8_8_neon: 94.0
put_hevc_epel_uni_w_h8_8_i8mm: 80.7
put_hevc_epel_uni_w_h12_8_c: 768.7
put_hevc_epel_uni_w_h12_8_neon: 196.7
put_hevc_epel_uni_w_h12_8_i8mm: 169.7
put_hevc_epel_uni_w_h16_8_c: 1313.0
put_hevc_epel_uni_w_h16_8_neon: 290.7
put_hevc_epel_uni_w_h16_8_i8mm: 238.0
put_hevc_epel_uni_w_h24_8_c: 2877.5
put_hevc_epel_uni_w_h24_8_neon: 650.0
put_hevc_epel_uni_w_h24_8_i8mm: 512.0
put_hevc_epel_uni_w_h32_8_c: 5113.5
put_hevc_epel_uni_w_h32_8_neon: 1129.5
put_hevc_epel_uni_w_h32_8_i8mm: 739.2
put_hevc_epel_uni_w_h48_8_c: 11757.0
put_hevc_epel_uni_w_h48_8_neon: 2518.7
put_hevc_epel_uni_w_h48_8_i8mm: 1688.5
put_hevc_epel_uni_w_h64_8_c: 20478.0
put_hevc_epel_uni_w_h64_8_neon: 4411.7
put_hevc_epel_uni_w_h64_8_i8mm: 2884.0

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2024-03-13 15:41:11 +02:00
parent 6d384298ec
commit 54af555bfa
2 changed files with 319 additions and 13 deletions

View File

@ -1520,6 +1520,319 @@ function ff_hevc_put_hevc_epel_h32_8_neon, export=1
ret
endfunc
.macro EPEL_UNI_W_H_HEADER elems=4s
ldr x12, [sp]
sub x2, x2, #1
movrel x9, epel_filters
add x9, x9, x12, lsl #2
ld1r {v28.4s}, [x9]
mov w10, #-6
sub w10, w10, w5
dup v30.\elems, w6
dup v31.4s, w10
dup v29.4s, w7
.endm
function ff_hevc_put_hevc_epel_uni_w_h4_8_neon, export=1
EPEL_UNI_W_H_HEADER 4h
sxtl v0.8h, v28.8b
1:
ld1 {v4.8b}, [x2], x3
subs w4, w4, #1
uxtl v4.8h, v4.8b
ext v5.16b, v4.16b, v4.16b, #2
ext v6.16b, v4.16b, v4.16b, #4
ext v7.16b, v4.16b, v4.16b, #6
mul v16.4h, v4.4h, v0.h[0]
mla v16.4h, v5.4h, v0.h[1]
mla v16.4h, v6.4h, v0.h[2]
mla v16.4h, v7.4h, v0.h[3]
smull v16.4s, v16.4h, v30.4h
sqrshl v16.4s, v16.4s, v31.4s
sqadd v16.4s, v16.4s, v29.4s
sqxtn v16.4h, v16.4s
sqxtun v16.8b, v16.8h
str s16, [x0]
add x0, x0, x1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_epel_uni_w_h6_8_neon, export=1
EPEL_UNI_W_H_HEADER 8h
sub x1, x1, #4
sxtl v0.8h, v28.8b
1:
ld1 {v3.8b, v4.8b}, [x2], x3
subs w4, w4, #1
uxtl v3.8h, v3.8b
uxtl v4.8h, v4.8b
ext v5.16b, v3.16b, v4.16b, #2
ext v6.16b, v3.16b, v4.16b, #4
ext v7.16b, v3.16b, v4.16b, #6
mul v16.8h, v3.8h, v0.h[0]
mla v16.8h, v5.8h, v0.h[1]
mla v16.8h, v6.8h, v0.h[2]
mla v16.8h, v7.8h, v0.h[3]
smull v17.4s, v16.4h, v30.4h
smull2 v18.4s, v16.8h, v30.8h
sqrshl v17.4s, v17.4s, v31.4s
sqrshl v18.4s, v18.4s, v31.4s
sqadd v17.4s, v17.4s, v29.4s
sqadd v18.4s, v18.4s, v29.4s
sqxtn v16.4h, v17.4s
sqxtn2 v16.8h, v18.4s
sqxtun v16.8b, v16.8h
str s16, [x0], #4
st1 {v16.h}[2], [x0], x1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_epel_uni_w_h8_8_neon, export=1
EPEL_UNI_W_H_HEADER 8h
sxtl v0.8h, v28.8b
1:
ld1 {v3.8b, v4.8b}, [x2], x3
subs w4, w4, #1
uxtl v3.8h, v3.8b
uxtl v4.8h, v4.8b
ext v5.16b, v3.16b, v4.16b, #2
ext v6.16b, v3.16b, v4.16b, #4
ext v7.16b, v3.16b, v4.16b, #6
mul v16.8h, v3.8h, v0.h[0]
mla v16.8h, v5.8h, v0.h[1]
mla v16.8h, v6.8h, v0.h[2]
mla v16.8h, v7.8h, v0.h[3]
smull v17.4s, v16.4h, v30.4h
smull2 v18.4s, v16.8h, v30.8h
sqrshl v17.4s, v17.4s, v31.4s
sqrshl v18.4s, v18.4s, v31.4s
sqadd v17.4s, v17.4s, v29.4s
sqadd v18.4s, v18.4s, v29.4s
sqxtn v16.4h, v17.4s
sqxtn2 v16.8h, v18.4s
sqxtun v16.8b, v16.8h
st1 {v16.8b}, [x0], x1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_epel_uni_w_h12_8_neon, export=1
EPEL_UNI_W_H_HEADER 8h
sxtl v0.8h, v28.8b
1:
ld1 {v3.8b, v4.8b}, [x2], x3
subs w4, w4, #1
uxtl v3.8h, v3.8b
uxtl v4.8h, v4.8b
ext v5.16b, v3.16b, v4.16b, #2
ext v6.16b, v3.16b, v4.16b, #4
ext v7.16b, v3.16b, v4.16b, #6
ext v20.16b, v4.16b, v4.16b, #2
ext v21.16b, v4.16b, v4.16b, #4
ext v22.16b, v4.16b, v4.16b, #6
mul v16.8h, v3.8h, v0.h[0]
mla v16.8h, v5.8h, v0.h[1]
mla v16.8h, v6.8h, v0.h[2]
mla v16.8h, v7.8h, v0.h[3]
mul v17.4h, v4.4h, v0.h[0]
mla v17.4h, v20.4h, v0.h[1]
mla v17.4h, v21.4h, v0.h[2]
mla v17.4h, v22.4h, v0.h[3]
smull v18.4s, v16.4h, v30.4h
smull2 v19.4s, v16.8h, v30.8h
smull v20.4s, v17.4h, v30.4h
sqrshl v18.4s, v18.4s, v31.4s
sqrshl v19.4s, v19.4s, v31.4s
sqrshl v20.4s, v20.4s, v31.4s
sqadd v18.4s, v18.4s, v29.4s
sqadd v19.4s, v19.4s, v29.4s
sqadd v20.4s, v20.4s, v29.4s
sqxtn v16.4h, v18.4s
sqxtn2 v16.8h, v19.4s
sqxtn v17.4h, v20.4s
sqxtun v16.8b, v16.8h
sqxtun v17.8b, v17.8h
str d16, [x0]
str s17, [x0, #8]
add x0, x0, x1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_epel_uni_w_h16_8_neon, export=1
EPEL_UNI_W_H_HEADER 8h
sxtl v0.8h, v28.8b
1:
ld1 {v1.8b, v2.8b, v3.8b}, [x2], x3
subs w4, w4, #1
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
ext v5.16b, v1.16b, v2.16b, #2
ext v6.16b, v1.16b, v2.16b, #4
ext v7.16b, v1.16b, v2.16b, #6
ext v20.16b, v2.16b, v3.16b, #2
ext v21.16b, v2.16b, v3.16b, #4
ext v22.16b, v2.16b, v3.16b, #6
mul v16.8h, v1.8h, v0.h[0]
mla v16.8h, v5.8h, v0.h[1]
mla v16.8h, v6.8h, v0.h[2]
mla v16.8h, v7.8h, v0.h[3]
mul v17.8h, v2.8h, v0.h[0]
mla v17.8h, v20.8h, v0.h[1]
mla v17.8h, v21.8h, v0.h[2]
mla v17.8h, v22.8h, v0.h[3]
smull v18.4s, v16.4h, v30.4h
smull2 v19.4s, v16.8h, v30.8h
smull v20.4s, v17.4h, v30.4h
smull2 v21.4s, v17.8h, v30.8h
sqrshl v18.4s, v18.4s, v31.4s
sqrshl v19.4s, v19.4s, v31.4s
sqrshl v20.4s, v20.4s, v31.4s
sqrshl v21.4s, v21.4s, v31.4s
sqadd v18.4s, v18.4s, v29.4s
sqadd v19.4s, v19.4s, v29.4s
sqadd v20.4s, v20.4s, v29.4s
sqadd v21.4s, v21.4s, v29.4s
sqxtn v16.4h, v18.4s
sqxtn2 v16.8h, v19.4s
sqxtn v17.4h, v20.4s
sqxtn2 v17.8h, v21.4s
sqxtun v16.8b, v16.8h
sqxtun v17.8b, v17.8h
st1 {v16.8b, v17.8b}, [x0], x1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_epel_uni_w_h24_8_neon, export=1
EPEL_UNI_W_H_HEADER 8h
sxtl v0.8h, v28.8b
1:
ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], x3
subs w4, w4, #1
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
uxtl v4.8h, v4.8b
ext v5.16b, v1.16b, v2.16b, #2
ext v6.16b, v1.16b, v2.16b, #4
ext v7.16b, v1.16b, v2.16b, #6
ext v20.16b, v2.16b, v3.16b, #2
ext v21.16b, v2.16b, v3.16b, #4
ext v22.16b, v2.16b, v3.16b, #6
ext v23.16b, v3.16b, v4.16b, #2
ext v24.16b, v3.16b, v4.16b, #4
ext v25.16b, v3.16b, v4.16b, #6
mul v16.8h, v1.8h, v0.h[0]
mla v16.8h, v5.8h, v0.h[1]
mla v16.8h, v6.8h, v0.h[2]
mla v16.8h, v7.8h, v0.h[3]
mul v17.8h, v2.8h, v0.h[0]
mla v17.8h, v20.8h, v0.h[1]
mla v17.8h, v21.8h, v0.h[2]
mla v17.8h, v22.8h, v0.h[3]
mul v18.8h, v3.8h, v0.h[0]
mla v18.8h, v23.8h, v0.h[1]
mla v18.8h, v24.8h, v0.h[2]
mla v18.8h, v25.8h, v0.h[3]
smull v20.4s, v16.4h, v30.4h
smull2 v21.4s, v16.8h, v30.8h
smull v22.4s, v17.4h, v30.4h
smull2 v23.4s, v17.8h, v30.8h
smull v24.4s, v18.4h, v30.4h
smull2 v25.4s, v18.8h, v30.8h
sqrshl v20.4s, v20.4s, v31.4s
sqrshl v21.4s, v21.4s, v31.4s
sqrshl v22.4s, v22.4s, v31.4s
sqrshl v23.4s, v23.4s, v31.4s
sqrshl v24.4s, v24.4s, v31.4s
sqrshl v25.4s, v25.4s, v31.4s
sqadd v20.4s, v20.4s, v29.4s
sqadd v21.4s, v21.4s, v29.4s
sqadd v22.4s, v22.4s, v29.4s
sqadd v23.4s, v23.4s, v29.4s
sqadd v24.4s, v24.4s, v29.4s
sqadd v25.4s, v25.4s, v29.4s
sqxtn v16.4h, v20.4s
sqxtn2 v16.8h, v21.4s
sqxtn v17.4h, v22.4s
sqxtn2 v17.8h, v23.4s
sqxtn v18.4h, v24.4s
sqxtn2 v18.8h, v25.4s
sqxtun v16.8b, v16.8h
sqxtun v17.8b, v17.8h
sqxtun v18.8b, v18.8h
st1 {v16.8b, v17.8b, v18.8b}, [x0], x1
b.hi 1b
ret
endfunc
function ff_hevc_put_hevc_epel_uni_w_h32_8_neon, export=1
EPEL_UNI_W_H_HEADER 8h
ldr w10, [sp, #16] // width
ld1 {v1.8b}, [x2], #8
sub x3, x3, w10, uxtw // decrement src stride
mov w11, w10 // original width
sub x3, x3, #8 // decrement src stride
sub x1, x1, w10, uxtw // decrement dst stride
sxtl v0.8h, v28.8b
uxtl v1.8h, v1.8b
1:
ld1 {v2.8b, v3.8b}, [x2], #16
subs w10, w10, #16 // width
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
ext v5.16b, v1.16b, v2.16b, #2
ext v6.16b, v1.16b, v2.16b, #4
ext v7.16b, v1.16b, v2.16b, #6
ext v20.16b, v2.16b, v3.16b, #2
ext v21.16b, v2.16b, v3.16b, #4
ext v22.16b, v2.16b, v3.16b, #6
mul v16.8h, v1.8h, v0.h[0]
mla v16.8h, v5.8h, v0.h[1]
mla v16.8h, v6.8h, v0.h[2]
mla v16.8h, v7.8h, v0.h[3]
mul v17.8h, v2.8h, v0.h[0]
mla v17.8h, v20.8h, v0.h[1]
mla v17.8h, v21.8h, v0.h[2]
mla v17.8h, v22.8h, v0.h[3]
smull v18.4s, v16.4h, v30.4h
smull2 v19.4s, v16.8h, v30.8h
smull v20.4s, v17.4h, v30.4h
smull2 v21.4s, v17.8h, v30.8h
sqrshl v18.4s, v18.4s, v31.4s
sqrshl v19.4s, v19.4s, v31.4s
sqrshl v20.4s, v20.4s, v31.4s
sqrshl v21.4s, v21.4s, v31.4s
sqadd v18.4s, v18.4s, v29.4s
sqadd v19.4s, v19.4s, v29.4s
sqadd v20.4s, v20.4s, v29.4s
sqadd v21.4s, v21.4s, v29.4s
sqxtn v16.4h, v18.4s
sqxtn2 v16.8h, v19.4s
sqxtn v17.4h, v20.4s
sqxtn2 v17.8h, v21.4s
sqxtun v16.8b, v16.8h
sqxtun v17.8b, v17.8h
st1 {v16.8b, v17.8b}, [x0], #16
mov v1.16b, v3.16b
b.gt 1b
subs w4, w4, #1 // height
add x2, x2, x3
b.le 9f
ld1 {v1.8b}, [x2], #8
mov w10, w11
add x0, x0, x1
uxtl v1.8h, v1.8b
b 1b
9:
ret
endfunc
#if HAVE_I8MM
ENABLE_I8MM
function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
@ -2410,19 +2723,6 @@ function ff_hevc_put_hevc_epel_uni_hv64_8_neon_i8mm, export=1
ret
endfunc
.macro EPEL_UNI_W_H_HEADER
ldr x12, [sp]
sub x2, x2, #1
movrel x9, epel_filters
add x9, x9, x12, lsl #2
ld1r {v28.4s}, [x9]
mov w10, #-6
sub w10, w10, w5
dup v30.4s, w6
dup v31.4s, w10
dup v29.4s, w7
.endm
function ff_hevc_put_hevc_epel_uni_w_h4_8_neon_i8mm, export=1
EPEL_UNI_W_H_HEADER

View File

@ -235,6 +235,11 @@ NEON8_FNPROTO(epel_hv, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width), _i8mm);
NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width),);
NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@ -400,6 +405,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel, 0, 1, epel_h,);
NEON8_FNASSIGN_SHARED_32(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h,);
if (have_i8mm(cpu_flags)) {
NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);