You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-15 14:13:16 +02:00
lavc/aarch64: new optimization for 8-bit hevc_epel_h
put_hevc_epel_h4_8_c: 67.1 put_hevc_epel_h4_8_i8mm: 21.1 put_hevc_epel_h6_8_c: 147.1 put_hevc_epel_h6_8_i8mm: 45.1 put_hevc_epel_h8_8_c: 237.4 put_hevc_epel_h8_8_i8mm: 72.1 put_hevc_epel_h12_8_c: 527.4 put_hevc_epel_h12_8_i8mm: 115.4 put_hevc_epel_h16_8_c: 943.6 put_hevc_epel_h16_8_i8mm: 153.9 put_hevc_epel_h24_8_c: 2105.4 put_hevc_epel_h24_8_i8mm: 384.4 put_hevc_epel_h32_8_c: 3631.4 put_hevc_epel_h32_8_i8mm: 519.9 put_hevc_epel_h48_8_c: 8082.1 put_hevc_epel_h48_8_i8mm: 1110.4 put_hevc_epel_h64_8_c: 14400.6 put_hevc_epel_h64_8_i8mm: 2057.1 put_hevc_qpel_h4_8_c: 124.9 put_hevc_qpel_h4_8_neon: 43.1 put_hevc_qpel_h4_8_i8mm: 33.1 put_hevc_qpel_h6_8_c: 269.4 put_hevc_qpel_h6_8_neon: 90.6 put_hevc_qpel_h6_8_i8mm: 61.4 put_hevc_qpel_h8_8_c: 477.6 put_hevc_qpel_h8_8_neon: 82.1 put_hevc_qpel_h8_8_i8mm: 99.9 put_hevc_qpel_h12_8_c: 1062.4 put_hevc_qpel_h12_8_neon: 226.9 put_hevc_qpel_h12_8_i8mm: 170.9 put_hevc_qpel_h16_8_c: 1880.6 put_hevc_qpel_h16_8_neon: 302.9 put_hevc_qpel_h16_8_i8mm: 251.4 put_hevc_qpel_h24_8_c: 4221.9 put_hevc_qpel_h24_8_neon: 893.9 put_hevc_qpel_h24_8_i8mm: 626.1 put_hevc_qpel_h32_8_c: 7437.6 put_hevc_qpel_h32_8_neon: 1189.9 put_hevc_qpel_h32_8_i8mm: 959.1 put_hevc_qpel_h48_8_c: 16838.4 put_hevc_qpel_h48_8_neon: 2727.9 put_hevc_qpel_h48_8_i8mm: 2163.9 put_hevc_qpel_h64_8_c: 29982.1 put_hevc_qpel_h64_8_neon: 4777.6 Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
committed by
Martin Storsjö
parent
668eb4c00e
commit
d48c89701c
@@ -33,6 +33,349 @@ const epel_filters, align=4
|
||||
endconst
|
||||
|
||||
#if HAVE_I8MM
|
||||
|
||||
.macro EPEL_H_HEADER
|
||||
movrel x5, epel_filters
|
||||
add x5, x5, x4, lsl #2
|
||||
ld1r {v30.4s}, [x5]
|
||||
sub x1, x1, #1
|
||||
mov x10, #(MAX_PB_SIZE * 2)
|
||||
.endm
|
||||
|
||||
function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
1: ld1 {v4.8b}, [x1], x2
|
||||
subs w3, w3, #1 // height
|
||||
ext v5.8b, v4.8b, v4.8b, #1
|
||||
ext v6.8b, v4.8b, v4.8b, #2
|
||||
ext v7.8b, v4.8b, v4.8b, #3
|
||||
trn1 v4.2s, v4.2s, v5.2s
|
||||
trn1 v6.2s, v6.2s, v7.2s
|
||||
trn1 v4.2d, v4.2d, v6.2d
|
||||
movi v16.2d, #0
|
||||
usdot v16.4s, v4.16b, v30.16b
|
||||
xtn v16.4h, v16.4s
|
||||
st1 {v16.4h}, [x0], x10
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
1: ld1 {v4.16b}, [x1], x2
|
||||
subs w3, w3, #1 // height
|
||||
ext v5.16b, v4.16b, v4.16b, #1
|
||||
ext v6.8b, v4.8b, v4.8b, #2
|
||||
ext v7.8b, v4.8b, v4.8b, #3
|
||||
trn1 v16.2s, v4.2s, v5.2s
|
||||
trn2 v17.2s, v4.2s, v5.2s
|
||||
trn1 v6.2s, v6.2s, v7.2s
|
||||
trn1 v16.2d, v16.2d, v6.2d
|
||||
movi v18.2d, #0
|
||||
movi v19.2d, #0
|
||||
usdot v18.4s, v16.16b, v30.16b
|
||||
usdot v19.2s, v17.8b, v30.8b
|
||||
xtn v18.4h, v18.4s
|
||||
xtn v19.4h, v19.4s
|
||||
str d18, [x0]
|
||||
str s19, [x0, #8]
|
||||
add x0, x0, x10
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
1: ld1 {v4.16b}, [x1], x2
|
||||
subs w3, w3, #1 // height
|
||||
ext v5.16b, v4.16b, v4.16b, #1
|
||||
ext v6.16b, v4.16b, v4.16b, #2
|
||||
ext v7.16b, v4.16b, v4.16b, #3
|
||||
zip1 v20.4s, v4.4s, v6.4s
|
||||
zip1 v21.4s, v5.4s, v7.4s
|
||||
movi v16.2d, #0
|
||||
movi v17.2d, #0
|
||||
usdot v16.4s, v20.16b, v30.16b
|
||||
usdot v17.4s, v21.16b, v30.16b
|
||||
xtn v16.4h, v16.4s
|
||||
xtn v17.4h, v17.4s
|
||||
st2 {v16.4h, v17.4h}, [x0], x10
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
1: ld1 {v4.16b}, [x1], x2
|
||||
subs w3, w3, #1 // height
|
||||
ext v5.16b, v4.16b, v4.16b, #1
|
||||
ext v6.16b, v4.16b, v4.16b, #2
|
||||
ext v7.16b, v4.16b, v4.16b, #3
|
||||
trn1 v20.2d, v4.2d, v6.2d
|
||||
trn2 v22.2d, v4.2d, v6.2d
|
||||
trn1 v21.2d, v5.2d, v7.2d
|
||||
trn2 v23.2d, v5.2d, v7.2d
|
||||
trn1 v4.4s, v20.4s, v21.4s
|
||||
trn2 v5.4s, v20.4s, v21.4s
|
||||
trn1 v6.4s, v22.4s, v23.4s
|
||||
movi v16.2d, #0
|
||||
movi v17.2d, #0
|
||||
movi v18.2d, #0
|
||||
usdot v16.4s, v4.16b, v30.16b
|
||||
usdot v17.4s, v5.16b, v30.16b
|
||||
usdot v18.4s, v6.16b, v30.16b
|
||||
xtn v16.4h, v16.4s
|
||||
xtn2 v16.8h, v17.4s
|
||||
xtn v18.4h, v18.4s
|
||||
str q16, [x0]
|
||||
str d18, [x0, #16]
|
||||
add x0, x0, x10
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
1: ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
subs w3, w3, #1 // height
|
||||
ext v5.16b, v0.16b, v1.16b, #1
|
||||
ext v6.16b, v0.16b, v1.16b, #2
|
||||
ext v7.16b, v0.16b, v1.16b, #3
|
||||
zip1 v20.4s, v0.4s, v6.4s
|
||||
zip2 v22.4s, v0.4s, v6.4s
|
||||
zip1 v21.4s, v5.4s, v7.4s
|
||||
zip2 v23.4s, v5.4s, v7.4s
|
||||
movi v16.2d, #0
|
||||
movi v17.2d, #0
|
||||
movi v18.2d, #0
|
||||
movi v19.2d, #0
|
||||
usdot v16.4s, v20.16b, v30.16b
|
||||
usdot v17.4s, v21.16b, v30.16b
|
||||
usdot v18.4s, v22.16b, v30.16b
|
||||
usdot v19.4s, v23.16b, v30.16b
|
||||
xtn v16.4h, v16.4s
|
||||
xtn2 v16.8h, v18.4s
|
||||
xtn v17.4h, v17.4s
|
||||
xtn2 v17.8h, v19.4s
|
||||
st2 {v16.8h, v17.8h}, [x0], x10
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
1: ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
subs w3, w3, #1 // height
|
||||
ext v5.16b, v0.16b, v1.16b, #1
|
||||
ext v6.16b, v0.16b, v1.16b, #2
|
||||
ext v7.16b, v0.16b, v1.16b, #3
|
||||
ext v26.16b, v1.16b, v1.16b, #1
|
||||
ext v27.16b, v1.16b, v1.16b, #2
|
||||
ext v28.16b, v1.16b, v1.16b, #3
|
||||
movi v16.2d, #0
|
||||
movi v17.2d, #0
|
||||
movi v18.2d, #0
|
||||
movi v19.2d, #0
|
||||
movi v20.2d, #0
|
||||
movi v21.2d, #0
|
||||
movi v22.2d, #0
|
||||
movi v23.2d, #0
|
||||
usdot v16.4s, v0.16b, v30.16b
|
||||
usdot v17.4s, v5.16b, v30.16b
|
||||
usdot v18.4s, v6.16b, v30.16b
|
||||
usdot v19.4s, v7.16b, v30.16b
|
||||
usdot v20.4s, v1.16b, v30.16b
|
||||
usdot v21.4s, v26.16b, v30.16b
|
||||
usdot v22.4s, v27.16b, v30.16b
|
||||
usdot v23.4s, v28.16b, v30.16b
|
||||
xtn v16.4h, v16.4s
|
||||
xtn2 v16.8h, v20.4s
|
||||
xtn v17.4h, v17.4s
|
||||
xtn2 v17.8h, v21.4s
|
||||
xtn v18.4h, v18.4s
|
||||
xtn2 v18.8h, v22.4s
|
||||
xtn v19.4h, v19.4s
|
||||
xtn2 v19.8h, v23.4s
|
||||
zip1 v20.8h, v16.8h, v18.8h
|
||||
zip1 v21.8h, v17.8h, v19.8h
|
||||
zip2 v22.8h, v16.8h, v18.8h
|
||||
zip2 v23.8h, v17.8h, v19.8h
|
||||
zip1 v22.8h, v22.8h, v23.8h
|
||||
add x7, x0, #32
|
||||
st2 {v20.8h, v21.8h}, [x0], x10
|
||||
st1 {v22.8h}, [x7]
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
1: ld1 {v0.16b, v1.16b, v2.16b}, [x1], x2
|
||||
subs w3, w3, #1 // height
|
||||
ext v5.16b, v0.16b, v1.16b, #1
|
||||
ext v6.16b, v0.16b, v1.16b, #2
|
||||
ext v7.16b, v0.16b, v1.16b, #3
|
||||
ext v26.16b, v1.16b, v2.16b, #1
|
||||
ext v27.16b, v1.16b, v2.16b, #2
|
||||
ext v28.16b, v1.16b, v2.16b, #3
|
||||
movi v16.2d, #0
|
||||
movi v17.2d, #0
|
||||
movi v18.2d, #0
|
||||
movi v19.2d, #0
|
||||
movi v20.2d, #0
|
||||
movi v21.2d, #0
|
||||
movi v22.2d, #0
|
||||
movi v23.2d, #0
|
||||
usdot v16.4s, v0.16b, v30.16b
|
||||
usdot v17.4s, v5.16b, v30.16b
|
||||
usdot v18.4s, v6.16b, v30.16b
|
||||
usdot v19.4s, v7.16b, v30.16b
|
||||
usdot v20.4s, v1.16b, v30.16b
|
||||
usdot v21.4s, v26.16b, v30.16b
|
||||
usdot v22.4s, v27.16b, v30.16b
|
||||
usdot v23.4s, v28.16b, v30.16b
|
||||
xtn v16.4h, v16.4s
|
||||
xtn2 v16.8h, v20.4s
|
||||
xtn v17.4h, v17.4s
|
||||
xtn2 v17.8h, v21.4s
|
||||
xtn v18.4h, v18.4s
|
||||
xtn2 v18.8h, v22.4s
|
||||
xtn v19.4h, v19.4s
|
||||
xtn2 v19.8h, v23.4s
|
||||
st4 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
|
||||
subs w3, w3, #1 // height
|
||||
ext v4.16b, v0.16b, v1.16b, #1
|
||||
ext v5.16b, v0.16b, v1.16b, #2
|
||||
ext v6.16b, v0.16b, v1.16b, #3
|
||||
ext v16.16b, v1.16b, v2.16b, #1
|
||||
ext v17.16b, v1.16b, v2.16b, #2
|
||||
ext v18.16b, v1.16b, v2.16b, #3
|
||||
movi v20.2d, #0
|
||||
movi v21.2d, #0
|
||||
movi v22.2d, #0
|
||||
movi v23.2d, #0
|
||||
usdot v20.4s, v0.16b, v30.16b
|
||||
usdot v21.4s, v4.16b, v30.16b
|
||||
usdot v22.4s, v5.16b, v30.16b
|
||||
usdot v23.4s, v6.16b, v30.16b
|
||||
movi v24.2d, #0
|
||||
movi v25.2d, #0
|
||||
movi v26.2d, #0
|
||||
movi v27.2d, #0
|
||||
usdot v24.4s, v1.16b, v30.16b
|
||||
usdot v25.4s, v16.16b, v30.16b
|
||||
usdot v26.4s, v17.16b, v30.16b
|
||||
usdot v27.4s, v18.16b, v30.16b
|
||||
xtn v20.4h, v20.4s
|
||||
xtn2 v20.8h, v24.4s
|
||||
xtn v21.4h, v21.4s
|
||||
xtn2 v21.8h, v25.4s
|
||||
xtn v22.4h, v22.4s
|
||||
xtn2 v22.8h, v26.4s
|
||||
xtn v23.4h, v23.4s
|
||||
xtn2 v23.8h, v27.4s
|
||||
st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10
|
||||
ext v4.16b, v2.16b, v3.16b, #1
|
||||
ext v5.16b, v2.16b, v3.16b, #2
|
||||
ext v6.16b, v2.16b, v3.16b, #3
|
||||
movi v20.2d, #0
|
||||
movi v21.2d, #0
|
||||
movi v22.2d, #0
|
||||
movi v23.2d, #0
|
||||
usdot v20.4s, v2.16b, v30.16b
|
||||
usdot v21.4s, v4.16b, v30.16b
|
||||
usdot v22.4s, v5.16b, v30.16b
|
||||
usdot v23.4s, v6.16b, v30.16b
|
||||
xtn v20.4h, v20.4s
|
||||
xtn2 v20.8h, v22.4s
|
||||
xtn v21.4h, v21.4s
|
||||
xtn2 v21.8h, v23.4s
|
||||
add x7, x0, #64
|
||||
st2 {v20.8h, v21.8h}, [x7]
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
|
||||
EPEL_H_HEADER
|
||||
sub x2, x2, #64
|
||||
1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
|
||||
subs w3, w3, #1 // height
|
||||
ext v4.16b, v0.16b, v1.16b, #1
|
||||
ext v5.16b, v0.16b, v1.16b, #2
|
||||
ext v6.16b, v0.16b, v1.16b, #3
|
||||
ext v16.16b, v1.16b, v2.16b, #1
|
||||
ext v17.16b, v1.16b, v2.16b, #2
|
||||
ext v18.16b, v1.16b, v2.16b, #3
|
||||
movi v20.2d, #0
|
||||
movi v21.2d, #0
|
||||
movi v22.2d, #0
|
||||
movi v23.2d, #0
|
||||
usdot v20.4s, v0.16b, v30.16b
|
||||
usdot v21.4s, v4.16b, v30.16b
|
||||
usdot v22.4s, v5.16b, v30.16b
|
||||
usdot v23.4s, v6.16b, v30.16b
|
||||
movi v24.2d, #0
|
||||
movi v25.2d, #0
|
||||
movi v26.2d, #0
|
||||
movi v27.2d, #0
|
||||
usdot v24.4s, v1.16b, v30.16b
|
||||
usdot v25.4s, v16.16b, v30.16b
|
||||
usdot v26.4s, v17.16b, v30.16b
|
||||
usdot v27.4s, v18.16b, v30.16b
|
||||
xtn v20.4h, v20.4s
|
||||
xtn2 v20.8h, v24.4s
|
||||
xtn v21.4h, v21.4s
|
||||
xtn2 v21.8h, v25.4s
|
||||
xtn v22.4h, v22.4s
|
||||
xtn2 v22.8h, v26.4s
|
||||
xtn v23.4h, v23.4s
|
||||
xtn2 v23.8h, v27.4s
|
||||
st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
||||
ld1 {v7.8b}, [x1], x2
|
||||
ext v4.16b, v2.16b, v3.16b, #1
|
||||
ext v5.16b, v2.16b, v3.16b, #2
|
||||
ext v6.16b, v2.16b, v3.16b, #3
|
||||
ext v16.16b, v3.16b, v7.16b, #1
|
||||
ext v17.16b, v3.16b, v7.16b, #2
|
||||
ext v18.16b, v3.16b, v7.16b, #3
|
||||
movi v20.2d, #0
|
||||
movi v21.2d, #0
|
||||
movi v22.2d, #0
|
||||
movi v23.2d, #0
|
||||
usdot v20.4s, v2.16b, v30.16b
|
||||
usdot v21.4s, v4.16b, v30.16b
|
||||
usdot v22.4s, v5.16b, v30.16b
|
||||
usdot v23.4s, v6.16b, v30.16b
|
||||
movi v24.2d, #0
|
||||
movi v25.2d, #0
|
||||
movi v26.2d, #0
|
||||
movi v27.2d, #0
|
||||
usdot v24.4s, v3.16b, v30.16b
|
||||
usdot v25.4s, v16.16b, v30.16b
|
||||
usdot v26.4s, v17.16b, v30.16b
|
||||
usdot v27.4s, v18.16b, v30.16b
|
||||
xtn v20.4h, v20.4s
|
||||
xtn2 v20.8h, v24.4s
|
||||
xtn v21.4h, v21.4s
|
||||
xtn2 v21.8h, v25.4s
|
||||
xtn v22.4h, v22.4s
|
||||
xtn2 v22.8h, v26.4s
|
||||
xtn v23.4h, v23.4s
|
||||
xtn2 v23.8h, v27.4s
|
||||
st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro EPEL_UNI_W_H_HEADER
|
||||
ldr x12, [sp]
|
||||
sub x2, x2, #1
|
||||
|
@@ -171,6 +171,10 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
int height, int denom, int wx, int ox,
|
||||
intptr_t mx, intptr_t my, int width),);
|
||||
|
||||
NEON8_FNPROTO(epel_h, (int16_t *dst,
|
||||
const uint8_t *_src, ptrdiff_t _srcstride,
|
||||
int height, intptr_t mx, intptr_t my, int width), _i8mm);
|
||||
|
||||
NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
const uint8_t *_src, ptrdiff_t _srcstride,
|
||||
int height, int denom, int wx, int ox,
|
||||
@@ -283,6 +287,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
|
||||
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
|
||||
|
||||
if (have_i8mm(cpu_flags)) {
|
||||
NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
|
||||
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
|
||||
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
|
||||
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
|
||||
|
Reference in New Issue
Block a user