1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-13 21:28:01 +02:00

lavc/aarch64: port HEVC add_residual NEON

Speedup is fairly small, around 1.5%, but these are fairly simple.

Signed-off-by: Josh Dekker <josh@itanimul.li>
This commit is contained in:
Reimar Döffinger 2021-01-10 10:27:00 +00:00 committed by Josh Dekker
parent 30f80d855b
commit 00c916ef61
2 changed files with 214 additions and 0 deletions

View File

@ -36,6 +36,196 @@ const trans, align=4
.short 31, 22, 13, 4 .short 31, 22, 13, 4
endconst endconst
.macro clip10 in1, in2, c1, c2
smax \in1, \in1, \c1
smax \in2, \in2, \c1
smin \in1, \in1, \c2
smin \in2, \in2, \c2
.endm
function ff_hevc_add_residual_4x4_8_neon, export=1
ld1 {v0.8h-v1.8h}, [x1]
ld1 {v2.s}[0], [x0], x2
ld1 {v2.s}[1], [x0], x2
ld1 {v2.s}[2], [x0], x2
ld1 {v2.s}[3], [x0], x2
sub x0, x0, x2, lsl #2
uxtl v6.8h, v2.8b
uxtl2 v7.8h, v2.16b
sqadd v0.8h, v0.8h, v6.8h
sqadd v1.8h, v1.8h, v7.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
st1 {v0.s}[0], [x0], x2
st1 {v0.s}[1], [x0], x2
st1 {v0.s}[2], [x0], x2
st1 {v0.s}[3], [x0], x2
ret
endfunc
function ff_hevc_add_residual_4x4_10_neon, export=1
mov x12, x0
ld1 {v0.8h-v1.8h}, [x1]
ld1 {v2.d}[0], [x12], x2
ld1 {v2.d}[1], [x12], x2
ld1 {v3.d}[0], [x12], x2
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.d}[1], [x12], x2
movi v4.8h, #0
sqadd v1.8h, v1.8h, v3.8h
mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
clip10 v0.8h, v1.8h, v4.8h, v5.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x0], x2
st1 {v1.d}[0], [x0], x2
st1 {v1.d}[1], [x0], x2
ret
endfunc
function ff_hevc_add_residual_8x8_8_neon, export=1
add x12, x0, x2
add x2, x2, x2
mov x3, #8
1: subs x3, x3, #2
ld1 {v2.d}[0], [x0]
ld1 {v2.d}[1], [x12]
uxtl v3.8h, v2.8b
ld1 {v0.8h-v1.8h}, [x1], #32
uxtl2 v2.8h, v2.16b
sqadd v0.8h, v0.8h, v3.8h
sqadd v1.8h, v1.8h, v2.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_8x8_10_neon, export=1
add x12, x0, x2
add x2, x2, x2
mov x3, #8
movi v4.8h, #0
mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
1: subs x3, x3, #2
ld1 {v0.8h-v1.8h}, [x1], #32
ld1 {v2.8h}, [x0]
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.8h}, [x12]
sqadd v1.8h, v1.8h, v3.8h
clip10 v0.8h, v1.8h, v4.8h, v5.8h
st1 {v0.8h}, [x0], x2
st1 {v1.8h}, [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_16x16_8_neon, export=1
mov x3, #16
add x12, x0, x2
add x2, x2, x2
1: subs x3, x3, #2
ld1 {v16.16b}, [x0]
ld1 {v0.8h-v3.8h}, [x1], #64
ld1 {v19.16b}, [x12]
uxtl v17.8h, v16.8b
uxtl2 v18.8h, v16.16b
uxtl v20.8h, v19.8b
uxtl2 v21.8h, v19.16b
sqadd v0.8h, v0.8h, v17.8h
sqadd v1.8h, v1.8h, v18.8h
sqadd v2.8h, v2.8h, v20.8h
sqadd v3.8h, v3.8h, v21.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
sqxtun v1.8b, v2.8h
sqxtun2 v1.16b, v3.8h
st1 {v0.16b}, [x0], x2
st1 {v1.16b}, [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_16x16_10_neon, export=1
mov x3, #16
movi v20.8h, #0
mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
add x12, x0, x2
add x2, x2, x2
1: subs x3, x3, #2
ld1 {v16.8h-v17.8h}, [x0]
ld1 {v0.8h-v3.8h}, [x1], #64
sqadd v0.8h, v0.8h, v16.8h
ld1 {v18.8h-v19.8h}, [x12]
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip10 v0.8h, v1.8h, v20.8h, v21.8h
clip10 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v1.8h}, [x0], x2
st1 {v2.8h-v3.8h}, [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_32x32_8_neon, export=1
add x12, x0, x2
add x2, x2, x2
mov x3, #32
1: subs x3, x3, #2
ld1 {v20.16b, v21.16b}, [x0]
uxtl v16.8h, v20.8b
uxtl2 v17.8h, v20.16b
ld1 {v22.16b, v23.16b}, [x12]
uxtl v18.8h, v21.8b
uxtl2 v19.8h, v21.16b
uxtl v20.8h, v22.8b
ld1 {v0.8h-v3.8h}, [x1], #64
ld1 {v4.8h-v7.8h}, [x1], #64
uxtl2 v21.8h, v22.16b
uxtl v22.8h, v23.8b
uxtl2 v23.8h, v23.16b
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
sqadd v4.8h, v4.8h, v20.8h
sqadd v5.8h, v5.8h, v21.8h
sqadd v6.8h, v6.8h, v22.8h
sqadd v7.8h, v7.8h, v23.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
sqxtun v1.8b, v2.8h
sqxtun2 v1.16b, v3.8h
sqxtun v2.8b, v4.8h
sqxtun2 v2.16b, v5.8h
st1 {v0.16b, v1.16b}, [x0], x2
sqxtun v3.8b, v6.8h
sqxtun2 v3.16b, v7.8h
st1 {v2.16b, v3.16b}, [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_32x32_10_neon, export=1
mov x3, #32
movi v20.8h, #0
mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
1: subs x3, x3, #1
ld1 {v0.8h-v3.8h}, [x1], #64
ld1 {v16.8h-v19.8h}, [x0]
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip10 v0.8h, v1.8h, v20.8h, v21.8h
clip10 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v3.8h}, [x0], x2
bne 1b
ret
endfunc
.macro sum_sub out, in, c, op, p .macro sum_sub out, in, c, op, p
.ifc \op, + .ifc \op, +
smlal\p \out, \in, \c smlal\p \out, \in, \c

View File

@ -25,6 +25,22 @@
#include "libavutil/aarch64/cpu.h" #include "libavutil/aarch64/cpu.h"
#include "libavcodec/hevcdsp.h" #include "libavcodec/hevcdsp.h"
void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@ -35,10 +51,18 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
if (!have_neon(av_get_cpu_flags())) return; if (!have_neon(av_get_cpu_flags())) return;
if (bit_depth == 8) { if (bit_depth == 8) {
c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
c->idct[1] = ff_hevc_idct_8x8_8_neon; c->idct[1] = ff_hevc_idct_8x8_8_neon;
c->idct[2] = ff_hevc_idct_16x16_8_neon; c->idct[2] = ff_hevc_idct_16x16_8_neon;
} }
if (bit_depth == 10) { if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon;
c->add_residual[3] = ff_hevc_add_residual_32x32_10_neon;
c->idct[1] = ff_hevc_idct_8x8_10_neon; c->idct[1] = ff_hevc_idct_8x8_10_neon;
c->idct[2] = ff_hevc_idct_16x16_10_neon; c->idct[2] = ff_hevc_idct_16x16_10_neon;
} }