1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-03-28 12:32:17 +02:00

lavc/aarch64: Provide neon implementation of nsse8

Add vectorized implementation of nsse8 function.

Performance comparison tests are shown below.
- nsse_1_c: 256.0
- nsse_1_neon: 82.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Grzegorz Bernacki 2022-10-03 16:10:17 +02:00 committed by Martin Storsjö
parent f401a2af21
commit faea56c9c7
2 changed files with 114 additions and 0 deletions

View File

@ -66,6 +66,11 @@ int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *p
int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
ptrdiff_t stride, int h);
int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@ -94,6 +99,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->vsse[4] = vsse_intra16_neon;
c->nsse[0] = nsse16_neon_wrapper;
c->nsse[1] = nsse8_neon_wrapper;
c->median_sad[0] = pix_median_abs16_neon;
c->median_sad[1] = pix_median_abs8_neon;
@ -108,3 +114,12 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
else
return nsse16_neon(8, s1, s2, stride, h);
}
int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h)
{
if (c)
return nsse8_neon(c->avctx->nsse_weight, s1, s2, stride, h);
else
return nsse8_neon(8, s1, s2, stride, h);
}

View File

@ -1158,6 +1158,105 @@ function nsse16_neon, export=1
ret
endfunc
function nsse8_neon, export=1
// x0 multiplier
// x1 uint8_t *pix1
// x2 uint8_t *pix2
// x3 ptrdiff_t stride
// w4 int h
str x0, [sp, #-0x40]!
stp x1, x2, [sp, #0x10]
stp x3, x4, [sp, #0x20]
str x30, [sp, #0x30]
bl X(sse8_neon)
ldr x30, [sp, #0x30]
mov w9, w0 // here we store score1
ldr x5, [sp]
ldp x1, x2, [sp, #0x10]
ldp x3, x4, [sp, #0x20]
add sp, sp, #0x40
movi v16.8h, #0
movi v17.8h, #0
movi v18.8h, #0
movi v19.8h, #0
ld1 {v0.8b}, [x1], x3
subs w4, w4, #1 // we need to make h-1 iterations
ext v1.8b, v0.8b, v0.8b, #1 // x1 + 1
ld1 {v2.8b}, [x2], x3
cmp w4, #2
ext v3.8b, v2.8b, v2.8b, #1 // x2 + 1
b.lt 2f
// make 2 iterations at once
1:
ld1 {v4.8b}, [x1], x3
ld1 {v20.8b}, [x1], x3
ld1 {v6.8b}, [x2], x3
ext v5.8b, v4.8b, v4.8b, #1 // x1 + stride + 1
ext v21.8b, v20.8b, v20.8b, #1
ld1 {v22.8b}, [x2], x3
ext v7.8b, v6.8b, v6.8b, #1 // x2 + stride + 1
usubl v31.8h, v0.8b, v4.8b
ext v23.8b, v22.8b, v22.8b, #1
usubl v29.8h, v1.8b, v5.8b
usubl v27.8h, v2.8b, v6.8b
usubl v25.8h, v3.8b, v7.8b
saba v16.8h, v31.8h, v29.8h
usubl v31.8h, v4.8b, v20.8b
saba v18.8h, v27.8h, v25.8h
sub w4, w4, #2
usubl v29.8h, v5.8b, v21.8b
mov v0.16b, v20.16b
mov v1.16b, v21.16b
saba v16.8h, v31.8h, v29.8h
usubl v27.8h, v6.8b, v22.8b
usubl v25.8h, v7.8b, v23.8b
mov v2.16b, v22.16b
mov v3.16b, v23.16b
cmp w4, #2
saba v18.8h, v27.8h, v25.8h
b.ge 1b
cbz w4, 3f
// iterate by one
2:
ld1 {v4.8b}, [x1], x3
subs w4, w4, #1
ext v5.8b, v4.8b, v4.8b, #1 // x1 + stride + 1
ld1 {v6.8b}, [x2], x3
usubl v31.8h, v0.8b, v4.8b
ext v7.8b, v6.8b, v6.8b, #1 // x2 + stride + 1
usubl v29.8h, v1.8b, v5.8b
saba v16.8h, v31.8h, v29.8h
usubl v27.8h, v2.8b, v6.8b
usubl v25.8h, v3.8b, v7.8b
saba v18.8h, v27.8h, v25.8h
mov v0.16b, v4.16b
mov v1.16b, v5.16b
mov v2.16b, v6.16b
mov v3.16b, v7.16b
cbnz w4, 2b
3:
sqsub v16.8h, v16.8h, v18.8h
ins v16.h[7], wzr
saddlv s16, v16.8h
sqabs s16, s16
fmov w0, s16
mul w0, w0, w5
add w0, w0, w9
ret
endfunc
function pix_median_abs16_neon, export=1
// x0 unused
// x1 uint8_t *pix1