lavc/aarch64: Add neon implementation for vsse_intra16

Provide optimized implementation for vsse_intra16 for arm64. Performance tests are shown below. - vsse_4_c: 155.2 - vsse_4_neon: 36.2 Benchmarks and tests are run with checkasm tool on AWS Graviton 3. Signed-off-by: Hubert Mazur <hum@semihalf.com> Signed-off-by: Martin Storsjö <martin@martin.st>
2025-08-10 06:10:52 +02:00 · 2022-09-08 11:25:06 +02:00
parent ce03ea3e79
commit 908abe8032
2 changed files with 66 additions and 0 deletions
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -47,6 +47,8 @@ int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
                      ptrdiff_t stride, int h) ;
 int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                ptrdiff_t stride, int h);
 int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
                      ptrdiff_t stride, int h);
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -69,5 +71,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
        c->vsad[4] = vsad_intra16_neon;
        c->vsse[0] = vsse16_neon;
        c->vsse[4] = vsse_intra16_neon;
    }
 }
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -784,3 +784,66 @@ function vsad_intra16_neon, export=1
        ret
 endfunc
 function vsse_intra16_neon, export=1
        // x0           unused
        // x1           uint8_t *pix1
        // x2           uint8_t *dummy
        // x3           ptrdiff_t stride
        // w4           int h
        ld1             {v0.16b}, [x1], x3
        movi            v16.4s, #0
        movi            v17.4s, #0
        sub             w4, w4, #1 // we need to make h-1 iterations
        cmp             w4, #3
        b.lt            2f
 1:
        // v = abs( pix1[0] - pix1[0 + stride] )
        // score = sum( v * v )
        ld1             {v1.16b}, [x1], x3
        ld1             {v2.16b}, [x1], x3
        uabd            v30.16b, v0.16b, v1.16b
        ld1             {v3.16b}, [x1], x3
        umull           v29.8h, v30.8b, v30.8b
        umull2          v28.8h, v30.16b, v30.16b
        uabd            v27.16b, v1.16b, v2.16b
        uadalp          v16.4s, v29.8h
        umull           v26.8h, v27.8b, v27.8b
        umull2          v27.8h, v27.16b, v27.16b
        uadalp          v17.4s, v28.8h
        uabd            v25.16b, v2.16b, v3.16b
        uadalp          v16.4s, v26.8h
        umull           v24.8h, v25.8b, v25.8b
        umull2          v25.8h, v25.16b, v25.16b
        uadalp          v17.4s, v27.8h
        sub             w4, w4, #3
        uadalp          v16.4s, v24.8h
        cmp             w4, #3
        uadalp          v17.4s, v25.8h
        mov             v0.16b, v3.16b
        b.ge            1b
        cbz             w4, 3f
 // iterate by one
 2:
        ld1             {v1.16b}, [x1], x3
        subs            w4, w4, #1
        uabd            v30.16b, v0.16b, v1.16b
        mov             v0.16b, v1.16b
        umull           v29.8h, v30.8b, v30.8b
        umull2          v30.8h, v30.16b, v30.16b
        uadalp          v16.4s, v29.8h
        uadalp          v17.4s, v30.8h
        cbnz            w4, 2b
 3:
        add             v16.4s, v16.4s, v17.4S
        uaddlv          d17, v16.4s
        fmov            w0, s17
        ret
 endfunc