avcodec/vc1: Arm 32-bit NEON unescape fast path

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. vc1dsp.vc1_unescape_buffer_c: 918624.7 vc1dsp.vc1_unescape_buffer_neon: 142958.0 Signed-off-by: Ben Avison <bavison@riscosopen.org> Signed-off-by: Martin Storsjö <martin@martin.st>
2025-12-09 22:02:17 +02:00 · 2022-03-31 18:23:51 +01:00
parent 6eee650289
commit 23c92e14f5
2 changed files with 179 additions and 0 deletions
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -19,6 +19,7 @@
 #include <stdint.h>
 #include "libavutil/attributes.h"
 #include "libavutil/intreadwrite.h"
 #include "libavcodec/vc1dsp.h"
 #include "vc1dsp.h"
@@ -84,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                int h, int x, int y);
 int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
 static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
 {
    /* Dealing with starting and stopping, and removing escape bytes, are
     * comparatively less time-sensitive, so are more clearly expressed using
     * a C wrapper around the assembly inner loop. Note that we assume a
     * little-endian machine that supports unaligned loads. */
    int dsize = 0;
    while (size >= 4)
    {
        int found = 0;
        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
        {
            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
            if (!found)
            {
                *dst++ = *src++;
                --size;
                ++dsize;
            }
        }
        if (!found)
        {
            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
            dst += skip;
            src += skip;
            size -= skip;
            dsize += skip;
            while (!found && size >= 4)
            {
                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
                if (!found)
                {
                    *dst++ = *src++;
                    --size;
                    ++dsize;
                }
            }
        }
        if (found)
        {
            *dst++ = *src++;
            *dst++ = *src++;
            ++src;
            size -= 3;
            dsize += 2;
        }
    }
    while (size > 0)
    {
        *dst++ = *src++;
        --size;
        ++dsize;
    }
    return dsize;
 }
 #define FN_ASSIGN(X, Y) \
    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
@@ -130,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
    dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
    dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
    dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
 }
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -1804,3 +1804,121 @@ function ff_vc1_h_loop_filter16_neon, export=1
 4:      vpop            {d8-d15}
        pop             {r4-r6,pc}
 endfunc
@ Copy at most the specified number of bytes from source to destination buffer,
@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
@ On entry:
@   r0 -> source buffer
@   r1 = max number of bytes to copy
@   r2 -> destination buffer, optimally 8-byte aligned
@ On exit:
@   r0 = number of bytes not copied
 function ff_vc1_unescape_buffer_helper_neon, export=1
        @ Offset by 48 to screen out cases that are too short for us to handle,
        @ and also make it easy to test for loop termination, or to determine
        @ whether we need an odd number of half-iterations of the loop.
        subs    r1, r1, #48
        bmi     90f
        @ Set up useful constants
        vmov.i32        q0, #0x3000000
        vmov.i32        q1, #0x30000
        tst             r1, #16
        bne             1f
          vld1.8          {q8, q9}, [r0]!
          vbic            q12, q8, q0
          vext.8          q13, q8, q9, #1
          vext.8          q14, q8, q9, #2
          vext.8          q15, q8, q9, #3
          veor            q12, q12, q1
          vbic            q13, q13, q0
          vbic            q14, q14, q0
          vbic            q15, q15, q0
          vceq.i32        q12, q12, #0
          veor            q13, q13, q1
          veor            q14, q14, q1
          veor            q15, q15, q1
          vceq.i32        q13, q13, #0
          vceq.i32        q14, q14, #0
          vceq.i32        q15, q15, #0
          add             r1, r1, #16
          b               3f
 1:      vld1.8          {q10, q11}, [r0]!
        vbic            q12, q10, q0
        vext.8          q13, q10, q11, #1
        vext.8          q14, q10, q11, #2
        vext.8          q15, q10, q11, #3
        veor            q12, q12, q1
        vbic            q13, q13, q0
        vbic            q14, q14, q0
        vbic            q15, q15, q0
        vceq.i32        q12, q12, #0
        veor            q13, q13, q1
        veor            q14, q14, q1
        veor            q15, q15, q1
        vceq.i32        q13, q13, #0
        vceq.i32        q14, q14, #0
        vceq.i32        q15, q15, #0
        @ Drop through...
 2:        vmov            q8, q11
          vld1.8          {q9}, [r0]!
        vorr            q13, q12, q13
        vorr            q15, q14, q15
          vbic            q12, q8, q0
        vorr            q3, q13, q15
          vext.8          q13, q8, q9, #1
          vext.8          q14, q8, q9, #2
          vext.8          q15, q8, q9, #3
          veor            q12, q12, q1
        vorr            d6, d6, d7
          vbic            q13, q13, q0
          vbic            q14, q14, q0
          vbic            q15, q15, q0
          vceq.i32        q12, q12, #0
        vmov            r3, r12, d6
          veor            q13, q13, q1
          veor            q14, q14, q1
          veor            q15, q15, q1
          vceq.i32        q13, q13, #0
          vceq.i32        q14, q14, #0
          vceq.i32        q15, q15, #0
        orrs            r3, r3, r12
        bne             90f
        vst1.64         {q10}, [r2]!
 3:          vmov            q10, q9
            vld1.8          {q11}, [r0]!
          vorr            q13, q12, q13
          vorr            q15, q14, q15
            vbic            q12, q10, q0
          vorr            q3, q13, q15
            vext.8          q13, q10, q11, #1
            vext.8          q14, q10, q11, #2
            vext.8          q15, q10, q11, #3
            veor            q12, q12, q1
          vorr            d6, d6, d7
            vbic            q13, q13, q0
            vbic            q14, q14, q0
            vbic            q15, q15, q0
            vceq.i32        q12, q12, #0
          vmov            r3, r12, d6
            veor            q13, q13, q1
            veor            q14, q14, q1
            veor            q15, q15, q1
            vceq.i32        q13, q13, #0
            vceq.i32        q14, q14, #0
            vceq.i32        q15, q15, #0
          orrs            r3, r3, r12
          bne             91f
          vst1.64         {q8}, [r2]!
        subs            r1, r1, #32
        bpl             2b
 90:     add             r0, r1, #48
        bx              lr
 91:     sub             r1, r1, #16
        b               90b
 endfunc