swscale/aarch64/yuv2rgb: add neon yuv42{0,2}p -> gbrp unscaled colorspace converters

checkasm --bench on a Raspberry Pi 5 Model B Rev 1.0: yuv420p_gbrp_128_c: 1243.0 yuv420p_gbrp_128_neon: 453.5 yuv420p_gbrp_1920_c: 18165.5 yuv420p_gbrp_1920_neon: 6700.0 yuv422p_gbrp_128_c: 1463.5 yuv422p_gbrp_128_neon: 471.5 yuv422p_gbrp_1920_c: 21343.7 yuv422p_gbrp_1920_neon: 6743.5
2025-01-29 22:00:58 +02:00 · 2024-08-06 12:51:06 +02:00 · 2024-08-06 12:51:06 +02:00 · 181cd260db
commit 181cd260db
parent 8744764a4c
2 changed files with 118 additions and 13 deletions
--- a/libswscale/aarch64/swscale_unscaled.c
+++ b/libswscale/aarch64/swscale_unscaled.c
@ -52,11 +52,41 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
                                        c->yuv2rgb_y_coeff);                                \
 }                                                                                           \

+#define DECLARE_FF_YUVX_TO_GBRP_FUNCS(ifmt, ofmt)                                           \
+int ff_##ifmt##_to_##ofmt##_neon(int w, int h,                                              \
+                                 uint8_t *dst, int linesize,                                \
+                                 const uint8_t *srcY, int linesizeY,                        \
+                                 const uint8_t *srcU, int linesizeU,                        \
+                                 const uint8_t *srcV, int linesizeV,                        \
+                                 const int16_t *table,                                      \
+                                 int y_offset,                                              \
+                                 int y_coeff,                                               \
+                                 uint8_t *dst1, int linesize1,                              \
+                                 uint8_t *dst2, int linesize2);                             \
+                                                                                            \
+static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],             \
+                                           int srcStride[], int srcSliceY, int srcSliceH,   \
+                                           uint8_t *dst[], int dstStride[]) {               \
+    const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE };                                   \
+                                                                                            \
+    return ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH,                                 \
+                                        dst[0] + srcSliceY * dstStride[0], dstStride[0],    \
+                                        src[0], srcStride[0],                               \
+                                        src[1], srcStride[1],                               \
+                                        src[2], srcStride[2],                               \
+                                        yuv2rgb_table,                                      \
+                                        c->yuv2rgb_y_offset >> 6,                           \
+                                        c->yuv2rgb_y_coeff,                                 \
+                                        dst[1] + srcSliceY * dstStride[1], dstStride[1],    \
+                                        dst[2] + srcSliceY * dstStride[2], dstStride[2]);   \
+}                                                                                           \
+
 #define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx)                                             \
 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb)                                                   \
 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba)                                                   \
 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr)                                                   \
 DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra)                                                   \
+DECLARE_FF_YUVX_TO_GBRP_FUNCS(yuvx, gbrp)                                                   \

 DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
 DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
@ -83,11 +113,38 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
                                        c->yuv2rgb_y_coeff);                                \
 }                                                                                           \

+#define DECLARE_FF_NVX_TO_GBRP_FUNCS(ifmt, ofmt)                                            \
+int ff_##ifmt##_to_##ofmt##_neon(int w, int h,                                              \
+                                 uint8_t *dst, int linesize,                                \
+                                 const uint8_t *srcY, int linesizeY,                        \
+                                 const uint8_t *srcC, int linesizeC,                        \
+                                 const int16_t *table,                                      \
+                                 int y_offset,                                              \
+                                 int y_coeff,                                               \
+                                 uint8_t *dst1, int linesize1,                              \
+                                 uint8_t *dst2, int linesize2);                             \
+                                                                                            \
+static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],             \
+                                           int srcStride[], int srcSliceY, int srcSliceH,   \
+                                           uint8_t *dst[], int dstStride[]) {               \
+    const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE };                                   \
+                                                                                            \
+    return ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH,                                 \
+                                        dst[0] + srcSliceY * dstStride[0], dstStride[0],    \
+                                        src[0], srcStride[0], src[1], srcStride[1],         \
+                                        yuv2rgb_table,                                      \
+                                        c->yuv2rgb_y_offset >> 6,                           \
+                                        c->yuv2rgb_y_coeff,                                 \
+                                        dst[1] + srcSliceY * dstStride[1], dstStride[1],    \
+                                        dst[2] + srcSliceY * dstStride[2], dstStride[2]);   \
+}                                                                                           \
+
 #define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx)                                               \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb)                                                     \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba)                                                     \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr)                                                     \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra)                                                     \
+DECLARE_FF_NVX_TO_GBRP_FUNCS(nvx, gbrp)                                                     \

 DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
 DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
@ -110,6 +167,7 @@ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd);                            \
    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd);                            \
    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd);                            \
+    SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, gbrp, GBRP, accurate_rnd);                            \
 } while (0)

 static void get_unscaled_swscale_neon(SwsContext *c) {
--- a/libswscale/aarch64/yuv2rgb_neon.S
+++ b/libswscale/aarch64/yuv2rgb_neon.S
@ -30,23 +30,43 @@
 #endif
 .endm

-.macro load_args_nv12
+.macro load_dst1_dst2 dst1 linesize1 dst2 linesize2
+#if defined(__APPLE__)
+#define DST_OFFSET 8
+#else
+#define DST_OFFSET 0
+#endif
+        ldr             x10, [sp, #\dst1      - DST_OFFSET]
+        ldr             w12, [sp, #\linesize1 - DST_OFFSET]
+        ldr             x15, [sp, #\dst2      - DST_OFFSET]
+        ldr             w16, [sp, #\linesize2 - DST_OFFSET]
+#undef DST_OFFSET
+        sub             w12, w12, w0                                    // w12 = linesize1 - width     (padding1)
+        sub             w16, w16, w0                                    // w16 = linesize2 - width     (padding2)
+.endm
+
+.macro load_args_nv12 ofmt
        ldr             x8,  [sp]                                       // table
        load_yoff_ycoeff 8, 16                                           // y_offset, y_coeff
        ld1             {v1.1d}, [x8]
        dup             v0.8h, w10
        dup             v3.8h, w9
+.ifc \ofmt,gbrp
+        load_dst1_dst2  24, 32, 40, 48
+        sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
+.else
        sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
+.endif
        sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
        sub             w7, w7, w0                                      // w7 = linesizeC - width     (paddingC)
        neg             w11, w0
 .endm

-.macro load_args_nv21
-    load_args_nv12
+.macro load_args_nv21 ofmt
+    load_args_nv12 \ofmt
 .endm

-.macro load_args_yuv420p
+.macro load_args_yuv420p ofmt
        ldr             x13, [sp]                                       // srcV
        ldr             w14, [sp, #8]                                   // linesizeV
        ldr             x8,  [sp, #16]                                  // table
@ -54,7 +74,12 @@
        ld1             {v1.1d}, [x8]
        dup             v0.8h, w10
        dup             v3.8h, w9
+.ifc \ofmt,gbrp
+        load_dst1_dst2  40, 48, 56, 64
+        sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
+.else
        sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
+.endif
        sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
        sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
        sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
@ -62,7 +87,7 @@
        neg             w11, w11
 .endm

-.macro load_args_yuv422p
+.macro load_args_yuv422p ofmt
        ldr             x13, [sp]                                       // srcV
        ldr             w14, [sp, #8]                                   // linesizeV
        ldr             x8,  [sp, #16]                                  // table
@ -70,7 +95,12 @@
        ld1             {v1.1d}, [x8]
        dup             v0.8h, w10
        dup             v3.8h, w9
+.ifc \ofmt,gbrp
+        load_dst1_dst2  40, 48, 56, 64
+        sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
+.else
        sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
+.endif
        sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
        sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
        sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
@ -100,9 +130,9 @@
 .endm

 .macro increment_nv12
-        ands            w15, w1, #1
-        csel            w16, w7, w11, ne                                // incC = (h & 1) ? paddincC : -width
-        add             x6,  x6, w16, sxtw                              // srcC += incC
+        ands            w17, w1, #1
+        csel            w17, w7, w11, ne                                // incC = (h & 1) ? paddincC : -width
+        add             x6,  x6, w17, sxtw                              // srcC += incC
 .endm

 .macro increment_nv21
@ -110,10 +140,10 @@
 .endm

 .macro increment_yuv420p
-        ands            w15, w1, #1
-        csel            w16,  w7, w11, ne                               // incU = (h & 1) ? paddincU : -width/2
+        ands            w17, w1, #1
+        csel            w17,  w7, w11, ne                               // incU = (h & 1) ? paddincU : -width/2
+        add             x6,  x6,  w17, sxtw                             // srcU += incU
        csel            w17, w14, w11, ne                               // incV = (h & 1) ? paddincV : -width/2
-        add             x6,  x6,  w16, sxtw                             // srcU += incU
        add             x13, x13, w17, sxtw                             // srcV += incV
 .endm

@ -122,7 +152,7 @@
        add             x13, x13, w14, sxtw                             // srcV += incV
 .endm

-.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
+.macro compute_rgb r1 g1 b1 r2 g2 b2
        add             v20.8h, v26.8h, v20.8h                          // Y1 + R1
        add             v21.8h, v27.8h, v21.8h                          // Y2 + R2
        add             v22.8h, v26.8h, v22.8h                          // Y1 + G1
@ -135,13 +165,18 @@
        sqrshrun        \g2, v23.8h, #1                                 // clip_u8((Y2 + G1) >> 1)
        sqrshrun        \b1, v24.8h, #1                                 // clip_u8((Y1 + B1) >> 1)
        sqrshrun        \b2, v25.8h, #1                                 // clip_u8((Y2 + B1) >> 1)
+.endm
+
+.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
+        compute_rgb     \r1, \g1, \b1, \r2, \g2, \b2
        movi            \a1, #255
        movi            \a2, #255
 .endm

 .macro declare_func ifmt ofmt
 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
-    load_args_\ifmt
+    load_args_\ifmt \ofmt
+
        mov             w9, w1
 1:
        mov             w8, w0                                          // w8 = width
@ -185,11 +220,22 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
        compute_rgba    v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
 .endif

+.ifc \ofmt,gbrp
+        compute_rgb     v18.8b,v4.8b,v6.8b, v19.8b,v5.8b,v7.8b
+        st1             {  v4.8b,  v5.8b }, [x2],  #16
+        st1             {  v6.8b,  v7.8b }, [x10], #16
+        st1             { v18.8b, v19.8b }, [x15], #16
+.else
        st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
        st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
+.endif
        subs            w8, w8, #16                                     // width -= 16
        b.gt            2b
        add             x2, x2, w3, sxtw                                // dst  += padding
+.ifc \ofmt,gbrp
+        add             x10, x10, w12, sxtw                             // dst1 += padding1
+        add             x15, x15, w16, sxtw                             // dst2 += padding2
+.endif
        add             x4, x4, w5, sxtw                                // srcY += paddingY
    increment_\ifmt
        subs            w1, w1, #1                                      // height -= 1
@ -204,6 +250,7 @@ endfunc
        declare_func    \ifmt, rgba
        declare_func    \ifmt, abgr
        declare_func    \ifmt, bgra
+        declare_func    \ifmt, gbrp
 .endm

 declare_rgb_funcs nv12