swscale: [LA] Optimize yuv2plane1_8_c.

Reviewed-by: colleague of Shiyou Yin Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2025-01-24 13:56:33 +02:00 · 2024-03-16 11:03:32 +08:00 · 2024-03-16 11:03:32 +08:00 · 8b76df9142
commit 8b76df9142
parent f3fe2cb5f7
5 changed files with 323 additions and 15 deletions
--- a/libswscale/loongarch/output.S
+++ b/libswscale/loongarch/output.S
@ -23,11 +23,11 @@

 #include "libavcodec/loongarch/loongson_asm.S"

-/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+/* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
 *                                 const int16_t **src, uint8_t *dest, int dstW,
 *                                 const uint8_t *dither, int offset)
 */
-function ff_yuv2planeX_8_lsx
+function yuv2planeX_8_lsx
    addi.w          t1,     a6,     1
    addi.w          t2,     a6,     2
    addi.w          t3,     a6,     3
@ -136,3 +136,253 @@ function ff_yuv2planeX_8_lsx
    blt             zero,   a4,     .DEST
 .END:
 endfunc
+
+/*
+ * void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+ *                       const uint8_t *dither, int offset)
+ */
+function yuv2plane1_8_lsx
+    addi.w       t1,    a4,    1
+    addi.w       t2,    a4,    2
+    addi.w       t3,    a4,    3
+    addi.w       t4,    a4,    4
+    addi.w       t5,    a4,    5
+    addi.w       t6,    a4,    6
+    addi.w       t7,    a4,    7
+    andi         t0,    a4,    7
+    andi         t1,    t1,    7
+    andi         t2,    t2,    7
+    andi         t3,    t3,    7
+    andi         t4,    t4,    7
+    andi         t5,    t5,    7
+    andi         t6,    t6,    7
+    andi         t7,    t7,    7
+    ldx.bu       t0,    a3,    t0
+    ldx.bu       t1,    a3,    t1
+    ldx.bu       t2,    a3,    t2
+    ldx.bu       t3,    a3,    t3
+    ldx.bu       t4,    a3,    t4
+    ldx.bu       t5,    a3,    t5
+    ldx.bu       t6,    a3,    t6
+    ldx.bu       t7,    a3,    t7
+    vinsgr2vr.h  vr1,   t0,    0
+    vinsgr2vr.h  vr1,   t1,    1
+    vinsgr2vr.h  vr1,   t2,    2
+    vinsgr2vr.h  vr1,   t3,    3
+    vinsgr2vr.h  vr1,   t4,    4
+    vinsgr2vr.h  vr1,   t5,    5
+    vinsgr2vr.h  vr1,   t6,    6
+    vinsgr2vr.h  vr1,   t7,    7
+    vsub.h       vr0,   vr0,   vr0
+    vilvl.h      vr2,   vr0,   vr1
+    vilvh.h      vr3,   vr0,   vr1
+
+    andi         t8,    a2,    7
+    srli.d       a2,    a2,    3
+    beqz         a2,    2f
+1:
+    vld          vr1,   a0,    0
+    addi.d       a0,    a0,    16
+    vshuf4i.d    vr0,   vr1,   8
+    vexth.w.h    vr4,   vr0
+    vexth.w.h    vr5,   vr1
+
+    vadd.w       vr4,   vr2,   vr4
+    vadd.w       vr5,   vr3,   vr5
+    vsrai.w      vr4,   vr4,   7
+    vsrai.w      vr5,   vr5,   7
+    vclip255.w   vr4,   vr4
+    vclip255.w   vr5,   vr5
+    vpickev.h    vr1,   vr5,   vr4
+    vpickev.b    vr1,   vr1,   vr1
+    fst.d        f1,    a1,    0
+    addi.d       a1,    a1,    8
+    addi.d       a2,    a2,    -1
+    bnez         a2,    1b
+2:
+    beqz         t8,    4f
+3:
+    add.w        a4,    a4,    t8
+    addi.w       t1,    a4,    1
+    addi.w       t2,    a4,    2
+    addi.w       t3,    a4,    3
+    addi.w       t4,    a4,    4
+    addi.w       t5,    a4,    5
+    addi.w       t6,    a4,    6
+    addi.w       t7,    a4,    7
+    andi         t0,    a4,    7
+    andi         t1,    t1,    7
+    andi         t2,    t2,    7
+    andi         t3,    t3,    7
+    andi         t4,    t4,    7
+    andi         t5,    t5,    7
+    andi         t6,    t6,    7
+    andi         t7,    t7,    7
+    ldx.bu       t0,    a3,    t0
+    ldx.bu       t1,    a3,    t1
+    ldx.bu       t2,    a3,    t2
+    ldx.bu       t3,    a3,    t3
+    ldx.bu       t4,    a3,    t4
+    ldx.bu       t5,    a3,    t5
+    ldx.bu       t6,    a3,    t6
+    ldx.bu       t7,    a3,    t7
+    vinsgr2vr.h  vr1,   t0,    0
+    vinsgr2vr.h  vr1,   t1,    1
+    vinsgr2vr.h  vr1,   t2,    2
+    vinsgr2vr.h  vr1,   t3,    3
+    vinsgr2vr.h  vr1,   t4,    4
+    vinsgr2vr.h  vr1,   t5,    5
+    vinsgr2vr.h  vr1,   t6,    6
+    vinsgr2vr.h  vr1,   t7,    7
+    vsub.h       vr0,   vr0,   vr0
+    vilvl.h      vr2,   vr0,   vr1
+    vilvh.h      vr3,   vr0,   vr1
+
+    addi.d       a0,    a0,    -16
+    add.d        a0,    a0,    t8
+    add.d        a0,    a0,    t8
+    addi.d       a1,    a1,    -8
+    add.d        a1,    a1,    t8
+
+    vld          vr1,   a0,    0
+    vshuf4i.d    vr0,   vr1,   8
+    vexth.w.h    vr4,   vr0
+    vexth.w.h    vr5,   vr1
+
+    vadd.w       vr4,   vr2,   vr4
+    vadd.w       vr5,   vr3,   vr5
+    vsrai.w      vr4,   vr4,   7
+    vsrai.w      vr5,   vr5,   7
+    vclip255.w   vr4,   vr4
+    vclip255.w   vr5,   vr5
+    vpickev.h    vr1,   vr5,   vr4
+    vpickev.b    vr1,   vr1,   vr1
+    fst.d        f1,    a1,    0
+4:
+endfunc
+
+function yuv2plane1_8_lasx
+    addi.w       t1,    a4,    1
+    addi.w       t2,    a4,    2
+    addi.w       t3,    a4,    3
+    addi.w       t4,    a4,    4
+    addi.w       t5,    a4,    5
+    addi.w       t6,    a4,    6
+    addi.w       t7,    a4,    7
+    andi         t0,    a4,    7
+    andi         t1,    t1,    7
+    andi         t2,    t2,    7
+    andi         t3,    t3,    7
+    andi         t4,    t4,    7
+    andi         t5,    t5,    7
+    andi         t6,    t6,    7
+    andi         t7,    t7,    7
+    ldx.bu       t0,    a3,    t0
+    ldx.bu       t1,    a3,    t1
+    ldx.bu       t2,    a3,    t2
+    ldx.bu       t3,    a3,    t3
+    ldx.bu       t4,    a3,    t4
+    ldx.bu       t5,    a3,    t5
+    ldx.bu       t6,    a3,    t6
+    ldx.bu       t7,    a3,    t7
+    vinsgr2vr.h  vr1,   t0,    0
+    vinsgr2vr.h  vr1,   t1,    1
+    vinsgr2vr.h  vr1,   t2,    2
+    vinsgr2vr.h  vr1,   t3,    3
+    vinsgr2vr.h  vr1,   t4,    4
+    vinsgr2vr.h  vr1,   t5,    5
+    vinsgr2vr.h  vr1,   t6,    6
+    vinsgr2vr.h  vr1,   t7,    7
+    xvpermi.q    xr1,   xr1,   0
+    xvsub.h      xr0,   xr0,   xr0
+    xvilvl.h     xr2,   xr0,   xr1
+    xvilvh.h     xr3,   xr0,   xr1
+
+    andi         t8,    a2,    15
+    srli.d       a2,    a2,    4
+    beqz         a2,    2f
+1:
+    xvld         xr1,   a0,    0
+    addi.d       a0,    a0,    32
+    xvpermi.d    xr0,   xr1,   0xa0
+    xvexth.w.h   xr4,   xr0
+    xvexth.w.h   xr5,   xr1
+
+    xvadd.w      xr4,   xr2,   xr4
+    xvadd.w      xr5,   xr3,   xr5
+    xvsrai.w     xr4,   xr4,   7
+    xvsrai.w     xr5,   xr5,   7
+    xvclip255.w  xr4,   xr4
+    xvclip255.w  xr5,   xr5
+    xvpickev.h   xr1,   xr5,   xr4
+    xvpickev.b   xr0,   xr1,   xr1
+    xvpermi.q    xr1,   xr0,   1
+    fst.d        f0,    a1,    0
+    fst.d        f1,    a1,    8
+    addi.d       a1,    a1,    16
+    addi.d       a2,    a2,    -1
+    bnez         a2,    1b
+2:
+    beqz         t8,    4f
+3:
+    add.w        a4,    a4,    t8
+    addi.w       t1,    a4,    1
+    addi.w       t2,    a4,    2
+    addi.w       t3,    a4,    3
+    addi.w       t4,    a4,    4
+    addi.w       t5,    a4,    5
+    addi.w       t6,    a4,    6
+    addi.w       t7,    a4,    7
+    andi         t0,    a4,    7
+    andi         t1,    t1,    7
+    andi         t2,    t2,    7
+    andi         t3,    t3,    7
+    andi         t4,    t4,    7
+    andi         t5,    t5,    7
+    andi         t6,    t6,    7
+    andi         t7,    t7,    7
+    ldx.bu       t0,    a3,    t0
+    ldx.bu       t1,    a3,    t1
+    ldx.bu       t2,    a3,    t2
+    ldx.bu       t3,    a3,    t3
+    ldx.bu       t4,    a3,    t4
+    ldx.bu       t5,    a3,    t5
+    ldx.bu       t6,    a3,    t6
+    ldx.bu       t7,    a3,    t7
+    vinsgr2vr.h  vr1,   t0,    0
+    vinsgr2vr.h  vr1,   t1,    1
+    vinsgr2vr.h  vr1,   t2,    2
+    vinsgr2vr.h  vr1,   t3,    3
+    vinsgr2vr.h  vr1,   t4,    4
+    vinsgr2vr.h  vr1,   t5,    5
+    vinsgr2vr.h  vr1,   t6,    6
+    vinsgr2vr.h  vr1,   t7,    7
+    xvpermi.q    xr1,   xr1,   0
+    xvsub.h      xr0,   xr0,   xr0
+    xvilvl.h     xr2,   xr0,   xr1
+    xvilvh.h     xr3,   xr0,   xr1
+
+    addi.d       a0,    a0,    -32
+    add.d        a0,    a0,    t8
+    add.d        a0,    a0,    t8
+    addi.d       a1,    a1,    -16
+    add.d        a1,    a1,    t8
+
+    xvld         xr1,   a0,    0
+    xvpermi.d    xr0,   xr1,   0xa0
+    xvexth.w.h   xr4,   xr0
+    xvexth.w.h   xr5,   xr1
+
+    xvadd.w      xr4,   xr2,   xr4
+    xvadd.w      xr5,   xr3,   xr5
+    xvsrai.w     xr4,   xr4,   7
+    xvsrai.w     xr5,   xr5,   7
+    xvclip255.w  xr4,   xr4
+    xvclip255.w  xr5,   xr5
+    xvpickev.h   xr1,   xr5,   xr4
+    xvpickev.b   xr0,   xr1,   xr1
+    xvpermi.q    xr1,   xr0,   1
+    fst.d        f0,    a1,    0
+    fst.d        f1,    a1,    8
+4:
+endfunc
--- a/libswscale/loongarch/output_lasx.c
+++ b/libswscale/loongarch/output_lasx.c
@ -22,7 +22,7 @@
 #include "swscale_loongarch.h"
 #include "libavutil/loongarch/loongson_intrinsics.h"

-void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
                          const int16_t **src, uint8_t *dest, int dstW,
                          const uint8_t *dither, int offset)
 {
@ -1775,8 +1775,27 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)


-av_cold void ff_sws_init_output_lasx(SwsContext *c)
+av_cold void ff_sws_init_output_lasx(SwsContext *c,
+                                     yuv2planar1_fn *yuv2plane1,
+                                     yuv2planarX_fn *yuv2planeX,
+                                     yuv2interleavedX_fn *yuv2nv12cX,
+                                     yuv2packed1_fn *yuv2packed1,
+                                     yuv2packed2_fn *yuv2packed2,
+                                     yuv2packedX_fn *yuv2packedX,
+                                     yuv2anyX_fn *yuv2anyX)
 {
+    enum AVPixelFormat dstFormat = c->dstFormat;
+
+    /* Add initialization once optimized */
+    if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
+    } else if (is16BPS(dstFormat)) {
+    } else if (isNBPS(dstFormat)) {
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
+    } else {
+        *yuv2plane1 = yuv2plane1_8_lasx;
+        *yuv2planeX = yuv2planeX_8_lasx;
+    }

    if(c->flags & SWS_FULL_CHR_H_INT) {
        switch (c->dstFormat) {
--- a/libswscale/loongarch/output_lsx.c
+++ b/libswscale/loongarch/output_lsx.c
@ -1624,8 +1624,28 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)


-av_cold void ff_sws_init_output_lsx(SwsContext *c)
+av_cold void ff_sws_init_output_lsx(SwsContext *c,
+                                    yuv2planar1_fn *yuv2plane1,
+                                    yuv2planarX_fn *yuv2planeX,
+                                    yuv2interleavedX_fn *yuv2nv12cX,
+                                    yuv2packed1_fn *yuv2packed1,
+                                    yuv2packed2_fn *yuv2packed2,
+                                    yuv2packedX_fn *yuv2packedX,
+                                    yuv2anyX_fn *yuv2anyX)
 {
+    enum AVPixelFormat dstFormat = c->dstFormat;
+
+    /* Add initialization once optimized */
+    if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
+    } else if (is16BPS(dstFormat)) {
+    } else if (isNBPS(dstFormat)) {
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
+    } else {
+        *yuv2plane1 = yuv2plane1_8_lsx;
+        *yuv2planeX = yuv2planeX_8_lsx;
+    }
+
    if(c->flags & SWS_FULL_CHR_H_INT) {
        switch (c->dstFormat) {
        case AV_PIX_FMT_RGBA:
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@ -60,7 +60,9 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
 {
    int cpu_flags = av_get_cpu_flags();
    if (have_lsx(cpu_flags)) {
-        ff_sws_init_output_lsx(c);
+        ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
+                               &c->yuv2nv12cX, &c->yuv2packed1,
+                               &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
        if (c->srcBpc == 8) {
            if (c->dstBpc <= 14) {
                c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
@ -80,12 +82,12 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
            }
            break;
        }
-        if (c->dstBpc == 8)
-            c->yuv2planeX = ff_yuv2planeX_8_lsx;
    }
 #if HAVE_LASX
    if (have_lasx(cpu_flags)) {
-        ff_sws_init_output_lasx(c);
+        ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
+                                &c->yuv2nv12cX, &c->yuv2packed1,
+                                &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
        if (c->srcBpc == 8) {
            if (c->dstBpc <= 14) {
                c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@ -105,8 +107,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
            }
            break;
        }
-        if (c->dstBpc == 8)
-            c->yuv2planeX = ff_yuv2planeX_8_lasx;
    }
 #endif // #if HAVE_LASX
    ff_sws_init_range_convert_loongarch(c);
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@ -61,11 +61,21 @@ void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
 void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
                         int32_t *rgb2yuv, void *opq);

-void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
                         const int16_t **src, uint8_t *dest, int dstW,
                         const uint8_t *dither, int offset);

-av_cold void ff_sws_init_output_lsx(SwsContext *c);
+void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+                      const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c,
+                                    yuv2planar1_fn *yuv2plane1,
+                                    yuv2planarX_fn *yuv2planeX,
+                                    yuv2interleavedX_fn *yuv2nv12cX,
+                                    yuv2packed1_fn *yuv2packed1,
+                                    yuv2packed2_fn *yuv2packed2,
+                                    yuv2packedX_fn *yuv2packedX,
+                                    yuv2anyX_fn *yuv2anyX);

 int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
                     int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
@ -135,12 +145,21 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
                              uint8_t *dest, int width, int height,
                              int src1Stride, int src2Stride, int dstStride);

-void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
                          const int16_t **src, uint8_t *dest, int dstW,
                          const uint8_t *dither, int offset);

-av_cold void ff_sws_init_output_lasx(SwsContext *c);
+void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
+                      const uint8_t *dither, int offset);

+av_cold void ff_sws_init_output_lasx(SwsContext *c,
+                                     yuv2planar1_fn *yuv2plane1,
+                                     yuv2planarX_fn *yuv2planeX,
+                                     yuv2interleavedX_fn *yuv2nv12cX,
+                                     yuv2packed1_fn *yuv2packed1,
+                                     yuv2packed2_fn *yuv2packed2,
+                                     yuv2packedX_fn *yuv2packedX,
+                                     yuv2anyX_fn *yuv2anyX);
 #endif // #if HAVE_LASX

 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */