avfilter/vf_colordetect: add aarch64 asm

| rpi5 gcc 12 | m1 clang -fno-vectorize | m1 clang --------------------------------------------------------------------------- alpha_8_full_c: | 32159.2 ( 1.00x) | 135.8 ( 1.00x) | 26.4 ( 1.00x) alpha_8_full_neon: | 1266.0 (25.40x) | 8.0 (17.03x) | 8.4 ( 3.15x) alpha_8_limited_c: | 37561.9 ( 1.00x) | 169.1 ( 1.00x) | 47.7 ( 1.00x) alpha_8_limited_neon: | 3967.0 ( 9.47x) | 12.5 (13.53x) | 13.3 ( 3.59x) alpha_16_full_c: | 15867.9 ( 1.00x) | 64.5 ( 1.00x) | 13.7 ( 1.00x) alpha_16_full_neon: | 1256.9 (12.62x) | 7.9 ( 8.15x) | 8.3 ( 1.64x) alpha_16_limited_c: | 16723.7 ( 1.00x) | 88.7 ( 1.00x) | 103.3 ( 1.00x) alpha_16_limited_neon: | 4031.3 ( 4.15x) | 12.5 ( 7.08x) | 13.2 ( 7.86x) range_8_c: | 21819.7 ( 1.00x) | 120.0 ( 1.00x) | 9.4 ( 1.00x) range_8_neon: | 1148.3 (19.00x) | 4.3 (27.60x) | 4.8 ( 1.97x) range_16_c: | 10757.1 ( 1.00x) | 45.7 ( 1.00x) | 7.9 ( 1.00x) range_16_neon: | 1141.5 ( 9.42x) | 4.4 (10.38x) | 4.6 ( 1.72x)
2025-09-16 08:36:51 +02:00 · 2025-08-21 20:44:37 +08:00
parent 6450e01446
commit eb14d45824
5 changed files with 551 additions and 1 deletions
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,5 +1,7 @@
 OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
+OBJS-$(CONFIG_COLORDETECT_FILTER)            += aarch64/vf_colordetect_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o

 NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
+NEON-OBJS-$(CONFIG_COLORDETECT_FILTER)       += aarch64/vf_colordetect_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
--- a/libavfilter/aarch64/vf_colordetect_init.c
+++ b/libavfilter/aarch64/vf_colordetect_init.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2025 Zhao Zhili <quinkblack@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/vf_colordetect.h"
+
+int ff_detect_alpha_full_neon(const uint8_t *color, ptrdiff_t color_stride,
+                              const uint8_t *alpha, ptrdiff_t alpha_stride,
+                              ptrdiff_t width, ptrdiff_t height,
+                              int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha16_full_neon(const uint8_t *color, ptrdiff_t color_stride,
+                                const uint8_t *alpha, ptrdiff_t alpha_stride,
+                                ptrdiff_t width, ptrdiff_t height,
+                                int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha_limited_neon(const uint8_t *color, ptrdiff_t color_stride,
+                                 const uint8_t *alpha, ptrdiff_t alpha_stride,
+                                 ptrdiff_t width, ptrdiff_t height,
+                                 int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha16_limited_neon(const uint8_t *color, ptrdiff_t color_stride,
+                                   const uint8_t *alpha, ptrdiff_t alpha_stride,
+                                   ptrdiff_t width, ptrdiff_t height,
+                                   int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_range_neon(const uint8_t *data, ptrdiff_t stride,
+                         ptrdiff_t width, ptrdiff_t height,
+                         int mpeg_min, int mpeg_max);
+
+int ff_detect_range16_neon(const uint8_t *data, ptrdiff_t stride,
+                           ptrdiff_t width, ptrdiff_t height,
+                           int mpeg_min, int mpeg_max);
+
+av_cold void ff_color_detect_dsp_init_aarch64(FFColorDetectDSPContext *dsp, int depth,
+                                          enum AVColorRange color_range)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        dsp->detect_range = depth > 8 ? ff_detect_range16_neon : ff_detect_range_neon;
+        if (color_range == AVCOL_RANGE_JPEG)
+            dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_full_neon : ff_detect_alpha_full_neon;
+        else
+            dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_limited_neon : ff_detect_alpha_limited_neon;
+    }
+}
--- a/libavfilter/aarch64/vf_colordetect_neon.S
+++ b/libavfilter/aarch64/vf_colordetect_neon.S
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2025 Zhao Zhili <quinkblack@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define FF_ALPHA_TRANSPARENT        (1 << 0)
+#define FF_ALPHA_STRAIGHT           ((1 << 1) | FF_ALPHA_TRANSPARENT)
+
+const mask
+        .byte           255, 255, 255, 255, 255, 255, 255, 255
+        .byte           255, 255, 255, 255, 255, 255, 255, 255
+mask_start:
+        .byte           0, 0, 0, 0, 0, 0, 0, 0
+        .byte           0, 0, 0, 0, 0, 0, 0, 0
+        .byte           255, 255, 255, 255, 255, 255, 255, 255
+        .byte           255, 255, 255, 255, 255, 255, 255, 255
+endconst
+
+.macro load_mask_zero, shift=0
+        movrel          x9, mask_start
+        sub             x9, x9, x7, lsl #(\shift)
+        ldr             q3, [x9]
+.endm
+
+.macro load_mask, shift=0
+        movrel          x9, mask_start
+        sub             x9, x9, x7, lsl #(\shift)
+        ld1             {v3.16b, v4.16b}, [x9]
+.endm
+
+/* x0: const uint8_t *data
+ * x1: ptrdiff_t stride
+ * x2: ptrdiff_t width
+ * x3: ptrdiff_t height
+ * w4: int mpeg_min
+ * w5: int mpeg_max
+ */
+function ff_detect_range_neon, export=1
+        ands            x7, x2, #15                 // width % 16
+        bic             x8, x2, #15                 // width / 16 * 16
+        bic             x6, x2, #31                 // width / 32 * 32
+        and             x10, x2, #16                // check x8 != x6
+        dup             v0.16b, w4                  // mpeg_min
+        dup             v1.16b, w5                  // mpeg_max
+        movi            v2.16b, #0                  // cond
+        sub             x1, x1, x8
+        b.eq            1f
+        load_mask_zero
+1:
+        cbz             x6, 20f                     // width < 32
+        mov             x12, x6
+2:
+        ld1             {v5.16b, v6.16b}, [x0], #32
+        cmhi            v16.16b, v0.16b, v5.16b
+        cmhi            v17.16b, v5.16b, v1.16b
+        cmhi            v18.16b, v0.16b, v6.16b
+        cmhi            v19.16b, v6.16b, v1.16b
+        orr             v20.16b, v16.16b, v17.16b
+        orr             v21.16b, v18.16b, v19.16b
+        subs            x12, x12, #32
+        orr             v20.16b, v20.16b, v21.16b
+        orr             v2.16b, v2.16b, v20.16b
+        b.gt            2b
+20:
+        cbz             x10, 3f                     // width < 16
+        ldr             q20, [x0], #16
+        cmhi            v16.16b, v0.16b, v20.16b
+        cmhi            v17.16b, v20.16b, v1.16b
+        orr             v16.16b, v16.16b, v17.16b
+        orr             v2.16b, v2.16b, v16.16b
+3:
+        cbz             x7, 4f
+        ldr             q21, [x0]
+        cmhi            v18.16b, v0.16b, v21.16b
+        cmhi            v19.16b, v21.16b, v1.16b
+        orr             v16.16b, v18.16b, v19.16b
+        and             v16.16b, v16.16b, v3.16b
+        orr             v2.16b, v2.16b, v16.16b
+4:
+        umaxv           b4, v2.16b
+        subs            x3, x3, #1
+        umov            w9, v4.b[0]
+        add             x0, x0, x1
+        cbnz            w9, 8f
+        b.gt            1b
+        mov             x0, #0
+        ret
+8:
+        mov             x0, #1
+        ret
+endfunc
+
+/* x0: const uint8_t *data
+ * x1: ptrdiff_t stride
+ * x2: ptrdiff_t width
+ * x3: ptrdiff_t height
+ * w4: int mpeg_min
+ * w5: int mpeg_max
+ */
+function ff_detect_range16_neon, export=1
+        ands            x7, x2, #7                  // width % 7
+        bic             x8, x2, #7                  // width / 8 * 8
+        bic             x6, x2, #15                 // width / 16 * 16
+        and             x10, x2, #8                 // check x8 != x6
+        dup             v0.8h, w4                   // mpeg_min
+        dup             v1.8h, w5                   // mpeg_max
+        movi            v2.16b, #0                  // cond
+        sub             x1, x1, x8, lsl #1
+        b.eq            1f
+        load_mask_zero  shift=1
+1:
+        cbz             x6, 20f                     // width < 16
+        mov             x12, x6
+2:
+        ld1             {v5.8h, v6.8h}, [x0], #32
+        cmhi            v16.8h, v0.8h, v5.8h
+        cmhi            v17.8h, v5.8h, v1.8h
+        cmhi            v18.8h, v0.8h, v6.8h
+        cmhi            v19.8h, v6.8h, v1.8h
+        orr             v20.16b, v16.16b, v17.16b
+        orr             v21.16b, v18.16b, v19.16b
+        subs            x12, x12, #16
+        orr             v20.16b, v20.16b, v21.16b
+        orr             v2.16b, v2.16b, v20.16b
+        b.gt            2b
+20:
+        cbz             x10, 3f                     // width < 8
+        ldr             q20, [x0], #16
+        cmhi            v16.8h, v0.8h, v20.8h
+        cmhi            v17.8h, v20.8h, v1.8h
+        orr             v16.16b, v16.16b, v17.16b
+        orr             v2.16b, v2.16b, v16.16b
+3:
+        cbz             x7, 4f
+        ldr             q21, [x0]
+        cmhi            v18.8h, v0.8h, v21.8h
+        cmhi            v19.8h, v21.8h, v1.8h
+        orr             v16.16b, v18.16b, v19.16b
+        and             v16.16b, v16.16b, v3.16b
+        orr             v2.16b, v2.16b, v16.16b
+4:
+        umaxv           h4, v2.8h
+        subs            x3, x3, #1
+        umov            w9, v4.h[0]
+        add             x0, x0, x1
+        cbnz            w9, 8f
+        b.gt            1b
+        mov             x0, #0
+        ret
+8:
+        mov             x0, #1
+        ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ */
+function ff_detect_alpha_full_neon, export=1
+        ands            x7, x4, #15             // width % 16
+        bic             x8, x4, #15             // width / 16 * 16
+        movi            v0.16b, #0
+        movi            v1.16b, #255
+        dup             v2.16b, w6              // alpha_max
+        sub             x1, x1, x8              // color_stride - aligned_width
+        sub             x3, x3, x8              // alpha_stride - aligned_width
+        b.eq            1f
+
+        // Create mask for non-aligned width
+        load_mask
+1:
+        cbz             x8, 20f                 // width < 16
+        mov             x12, x8                 // w12: aligned_width
+2:
+        ldr             q5, [x0], #16
+        ldr             q6, [x2], #16
+        subs            x12, x12, #16
+        cmhi            v7.16b, v5.16b, v6.16b
+        cmeq            v16.16b, v6.16b, v2.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+        b.gt            2b
+20:
+        cbz             w7, 3f
+        // handle loop tail
+        ldr             q5, [x0]
+        ldr             q6, [x2]
+        cmhi            v7.16b, v5.16b, v6.16b
+        cmeq            v16.16b, v6.16b, v2.16b
+        and             v7.16b, v7.16b, v3.16b
+        orr             v16.16b, v16.16b, v4.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+3:
+        umaxv           b17, v0.16b
+        subs            x5, x5, #1
+        umov            w9, v17.b[0]
+        add             x0, x0, x1
+        add             x2, x2, x3
+        cbnz            w9, 4f
+        b.gt            1b
+
+        uminv           b1, v1.16b
+        umov            w9, v1.b[0]
+        mov             x0, #0
+        cbnz            w9, 5f
+        mov             x0, #FF_ALPHA_TRANSPARENT
+        ret
+4:
+        mov             x0, #FF_ALPHA_STRAIGHT
+5:
+        ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ */
+function ff_detect_alpha16_full_neon, export=1
+        ands            x7, x4, #7              // width % 8
+        bic             x8, x4, #7              // width / 8 * 8
+        movi            v0.8h, #0
+        movi            v1.16b, #255
+        dup             v2.8h, w6               // alpha_max
+        sub             x1, x1, x8, lsl #1      // color_stride - (aligned_width * 2)
+        sub             x3, x3, x8, lsl #1      // alpha_stride - (aligned_width * 2)
+        b.eq            1f
+
+        // Create mask for non-aligned width
+        load_mask       shift=1
+1:
+        cbz             x8, 20f                 // width < 8
+        mov             x12, x8                 // w12: aligned_width
+2:
+        ldr             q5, [x0], #16
+        ldr             q6, [x2], #16
+        subs            x12, x12, #8
+        cmhi            v7.8h, v5.8h, v6.8h
+        cmeq            v16.8h, v6.8h, v2.8h
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+        b.gt            2b
+20:
+        cbz             w7, 3f
+        // handle loop tail
+        ldr             q5, [x0]
+        ldr             q6, [x2]
+        cmhi            v7.8h, v5.8h, v6.8h
+        cmeq            v16.8h, v6.8h, v2.8h
+        and             v7.16b, v7.16b, v3.16b
+        orr             v16.16b, v16.16b, v4.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+3:
+        umaxv           h17, v0.8h
+        subs            x5, x5, #1
+        umov            w9, v17.h[0]
+        add             x0, x0, x1
+        add             x2, x2, x3
+        cbnz            w9, 4f
+        b.gt            1b
+
+        uminv           h1, v1.8h
+        umov            w9, v1.h[0]
+        mov             x0, #0
+        cbnz            w9, 5f
+        mov             x0, #FF_ALPHA_TRANSPARENT
+        ret
+4:
+        mov             x0, #FF_ALPHA_STRAIGHT
+5:
+        ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ * w7: int mpeg_range
+ * [sp]: int offset
+ */
+function ff_detect_alpha_limited_neon, export=1
+        dup             v17.16b, w7             // mpeg_range
+        ldr             w13, [sp]
+        movi            v0.16b, #0
+        movi            v1.16b, #255
+        dup             v2.16b, w6              // alpha_max
+        ands            x7, x4, #15             // width % 16
+        bic             x8, x4, #15             // width / 16 * 16
+        dup             v18.8h, w13             // offset
+        sub             x1, x1, x8              // color_stride - aligned_width
+        sub             x3, x3, x8              // alpha_stride - aligned_width
+        b.eq            1f
+
+        // Create mask for non-aligned width
+        load_mask
+1:
+        cbz             x8, 20f                     // width < 16
+        mov             x12, x8                     // w12: aligned_width
+2:
+        ldr             q5, [x0], #16               // color
+        ldr             q6, [x2], #16               // alpha
+        umull           v19.8h, v2.8b, v5.8b        // alpha_max * color
+        umull2          v20.8h, v2.16b, v5.16b      // alpha_max * color
+        umull           v21.8h, v17.8b, v6.8b       // range * alpha
+        umull2          v22.8h, v17.16b, v6.16b     // range * alpha
+        cmeq            v16.16b, v6.16b, v2.16b
+        subs            x12, x12, #16
+        uqsub           v19.8h, v19.8h, v18.8h      // alpha_max * color - offset
+        uqsub           v20.8h, v20.8h, v18.8h      // alpha_max * color - offset
+
+        cmhi            v19.8h, v19.8h, v21.8h
+        cmhi            v20.8h, v20.8h, v22.8h
+        orr             v7.16b, v19.16b, v20.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+        b.gt            2b
+20:
+        cbz             w7, 3f
+        // handle loop tail
+        ldr             q5, [x0]
+        ldr             q6, [x2]
+        umull           v19.8h, v2.8b, v5.8b        // alpha_max * color
+        umull2          v20.8h, v2.16b, v5.16b      // alpha_max * color
+        umull           v21.8h, v17.8b, v6.8b       // range * alpha
+        umull2          v22.8h, v17.16b, v6.16b     // range * alpha
+        uqsub           v19.8h, v19.8h, v18.8h      // alpha_max * color - offset
+        uqsub           v20.8h, v20.8h, v18.8h      // alpha_max * color - offset
+
+        cmhi            v19.8h, v19.8h, v21.8h
+        cmhi            v20.8h, v20.8h, v22.8h
+        uqxtn           v7.8b, v19.8h
+        uqxtn2          v7.16b, v20.8h
+        cmeq            v16.16b, v6.16b, v2.16b
+
+        and             v7.16b, v7.16b, v3.16b
+        orr             v16.16b, v16.16b, v4.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+3:
+        umaxv           b23, v0.16b
+        subs            x5, x5, #1
+        umov            w9, v23.b[0]
+        add             x0, x0, x1
+        add             x2, x2, x3
+        cbnz            w9, 4f
+        b.gt            1b
+
+        uminv           b1, v1.16b
+        umov            w9, v1.b[0]
+        mov             x0, #0
+        cbnz            w9, 5f
+        mov             x0, #FF_ALPHA_TRANSPARENT
+        ret
+4:
+        mov             x0, #FF_ALPHA_STRAIGHT
+5:
+        ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ * w7: int mpeg_range
+ * [sp]: int offset
+ */
+function ff_detect_alpha16_limited_neon, export=1
+        dup             v17.8h, w7                  // mpeg_range
+        ldr             w13, [sp]
+        movi            v0.8h, #0
+        movi            v1.16b, #255
+        dup             v2.8h, w6                   // alpha_max
+        ands            x7, x4, #7                  // width % 8
+        bic             x8, x4, #7                  // width / 8 * 8
+        dup             v18.4s, w13                 // offset
+        sub             x1, x1, x8, lsl #1          // color_stride - (aligned_width * 2)
+        sub             x3, x3, x8, lsl #1          // alpha_stride - (aligned_width * 2)
+        b.eq            1f
+
+        // Create mask for non-aligned width
+        load_mask       shift=1
+1:
+        cbz             x8, 20f                     // width < 8
+        mov             x12, x8                     // w12: aligned_width
+2:
+        ldr             q5, [x0], #16
+        ldr             q6, [x2], #16
+        umull           v19.4s, v2.4h, v5.4h        // alpha_max * color
+        umull2          v20.4s, v2.8h, v5.8h        // alpha_max * color
+        umull           v21.4s, v17.4h, v6.4h       // range * alpha
+        umull2          v22.4s, v17.8h, v6.8h       // range * alpha
+        cmeq            v16.8h, v6.8h, v2.8h
+        subs            x12, x12, #8
+        uqsub           v19.4s, v19.4s, v18.4s      // alpha_max * color - offset
+        uqsub           v20.4s, v20.4s, v18.4s      // alpha_max * color - offset
+
+        cmhi            v19.4s, v19.4s, v21.4s
+        cmhi            v20.4s, v20.4s, v22.4s
+        orr             v7.16b, v19.16b, v20.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+        b.gt            2b
+20:
+        cbz             w7, 3f
+        // handle loop tail
+        ldr             q5, [x0]
+        ldr             q6, [x2]
+        umull           v19.4s, v2.4h, v5.4h        // alpha_max * color
+        umull2          v20.4s, v2.8h, v5.8h        // alpha_max * color
+        umull           v21.4s, v17.4h, v6.4h       // range * alpha
+        umull2          v22.4s, v17.8h, v6.8h       // range * alpha
+        uqsub           v19.4s, v19.4s, v18.4s      // alpha_max * color - offset
+        uqsub           v20.4s, v20.4s, v18.4s      // alpha_max * color - offset
+
+        cmhi            v19.4s, v19.4s, v21.4s
+        cmhi            v20.4s, v20.4s, v22.4s
+        uqxtn           v7.4h, v19.4s
+        uqxtn2          v7.8h, v20.4s
+        cmeq            v16.8h, v6.8h, v2.8h
+
+        and             v7.16b, v7.16b, v3.16b
+        orr             v16.16b, v16.16b, v4.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+3:
+        umaxv           s23, v0.4s
+        subs            x5, x5, #1
+        umov            w9, v23.s[0]
+        add             x0, x0, x1
+        add             x2, x2, x3
+        cbnz            w9, 4f
+        b.gt            1b
+
+        uminv           h1, v1.8h
+        umov            w9, v1.h[0]
+        mov             x0, #0
+        cbnz            w9, 5f
+        mov             x0, #FF_ALPHA_TRANSPARENT
+        ret
+4:
+        mov             x0, #FF_ALPHA_STRAIGHT
+5:
+        ret
+endfunc
--- a/libavfilter/vf_colordetect.c
+++ b/libavfilter/vf_colordetect.c
@@ -236,7 +236,9 @@ av_cold void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
        dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_limited_c : ff_detect_alpha_limited_c;
    }

-#if ARCH_X86
+#if ARCH_AARCH64
+    ff_color_detect_dsp_init_aarch64(dsp, depth, color_range);
+#elif ARCH_X86
    ff_color_detect_dsp_init_x86(dsp, depth, color_range);
 #endif
 }
--- a/libavfilter/vf_colordetect.h
+++ b/libavfilter/vf_colordetect.h
@@ -50,6 +50,8 @@ typedef struct FFColorDetectDSPContext {
 void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
                              enum AVColorRange color_range);

+void ff_color_detect_dsp_init_aarch64(FFColorDetectDSPContext *dsp, int depth,
+                                      enum AVColorRange color_range);
 void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
                                  enum AVColorRange color_range);