1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

avfilter/vf_colordetect: add x86 SIMD implementation

alphadetect8_full_c:                                  5658.2 ( 1.00x)
alphadetect8_full_avx2:                                215.1 (26.31x)
alphadetect8_full_avx512:                              133.5 (42.40x)
alphadetect8_limited_c:                               7391.5 ( 1.00x)
alphadetect8_limited_avx2:                             649.3 (11.38x)
alphadetect8_limited_avx512:                           330.5 (22.36x)
alphadetect16_full_c:                                 3027.4 ( 1.00x)
alphadetect16_full_avx2:                               209.4 (14.46x)
alphadetect16_full_avx512:                             141.4 (21.41x)
alphadetect16_limited_c:                              3880.9 ( 1.00x)
alphadetect16_limited_avx2:                            734.9 ( 5.28x)
alphadetect16_limited_avx512:                          349.2 (11.11x)
rangedetect8_c:                                       5854.2 ( 1.00x)
rangedetect8_avx2:                                     138.9 (42.15x)
rangedetect8_avx512:                                   106.2 (55.12x)
rangedetect16_c:                                      4122.0 ( 1.00x)
rangedetect16_avx2:                                    138.6 (29.74x)
rangedetect16_avx512:                                  104.1 (39.60x)
This commit is contained in:
Niklas Haas
2025-07-16 14:02:43 +02:00
parent 545f721b44
commit 8b647b3f8a
5 changed files with 264 additions and 0 deletions

View File

@ -232,6 +232,10 @@ static av_cold void uninit(AVFilterContext *ctx)
av_cold void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth, av_cold void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
enum AVColorRange color_range) enum AVColorRange color_range)
{ {
#if ARCH_X86
ff_color_detect_dsp_init_x86(dsp, depth, color_range);
#endif
if (!dsp->detect_range) if (!dsp->detect_range)
dsp->detect_range = depth > 8 ? ff_detect_range16_c : ff_detect_range_c; dsp->detect_range = depth > 8 ? ff_detect_range16_c : ff_detect_range_c;
if (!dsp->detect_alpha) { if (!dsp->detect_alpha) {

View File

@ -41,6 +41,9 @@ typedef struct FFColorDetectDSPContext {
void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth, void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
enum AVColorRange color_range); enum AVColorRange color_range);
void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
enum AVColorRange color_range);
static inline int ff_detect_range_c(const uint8_t *data, ptrdiff_t stride, static inline int ff_detect_range_c(const uint8_t *data, ptrdiff_t stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t width, ptrdiff_t height,
int mpeg_min, int mpeg_max) int mpeg_min, int mpeg_max)

View File

@ -6,6 +6,7 @@ OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise_init.o
OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect_init.o OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect_init.o
OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
OBJS-$(CONFIG_COLORDETECT_FILTER) += x86/vf_colordetect_init.o
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution_init.o OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution_init.o
OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128_init.o OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128_init.o
@ -53,6 +54,7 @@ X86ASM-OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise.o
X86ASM-OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect.o X86ASM-OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect.o
X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
X86ASM-OBJS-$(CONFIG_COLORDETECT_FILTER) += x86/vf_colordetect.o
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution.o X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution.o
X86ASM-OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128.o X86ASM-OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128.o

View File

@ -0,0 +1,150 @@
;*****************************************************************************
;* x86-optimized functions for blackdetect filter
;*
;* Copyright (C) 2025 Niklas Haas
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro detect_range_fn 1 ; suffix
cglobal detect_range%1, 6, 7, 5, data, stride, width, height, mpeg_min, mpeg_max, x
%if UNIX64 && notcpuflag(avx512)
movd xm0, mpeg_mind
movd xm1, mpeg_maxd
vpbroadcast%1 m0, xm0
vpbroadcast%1 m1, xm1
%else
vpbroadcast%1 m0, mpeg_mind
vpbroadcast%1 m1, mpeg_maxd
%endif
add dataq, widthq
neg widthq
.lineloop:
mova m2, m0
mova m3, m1
mov xq, widthq
.loop:
movu m4, [dataq + xq]
pminu%1 m2, m4
pmaxu%1 m3, m4
add xq, mmsize
jl .loop
; test if the data is out of range
pxor m2, m0
%if cpuflag(avx512)
vpternlogq m2, m3, m1, 0xF6 ; m2 |= m3 ^ m1
vptestmq k1, m2, m2
kortestb k1, k1
%else
pxor m3, m1
por m2, m3
ptest m2, m2
%endif
jnz .end
add dataq, strideq
dec heightq
jg .lineloop
.end:
setnz al
movzx rax, al
RET
%endmacro
%macro detect_alpha_fn 3 ; suffix, hsuffix, range
cglobal detect_alpha%1_%3, 6, 7, 6, color, color_stride, alpha, alpha_stride, width, height, x
pxor m0, m0
add colorq, widthq
add alphaq, widthq
neg widthq
%ifidn %3, limited
vpbroadcast%2 m3, r6m ; p
vpbroadcast%2 m4, r7m ; q
vpbroadcast%2 m5, r8m ; k
%endif
.lineloop:
mov xq, widthq
.loop:
%ifidn %3, full
movu m1, [colorq + xq]
movu m2, [alphaq + xq]
pmaxu%1 m1, m2
%else
pmovzx%1%2 m1, [colorq + xq]
pmovzx%1%2 m2, [alphaq + xq]
pmull%2 m1, m3
pmull%2 m2, m4
%ifidn %1, b
psubusw m1, m5
%else
pmaxud m1, m5
psubd m1, m5
%endif
pmaxu%2 m1, m2
%endif
%if cpuflag(avx512)
vpternlogq m0, m1, m2, 0xF6 ; m0 |= m1 ^ m2
%else
pxor m1, m2
por m0, m1
%endif
%ifidn %3, full
add xq, mmsize
%else
add xq, mmsize >> 1
%endif
jl .loop
%if cpuflag(avx512)
vptestmq k1, m0, m0
kortestb k1, k1
%else
ptest m0, m0
%endif
jnz .found
add colorq, color_strideq
add alphaq, alpha_strideq
dec heightq
jg .lineloop
xor rax, rax
RET
.found:
mov rax, 1
RET
%endmacro
INIT_YMM avx2
detect_range_fn b
detect_range_fn w
detect_alpha_fn b, w, full
detect_alpha_fn w, d, full
detect_alpha_fn b, w, limited
detect_alpha_fn w, d, limited
INIT_ZMM avx512
detect_range_fn b
detect_range_fn w
detect_alpha_fn b, w, full
detect_alpha_fn w, d, full
detect_alpha_fn b, w, limited
detect_alpha_fn w, d, limited

View File

@ -0,0 +1,105 @@
/*
* Copyright (c) 2025 Niklas Haas
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_colordetect.h"
#define DETECT_RANGE_FUNC(FUNC_NAME, ASM_FUNC_NAME, C_FUNC_NAME, SHIFT, MMSIZE) \
int ASM_FUNC_NAME(const uint8_t *src, ptrdiff_t stride, \
ptrdiff_t width, ptrdiff_t height, int min, int max); \
\
static int FUNC_NAME(const uint8_t *src, ptrdiff_t stride, \
ptrdiff_t width, ptrdiff_t height, int min, int max) \
{ \
ptrdiff_t bytes = (width << SHIFT) & ~(MMSIZE - 1); \
int ret = ASM_FUNC_NAME(src, stride, bytes, height, min, max); \
if (ret) \
return ret; \
\
return C_FUNC_NAME(src + bytes, stride, width - (bytes >> SHIFT), \
height, min, max); \
}
#define DETECT_ALPHA_FUNC(FUNC_NAME, ASM_FUNC_NAME, C_FUNC_NAME, SHIFT, MMSIZE) \
int ASM_FUNC_NAME(const uint8_t *color, ptrdiff_t color_stride, \
const uint8_t *alpha, ptrdiff_t alpha_stride, \
ptrdiff_t width, ptrdiff_t height, int p, int q, int k); \
\
static int FUNC_NAME(const uint8_t *color, ptrdiff_t color_stride, \
const uint8_t *alpha, ptrdiff_t alpha_stride, \
ptrdiff_t width, ptrdiff_t height, int p, int q, int k) \
{ \
ptrdiff_t bytes = (width << SHIFT) & ~(MMSIZE - 1); \
int ret = ASM_FUNC_NAME(color, color_stride, alpha, alpha_stride, \
bytes, height, p, q, k); \
if (ret) \
return ret; \
\
return C_FUNC_NAME(color + bytes, color_stride, alpha + bytes, alpha_stride,\
width - (bytes >> SHIFT), height, p, q, k); \
}
#if HAVE_X86ASM
#if HAVE_AVX512_EXTERNAL
DETECT_RANGE_FUNC(detect_range_avx512, ff_detect_rangeb_avx512, ff_detect_range_c, 0, 64)
DETECT_RANGE_FUNC(detect_range16_avx512, ff_detect_rangew_avx512, ff_detect_range16_c, 1, 64)
DETECT_ALPHA_FUNC(detect_alpha_full_avx512, ff_detect_alphab_full_avx512, ff_detect_alpha_full_c, 0, 64)
DETECT_ALPHA_FUNC(detect_alpha16_full_avx512, ff_detect_alphaw_full_avx512, ff_detect_alpha16_full_c, 1, 64)
DETECT_ALPHA_FUNC(detect_alpha_limited_avx512, ff_detect_alphab_limited_avx512, ff_detect_alpha_limited_c, 0, 64)
DETECT_ALPHA_FUNC(detect_alpha16_limited_avx512, ff_detect_alphaw_limited_avx512, ff_detect_alpha16_limited_c, 1, 64)
#endif
#if HAVE_AVX2_EXTERNAL
DETECT_RANGE_FUNC(detect_range_avx2, ff_detect_rangeb_avx2, ff_detect_range_c, 0, 32)
DETECT_RANGE_FUNC(detect_range16_avx2, ff_detect_rangew_avx2, ff_detect_range16_c, 1, 32)
DETECT_ALPHA_FUNC(detect_alpha_full_avx2, ff_detect_alphab_full_avx2, ff_detect_alpha_full_c, 0, 32)
DETECT_ALPHA_FUNC(detect_alpha16_full_avx2, ff_detect_alphaw_full_avx2, ff_detect_alpha16_full_c, 1, 32)
DETECT_ALPHA_FUNC(detect_alpha_limited_avx2, ff_detect_alphab_limited_avx2, ff_detect_alpha_limited_c, 0, 32)
DETECT_ALPHA_FUNC(detect_alpha16_limited_avx2, ff_detect_alphaw_limited_avx2, ff_detect_alpha16_limited_c, 1, 32)
#endif
#endif
av_cold void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
enum AVColorRange color_range)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#if HAVE_AVX2_EXTERNAL
if (EXTERNAL_AVX2(cpu_flags)) {
dsp->detect_range = depth > 8 ? detect_range16_avx2 : detect_range_avx2;
if (color_range == AVCOL_RANGE_JPEG) {
dsp->detect_alpha = depth > 8 ? detect_alpha16_full_avx2 : detect_alpha_full_avx2;
} else {
dsp->detect_alpha = depth > 8 ? detect_alpha16_limited_avx2 : detect_alpha_limited_avx2;
}
}
#endif
#if HAVE_AVX512_EXTERNAL
if (EXTERNAL_AVX512(cpu_flags)) {
dsp->detect_range = depth > 8 ? detect_range16_avx512 : detect_range_avx512;
if (color_range == AVCOL_RANGE_JPEG) {
dsp->detect_alpha = depth > 8 ? detect_alpha16_full_avx512 : detect_alpha_full_avx512;
} else {
dsp->detect_alpha = depth > 8 ? detect_alpha16_limited_avx512 : detect_alpha_limited_avx512;
}
}
#endif
#endif
}