You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
avfilter/vf_colordetect: add x86 SIMD implementation
alphadetect8_full_c: 5658.2 ( 1.00x) alphadetect8_full_avx2: 215.1 (26.31x) alphadetect8_full_avx512: 133.5 (42.40x) alphadetect8_limited_c: 7391.5 ( 1.00x) alphadetect8_limited_avx2: 649.3 (11.38x) alphadetect8_limited_avx512: 330.5 (22.36x) alphadetect16_full_c: 3027.4 ( 1.00x) alphadetect16_full_avx2: 209.4 (14.46x) alphadetect16_full_avx512: 141.4 (21.41x) alphadetect16_limited_c: 3880.9 ( 1.00x) alphadetect16_limited_avx2: 734.9 ( 5.28x) alphadetect16_limited_avx512: 349.2 (11.11x) rangedetect8_c: 5854.2 ( 1.00x) rangedetect8_avx2: 138.9 (42.15x) rangedetect8_avx512: 106.2 (55.12x) rangedetect16_c: 4122.0 ( 1.00x) rangedetect16_avx2: 138.6 (29.74x) rangedetect16_avx512: 104.1 (39.60x)
This commit is contained in:
@ -232,6 +232,10 @@ static av_cold void uninit(AVFilterContext *ctx)
|
|||||||
av_cold void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
|
av_cold void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
|
||||||
enum AVColorRange color_range)
|
enum AVColorRange color_range)
|
||||||
{
|
{
|
||||||
|
#if ARCH_X86
|
||||||
|
ff_color_detect_dsp_init_x86(dsp, depth, color_range);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (!dsp->detect_range)
|
if (!dsp->detect_range)
|
||||||
dsp->detect_range = depth > 8 ? ff_detect_range16_c : ff_detect_range_c;
|
dsp->detect_range = depth > 8 ? ff_detect_range16_c : ff_detect_range_c;
|
||||||
if (!dsp->detect_alpha) {
|
if (!dsp->detect_alpha) {
|
||||||
|
@ -41,6 +41,9 @@ typedef struct FFColorDetectDSPContext {
|
|||||||
void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
|
void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
|
||||||
enum AVColorRange color_range);
|
enum AVColorRange color_range);
|
||||||
|
|
||||||
|
void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
|
||||||
|
enum AVColorRange color_range);
|
||||||
|
|
||||||
static inline int ff_detect_range_c(const uint8_t *data, ptrdiff_t stride,
|
static inline int ff_detect_range_c(const uint8_t *data, ptrdiff_t stride,
|
||||||
ptrdiff_t width, ptrdiff_t height,
|
ptrdiff_t width, ptrdiff_t height,
|
||||||
int mpeg_min, int mpeg_max)
|
int mpeg_min, int mpeg_max)
|
||||||
|
@ -6,6 +6,7 @@ OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise_init.o
|
|||||||
OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect_init.o
|
OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect_init.o
|
||||||
OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
|
OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
|
||||||
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
|
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
|
||||||
|
OBJS-$(CONFIG_COLORDETECT_FILTER) += x86/vf_colordetect_init.o
|
||||||
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
|
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
|
||||||
OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution_init.o
|
OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution_init.o
|
||||||
OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128_init.o
|
OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128_init.o
|
||||||
@ -53,6 +54,7 @@ X86ASM-OBJS-$(CONFIG_ATADENOISE_FILTER) += x86/vf_atadenoise.o
|
|||||||
X86ASM-OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect.o
|
X86ASM-OBJS-$(CONFIG_BLACKDETECT_FILTER) += x86/vf_blackdetect.o
|
||||||
X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
|
X86ASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
|
||||||
X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
|
X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
|
||||||
|
X86ASM-OBJS-$(CONFIG_COLORDETECT_FILTER) += x86/vf_colordetect.o
|
||||||
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
|
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
|
||||||
X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution.o
|
X86ASM-OBJS-$(CONFIG_CONVOLUTION_FILTER) += x86/vf_convolution.o
|
||||||
X86ASM-OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128.o
|
X86ASM-OBJS-$(CONFIG_EBUR128_FILTER) += x86/f_ebur128.o
|
||||||
|
150
libavfilter/x86/vf_colordetect.asm
Normal file
150
libavfilter/x86/vf_colordetect.asm
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
;*****************************************************************************
|
||||||
|
;* x86-optimized functions for blackdetect filter
|
||||||
|
;*
|
||||||
|
;* Copyright (C) 2025 Niklas Haas
|
||||||
|
;*
|
||||||
|
;* This file is part of FFmpeg.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
;* modify it under the terms of the GNU Lesser General Public
|
||||||
|
;* License as published by the Free Software Foundation; either
|
||||||
|
;* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
;* Lesser General Public License for more details.
|
||||||
|
;*
|
||||||
|
;* You should have received a copy of the GNU Lesser General Public
|
||||||
|
;* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
;*****************************************************************************
|
||||||
|
|
||||||
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
|
SECTION .text
|
||||||
|
|
||||||
|
%macro detect_range_fn 1 ; suffix
|
||||||
|
cglobal detect_range%1, 6, 7, 5, data, stride, width, height, mpeg_min, mpeg_max, x
|
||||||
|
%if UNIX64 && notcpuflag(avx512)
|
||||||
|
movd xm0, mpeg_mind
|
||||||
|
movd xm1, mpeg_maxd
|
||||||
|
vpbroadcast%1 m0, xm0
|
||||||
|
vpbroadcast%1 m1, xm1
|
||||||
|
%else
|
||||||
|
vpbroadcast%1 m0, mpeg_mind
|
||||||
|
vpbroadcast%1 m1, mpeg_maxd
|
||||||
|
%endif
|
||||||
|
add dataq, widthq
|
||||||
|
neg widthq
|
||||||
|
.lineloop:
|
||||||
|
mova m2, m0
|
||||||
|
mova m3, m1
|
||||||
|
mov xq, widthq
|
||||||
|
.loop:
|
||||||
|
movu m4, [dataq + xq]
|
||||||
|
pminu%1 m2, m4
|
||||||
|
pmaxu%1 m3, m4
|
||||||
|
add xq, mmsize
|
||||||
|
jl .loop
|
||||||
|
|
||||||
|
; test if the data is out of range
|
||||||
|
pxor m2, m0
|
||||||
|
%if cpuflag(avx512)
|
||||||
|
vpternlogq m2, m3, m1, 0xF6 ; m2 |= m3 ^ m1
|
||||||
|
vptestmq k1, m2, m2
|
||||||
|
kortestb k1, k1
|
||||||
|
%else
|
||||||
|
pxor m3, m1
|
||||||
|
por m2, m3
|
||||||
|
ptest m2, m2
|
||||||
|
%endif
|
||||||
|
jnz .end
|
||||||
|
add dataq, strideq
|
||||||
|
dec heightq
|
||||||
|
jg .lineloop
|
||||||
|
.end:
|
||||||
|
setnz al
|
||||||
|
movzx rax, al
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro detect_alpha_fn 3 ; suffix, hsuffix, range
|
||||||
|
cglobal detect_alpha%1_%3, 6, 7, 6, color, color_stride, alpha, alpha_stride, width, height, x
|
||||||
|
pxor m0, m0
|
||||||
|
add colorq, widthq
|
||||||
|
add alphaq, widthq
|
||||||
|
neg widthq
|
||||||
|
%ifidn %3, limited
|
||||||
|
vpbroadcast%2 m3, r6m ; p
|
||||||
|
vpbroadcast%2 m4, r7m ; q
|
||||||
|
vpbroadcast%2 m5, r8m ; k
|
||||||
|
%endif
|
||||||
|
.lineloop:
|
||||||
|
mov xq, widthq
|
||||||
|
.loop:
|
||||||
|
%ifidn %3, full
|
||||||
|
movu m1, [colorq + xq]
|
||||||
|
movu m2, [alphaq + xq]
|
||||||
|
pmaxu%1 m1, m2
|
||||||
|
%else
|
||||||
|
pmovzx%1%2 m1, [colorq + xq]
|
||||||
|
pmovzx%1%2 m2, [alphaq + xq]
|
||||||
|
pmull%2 m1, m3
|
||||||
|
pmull%2 m2, m4
|
||||||
|
%ifidn %1, b
|
||||||
|
psubusw m1, m5
|
||||||
|
%else
|
||||||
|
pmaxud m1, m5
|
||||||
|
psubd m1, m5
|
||||||
|
%endif
|
||||||
|
pmaxu%2 m1, m2
|
||||||
|
%endif
|
||||||
|
%if cpuflag(avx512)
|
||||||
|
vpternlogq m0, m1, m2, 0xF6 ; m0 |= m1 ^ m2
|
||||||
|
%else
|
||||||
|
pxor m1, m2
|
||||||
|
por m0, m1
|
||||||
|
%endif
|
||||||
|
%ifidn %3, full
|
||||||
|
add xq, mmsize
|
||||||
|
%else
|
||||||
|
add xq, mmsize >> 1
|
||||||
|
%endif
|
||||||
|
jl .loop
|
||||||
|
|
||||||
|
%if cpuflag(avx512)
|
||||||
|
vptestmq k1, m0, m0
|
||||||
|
kortestb k1, k1
|
||||||
|
%else
|
||||||
|
ptest m0, m0
|
||||||
|
%endif
|
||||||
|
jnz .found
|
||||||
|
|
||||||
|
add colorq, color_strideq
|
||||||
|
add alphaq, alpha_strideq
|
||||||
|
dec heightq
|
||||||
|
jg .lineloop
|
||||||
|
xor rax, rax
|
||||||
|
RET
|
||||||
|
|
||||||
|
.found:
|
||||||
|
mov rax, 1
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_YMM avx2
|
||||||
|
detect_range_fn b
|
||||||
|
detect_range_fn w
|
||||||
|
detect_alpha_fn b, w, full
|
||||||
|
detect_alpha_fn w, d, full
|
||||||
|
detect_alpha_fn b, w, limited
|
||||||
|
detect_alpha_fn w, d, limited
|
||||||
|
|
||||||
|
INIT_ZMM avx512
|
||||||
|
detect_range_fn b
|
||||||
|
detect_range_fn w
|
||||||
|
detect_alpha_fn b, w, full
|
||||||
|
detect_alpha_fn w, d, full
|
||||||
|
detect_alpha_fn b, w, limited
|
||||||
|
detect_alpha_fn w, d, limited
|
105
libavfilter/x86/vf_colordetect_init.c
Normal file
105
libavfilter/x86/vf_colordetect_init.c
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2025 Niklas Haas
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavutil/attributes.h"
|
||||||
|
#include "libavutil/x86/cpu.h"
|
||||||
|
#include "libavfilter/vf_colordetect.h"
|
||||||
|
|
||||||
|
#define DETECT_RANGE_FUNC(FUNC_NAME, ASM_FUNC_NAME, C_FUNC_NAME, SHIFT, MMSIZE) \
|
||||||
|
int ASM_FUNC_NAME(const uint8_t *src, ptrdiff_t stride, \
|
||||||
|
ptrdiff_t width, ptrdiff_t height, int min, int max); \
|
||||||
|
\
|
||||||
|
static int FUNC_NAME(const uint8_t *src, ptrdiff_t stride, \
|
||||||
|
ptrdiff_t width, ptrdiff_t height, int min, int max) \
|
||||||
|
{ \
|
||||||
|
ptrdiff_t bytes = (width << SHIFT) & ~(MMSIZE - 1); \
|
||||||
|
int ret = ASM_FUNC_NAME(src, stride, bytes, height, min, max); \
|
||||||
|
if (ret) \
|
||||||
|
return ret; \
|
||||||
|
\
|
||||||
|
return C_FUNC_NAME(src + bytes, stride, width - (bytes >> SHIFT), \
|
||||||
|
height, min, max); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define DETECT_ALPHA_FUNC(FUNC_NAME, ASM_FUNC_NAME, C_FUNC_NAME, SHIFT, MMSIZE) \
|
||||||
|
int ASM_FUNC_NAME(const uint8_t *color, ptrdiff_t color_stride, \
|
||||||
|
const uint8_t *alpha, ptrdiff_t alpha_stride, \
|
||||||
|
ptrdiff_t width, ptrdiff_t height, int p, int q, int k); \
|
||||||
|
\
|
||||||
|
static int FUNC_NAME(const uint8_t *color, ptrdiff_t color_stride, \
|
||||||
|
const uint8_t *alpha, ptrdiff_t alpha_stride, \
|
||||||
|
ptrdiff_t width, ptrdiff_t height, int p, int q, int k) \
|
||||||
|
{ \
|
||||||
|
ptrdiff_t bytes = (width << SHIFT) & ~(MMSIZE - 1); \
|
||||||
|
int ret = ASM_FUNC_NAME(color, color_stride, alpha, alpha_stride, \
|
||||||
|
bytes, height, p, q, k); \
|
||||||
|
if (ret) \
|
||||||
|
return ret; \
|
||||||
|
\
|
||||||
|
return C_FUNC_NAME(color + bytes, color_stride, alpha + bytes, alpha_stride,\
|
||||||
|
width - (bytes >> SHIFT), height, p, q, k); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#if HAVE_X86ASM
|
||||||
|
#if HAVE_AVX512_EXTERNAL
|
||||||
|
DETECT_RANGE_FUNC(detect_range_avx512, ff_detect_rangeb_avx512, ff_detect_range_c, 0, 64)
|
||||||
|
DETECT_RANGE_FUNC(detect_range16_avx512, ff_detect_rangew_avx512, ff_detect_range16_c, 1, 64)
|
||||||
|
DETECT_ALPHA_FUNC(detect_alpha_full_avx512, ff_detect_alphab_full_avx512, ff_detect_alpha_full_c, 0, 64)
|
||||||
|
DETECT_ALPHA_FUNC(detect_alpha16_full_avx512, ff_detect_alphaw_full_avx512, ff_detect_alpha16_full_c, 1, 64)
|
||||||
|
DETECT_ALPHA_FUNC(detect_alpha_limited_avx512, ff_detect_alphab_limited_avx512, ff_detect_alpha_limited_c, 0, 64)
|
||||||
|
DETECT_ALPHA_FUNC(detect_alpha16_limited_avx512, ff_detect_alphaw_limited_avx512, ff_detect_alpha16_limited_c, 1, 64)
|
||||||
|
#endif
|
||||||
|
#if HAVE_AVX2_EXTERNAL
|
||||||
|
DETECT_RANGE_FUNC(detect_range_avx2, ff_detect_rangeb_avx2, ff_detect_range_c, 0, 32)
|
||||||
|
DETECT_RANGE_FUNC(detect_range16_avx2, ff_detect_rangew_avx2, ff_detect_range16_c, 1, 32)
|
||||||
|
DETECT_ALPHA_FUNC(detect_alpha_full_avx2, ff_detect_alphab_full_avx2, ff_detect_alpha_full_c, 0, 32)
|
||||||
|
DETECT_ALPHA_FUNC(detect_alpha16_full_avx2, ff_detect_alphaw_full_avx2, ff_detect_alpha16_full_c, 1, 32)
|
||||||
|
DETECT_ALPHA_FUNC(detect_alpha_limited_avx2, ff_detect_alphab_limited_avx2, ff_detect_alpha_limited_c, 0, 32)
|
||||||
|
DETECT_ALPHA_FUNC(detect_alpha16_limited_avx2, ff_detect_alphaw_limited_avx2, ff_detect_alpha16_limited_c, 1, 32)
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
av_cold void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
|
||||||
|
enum AVColorRange color_range)
|
||||||
|
{
|
||||||
|
#if HAVE_X86ASM
|
||||||
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
#if HAVE_AVX2_EXTERNAL
|
||||||
|
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||||
|
dsp->detect_range = depth > 8 ? detect_range16_avx2 : detect_range_avx2;
|
||||||
|
if (color_range == AVCOL_RANGE_JPEG) {
|
||||||
|
dsp->detect_alpha = depth > 8 ? detect_alpha16_full_avx2 : detect_alpha_full_avx2;
|
||||||
|
} else {
|
||||||
|
dsp->detect_alpha = depth > 8 ? detect_alpha16_limited_avx2 : detect_alpha_limited_avx2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if HAVE_AVX512_EXTERNAL
|
||||||
|
if (EXTERNAL_AVX512(cpu_flags)) {
|
||||||
|
dsp->detect_range = depth > 8 ? detect_range16_avx512 : detect_range_avx512;
|
||||||
|
if (color_range == AVCOL_RANGE_JPEG) {
|
||||||
|
dsp->detect_alpha = depth > 8 ? detect_alpha16_full_avx512 : detect_alpha_full_avx512;
|
||||||
|
} else {
|
||||||
|
dsp->detect_alpha = depth > 8 ? detect_alpha16_limited_avx512 : detect_alpha_limited_avx512;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
}
|
Reference in New Issue
Block a user