1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

avfilter/vf_colordetect: optimize C functions a bit

They are used to process tail, so it's still good to have them faster.
Even if AVX version are used.

GCC 14.2.0 | x86_64 (default config) | Before:

detect_alpha_8_full_c:                                3803.0 ( 1.00x)
detect_alpha_8_full_avx2:                              166.4 (22.86x)
detect_alpha_8_full_avx512icl:                         144.2 (26.37x)
detect_alpha_8_limited_c:                            10454.4 ( 1.00x)
detect_alpha_8_limited_avx2:                           616.5 (16.96x)
detect_alpha_8_limited_avx512icl:                      509.4 (20.52x)
detect_alpha_16_full_c:                               1903.0 ( 1.00x)
detect_alpha_16_full_avx2:                             172.4 (11.04x)
detect_alpha_16_full_avx512icl:                        163.4 (11.65x)
detect_alpha_16_limited_c:                            3703.6 ( 1.00x)
detect_alpha_16_limited_avx2:                          644.4 ( 5.75x)
detect_alpha_16_limited_avx512icl:                     558.0 ( 6.64x)
detect_range_8_c:                                     5855.9 ( 1.00x)
detect_range_8_avx2:                                   150.4 (38.94x)
detect_range_8_avx512icl:                              146.7 (39.91x)
detect_range_16_c:                                    2702.2 ( 1.00x)
detect_range_16_avx2:                                  256.7 (10.53x)
detect_range_16_avx512icl:                             116.8 (23.13x)

GCC 14.2.0 | x86_64 (default config) | After:

detect_alpha_8_full_c:                                 376.3 ( 1.00x)
detect_alpha_8_full_avx2:                              169.2 ( 2.22x)
detect_alpha_8_full_avx512icl:                         134.6 ( 2.80x)
detect_alpha_8_limited_c:                             6024.1 ( 1.00x)
detect_alpha_8_limited_avx2:                           641.8 ( 9.39x)
detect_alpha_8_limited_avx512icl:                      493.0 (12.22x)
detect_alpha_16_full_c:                                436.4 ( 1.00x)
detect_alpha_16_full_avx2:                             156.3 ( 2.79x)
detect_alpha_16_full_avx512icl:                        151.8 ( 2.87x)
detect_alpha_16_limited_c:                            3679.9 ( 1.00x)
detect_alpha_16_limited_avx2:                          642.0 ( 5.73x)
detect_alpha_16_limited_avx512icl:                     555.2 ( 6.63x)
detect_range_8_c:                                      655.2 ( 1.00x)
detect_range_8_avx2:                                   153.9 ( 4.26x)
detect_range_8_avx512icl:                              147.4 ( 4.45x)
detect_range_16_c:                                     743.3 ( 1.00x)
detect_range_16_avx2:                                  258.6 ( 2.87x)
detect_range_16_avx512icl:                             107.7 ( 6.90x)

Clang 19.1.7 | x86_64 (default config) | Before:

detect_alpha_8_full_c:                                7013.4 ( 1.00x)
detect_alpha_8_full_avx2:                              141.8 (49.46x)
detect_alpha_8_full_avx512icl:                         133.8 (52.40x)
detect_alpha_8_limited_c:                             7038.8 ( 1.00x)
detect_alpha_8_limited_avx2:                           605.0 (11.63x)
detect_alpha_8_limited_avx512icl:                      506.5 (13.90x)
detect_alpha_16_full_c:                               1799.5 ( 1.00x)
detect_alpha_16_full_avx2:                             143.0 (12.59x)
detect_alpha_16_full_avx512icl:                        127.5 (14.12x)
detect_alpha_16_limited_c:                            3499.6 ( 1.00x)
detect_alpha_16_limited_avx2:                          633.6 ( 5.52x)
detect_alpha_16_limited_avx512icl:                     551.9 ( 6.34x)
detect_range_8_c:                                     5253.6 ( 1.00x)
detect_range_8_avx2:                                   125.0 (42.01x)
detect_range_8_avx512icl:                              123.2 (42.65x)
detect_range_16_c:                                    3055.2 ( 1.00x)
detect_range_16_avx2:                                  230.0 (13.28x)
detect_range_16_avx512icl:                              95.9 (31.86x)

Clang 19.1.7 | x86_64 (default config) | After:

detect_alpha_8_full_c:                                 323.3 ( 1.00x)
detect_alpha_8_full_avx2:                              149.7 ( 2.16x)
detect_alpha_8_full_avx512icl:                         127.7 ( 2.53x)
detect_alpha_8_limited_c:                             5075.9 ( 1.00x)
detect_alpha_8_limited_avx2:                           625.4 ( 8.12x)
detect_alpha_8_limited_avx512icl:                      493.0 (10.30x)
detect_alpha_16_full_c:                                421.0 ( 1.00x)
detect_alpha_16_full_avx2:                             238.8 ( 1.76x)
detect_alpha_16_full_avx512icl:                        126.0 ( 3.34x)
detect_alpha_16_limited_c:                            3516.8 ( 1.00x)
detect_alpha_16_limited_avx2:                          624.7 ( 5.63x)
detect_alpha_16_limited_avx512icl:                     544.7 ( 6.46x)
detect_range_8_c:                                      609.1 ( 1.00x)
detect_range_8_avx2:                                   239.4 ( 2.54x)
detect_range_8_avx512icl:                               89.0 ( 6.84x)
detect_range_16_c:                                     463.9 ( 1.00x)
detect_range_16_avx2:                                  127.4 ( 3.64x)
detect_range_16_avx512icl:                              86.4 ( 5.37x)

Signed-off-by: Kacper Michajłow <kasper93@gmail.com>
This commit is contained in:
Kacper Michajłow
2025-08-06 18:16:08 +02:00
parent 85e8e59001
commit e6635ada64

View File

@@ -22,6 +22,7 @@
#include <stddef.h>
#include <stdint.h>
#include <libavutil/avassert.h>
#include <libavutil/macros.h>
#include <libavutil/pixfmt.h>
@@ -44,16 +45,46 @@ void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
enum AVColorRange color_range);
static inline int ff_detect_range_impl_c(const uint8_t *data, ptrdiff_t stride,
ptrdiff_t width, ptrdiff_t height,
uint8_t mpeg_min, uint8_t mpeg_max)
{
while (height--) {
uint8_t cond = 0;
for (int x = 0; x < width; x++) {
const uint8_t val = data[x];
cond |= val < mpeg_min || val > mpeg_max;
}
if (cond)
return 1;
data += stride;
}
return 0;
}
static inline int ff_detect_range_c(const uint8_t *data, ptrdiff_t stride,
ptrdiff_t width, ptrdiff_t height,
int mpeg_min, int mpeg_max)
{
av_assume(mpeg_min >= 0 && mpeg_min <= UINT8_MAX);
av_assume(mpeg_max >= 0 && mpeg_max <= UINT8_MAX);
return ff_detect_range_impl_c(data, stride, width, height, mpeg_min, mpeg_max);
}
static inline int ff_detect_range16_impl_c(const uint8_t *data, ptrdiff_t stride,
ptrdiff_t width, ptrdiff_t height,
uint16_t mpeg_min, uint16_t mpeg_max)
{
while (height--) {
const uint16_t *data16 = (const uint16_t *) data;
uint8_t cond = 0;
for (int x = 0; x < width; x++) {
const uint8_t val = data[x];
if (val < mpeg_min || val > mpeg_max)
return 1;
const uint16_t val = data16[x];
cond |= val < mpeg_min || val > mpeg_max;
}
if (cond)
return 1;
data += stride;
}
@@ -64,17 +95,9 @@ static inline int ff_detect_range16_c(const uint8_t *data, ptrdiff_t stride,
ptrdiff_t width, ptrdiff_t height,
int mpeg_min, int mpeg_max)
{
while (height--) {
const uint16_t *data16 = (const uint16_t *) data;
for (int x = 0; x < width; x++) {
const uint16_t val = data16[x];
if (val < mpeg_min || val > mpeg_max)
return 1;
}
data += stride;
}
return 0;
av_assume(mpeg_min >= 0 && mpeg_min <= UINT16_MAX);
av_assume(mpeg_max >= 0 && mpeg_max <= UINT16_MAX);
return ff_detect_range16_impl_c(data, stride, width, height, mpeg_min, mpeg_max);
}
static inline int
@@ -84,10 +107,11 @@ ff_detect_alpha_full_c(const uint8_t *color, ptrdiff_t color_stride,
int p, int q, int k)
{
while (height--) {
for (int x = 0; x < width; x++) {
if (color[x] > alpha[x])
return 1;
}
uint8_t cond = 0;
for (int x = 0; x < width; x++)
cond |= color[x] > alpha[x];
if (cond)
return 1;
color += color_stride;
alpha += alpha_stride;
}
@@ -101,10 +125,11 @@ ff_detect_alpha_limited_c(const uint8_t *color, ptrdiff_t color_stride,
int p, int q, int k)
{
while (height--) {
for (int x = 0; x < width; x++) {
if (p * color[x] - k > q * alpha[x])
return 1;
}
uint8_t cond = 0;
for (int x = 0; x < width; x++)
cond |= p * color[x] - k > q * alpha[x];
if (cond)
return 1;
color += color_stride;
alpha += alpha_stride;
}
@@ -120,10 +145,11 @@ ff_detect_alpha16_full_c(const uint8_t *color, ptrdiff_t color_stride,
while (height--) {
const uint16_t *color16 = (const uint16_t *) color;
const uint16_t *alpha16 = (const uint16_t *) alpha;
for (int x = 0; x < width; x++) {
if (color16[x] > alpha16[x])
return 1;
}
uint8_t cond = 0;
for (int x = 0; x < width; x++)
cond |= color16[x] > alpha16[x];
if (cond)
return 1;
color += color_stride;
alpha += alpha_stride;
}