From ab1114a0f5b32d91e7f56dc1183087a33a96153c Mon Sep 17 00:00:00 2001 From: Paul B Mahol Date: Wed, 2 May 2018 19:23:41 +0200 Subject: [PATCH] avfilter/vf_convolution: rewrite so it doesn't use temp buffers Signed-off-by: Paul B Mahol --- libavfilter/vf_convolution.c | 1005 ++++++++++------------------------ 1 file changed, 279 insertions(+), 726 deletions(-) diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c index e9b1f4abe8..026b881873 100644 --- a/libavfilter/vf_convolution.c +++ b/libavfilter/vf_convolution.c @@ -21,6 +21,7 @@ #include "libavutil/avstring.h" #include "libavutil/imgutils.h" +#include "libavutil/intreadwrite.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" #include "avfilter.h" @@ -40,10 +41,8 @@ typedef struct ConvolutionContext { int size[4]; int depth; + int max; int bpc; - int bstride; - uint8_t *buffer; - uint8_t **bptrs; int nb_planes; int nb_threads; int planewidth[4]; @@ -52,7 +51,11 @@ typedef struct ConvolutionContext { int matrix_length[4]; int copy[4]; - int (*filter[4])(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs); + void (*setup[4])(const uint8_t *c[], const uint8_t *src, int stride, + int x, int width, int y, int height, int bpc); + void (*filter[4])(uint8_t *dst, const uint8_t *src, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak); } ConvolutionContext; #define OFFSET(x) offsetof(ConvolutionContext, x) @@ -120,743 +123,314 @@ static int query_formats(AVFilterContext *ctx) return ff_set_common_formats(ctx, ff_make_format_list(pix_fmts)); } -static inline void line_copy8(uint8_t *line, const uint8_t *srcp, int width, int mergin) -{ - int i; - - memcpy(line, srcp, width); - - for (i = mergin; i > 0; i--) { - line[-i] = line[i]; - line[width - 1 + i] = line[width - 1 - i]; - } -} - -static inline void line_copy16(uint16_t *line, const uint16_t *srcp, int width, int mergin) -{ - int i; - - memcpy(line, srcp, width * 2); - - for (i = mergin; i > 0; i--) { - line[-i] = line[i]; - line[width - 1 + i] = line[width - 1 - i]; - } -} - typedef struct ThreadData { AVFrame *in, *out; - int plane; } ThreadData; -static int filter16_prewitt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void filter16_prewitt(uint8_t *dstp, const uint8_t *src, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak) { - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int peak = (1 << s->depth) - 1; - const int stride = in->linesize[plane] / 2; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; - uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); - const float scale = s->scale; - const float delta = s->delta; - uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 16; - uint16_t *p1 = p0 + bstride; - uint16_t *p2 = p1 + bstride; - uint16_t *orig = p0, *end = p2; - int y, x; + uint16_t *dst = (uint16_t *)dstp; + int x; - line_copy16(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); - line_copy16(p1, src, width, 1); + for (x = 0; x < width; x++) { + int suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) * -1 + + AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) * 1 + AV_RN16A(&c[8][2 * x]) * 1; + int sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) * 1 + AV_RN16A(&c[3][2 * x]) * -1 + + AV_RN16A(&c[5][2 * x]) * 1 + AV_RN16A(&c[6][2 * x]) * -1 + AV_RN16A(&c[8][2 * x]) * 1; - for (y = slice_start; y < slice_end; y++) { - src += stride * (y < height - 1 ? 1 : -1); - line_copy16(p2, src, width, 1); - - for (x = 0; x < width; x++) { - int suma = p0[x - 1] * -1 + - p0[x] * -1 + - p0[x + 1] * -1 + - p2[x - 1] * 1 + - p2[x] * 1 + - p2[x + 1] * 1; - int sumb = p0[x - 1] * -1 + - p0[x + 1] * 1 + - p1[x - 1] * -1 + - p1[x + 1] * 1 + - p2[x - 1] * -1 + - p2[x + 1] * 1; - - dst[x] = av_clip(sqrt(suma*suma + sumb*sumb) * scale + delta, 0, peak); - } - - p0 = p1; - p1 = p2; - p2 = (p2 == end) ? orig: p2 + bstride; - dst += out->linesize[plane] / 2; + dst[x] = av_clip(sqrt(suma*suma + sumb*sumb) * scale + delta, 0, peak); } - - return 0; } -static int filter16_roberts(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void filter16_roberts(uint8_t *dstp, const uint8_t *src, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak) { - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int peak = (1 << s->depth) - 1; - const int stride = in->linesize[plane] / 2; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; - uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); - const float scale = s->scale; - const float delta = s->delta; - uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 16; - uint16_t *p1 = p0 + bstride; - uint16_t *p2 = p1 + bstride; - uint16_t *orig = p0, *end = p2; - int y, x; + uint16_t *dst = (uint16_t *)dstp; + int x; - line_copy16(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); - line_copy16(p1, src, width, 1); + for (x = 0; x < width; x++) { + int suma = AV_RN16A(&c[0][2 * x]) * 1 + AV_RN16A(&c[4][2 * x]) * -1; + int sumb = AV_RN16A(&c[1][2 * x]) * 1 + AV_RN16A(&c[3][2 * x]) * -1; - for (y = slice_start; y < slice_end; y++) { - src += stride * (y < height - 1 ? 1 : -1); - line_copy16(p2, src, width, 1); - - for (x = 0; x < width; x++) { - int suma = p0[x - 1] * 1 + - p1[x ] * -1; - int sumb = p0[x ] * 1 + - p1[x - 1] * -1; - - dst[x] = av_clip(sqrt(suma*suma + sumb*sumb) * scale + delta, 0, peak); - } - - p0 = p1; - p1 = p2; - p2 = (p2 == end) ? orig: p2 + bstride; - dst += out->linesize[plane] / 2; + dst[x] = av_clip(sqrt(suma*suma + sumb*sumb) * scale + delta, 0, peak); } - - return 0; } -static int filter16_sobel(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void filter16_sobel(uint8_t *dstp, const uint8_t *src, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak) { - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int peak = (1 << s->depth) - 1; - const int stride = in->linesize[plane] / 2; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; - uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); - const float scale = s->scale; - const float delta = s->delta; - uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 16; - uint16_t *p1 = p0 + bstride; - uint16_t *p2 = p1 + bstride; - uint16_t *orig = p0, *end = p2; - int y, x; + uint16_t *dst = (uint16_t *)dstp; + int x; - line_copy16(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); - line_copy16(p1, src, width, 1); + for (x = 0; x < width; x++) { + int suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * -2 + AV_RN16A(&c[2][2 * x]) * -1 + + AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) * 2 + AV_RN16A(&c[8][2 * x]) * 1; + int sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) * 1 + AV_RN16A(&c[3][2 * x]) * -2 + + AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) * -1 + AV_RN16A(&c[8][2 * x]) * 1; - for (y = slice_start; y < slice_end; y++) { - src += stride * (y < height - 1 ? 1 : -1); - line_copy16(p2, src, width, 1); - - for (x = 0; x < width; x++) { - int suma = p0[x - 1] * -1 + - p0[x] * -2 + - p0[x + 1] * -1 + - p2[x - 1] * 1 + - p2[x] * 2 + - p2[x + 1] * 1; - int sumb = p0[x - 1] * -1 + - p0[x + 1] * 1 + - p1[x - 1] * -2 + - p1[x + 1] * 2 + - p2[x - 1] * -1 + - p2[x + 1] * 1; - - dst[x] = av_clip(sqrt(suma*suma + sumb*sumb) * scale + delta, 0, peak); - } - - p0 = p1; - p1 = p2; - p2 = (p2 == end) ? orig: p2 + bstride; - dst += out->linesize[plane] / 2; + dst[x] = av_clip(sqrt(suma*suma + sumb*sumb) * scale + delta, 0, peak); } - - return 0; } -static int filter_prewitt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void filter_prewitt(uint8_t *dst, const uint8_t *src, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak) { - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int stride = in->linesize[plane]; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint8_t *src = in->data[plane] + slice_start * stride; - uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; - const float scale = s->scale; - const float delta = s->delta; - uint8_t *p0 = s->bptrs[jobnr] + 16; - uint8_t *p1 = p0 + bstride; - uint8_t *p2 = p1 + bstride; - uint8_t *orig = p0, *end = p2; - int y, x; + const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; + const uint8_t *c3 = c[3], *c5 = c[5]; + const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; + int x; - line_copy8(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); - line_copy8(p1, src, width, 1); + for (x = 0; x < width; x++) { + int suma = c0[x] * -1 + c1[x] * -1 + c2[x] * -1 + + c6[x] * 1 + c7[x] * 1 + c8[x] * 1; + int sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -1 + + c5[x] * 1 + c6[x] * -1 + c8[x] * 1; - for (y = slice_start; y < slice_end; y++) { - src += stride * (y < height - 1 ? 1 : -1); - line_copy8(p2, src, width, 1); - - for (x = 0; x < width; x++) { - int suma = p0[x - 1] * -1 + - p0[x] * -1 + - p0[x + 1] * -1 + - p2[x - 1] * 1 + - p2[x] * 1 + - p2[x + 1] * 1; - int sumb = p0[x - 1] * -1 + - p0[x + 1] * 1 + - p1[x - 1] * -1 + - p1[x + 1] * 1 + - p2[x - 1] * -1 + - p2[x + 1] * 1; - - dst[x] = av_clip_uint8(sqrt(suma*suma + sumb*sumb) * scale + delta); - } - - p0 = p1; - p1 = p2; - p2 = (p2 == end) ? orig: p2 + bstride; - dst += out->linesize[plane]; + dst[x] = av_clip_uint8(sqrt(suma*suma + sumb*sumb) * scale + delta); } - - return 0; } -static int filter_roberts(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void filter_roberts(uint8_t *dst, const uint8_t *src, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak) { - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int stride = in->linesize[plane]; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint8_t *src = in->data[plane] + slice_start * stride; - uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; - const float scale = s->scale; - const float delta = s->delta; - uint8_t *p0 = s->bptrs[jobnr] + 16; - uint8_t *p1 = p0 + bstride; - uint8_t *p2 = p1 + bstride; - uint8_t *orig = p0, *end = p2; - int y, x; + int x; - line_copy8(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); - line_copy8(p1, src, width, 1); + for (x = 0; x < width; x++) { + int suma = c[0][x - 1] * 1 + c[4][x ] * -1; + int sumb = c[1][x ] * 1 + c[3][x - 1] * -1; - for (y = slice_start; y < slice_end; y++) { - src += stride * (y < height - 1 ? 1 : -1); - line_copy8(p2, src, width, 1); - - for (x = 0; x < width; x++) { - int suma = p0[x - 1] * 1 + - p1[x ] * -1; - int sumb = p0[x ] * 1 + - p1[x - 1] * -1; - - dst[x] = av_clip_uint8(sqrt(suma*suma + sumb*sumb) * scale + delta); - } - - p0 = p1; - p1 = p2; - p2 = (p2 == end) ? orig: p2 + bstride; - dst += out->linesize[plane]; + dst[x] = av_clip_uint8(sqrt(suma*suma + sumb*sumb) * scale + delta); } - - return 0; } -static int filter_sobel(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void filter_sobel(uint8_t *dst, const uint8_t *src, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak) { - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int stride = in->linesize[plane]; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint8_t *src = in->data[plane] + slice_start * stride; - uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; - const float scale = s->scale; - const float delta = s->delta; - uint8_t *p0 = s->bptrs[jobnr] + 16; - uint8_t *p1 = p0 + bstride; - uint8_t *p2 = p1 + bstride; - uint8_t *orig = p0, *end = p2; - int y, x; + const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; + const uint8_t *c3 = c[3], *c5 = c[5]; + const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; + int x; - line_copy8(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); - line_copy8(p1, src, width, 1); + for (x = 0; x < width; x++) { + int suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + + c6[x] * 1 + c7[x] * 2 + c8[x] * 1; + int sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + + c5[x] * 2 + c6[x] * -1 + c8[x] * 1; - for (y = slice_start; y < slice_end; y++) { - src += stride * (y < height - 1 ? 1 : -1); - line_copy8(p2, src, width, 1); - - for (x = 0; x < width; x++) { - int suma = p0[x - 1] * -1 + - p0[x] * -2 + - p0[x + 1] * -1 + - p2[x - 1] * 1 + - p2[x] * 2 + - p2[x + 1] * 1; - int sumb = p0[x - 1] * -1 + - p0[x + 1] * 1 + - p1[x - 1] * -2 + - p1[x + 1] * 2 + - p2[x - 1] * -1 + - p2[x + 1] * 1; - - dst[x] = av_clip_uint8(sqrt(suma*suma + sumb*sumb) * scale + delta); - } - - p0 = p1; - p1 = p2; - p2 = (p2 == end) ? orig: p2 + bstride; - dst += out->linesize[plane]; + dst[x] = av_clip_uint8(sqrt(suma*suma + sumb*sumb) * scale + delta); } - - return 0; } -static int filter16_3x3(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void filter16_3x3(uint8_t *dstp, const uint8_t *src, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak) { - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int peak = (1 << s->depth) - 1; - const int stride = in->linesize[plane] / 2; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; - uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); - uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 16; - uint16_t *p1 = p0 + bstride; - uint16_t *p2 = p1 + bstride; - uint16_t *orig = p0, *end = p2; - const int *matrix = s->matrix[plane]; - const float rdiv = s->rdiv[plane]; - const float bias = s->bias[plane]; - int y, x; + uint16_t *dst = (uint16_t *)dstp; + int x; - line_copy16(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); - line_copy16(p1, src, width, 1); - - for (y = slice_start; y < slice_end; y++) { - src += stride * (y < height - 1 ? 1 : -1); - line_copy16(p2, src, width, 1); - - for (x = 0; x < width; x++) { - int sum = p0[x - 1] * matrix[0] + - p0[x] * matrix[1] + - p0[x + 1] * matrix[2] + - p1[x - 1] * matrix[3] + - p1[x] * matrix[4] + - p1[x + 1] * matrix[5] + - p2[x - 1] * matrix[6] + - p2[x] * matrix[7] + - p2[x + 1] * matrix[8]; - sum = (int)(sum * rdiv + bias + 0.5f); - dst[x] = av_clip(sum, 0, peak); - } - - p0 = p1; - p1 = p2; - p2 = (p2 == end) ? orig: p2 + bstride; - dst += out->linesize[plane] / 2; + for (x = 0; x < width; x++) { + int sum = AV_RN16A(&c[0][2 * x]) * matrix[0] + + AV_RN16A(&c[1][2 * x]) * matrix[1] + + AV_RN16A(&c[2][2 * x]) * matrix[2] + + AV_RN16A(&c[3][2 * x]) * matrix[3] + + AV_RN16A(&c[4][2 * x]) * matrix[4] + + AV_RN16A(&c[5][2 * x]) * matrix[5] + + AV_RN16A(&c[6][2 * x]) * matrix[6] + + AV_RN16A(&c[7][2 * x]) * matrix[7] + + AV_RN16A(&c[8][2 * x]) * matrix[8]; + sum = (int)(sum * rdiv + bias + 0.5f); + dst[x] = av_clip(sum, 0, peak); } - - return 0; } -static int filter16_5x5(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void filter16_5x5(uint8_t *dstp, const uint8_t *src, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak) +{ + uint16_t *dst = (uint16_t *)dstp; + int x; + + for (x = 0; x < width; x++) { + int i, sum = 0; + + for (i = 0; i < 25; i++) + sum += AV_RN16A(&c[i][2 * x]) * matrix[i]; + + sum = (int)(sum * rdiv + bias + 0.5f); + dst[x] = av_clip(sum, 0, peak); + } +} + +static void filter16_7x7(uint8_t *dstp, const uint8_t *src, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak) +{ + uint16_t *dst = (uint16_t *)dstp; + int x; + + for (x = 0; x < width; x++) { + int i, sum = 0; + + for (i = 0; i < 49; i++) + sum += AV_RN16A(&c[i][2 * x]) * matrix[i]; + + sum = (int)(sum * rdiv + bias + 0.5f); + dst[x] = av_clip(sum, 0, peak); + } +} + +static void filter_7x7(uint8_t *dst, const uint8_t *src, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak) +{ + int x; + + for (x = 0; x < width; x++) { + int i, sum = 0; + + for (i = 0; i < 49; i++) + sum += c[i][x] * matrix[i]; + + sum = (int)(sum * rdiv + bias + 0.5f); + dst[x] = av_clip_uint8(sum); + } +} + +static void filter_5x5(uint8_t *dst, const uint8_t *src, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak) +{ + int x; + + for (x = 0; x < width; x++) { + int i, sum = 0; + + for (i = 0; i < 25; i++) + sum += c[i][x] * matrix[i]; + + sum = (int)(sum * rdiv + bias + 0.5f); + dst[x] = av_clip_uint8(sum); + } +} + +static void filter_3x3(uint8_t *dst, const uint8_t *src, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak) +{ + const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; + const uint8_t *c3 = c[3], *c4 = c[4], *c5 = c[5]; + const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; + int x; + + for (x = 0; x < width; x++) { + int sum = c0[x] * matrix[0] + c1[x] * matrix[1] + c2[x] * matrix[2] + + c3[x] * matrix[3] + c4[x] * matrix[4] + c5[x] * matrix[5] + + c6[x] * matrix[6] + c7[x] * matrix[7] + c8[x] * matrix[8]; + sum = (int)(sum * rdiv + bias + 0.5f); + dst[x] = av_clip_uint8(sum); + } +} + +static void setup_3x3(const uint8_t *c[], const uint8_t *src, int stride, + int x, int w, int y, int h, int bpc) +{ + int i; + + for (i = 0; i < 9; i++) { + int xoff = FFABS(x + ((i % 3) - 1)); + int yoff = FFABS(y + (i / 3) - 1); + + xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; + yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; + + c[i] = src + xoff * bpc + yoff * stride; + } +} + +static void setup_5x5(const uint8_t *c[], const uint8_t *src, int stride, + int x, int w, int y, int h, int bpc) +{ + int i; + + for (i = 0; i < 25; i++) { + int xoff = FFABS(x + ((i % 5) - 2)); + int yoff = FFABS(y + (i / 5) - 2); + + xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; + yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; + + c[i] = src + xoff * bpc + yoff * stride; + } +} + +static void setup_7x7(const uint8_t *c[], const uint8_t *src, int stride, + int x, int w, int y, int h, int bpc) +{ + int i; + + for (i = 0; i < 49; i++) { + int xoff = FFABS(x + ((i % 7) - 3)); + int yoff = FFABS(y + (i / 7) - 3); + + xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; + yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; + + c[i] = src + xoff * bpc + yoff * stride; + } +} + +static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { ConvolutionContext *s = ctx->priv; ThreadData *td = arg; AVFrame *in = td->in; AVFrame *out = td->out; - const int plane = td->plane; - const int peak = (1 << s->depth) - 1; - const int stride = in->linesize[plane] / 2; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; - uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); - uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 16; - uint16_t *p1 = p0 + bstride; - uint16_t *p2 = p1 + bstride; - uint16_t *p3 = p2 + bstride; - uint16_t *p4 = p3 + bstride; - uint16_t *orig = p0, *end = p4; - const int *matrix = s->matrix[plane]; - float rdiv = s->rdiv[plane]; - float bias = s->bias[plane]; - int y, x, i; + int plane; - line_copy16(p0, src + 2 * stride * (slice_start < 2 ? 1 : -1), width, 2); - line_copy16(p1, src + stride * (slice_start == 0 ? 1 : -1), width, 2); - line_copy16(p2, src, width, 2); - src += stride; - line_copy16(p3, src, width, 2); + for (plane = 0; plane < s->nb_planes; plane++) { + const int radius = s->size[plane] / 2; + const int height = s->planeheight[plane]; + const int width = s->planewidth[plane]; + const int stride = in->linesize[plane]; + const int dstride = out->linesize[plane]; + const int slice_start = (height * jobnr) / nb_jobs; + const int slice_end = (height * (jobnr+1)) / nb_jobs; + const float rdiv = s->rdiv[plane]; + const float bias = s->bias[plane]; + const uint8_t *src = in->data[plane]; + uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; + const int *matrix = s->matrix[plane]; + const int bpc = s->bpc; + const uint8_t *c[49]; + int y, x; - for (y = slice_start; y < slice_end; y++) { - uint16_t *array[] = { - p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, - p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, - p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, - p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, - p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 - }; + if (s->copy[plane]) { + av_image_copy_plane(dst, dstride, src + slice_start * stride, stride, + width * bpc, slice_end - slice_start); + continue; + } - src += stride * (y < height - 2 ? 1 : -1); - line_copy16(p4, src, width, 2); - - for (x = 0; x < width; x++) { - int sum = 0; - - for (i = 0; i < 25; i++) { - sum += *(array[i] + x) * matrix[i]; + for (y = slice_start; y < slice_end; y++) { + for (x = 0; x < radius; x++) { + s->setup[plane](c, src, stride, x, width, y, height, bpc); + s->filter[plane](dst + x * bpc, src, 1, rdiv, bias, matrix, c, s->max); } - sum = (int)(sum * rdiv + bias + 0.5f); - dst[x] = av_clip(sum, 0, peak); - } - - p0 = p1; - p1 = p2; - p2 = p3; - p3 = p4; - p4 = (p4 == end) ? orig: p4 + bstride; - dst += out->linesize[plane] / 2; - } - - return 0; -} - -static int filter16_7x7(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) -{ - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int peak = (1 << s->depth) - 1; - const int stride = in->linesize[plane] / 2; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; - uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); - uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 32; - uint16_t *p1 = p0 + bstride; - uint16_t *p2 = p1 + bstride; - uint16_t *p3 = p2 + bstride; - uint16_t *p4 = p3 + bstride; - uint16_t *p5 = p4 + bstride; - uint16_t *p6 = p5 + bstride; - uint16_t *orig = p0, *end = p6; - const int *matrix = s->matrix[plane]; - float rdiv = s->rdiv[plane]; - float bias = s->bias[plane]; - int y, x, i; - - line_copy16(p0, src + 3 * stride * (slice_start < 3 ? 1 : -1), width, 3); - line_copy16(p1, src + 2 * stride * (slice_start < 2 ? 1 : -1), width, 3); - line_copy16(p2, src + stride * (slice_start == 0 ? 1 : -1), width, 3); - line_copy16(p3, src, width, 3); - src += stride; - line_copy16(p4, src, width, 3); - src += stride; - line_copy16(p5, src, width, 3); - - for (y = slice_start; y < slice_end; y++) { - uint16_t *array[] = { - p0 - 3, p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p0 + 3, - p1 - 3, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p1 + 3, - p2 - 3, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p2 + 3, - p3 - 3, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p3 + 3, - p4 - 3, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2, p4 + 3, - p5 - 3, p5 - 2, p5 - 1, p5, p5 + 1, p5 + 2, p5 + 3, - p6 - 3, p6 - 2, p6 - 1, p6, p6 + 1, p6 + 2, p6 + 3, - }; - - src += stride * (y < height - 3 ? 1 : -1); - line_copy16(p6, src, width, 3); - - for (x = 0; x < width; x++) { - int sum = 0; - - for (i = 0; i < 25; i++) { - sum += *(array[i] + x) * matrix[i]; + s->setup[plane](c, src, stride, radius, width, y, height, bpc); + s->filter[plane](dst + radius * bpc, src, width - 2 * radius, + rdiv, bias, matrix, c, s->max); + for (x = width - radius; x < width; x++) { + s->setup[plane](c, src, stride, x, width, y, height, bpc); + s->filter[plane](dst + x * bpc, src, 1, rdiv, bias, matrix, c, s->max); } - sum = (int)(sum * rdiv + bias + 0.5f); - dst[x] = av_clip(sum, 0, peak); + dst += dstride; } - - p0 = p1; - p1 = p2; - p2 = p3; - p3 = p4; - p4 = p5; - p5 = p6; - p6 = (p6 == end) ? orig: p6 + bstride; - dst += out->linesize[plane] / 2; - } - - return 0; -} - -static int filter_3x3(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) -{ - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int stride = in->linesize[plane]; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint8_t *src = in->data[plane] + slice_start * stride; - uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; - uint8_t *p0 = s->bptrs[jobnr] + 16; - uint8_t *p1 = p0 + bstride; - uint8_t *p2 = p1 + bstride; - uint8_t *orig = p0, *end = p2; - const int *matrix = s->matrix[plane]; - const float rdiv = s->rdiv[plane]; - const float bias = s->bias[plane]; - int y, x; - - line_copy8(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); - line_copy8(p1, src, width, 1); - - for (y = slice_start; y < slice_end; y++) { - src += stride * (y < height - 1 ? 1 : -1); - line_copy8(p2, src, width, 1); - - for (x = 0; x < width; x++) { - int sum = p0[x - 1] * matrix[0] + - p0[x] * matrix[1] + - p0[x + 1] * matrix[2] + - p1[x - 1] * matrix[3] + - p1[x] * matrix[4] + - p1[x + 1] * matrix[5] + - p2[x - 1] * matrix[6] + - p2[x] * matrix[7] + - p2[x + 1] * matrix[8]; - sum = (int)(sum * rdiv + bias + 0.5f); - dst[x] = av_clip_uint8(sum); - } - - p0 = p1; - p1 = p2; - p2 = (p2 == end) ? orig: p2 + bstride; - dst += out->linesize[plane]; - } - - return 0; -} - -static int filter_5x5(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) -{ - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int stride = in->linesize[plane]; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint8_t *src = in->data[plane] + slice_start * stride; - uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; - uint8_t *p0 = s->bptrs[jobnr] + 16; - uint8_t *p1 = p0 + bstride; - uint8_t *p2 = p1 + bstride; - uint8_t *p3 = p2 + bstride; - uint8_t *p4 = p3 + bstride; - uint8_t *orig = p0, *end = p4; - const int *matrix = s->matrix[plane]; - float rdiv = s->rdiv[plane]; - float bias = s->bias[plane]; - int y, x, i; - - line_copy8(p0, src + 2 * stride * (slice_start < 2 ? 1 : -1), width, 2); - line_copy8(p1, src + stride * (slice_start == 0 ? 1 : -1), width, 2); - line_copy8(p2, src, width, 2); - src += stride; - line_copy8(p3, src, width, 2); - - - for (y = slice_start; y < slice_end; y++) { - uint8_t *array[] = { - p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, - p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, - p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, - p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, - p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 - }; - - src += stride * (y < height - 2 ? 1 : -1); - line_copy8(p4, src, width, 2); - - for (x = 0; x < width; x++) { - int sum = 0; - - for (i = 0; i < 25; i++) { - sum += *(array[i] + x) * matrix[i]; - } - sum = (int)(sum * rdiv + bias + 0.5f); - dst[x] = av_clip_uint8(sum); - } - - p0 = p1; - p1 = p2; - p2 = p3; - p3 = p4; - p4 = (p4 == end) ? orig: p4 + bstride; - dst += out->linesize[plane]; - } - - return 0; -} - -static int filter_7x7(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) -{ - ConvolutionContext *s = ctx->priv; - ThreadData *td = arg; - AVFrame *in = td->in; - AVFrame *out = td->out; - const int plane = td->plane; - const int stride = in->linesize[plane]; - const int bstride = s->bstride; - const int height = s->planeheight[plane]; - const int width = s->planewidth[plane]; - const int slice_start = (height * jobnr) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const uint8_t *src = in->data[plane] + slice_start * stride; - uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; - uint8_t *p0 = s->bptrs[jobnr] + 32; - uint8_t *p1 = p0 + bstride; - uint8_t *p2 = p1 + bstride; - uint8_t *p3 = p2 + bstride; - uint8_t *p4 = p3 + bstride; - uint8_t *p5 = p4 + bstride; - uint8_t *p6 = p5 + bstride; - uint8_t *orig = p0, *end = p6; - const int *matrix = s->matrix[plane]; - float rdiv = s->rdiv[plane]; - float bias = s->bias[plane]; - int y, x, i; - - line_copy8(p0, src + 3 * stride * (slice_start < 3 ? 1 : -1), width, 3); - line_copy8(p1, src + 2 * stride * (slice_start < 2 ? 1 : -1), width, 3); - line_copy8(p2, src + stride * (slice_start == 0 ? 1 : -1), width, 3); - line_copy8(p3, src, width, 3); - src += stride; - line_copy8(p4, src, width, 3); - src += stride; - line_copy8(p5, src, width, 3); - - for (y = slice_start; y < slice_end; y++) { - uint8_t *array[] = { - p0 - 3, p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p0 + 3, - p1 - 3, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p1 + 3, - p2 - 3, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p2 + 3, - p3 - 3, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p3 + 3, - p4 - 3, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2, p4 + 3, - p5 - 3, p5 - 2, p5 - 1, p5, p5 + 1, p5 + 2, p5 + 3, - p6 - 3, p6 - 2, p6 - 1, p6, p6 + 1, p6 + 2, p6 + 3, - }; - - src += stride * (y < height - 3 ? 1 : -1); - line_copy8(p6, src, width, 3); - - for (x = 0; x < width; x++) { - int sum = 0; - - for (i = 0; i < 49; i++) { - sum += *(array[i] + x) * matrix[i]; - } - sum = (int)(sum * rdiv + bias + 0.5f); - dst[x] = av_clip_uint8(sum); - } - - p0 = p1; - p1 = p2; - p2 = p3; - p3 = p4; - p4 = p5; - p5 = p6; - p6 = (p6 == end) ? orig: p6 + bstride; - dst += out->linesize[plane]; } return 0; @@ -870,6 +444,7 @@ static int config_input(AVFilterLink *inlink) int p; s->depth = desc->comp[0].depth; + s->max = (1 << s->depth) - 1; s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w); s->planewidth[0] = s->planewidth[3] = inlink->w; @@ -878,19 +453,7 @@ static int config_input(AVFilterLink *inlink) s->nb_planes = av_pix_fmt_count_planes(inlink->format); s->nb_threads = ff_filter_get_nb_threads(ctx); - s->bptrs = av_calloc(s->nb_threads, sizeof(*s->bptrs)); - if (!s->bptrs) - return AVERROR(ENOMEM); - - s->bstride = s->planewidth[0] + 64; s->bpc = (s->depth + 7) / 8; - s->buffer = av_malloc_array(7 * s->bstride * s->nb_threads, s->bpc); - if (!s->buffer) - return AVERROR(ENOMEM); - - for (p = 0; p < s->nb_threads; p++) { - s->bptrs[p] = s->buffer + 7 * s->bstride * s->bpc * p; - } if (!strcmp(ctx->filter->name, "convolution")) { if (s->depth > 8) { @@ -926,7 +489,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) ConvolutionContext *s = ctx->priv; AVFilterLink *outlink = ctx->outputs[0]; AVFrame *out; - int plane; + ThreadData td; out = ff_get_video_buffer(outlink, outlink->w, outlink->h); if (!out) { @@ -935,22 +498,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) } av_frame_copy_props(out, in); - for (plane = 0; plane < s->nb_planes; plane++) { - ThreadData td; - - if (s->copy[plane]) { - av_image_copy_plane(out->data[plane], out->linesize[plane], - in->data[plane], in->linesize[plane], - s->planewidth[plane] * s->bpc, - s->planeheight[plane]); - continue; - } - - td.in = in; - td.out = out; - td.plane = plane; - ctx->internal->execute(ctx, s->filter[plane], &td, NULL, FFMIN(s->planeheight[plane], s->nb_threads)); - } + td.in = in; + td.out = out; + ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(s->planeheight[1], s->nb_threads)); av_frame_free(&in); return ff_filter_frame(outlink, out); @@ -984,18 +534,21 @@ static av_cold int init(AVFilterContext *ctx) s->copy[i] = 1; else s->filter[i] = filter_3x3; + s->setup[i] = setup_3x3; } else if (s->matrix_length[i] == 25) { s->size[i] = 5; if (!memcmp(matrix, same5x5, sizeof(same5x5))) s->copy[i] = 1; else s->filter[i] = filter_5x5; + s->setup[i] = setup_5x5; } else if (s->matrix_length[i] == 49) { s->size[i] = 7; if (!memcmp(matrix, same7x7, sizeof(same7x7))) s->copy[i] = 1; else s->filter[i] = filter_7x7; + s->setup[i] = setup_7x7; } else { return AVERROR(EINVAL); } @@ -1014,6 +567,10 @@ static av_cold int init(AVFilterContext *ctx) s->filter[i] = filter_prewitt; else s->copy[i] = 1; + s->size[i] = 3; + s->setup[i] = setup_3x3; + s->rdiv[i] = s->scale; + s->bias[i] = s->delta; } } else if (!strcmp(ctx->filter->name, "roberts")) { for (i = 0; i < 4; i++) { @@ -1021,6 +578,10 @@ static av_cold int init(AVFilterContext *ctx) s->filter[i] = filter_roberts; else s->copy[i] = 1; + s->size[i] = 3; + s->setup[i] = setup_3x3; + s->rdiv[i] = s->scale; + s->bias[i] = s->delta; } } else if (!strcmp(ctx->filter->name, "sobel")) { for (i = 0; i < 4; i++) { @@ -1028,20 +589,16 @@ static av_cold int init(AVFilterContext *ctx) s->filter[i] = filter_sobel; else s->copy[i] = 1; + s->size[i] = 3; + s->setup[i] = setup_3x3; + s->rdiv[i] = s->scale; + s->bias[i] = s->delta; } } return 0; } -static av_cold void uninit(AVFilterContext *ctx) -{ - ConvolutionContext *s = ctx->priv; - - av_freep(&s->bptrs); - av_freep(&s->buffer); -} - static const AVFilterPad convolution_inputs[] = { { .name = "default", @@ -1068,7 +625,6 @@ AVFilter ff_vf_convolution = { .priv_size = sizeof(ConvolutionContext), .priv_class = &convolution_class, .init = init, - .uninit = uninit, .query_formats = query_formats, .inputs = convolution_inputs, .outputs = convolution_outputs, @@ -1094,7 +650,6 @@ AVFilter ff_vf_prewitt = { .priv_size = sizeof(ConvolutionContext), .priv_class = &prewitt_class, .init = init, - .uninit = uninit, .query_formats = query_formats, .inputs = convolution_inputs, .outputs = convolution_outputs, @@ -1120,7 +675,6 @@ AVFilter ff_vf_sobel = { .priv_size = sizeof(ConvolutionContext), .priv_class = &sobel_class, .init = init, - .uninit = uninit, .query_formats = query_formats, .inputs = convolution_inputs, .outputs = convolution_outputs, @@ -1146,7 +700,6 @@ AVFilter ff_vf_roberts = { .priv_size = sizeof(ConvolutionContext), .priv_class = &roberts_class, .init = init, - .uninit = uninit, .query_formats = query_formats, .inputs = convolution_inputs, .outputs = convolution_outputs,