avfilter: add showcwt multimedia filter

2025-11-23 21:54:53 +02:00 · 2022-11-19 19:01:23 +01:00
parent 93810a625c
commit d34c1b389e
6 changed files with 938 additions and 2 deletions
--- a/1
+++ b/1
@@ -25,6 +25,7 @@ version <next>:
 - oneVPL support for QSV
 - QSV AV1 encoder
 - QSV decoding and encoding for 10/12bit 422, 10/12bit 444 HEVC and VP9
+- showcwt multimedia filter


 version 5.1:
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -29274,6 +29274,102 @@ axisfile=myaxis.png:basefreq=40:endfreq=10000
@end example
@end itemize

+@section showcwt
+
+Convert input audio to video output representing frequency spectrum
+using Continuous Wavelet Transform and Morlet wavelet.
+
+The filter accepts the following options:
+
+@table @option
+@item size, s
+Specify the video size for the output. For the syntax of this option,
+check the @ref{video size syntax,,"Video size" section in the ffmpeg-utils manual,ffmpeg-utils}.
+Default value is @code{640x512}.
+
+@item rate, r
+Set the output frame rate. Default value is @code{25}.
+
+@item scale
+Set the frequency scale used. Allowed values are:
+
+@table @option
+@item linear
+@item log2
+@item bark
+@item mel
+@item erbs
+@end table
+Default value is @code{linear}.
+
+@item min
+Set the minimum frequency that will be used in output.
+Default is @code{20} Hz.
+
+@item max
+Set the maximum frequency that will be used in output.
+Default is @code{20000} Hz. The real frequency upper limit
+depends on input audio's sample rate and such will be enforced
+on this value when it is set to value greater than Nyquist frequency.
+
+@item logb
+Set the logarithmic basis for brightness strength when
+mapping calculated magnitude values to pixel values.
+Allowed range is from @code{0} to @code{1}.
+Default value is @code{0.0001}.
+
+@item deviation
+Set the frequency deviation.
+Lower values than @code{1} are more frequency oriented,
+while higher values than @code{1} are more time oriented.
+Allowed range is from @code{0} to @code{10}.
+Default value is @code{1}.
+
+@item pps
+Set the number of pixel output per each second in one row.
+Allowed range is from @code{1} to @code{1024}.
+Default value is @code{64}.
+
+@item mode
+Set the output visual mode. Allowed values are:
+
+@table @option
+@item magnitude
+Show magnitude.
+@item phase
+Show only phase.
+@item magphase
+Show combination of magnitude and phase.
+Magnitude is mapped to brightness and phase to color.
+@item channel
+Show unique color per channel magnitude.
+@end table
+
+Default value is @code{magnitude}.
+
+@item slide
+Set the output slide method. Allowed values are:
+
+@table @option
+@item replace
+@item scroll
+@end table
+
+@item direction
+Set the direction method for output slide method. Allowed values are:
+
+@table @option
+@item lr
+Direction from left to right.
+@item rl
+Direction from right to left.
+@item ud
+Direction from up to down.
+@item du
+Direction from down to up.
+@end table
+@end table
+
@section showfreqs

 Convert input audio to video output representing the audio power spectrum.
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -593,6 +593,7 @@ OBJS-$(CONFIG_APHASEMETER_FILTER)            += avf_aphasemeter.o
 OBJS-$(CONFIG_AVECTORSCOPE_FILTER)           += avf_avectorscope.o
 OBJS-$(CONFIG_CONCAT_FILTER)                 += avf_concat.o
 OBJS-$(CONFIG_SHOWCQT_FILTER)                += avf_showcqt.o lswsutils.o lavfutils.o
+OBJS-$(CONFIG_SHOWCWT_FILTER)                += avf_showcwt.o
 OBJS-$(CONFIG_SHOWFREQS_FILTER)              += avf_showfreqs.o
 OBJS-$(CONFIG_SHOWSPATIAL_FILTER)            += avf_showspatial.o
 OBJS-$(CONFIG_SHOWSPECTRUM_FILTER)           += avf_showspectrum.o
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -558,6 +558,7 @@ extern const AVFilter ff_avf_aphasemeter;
 extern const AVFilter ff_avf_avectorscope;
 extern const AVFilter ff_avf_concat;
 extern const AVFilter ff_avf_showcqt;
+extern const AVFilter ff_avf_showcwt;
 extern const AVFilter ff_avf_showfreqs;
 extern const AVFilter ff_avf_showspatial;
 extern const AVFilter ff_avf_showspectrum;
--- a/libavfilter/avf_showcwt.c
+++ b/libavfilter/avf_showcwt.c
@@ -0,0 +1,837 @@
+/*
+ * Copyright (c) 2022 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <float.h>
+#include <math.h>
+
+#include "libavutil/tx.h"
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/channel_layout.h"
+#include "libavutil/cpu.h"
+#include "libavutil/opt.h"
+#include "libavutil/parseutils.h"
+#include "audio.h"
+#include "video.h"
+#include "avfilter.h"
+#include "filters.h"
+#include "internal.h"
+
+enum FrequencyScale {
+    FSCALE_LINEAR,
+    FSCALE_LOG2,
+    FSCALE_BARK,
+    FSCALE_MEL,
+    FSCALE_ERBS,
+    NB_FSCALE
+};
+
+enum DirectionMode {
+    DIRECTION_LR,
+    DIRECTION_RL,
+    DIRECTION_UD,
+    DIRECTION_DU,
+    NB_DIRECTION
+};
+
+enum SlideMode {
+    SLIDE_REPLACE,
+    SLIDE_SCROLL,
+    NB_SLIDE
+};
+
+typedef struct ShowCWTContext {
+    const AVClass *class;
+    int w, h;
+    int mode;
+    char *rate_str;
+    AVRational auto_frame_rate;
+    AVRational frame_rate;
+    AVTXContext *fft;
+    AVTXContext **ifft;
+    av_tx_fn tx_fn;
+    av_tx_fn itx_fn;
+    int fft_in_size;
+    int fft_out_size;
+    int ifft_in_size;
+    int ifft_out_size;
+    int pos;
+    int in_nb_samples;
+    int64_t in_pts;
+    int64_t old_pts;
+    float *frequency_band;
+    AVFrame *kernel;
+    unsigned *index;
+    int *kernel_start;
+    int *kernel_stop;
+    AVFrame *overlap;
+    AVFrame *outpicref;
+    AVFrame *fft_in;
+    AVFrame *fft_out;
+    AVFrame *ifft_in;
+    AVFrame *ifft_out;
+    AVFrame *ch_out;
+    int nb_threads;
+    int nb_channels;
+    int nb_consumed_samples;
+    int pps;
+    int slide;
+    int direction;
+    int hop_size;
+    int ihop_size;
+    int ihop_index;
+    int input_padding_size;
+    int input_sample_count;
+    int output_padding_size;
+    int output_sample_count;
+    int frequency_band_count;
+    float logarithmic_basis;
+    int frequency_scale;
+    float minimum_frequency;
+    float maximum_frequency;
+    float deviation;
+} ShowCWTContext;
+
+#define OFFSET(x) offsetof(ShowCWTContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
+
+static const AVOption showcwt_options[] = {
+    { "size", "set video size", OFFSET(w), AV_OPT_TYPE_IMAGE_SIZE, {.str = "640x512"}, 0, 0, FLAGS },
+    { "s",    "set video size", OFFSET(w), AV_OPT_TYPE_IMAGE_SIZE, {.str = "640x512"}, 0, 0, FLAGS },
+    { "rate", "set video rate",  OFFSET(rate_str), AV_OPT_TYPE_STRING, {.str = "25"}, 0, 0, FLAGS },
+    { "r",    "set video rate",  OFFSET(rate_str), AV_OPT_TYPE_STRING, {.str = "25"}, 0, 0, FLAGS },
+    { "scale", "set frequency scale", OFFSET(frequency_scale), AV_OPT_TYPE_INT,  {.i64=0}, 0, NB_FSCALE-1, FLAGS, "scale" },
+    {  "linear",  "linear",           0,                       AV_OPT_TYPE_CONST,{.i64=FSCALE_LINEAR}, 0, 0, FLAGS, "scale" },
+    {  "log2",    "logarithmic",      0,                       AV_OPT_TYPE_CONST,{.i64=FSCALE_LOG2},   0, 0, FLAGS, "scale" },
+    {  "bark",    "bark",             0,                       AV_OPT_TYPE_CONST,{.i64=FSCALE_BARK},   0, 0, FLAGS, "scale" },
+    {  "mel",     "mel",              0,                       AV_OPT_TYPE_CONST,{.i64=FSCALE_MEL},    0, 0, FLAGS, "scale" },
+    {  "erbs",    "erbs",             0,                       AV_OPT_TYPE_CONST,{.i64=FSCALE_ERBS},   0, 0, FLAGS, "scale" },
+    { "min",  "set minimum frequency", OFFSET(minimum_frequency), AV_OPT_TYPE_FLOAT, {.dbl = 20.}, 1, 2000, FLAGS },
+    { "max",  "set maximum frequency", OFFSET(maximum_frequency), AV_OPT_TYPE_FLOAT, {.dbl = 20000.}, 0, 192000, FLAGS },
+    { "logb", "set logarithmic basis", OFFSET(logarithmic_basis), AV_OPT_TYPE_FLOAT, {.dbl = 0.0001}, 0, 1, FLAGS },
+    { "deviation", "set frequency deviation", OFFSET(deviation), AV_OPT_TYPE_FLOAT, {.dbl = 1.}, 0, 10, FLAGS },
+    { "pps",  "set pixels per second", OFFSET(pps), AV_OPT_TYPE_INT, {.i64 = 64}, 1, 1024, FLAGS },
+    { "mode", "set output mode", OFFSET(mode), AV_OPT_TYPE_INT,  {.i64=0}, 0, 3, FLAGS, "mode" },
+    {  "magnitude", "magnitude",         0, AV_OPT_TYPE_CONST,{.i64=0}, 0, 0, FLAGS, "mode" },
+    {  "phase",     "phase",             0, AV_OPT_TYPE_CONST,{.i64=1}, 0, 0, FLAGS, "mode" },
+    {  "magphase",  "magnitude+phase",   0, AV_OPT_TYPE_CONST,{.i64=2}, 0, 0, FLAGS, "mode" },
+    {  "channel",   "color per channel", 0, AV_OPT_TYPE_CONST,{.i64=3}, 0, 0, FLAGS, "mode" },
+    { "slide", "set slide mode", OFFSET(slide), AV_OPT_TYPE_INT,  {.i64=0}, 0, NB_SLIDE-1, FLAGS, "slide" },
+    {  "replace", "replace", 0, AV_OPT_TYPE_CONST,{.i64=SLIDE_REPLACE},0, 0, FLAGS, "slide" },
+    {  "scroll",  "scroll",  0, AV_OPT_TYPE_CONST,{.i64=SLIDE_SCROLL}, 0, 0, FLAGS, "slide" },
+    { "direction", "set direction mode", OFFSET(direction), AV_OPT_TYPE_INT,  {.i64=0}, 0, NB_DIRECTION-1, FLAGS, "direction" },
+    {  "lr", "left to right", 0, AV_OPT_TYPE_CONST,{.i64=DIRECTION_LR}, 0, 0, FLAGS, "direction" },
+    {  "rl", "right to left", 0, AV_OPT_TYPE_CONST,{.i64=DIRECTION_RL}, 0, 0, FLAGS, "direction" },
+    {  "ud", "up to down",    0, AV_OPT_TYPE_CONST,{.i64=DIRECTION_UD}, 0, 0, FLAGS, "direction" },
+    {  "du", "down to up",    0, AV_OPT_TYPE_CONST,{.i64=DIRECTION_DU}, 0, 0, FLAGS, "direction" },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(showcwt);
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    ShowCWTContext *s = ctx->priv;
+
+    av_freep(&s->frequency_band);
+    av_freep(&s->kernel_start);
+    av_freep(&s->kernel_stop);
+    av_freep(&s->index);
+
+    av_frame_free(&s->kernel);
+    av_frame_free(&s->overlap);
+    av_frame_free(&s->outpicref);
+    av_frame_free(&s->fft_in);
+    av_frame_free(&s->fft_out);
+    av_frame_free(&s->ifft_in);
+    av_frame_free(&s->ifft_out);
+    av_frame_free(&s->ch_out);
+    av_tx_uninit(&s->fft);
+
+    if (s->ifft) {
+        for (int n = 0; n < s->nb_threads; n++)
+            av_tx_uninit(&s->ifft[n]);
+    }
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    AVFilterFormats *formats = NULL;
+    AVFilterChannelLayouts *layouts = NULL;
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    static const enum AVSampleFormat sample_fmts[] = { AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_NONE };
+    static const enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVA444P, AV_PIX_FMT_NONE };
+    int ret;
+
+    formats = ff_make_format_list(sample_fmts);
+    if ((ret = ff_formats_ref(formats, &inlink->outcfg.formats)) < 0)
+        return ret;
+
+    layouts = ff_all_channel_counts();
+    if ((ret = ff_channel_layouts_ref(layouts, &inlink->outcfg.channel_layouts)) < 0)
+        return ret;
+
+    formats = ff_all_samplerates();
+    if ((ret = ff_formats_ref(formats, &inlink->outcfg.samplerates)) < 0)
+        return ret;
+
+    formats = ff_make_format_list(pix_fmts);
+    if ((ret = ff_formats_ref(formats, &outlink->incfg.formats)) < 0)
+        return ret;
+
+    return 0;
+}
+
+static void frequency_band(float *frequency_band,
+                           int frequency_band_count,
+                           float frequency_range,
+                           float frequency_offset,
+                           int frequency_scale, float deviation)
+{
+    deviation *= sqrtf(1.f / (4.f * M_PI)); // Heisenberg Gabor Limit
+    for (int y = 0; y < frequency_band_count; y++) {
+        float frequency = frequency_range * (1.f - (float)y / frequency_band_count) + frequency_offset;
+        float frequency_derivative = frequency_range / frequency_band_count;
+
+        switch (frequency_scale) {
+        case FSCALE_LOG2:
+            frequency = powf(2.f, frequency);
+            frequency_derivative *= logf(2.f) * frequency;
+            break;
+        case FSCALE_BARK:
+            frequency = 600.f * sinhf(frequency / 6.f);
+            frequency_derivative *= sqrtf(frequency * frequency + 360000.f) / 6.f;
+            break;
+        case FSCALE_MEL:
+            frequency = 700.f * (powf(10.f, frequency / 2595.f) - 1.f);
+            frequency_derivative *= (frequency + 700.f) * logf(10.f) / 2595.f;
+            break;
+        case FSCALE_ERBS:
+            frequency = 676170.4f / (47.06538f - expf(frequency * 0.08950404f)) - 14678.49f;
+            frequency_derivative *= (frequency * frequency + 14990.4 * frequency + 4577850.f) / 160514.f;
+            break;
+        }
+
+        frequency_band[y*2  ] = frequency;
+        frequency_band[y*2+1] = frequency_derivative * deviation;
+    }
+}
+
+#define cmul(operator, index) {           \
+    const float ff = kernel[index];       \
+    isrc[n].re operator ff*dst[index].re; \
+    isrc[n].im operator ff*dst[index].im; \
+}
+
+static float remap_log(float value, float log_factor)
+{
+    float sign = (0 < value) - (value < 0);
+
+    value = logf(value * sign) * log_factor;
+
+    return 1.f - av_clipf(value, 0.f, 1.f);
+}
+
+static int run_channel_cwt_prepare(AVFilterContext *ctx, void *arg, int ch)
+{
+    ShowCWTContext *s = ctx->priv;
+    AVFrame *fin = arg;
+    const float *input = (const float *)fin->extended_data[ch];
+    float *overlap = (float *)s->overlap->extended_data[ch];
+    AVComplexFloat *src = (AVComplexFloat *)s->fft_in->extended_data[ch];
+    AVComplexFloat *dst = (AVComplexFloat *)s->fft_out->extended_data[ch];
+    const int nb_consumed_samples = s->nb_consumed_samples;
+    const int input_padding_size = s->input_padding_size;
+    const int hop_size = s->hop_size;
+    const int offset = input_padding_size - hop_size;
+
+    memmove(overlap, &overlap[hop_size], offset * sizeof(float));
+    memcpy(&overlap[offset], input,
+           fin->nb_samples * sizeof(float));
+    memset(&overlap[offset + fin->nb_samples], 0,
+           (hop_size - fin->nb_samples) * sizeof(float));
+
+    for (int n = 0; n < nb_consumed_samples; n++) {
+        src[n].re = overlap[n];
+        src[n].im = 0.f;
+    }
+
+    s->tx_fn(s->fft, dst, src, sizeof(*src));
+
+    return 0;
+}
+
+static int draw(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+    ShowCWTContext *s = ctx->priv;
+    const ptrdiff_t ylinesize = s->outpicref->linesize[0];
+    const ptrdiff_t ulinesize = s->outpicref->linesize[1];
+    const ptrdiff_t vlinesize = s->outpicref->linesize[2];
+    const float log_factor = 1.f/logf(s->logarithmic_basis);
+    const int count = s->frequency_band_count;
+    const int start = (count * jobnr) / nb_jobs;
+    const int end = (count * (jobnr+1)) / nb_jobs;
+    const int ihop_index = s->ihop_index;
+    const int ihop_size = s->ihop_size;
+    const int direction = s->direction;
+    uint8_t *dstY, *dstU, *dstV;
+    const int mode = s->mode;
+    const int w_1 = s->w - 1;
+    const int x = s->pos;
+    float Y, U, V;
+
+    for (int y = start; y < end; y++) {
+        const AVComplexFloat *src = ((const AVComplexFloat *)s->ch_out->extended_data[0]) +
+                                                    y * ihop_size + ihop_index;
+
+        switch (direction) {
+        case DIRECTION_LR:
+        case DIRECTION_RL:
+            dstY = s->outpicref->data[0] + y * ylinesize;
+            dstU = s->outpicref->data[1] + y * ulinesize;
+            dstV = s->outpicref->data[2] + y * vlinesize;
+            break;
+        case DIRECTION_UD:
+        case DIRECTION_DU:
+            dstY = s->outpicref->data[0] + x * ylinesize + w_1 - y;
+            dstU = s->outpicref->data[1] + x * ulinesize + w_1 - y;
+            dstV = s->outpicref->data[2] + x * vlinesize + w_1 - y;
+            break;
+        }
+
+        switch (s->slide) {
+        case SLIDE_REPLACE:
+            /* nothing to do here */
+            break;
+        case SLIDE_SCROLL:
+            switch (s->direction) {
+            case DIRECTION_RL:
+                memmove(dstY, dstY + 1, w_1);
+                memmove(dstU, dstU + 1, w_1);
+                memmove(dstV, dstV + 1, w_1);
+                break;
+            case DIRECTION_LR:
+                memmove(dstY + 1, dstY, w_1);
+                memmove(dstU + 1, dstU, w_1);
+                memmove(dstV + 1, dstV, w_1);
+                break;
+            }
+            break;
+        }
+
+        if (direction == DIRECTION_RL ||
+            direction == DIRECTION_LR) {
+            dstY += x;
+            dstU += x;
+            dstV += x;
+        }
+
+        switch (mode) {
+        case 3:
+            {
+                const int nb_channels = s->nb_channels;
+                const float yf = 1.f / nb_channels;
+
+                Y = 0.f;
+                U = V = 0.5f;
+                for (int ch = 0; ch < nb_channels; ch++) {
+                    const AVComplexFloat *src = ((const AVComplexFloat *)s->ch_out->extended_data[ch]) +
+                                                    y * ihop_size + ihop_index;
+                    float z;
+
+                    z = hypotf(src[0].re, src[0].im);
+                    z = remap_log(z, log_factor);
+
+                    Y += z * yf;
+                    U += z * yf * sinf(2.f * M_PI * ch * yf);
+                    V += z * yf * cosf(2.f * M_PI * ch * yf);
+                }
+
+                dstY[0] = av_clip_uint8(lrintf(Y * 255.f));
+                dstU[0] = av_clip_uint8(lrintf(U * 255.f));
+                dstV[0] = av_clip_uint8(lrintf(V * 255.f));
+            }
+            break;
+        case 2:
+            Y = hypotf(src[0].re, src[0].im);
+            Y = remap_log(Y, log_factor);
+            U = atan2f(src[0].im, src[0].re);
+            U = 0.5f + 0.5f * U * Y / M_PI;
+            V = 1.f - U;
+
+            dstY[0] = av_clip_uint8(lrintf(Y * 255.f));
+            dstU[0] = av_clip_uint8(lrintf(U * 255.f));
+            dstV[0] = av_clip_uint8(lrintf(V * 255.f));
+            break;
+        case 1:
+            Y = atan2f(src[0].im, src[0].re);
+            Y = 0.5f + 0.5f * Y / M_PI;
+
+            dstY[0] = av_clip_uint8(lrintf(Y * 255.f));
+            break;
+        case 0:
+            Y = hypotf(src[0].re, src[0].im);
+            Y = remap_log(Y, log_factor);
+
+            dstY[0] = av_clip_uint8(lrintf(Y * 255.f));
+            break;
+        }
+    }
+
+    return 0;
+}
+
+static int run_channel_cwt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+    ShowCWTContext *s = ctx->priv;
+    const int ch = *(int *)arg;
+    AVComplexFloat *dst = (AVComplexFloat *)s->fft_out->extended_data[ch];
+    const int output_sample_count = s->output_sample_count;
+    const int ihop_size = s->ihop_size;
+    const int ioffset = (s->output_padding_size - ihop_size) >> 1;
+    const int count = s->frequency_band_count;
+    const int start = (count * jobnr) / nb_jobs;
+    const int end = (count * (jobnr+1)) / nb_jobs;
+
+    for (int y = start; y < end; y++) {
+        AVComplexFloat *isrc = (AVComplexFloat *)s->ifft_in->extended_data[y];
+        AVComplexFloat *idst = (AVComplexFloat *)s->ifft_out->extended_data[y];
+        AVComplexFloat *chout = ((AVComplexFloat *)s->ch_out->extended_data[ch]) + y * ihop_size;
+        const float *kernel = (const float *)s->kernel->extended_data[y];
+        const unsigned *index = (const unsigned *)s->index;
+        const int kernel_start = s->kernel_start[y];
+        const int kernel_stop = s->kernel_stop[y];
+
+        memset(isrc, 0, sizeof(*isrc) * output_sample_count);
+        for (int i = kernel_start; i < kernel_stop; i++) {
+            const unsigned n = index[i];
+            cmul(+=, i);
+        }
+
+        s->itx_fn(s->ifft[jobnr], idst, isrc, sizeof(*isrc));
+
+        for (int i = 0; i < ihop_size; i++) {
+            chout[i].re = idst[ioffset + i].re;
+            chout[i].im = idst[ioffset + i].im;
+        }
+    }
+
+    return 0;
+}
+
+static void compute_kernel(AVFilterContext *ctx)
+{
+    ShowCWTContext *s = ctx->priv;
+    const int size = s->input_sample_count;
+    const float scale_factor = 1.f/(float)size;
+    const int output_sample_count = s->output_sample_count;
+    const int fsize = s->frequency_band_count;
+    unsigned *index = s->index;
+
+    for (int y = 0; y < fsize; y++) {
+        float *kernel = (float *)s->kernel->extended_data[y];
+        int *kernel_start = s->kernel_start;
+        int *kernel_stop = s->kernel_stop;
+        float frequency = s->frequency_band[y*2];
+        float deviation = 1.f / (s->frequency_band[y*2+1] *
+                                 output_sample_count);
+
+        for (int n = 0; n < size; n++) {
+            float ff, f = fabsf(n-frequency);
+
+            f = size - fabsf(f - size);
+            ff = expf(-f*f*deviation) * scale_factor;
+            kernel[n] = ff;
+        }
+
+        for (int n = 0; n < size; n++) {
+            if (kernel[n] != 0.f) {
+                kernel_start[y] = n;
+                break;
+            }
+        }
+
+        for (int n = 0; n < size; n++) {
+            if (kernel[size - n - 1] != 0.f) {
+                kernel_stop[y] = size - n;
+                break;
+            }
+        }
+    }
+
+    for (int n = 0; n < size; n++)
+        index[n] = n % output_sample_count;
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = ctx->inputs[0];
+    ShowCWTContext *s = ctx->priv;
+    float maximum_frequency = fminf(s->maximum_frequency, inlink->sample_rate * 0.5f);
+    float minimum_frequency = s->minimum_frequency;
+    float scale = 1.f, factor;
+    int ret;
+
+    uninit(ctx);
+
+    switch (s->direction) {
+    case DIRECTION_LR:
+    case DIRECTION_RL:
+        s->frequency_band_count = s->h;
+        break;
+    case DIRECTION_UD:
+    case DIRECTION_DU:
+        s->frequency_band_count = s->w;
+        break;
+    }
+
+    s->nb_threads = FFMIN(s->frequency_band_count, ff_filter_get_nb_threads(ctx));
+    s->nb_channels = inlink->ch_layout.nb_channels;
+    s->old_pts = AV_NOPTS_VALUE;
+    s->nb_consumed_samples = 65536;
+
+    s->input_sample_count = s->nb_consumed_samples;
+    s->hop_size = s->nb_consumed_samples >> 1;
+    s->input_padding_size = 65536;
+    s->output_padding_size = FFMAX(16, s->input_padding_size * s->pps / inlink->sample_rate);
+
+    outlink->w = s->w;
+    outlink->h = s->h;
+    outlink->sample_aspect_ratio = (AVRational){1,1};
+
+    s->fft_in_size  = FFALIGN(s->input_padding_size, av_cpu_max_align());
+    s->fft_out_size = FFALIGN(s->input_padding_size, av_cpu_max_align());
+
+    s->output_sample_count = s->output_padding_size;
+
+    s->ifft_in_size = FFALIGN(s->output_padding_size, av_cpu_max_align());
+    s->ifft_out_size = FFALIGN(s->output_padding_size, av_cpu_max_align());
+    s->ihop_size = s->output_padding_size >> 1;
+
+    ret = av_tx_init(&s->fft, &s->tx_fn, AV_TX_FLOAT_FFT, 0, s->input_padding_size, &scale, 0);
+    if (ret < 0)
+        return ret;
+
+    s->ifft = av_calloc(s->nb_threads, sizeof(*s->ifft));
+    if (!s->ifft)
+        return AVERROR(ENOMEM);
+
+    for (int n = 0; n < s->nb_threads; n++) {
+        ret = av_tx_init(&s->ifft[n], &s->itx_fn, AV_TX_FLOAT_FFT, 1, s->output_padding_size, &scale, 0);
+        if (ret < 0)
+            return ret;
+    }
+
+    s->frequency_band = av_calloc(s->frequency_band_count,
+                                  sizeof(*s->frequency_band) * 2);
+    s->outpicref = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    s->fft_in = ff_get_audio_buffer(inlink, s->fft_in_size * 2);
+    s->fft_out = ff_get_audio_buffer(inlink, s->fft_out_size * 2);
+    s->overlap = ff_get_audio_buffer(inlink, s->input_padding_size);
+    s->ch_out = ff_get_audio_buffer(inlink, s->frequency_band_count * 2 * s->ihop_size);
+    s->ifft_in = av_frame_alloc();
+    s->ifft_out = av_frame_alloc();
+    s->kernel = av_frame_alloc();
+    s->index = av_calloc(s->input_padding_size, sizeof(*s->index));
+    s->kernel_start = av_calloc(s->frequency_band_count, sizeof(*s->kernel_start));
+    s->kernel_stop = av_calloc(s->frequency_band_count, sizeof(*s->kernel_stop));
+    if (!s->outpicref || !s->fft_in || !s->fft_out ||
+        !s->ifft_in || !s->ifft_out || !s->kernel_start || !s->kernel_stop ||
+        !s->frequency_band || !s->kernel || !s->overlap || !s->index)
+        return AVERROR(ENOMEM);
+
+    s->ifft_in->format     = inlink->format;
+    s->ifft_in->nb_samples = s->ifft_in_size * 2;
+    s->ifft_in->ch_layout.nb_channels = s->frequency_band_count;
+    ret = av_frame_get_buffer(s->ifft_in, 0);
+    if (ret < 0)
+        return ret;
+
+    s->ifft_out->format     = inlink->format;
+    s->ifft_out->nb_samples = s->ifft_out_size * 2;
+    s->ifft_out->ch_layout.nb_channels = s->frequency_band_count;
+    ret = av_frame_get_buffer(s->ifft_out, 0);
+    if (ret < 0)
+        return ret;
+
+    s->kernel->format     = inlink->format;
+    s->kernel->nb_samples = s->input_padding_size;
+    s->kernel->ch_layout.nb_channels = s->frequency_band_count;
+    ret = av_frame_get_buffer(s->kernel, 0);
+    if (ret < 0)
+        return ret;
+
+    s->outpicref->sample_aspect_ratio = (AVRational){1,1};
+
+    for (int y = 0; y < outlink->h; y++) {
+        memset(s->outpicref->data[0] + y * s->outpicref->linesize[0],   0, outlink->w);
+        memset(s->outpicref->data[1] + y * s->outpicref->linesize[1], 128, outlink->w);
+        memset(s->outpicref->data[2] + y * s->outpicref->linesize[2], 128, outlink->w);
+        if (s->outpicref->data[3])
+            memset(s->outpicref->data[3] + y * s->outpicref->linesize[3], 0, outlink->w);
+    }
+
+    s->outpicref->color_range = AVCOL_RANGE_JPEG;
+
+    factor = s->nb_consumed_samples / (float)inlink->sample_rate;
+    minimum_frequency *= factor;
+    maximum_frequency *= factor;
+
+    switch (s->frequency_scale) {
+    case FSCALE_LOG2:
+        minimum_frequency = logf(minimum_frequency) / logf(2.f);
+        maximum_frequency = logf(maximum_frequency) / logf(2.f);
+        break;
+    case FSCALE_BARK:
+        minimum_frequency = 6.f * asinhf(minimum_frequency / 600.f);
+        maximum_frequency = 6.f * asinhf(maximum_frequency / 600.f);
+        break;
+    case FSCALE_MEL:
+        minimum_frequency = 2595.f * log10f(1.f + minimum_frequency / 700.f);
+        maximum_frequency = 2595.f * log10f(1.f + maximum_frequency / 700.f);
+        break;
+    case FSCALE_ERBS:
+        minimum_frequency = 11.17268f * log(1.f + (46.06538f * minimum_frequency) / (minimum_frequency + 14678.49f));
+        maximum_frequency = 11.17268f * log(1.f + (46.06538f * maximum_frequency) / (maximum_frequency + 14678.49f));
+        break;
+    }
+
+    frequency_band(s->frequency_band,
+                   s->frequency_band_count, maximum_frequency - minimum_frequency,
+                   minimum_frequency, s->frequency_scale, s->deviation);
+
+    av_log(ctx, AV_LOG_DEBUG, "input_sample_count: %d\n", s->input_sample_count);
+    av_log(ctx, AV_LOG_DEBUG, "output_sample_count: %d\n", s->output_sample_count);
+
+    switch (s->direction) {
+    case DIRECTION_LR:
+        s->pos = 0;
+        break;
+    case DIRECTION_RL:
+        s->pos = s->w - 1;
+        break;
+    case DIRECTION_UD:
+        s->pos = 0;
+        break;
+    case DIRECTION_DU:
+        s->pos = s->h - 1;
+        break;
+    }
+
+    s->auto_frame_rate = av_make_q(inlink->sample_rate, s->hop_size);
+    if (strcmp(s->rate_str, "auto")) {
+        ret = av_parse_video_rate(&s->frame_rate, s->rate_str);
+    } else {
+        s->frame_rate = s->auto_frame_rate;
+    }
+    outlink->frame_rate = s->frame_rate;
+    outlink->time_base = av_inv_q(outlink->frame_rate);
+
+    compute_kernel(ctx);
+
+    return 0;
+}
+
+static int activate(AVFilterContext *ctx)
+{
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    ShowCWTContext *s = ctx->priv;
+    int ret = 0, status;
+    int64_t pts;
+
+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+    if (s->outpicref) {
+        AVFrame *fin;
+
+        if (s->ihop_index == 0) {
+            ret = ff_inlink_consume_samples(inlink, s->hop_size, s->hop_size, &fin);
+            if (ret < 0)
+                return ret;
+            if (ret > 0) {
+                for (int ch = 0; ch < s->nb_channels; ch++)
+                    run_channel_cwt_prepare(ctx, fin, ch);
+
+                s->in_pts = fin->pts;
+                s->in_nb_samples = fin->nb_samples;
+                av_frame_free(&fin);
+            }
+        }
+
+        if (ret > 0 || s->ihop_index > 0) {
+            int64_t pts_offset;
+
+            switch (s->slide) {
+            case SLIDE_SCROLL:
+                switch (s->direction) {
+                case DIRECTION_UD:
+                    for (int p = 0; p < 3; p++) {
+                        ptrdiff_t linesize = s->outpicref->linesize[p];
+
+                        for (int y = s->h - 1; y > 0; y--) {
+                            uint8_t *dst = s->outpicref->data[p] + y * linesize;
+
+                            memmove(dst, dst - linesize, s->w);
+                        }
+                    }
+                    break;
+                case DIRECTION_DU:
+                    for (int p = 0; p < 3; p++) {
+                        ptrdiff_t linesize = s->outpicref->linesize[p];
+
+                        for (int y = 0; y < s->h - 1; y++) {
+                            uint8_t *dst = s->outpicref->data[p] + y * linesize;
+
+                            memmove(dst, dst + linesize, s->w);
+                        }
+                    }
+                    break;
+                }
+                break;
+            }
+
+            for (int ch = 0; ch < s->nb_channels && s->ihop_index == 0; ch++) {
+                ff_filter_execute(ctx, run_channel_cwt, (void *)&ch, NULL,
+                                  s->nb_threads);
+            }
+
+            ff_filter_execute(ctx, draw, NULL, NULL, s->nb_threads);
+
+            pts_offset = av_rescale_q(s->ihop_index, av_make_q(1, s->ihop_size), av_make_q(1, s->in_nb_samples));
+            s->outpicref->pts = av_rescale_q(s->in_pts + pts_offset, inlink->time_base, outlink->time_base);
+
+            s->ihop_index++;
+            if (s->ihop_index >= s->ihop_size)
+                s->ihop_index = 0;
+
+            switch (s->slide) {
+            case SLIDE_REPLACE:
+                switch (s->direction) {
+                case DIRECTION_LR:
+                    s->pos++;
+                    if (s->pos >= s->w)
+                        s->pos = 0;
+                    break;
+                case DIRECTION_RL:
+                    s->pos--;
+                    if (s->pos < 0)
+                        s->pos = s->w - 1;
+                    break;
+                case DIRECTION_UD:
+                    s->pos++;
+                    if (s->pos >= s->h)
+                        s->pos = 0;
+                    break;
+                case DIRECTION_DU:
+                    s->pos--;
+                    if (s->pos < 0)
+                        s->pos = s->h - 1;
+                    break;
+                }
+                break;
+            case SLIDE_SCROLL:
+                switch (s->direction) {
+                case DIRECTION_LR:
+                    s->pos = 0;
+                    break;
+                case DIRECTION_RL:
+                    s->pos = s->w - 1;
+                    break;
+                case DIRECTION_UD:
+                    s->pos = 0;
+                    break;
+                case DIRECTION_DU:
+                    s->pos = s->h - 1;
+                    break;
+                }
+                break;
+            }
+
+            if (s->old_pts < s->outpicref->pts) {
+                AVFrame *out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+                if (!out)
+                    return AVERROR(ENOMEM);
+                ret = av_frame_copy_props(out, s->outpicref);
+                if (ret < 0)
+                    goto fail;
+                ret = av_frame_copy(out, s->outpicref);
+                if (ret < 0)
+                    goto fail;
+                s->old_pts = s->outpicref->pts;
+                ret = ff_filter_frame(outlink, out);
+                if (ret <= 0)
+                    return ret;
+fail:
+                av_frame_free(&out);
+                return ret;
+            }
+        }
+    }
+
+    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+        if (status == AVERROR_EOF) {
+            ff_outlink_set_status(outlink, status, pts);
+            return 0;
+        }
+    }
+
+    if (ff_inlink_queued_samples(inlink) >= s->hop_size || s->ihop_index) {
+        ff_filter_set_ready(ctx, 10);
+        return 0;
+    }
+
+    if (ff_outlink_frame_wanted(outlink)) {
+        ff_inlink_request_frame(inlink);
+        return 0;
+    }
+
+    return FFERROR_NOT_READY;
+}
+
+static const AVFilterPad showcwt_inputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_AUDIO,
+    },
+};
+
+static const AVFilterPad showcwt_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = config_output,
+    },
+};
+
+const AVFilter ff_avf_showcwt = {
+    .name          = "showcwt",
+    .description   = NULL_IF_CONFIG_SMALL("Convert input audio to a CWT (Continuous Wavelet Transform) spectrum video output."),
+    .uninit        = uninit,
+    .priv_size     = sizeof(ShowCWTContext),
+    FILTER_INPUTS(showcwt_inputs),
+    FILTER_OUTPUTS(showcwt_outputs),
+    FILTER_QUERY_FUNC(query_formats),
+    .activate      = activate,
+    .priv_class    = &showcwt_class,
+    .flags         = AVFILTER_FLAG_SLICE_THREADS,
+};
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,8 +31,8 @@

 #include "version_major.h"

-#define LIBAVFILTER_VERSION_MINOR  50
-#define LIBAVFILTER_VERSION_MICRO 101
+#define LIBAVFILTER_VERSION_MINOR  51
+#define LIBAVFILTER_VERSION_MICRO 100


 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \