FFmpeg/libavfilter/af_adeclick.c

/*
 * Copyright (c) 2018 Paul B Mahol
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/audio_fifo.h"
#include "libavutil/opt.h"
#include "avfilter.h"
#include "audio.h"
#include "formats.h"

typedef struct DeclickChannel {
    double *auxiliary;
    double *detection;
    double *acoefficients;
    double *acorrelation;
    double *tmp;
    double *interpolated;
    double *matrix;
    int matrix_size;
    double *vector;
    int vector_size;
    double *y;
    int y_size;
    uint8_t *click;
    int *index;
    unsigned *histogram;
    int histogram_size;
} DeclickChannel;

typedef struct AudioDeclickContext {
    const AVClass *class;

    double w;
    double overlap;
    double threshold;
    double ar;
    double burst;
    int method;
    int nb_hbins;

    int is_declip;
    int ar_order;
    int nb_burst_samples;
    int window_size;
    int hop_size;
    int overlap_skip;

    AVFrame *in;
    AVFrame *out;
    AVFrame *buffer;
    AVFrame *is;

    DeclickChannel *chan;

    int64_t pts;
    int nb_channels;
    uint64_t nb_samples;
    uint64_t detected_errors;
    int samples_left;

    AVAudioFifo *fifo;
    double *window_func_lut;

    int (*detector)(struct AudioDeclickContext *s, DeclickChannel *c,
                    double sigmae, double *detection,
                    double *acoefficients, uint8_t *click, int *index,
                    const double *src, double *dst);
} AudioDeclickContext;

#define OFFSET(x) offsetof(AudioDeclickContext, x)
#define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM

static const AVOption adeclick_options[] = {
    { "w", "set window size",          OFFSET(w),         AV_OPT_TYPE_DOUBLE, {.dbl=55}, 10,  100, AF },
    { "o", "set window overlap",       OFFSET(overlap),   AV_OPT_TYPE_DOUBLE, {.dbl=75}, 50,   95, AF },
    { "a", "set autoregression order", OFFSET(ar),        AV_OPT_TYPE_DOUBLE, {.dbl=2},   0,   25, AF },
    { "t", "set threshold",            OFFSET(threshold), AV_OPT_TYPE_DOUBLE, {.dbl=2},   1,  100, AF },
    { "b", "set burst fusion",         OFFSET(burst),     AV_OPT_TYPE_DOUBLE, {.dbl=2},   0,   10, AF },
    { "m", "set overlap method",       OFFSET(method),    AV_OPT_TYPE_INT,    {.i64=0},   0,    1, AF, "m" },
    { "a", "overlap-add",              0,                 AV_OPT_TYPE_CONST,  {.i64=0},   0,    0, AF, "m" },
    { "s", "overlap-save",             0,                 AV_OPT_TYPE_CONST,  {.i64=1},   0,    0, AF, "m" },
    { NULL }
};

AVFILTER_DEFINE_CLASS(adeclick);

static int query_formats(AVFilterContext *ctx)
{
    AVFilterFormats *formats = NULL;
    AVFilterChannelLayouts *layouts = NULL;
    static const enum AVSampleFormat sample_fmts[] = {
        AV_SAMPLE_FMT_DBLP,
        AV_SAMPLE_FMT_NONE
    };
    int ret;

    formats = ff_make_format_list(sample_fmts);
    if (!formats)
        return AVERROR(ENOMEM);
    ret = ff_set_common_formats(ctx, formats);
    if (ret < 0)
        return ret;

    layouts = ff_all_channel_counts();
    if (!layouts)
        return AVERROR(ENOMEM);

    ret = ff_set_common_channel_layouts(ctx, layouts);
    if (ret < 0)
        return ret;

    formats = ff_all_samplerates();
    return ff_set_common_samplerates(ctx, formats);
}

static int config_input(AVFilterLink *inlink)
{
    AVFilterContext *ctx = inlink->dst;
    AudioDeclickContext *s = ctx->priv;
    int i;

    s->pts = AV_NOPTS_VALUE;
    s->window_size = inlink->sample_rate * s->w / 1000.;
    if (s->window_size < 100)
        return AVERROR(EINVAL);
    s->ar_order = FFMAX(s->window_size * s->ar / 100., 1);
    s->nb_burst_samples = s->window_size * s->burst / 1000.;
    s->hop_size = s->window_size * (1. - (s->overlap / 100.));
    if (s->hop_size < 1)
        return AVERROR(EINVAL);

    s->window_func_lut = av_calloc(s->window_size, sizeof(*s->window_func_lut));
    if (!s->window_func_lut)
        return AVERROR(ENOMEM);
    for (i = 0; i < s->window_size; i++)
        s->window_func_lut[i] = sin(M_PI * i / s->window_size) *
                                (1. - (s->overlap / 100.)) * M_PI_2;

    av_frame_free(&s->in);
    av_frame_free(&s->out);
    av_frame_free(&s->buffer);
    av_frame_free(&s->is);
    s->in = ff_get_audio_buffer(inlink, s->window_size);
    s->out = ff_get_audio_buffer(inlink, s->window_size);
    s->buffer = ff_get_audio_buffer(inlink, s->window_size * 2);
    s->is = ff_get_audio_buffer(inlink, s->window_size);
    if (!s->in || !s->out || !s->buffer || !s->is)
        return AVERROR(ENOMEM);

    s->fifo = av_audio_fifo_alloc(inlink->format, inlink->channels, s->window_size);
    if (!s->fifo)
        return AVERROR(ENOMEM);
    s->overlap_skip = s->method ? (s->window_size - s->hop_size) / 2 : 0;
    if (s->overlap_skip > 0) {
        av_audio_fifo_write(s->fifo, (void **)s->in->extended_data,
                            s->overlap_skip);
    }

    s->nb_channels = inlink->channels;
    s->chan = av_calloc(inlink->channels, sizeof(*s->chan));
    if (!s->chan)
        return AVERROR(ENOMEM);

    for (i = 0; i < inlink->channels; i++) {
        DeclickChannel *c = &s->chan[i];

        c->detection = av_calloc(s->window_size, sizeof(*c->detection));
        c->auxiliary = av_calloc(s->ar_order + 1, sizeof(*c->auxiliary));
        c->acoefficients = av_calloc(s->ar_order + 1, sizeof(*c->acoefficients));
        c->acorrelation = av_calloc(s->ar_order + 1, sizeof(*c->acorrelation));
        c->tmp = av_calloc(s->ar_order, sizeof(*c->tmp));
        c->click = av_calloc(s->window_size, sizeof(*c->click));
        c->index = av_calloc(s->window_size, sizeof(*c->index));
        c->interpolated = av_calloc(s->window_size, sizeof(*c->interpolated));
        if (!c->auxiliary || !c->acoefficients || !c->detection || !c->click ||
            !c->index || !c->interpolated || !c->acorrelation || !c->tmp)
            return AVERROR(ENOMEM);
    }

    return 0;
}

static void autocorrelation(const double *input, int order, int size,
                            double *output, double scale)
{
    int i, j;

    for (i = 0; i <= order; i++) {
        double value = 0.;

        for (j = i; j < size; j++)
            value += input[j] * input[j - i];

        output[i] = value * scale;
    }
}

static double autoregression(const double *samples, int ar_order,
                             int nb_samples, double *k, double *r, double *a)
{
    double alpha;
    int i, j;

    memset(a, 0, ar_order * sizeof(*a));

    autocorrelation(samples, ar_order, nb_samples, r, 1. / nb_samples);

    /* Levinson-Durbin algorithm */
    k[0] = a[0] = -r[1] / r[0];
    alpha = r[0] * (1. - k[0] * k[0]);
    for (i = 1; i < ar_order; i++) {
        double epsilon = 0.;

        for (j = 0; j < i; j++)
            epsilon += a[j] * r[i - j];
        epsilon += r[i + 1];

        k[i] = -epsilon / alpha;
        alpha *= (1. - k[i] * k[i]);
        for (j = i - 1; j >= 0; j--)
            k[j] = a[j] + k[i] * a[i - j - 1];
        for (j = 0; j <= i; j++)
            a[j] = k[j];
    }

    k[0] = 1.;
    for (i = 1; i <= ar_order; i++)
        k[i] = a[i - 1];

    return sqrt(alpha);
}

static int isfinite_array(double *samples, int nb_samples)
{
    int i;

    for (i = 0; i < nb_samples; i++)
        if (!isfinite(samples[i]))
            return 0;

    return 1;
}

static int find_index(int *index, int value, int size)
{
    int i, start, end;

    if ((value < index[0]) || (value > index[size - 1]))
        return 1;

    i = start = 0;
    end = size - 1;

    while (start <= end) {
        i = (end + start) / 2;
        if (index[i] == value)
            return 0;
        if (value < index[i])
            end = i - 1;
        if (value > index[i])
            start = i + 1;
    }

    return 1;
}

static int factorization(double *matrix, int n)
{
    int i, j, k;

    for (i = 0; i < n; i++) {
        const int in = i * n;
        double value;

        value = matrix[in + i];
        for (j = 0; j < i; j++)
            value -= matrix[j * n + j] * matrix[in + j] * matrix[in + j];

        if (value == 0.) {
            return -1;
        }

        matrix[in + i] = value;
        for (j = i + 1; j < n; j++) {
            const int jn = j * n;
            double x;

            x = matrix[jn + i];
            for (k = 0; k < i; k++)
                x -= matrix[k * n + k] * matrix[in + k] * matrix[jn + k];
            matrix[jn + i] = x / matrix[in + i];
        }
    }

    return 0;
}

static int do_interpolation(DeclickChannel *c, double *matrix,
                            double *vector, int n, double *out)
{
    int i, j, ret;
    double *y;

    ret = factorization(matrix, n);
    if (ret < 0)
        return ret;

    av_fast_malloc(&c->y, &c->y_size, n * sizeof(*c->y));
    y = c->y;
    if (!y)
        return AVERROR(ENOMEM);

    for (i = 0; i < n; i++) {
        const int in = i * n;
        double value;

        value = vector[i];
        for (j = 0; j < i; j++)
            value -= matrix[in + j] * y[j];
        y[i] = value;
    }

    for (i = n - 1; i >= 0; i--) {
        out[i] = y[i] / matrix[i * n + i];
        for (j = i + 1; j < n; j++)
            out[i] -= matrix[j * n + i] * out[j];
    }

    return 0;
}

static int interpolation(DeclickChannel *c, const double *src, int ar_order,
                         double *acoefficients, int *index, int nb_errors,
                         double *auxiliary, double *interpolated)
{
    double *vector, *matrix;
    int i, j;

    av_fast_malloc(&c->matrix, &c->matrix_size, nb_errors * nb_errors * sizeof(*c->matrix));
    matrix = c->matrix;
    if (!matrix)
        return AVERROR(ENOMEM);

    av_fast_malloc(&c->vector, &c->vector_size, nb_errors * sizeof(*c->vector));
    vector = c->vector;
    if (!vector)
        return AVERROR(ENOMEM);

    autocorrelation(acoefficients, ar_order, ar_order + 1, auxiliary, 1.);

    for (i = 0; i < nb_errors; i++) {
        const int im = i * nb_errors;

        for (j = i; j < nb_errors; j++) {
            if (abs(index[j] - index[i]) <= ar_order) {
                matrix[j * nb_errors + i] = matrix[im + j] = auxiliary[abs(index[j] - index[i])];
            } else {
                matrix[j * nb_errors + i] = matrix[im + j] = 0;
            }
        }
    }

    for (i = 0; i < nb_errors; i++) {
        double value = 0.;

        for (j = -ar_order; j <= ar_order; j++)
            if (find_index(index, index[i] - j, nb_errors))
                value -= src[index[i] - j] * auxiliary[abs(j)];

        vector[i] = value;
    }

    return do_interpolation(c, matrix, vector, nb_errors, interpolated);
}

static int detect_clips(AudioDeclickContext *s, DeclickChannel *c,
                        double unused0,
                        double *unused1, double *unused2,
                        uint8_t *clip, int *index,
                        const double *src, double *dst)
{
    const double threshold = s->threshold;
    double max_amplitude = 0;
    unsigned *histogram;
    int i, nb_clips = 0;

    av_fast_malloc(&c->histogram, &c->histogram_size, s->nb_hbins * sizeof(*c->histogram));
    if (!c->histogram)
        return AVERROR(ENOMEM);
    histogram = c->histogram;
    memset(histogram, 0, sizeof(*histogram) * s->nb_hbins);

    for (i = 0; i < s->window_size; i++) {
        const unsigned index = fmin(fabs(src[i]), 1) * (s->nb_hbins - 1);

        histogram[index]++;
        dst[i] = src[i];
        clip[i] = 0;
    }

    for (i = s->nb_hbins - 1; i > 1; i--) {
        if (histogram[i]) {
            if (histogram[i] / (double)FFMAX(histogram[i - 1], 1) > threshold) {
                max_amplitude = i / (double)s->nb_hbins;
            }
            break;
        }
    }

    if (max_amplitude > 0.) {
        for (i = 0; i < s->window_size; i++) {
            clip[i] = fabs(src[i]) >= max_amplitude;
        }
    }

    memset(clip, 0, s->ar_order * sizeof(*clip));
    memset(clip + (s->window_size - s->ar_order), 0, s->ar_order * sizeof(*clip));

    for (i = s->ar_order; i < s->window_size - s->ar_order; i++)
        if (clip[i])
            index[nb_clips++] = i;

    return nb_clips;
}

static int detect_clicks(AudioDeclickContext *s, DeclickChannel *c,
                         double sigmae,
                         double *detection, double *acoefficients,
                         uint8_t *click, int *index,
                         const double *src, double *dst)
{
    const double threshold = s->threshold;
    int i, j, nb_clicks = 0, prev = -1;

    memset(detection, 0, s->window_size * sizeof(*detection));

    for (i = s->ar_order; i < s->window_size; i++) {
        for (j = 0; j <= s->ar_order; j++) {
            detection[i] += acoefficients[j] * src[i - j];
        }
    }

    for (i = 0; i < s->window_size; i++) {
        click[i] = fabs(detection[i]) > sigmae * threshold;
        dst[i] = src[i];
    }

    for (i = 0; i < s->window_size; i++) {
        if (!click[i])
            continue;

        if (prev >= 0 && (i > prev + 1) && (i <= s->nb_burst_samples + prev))
            for (j = prev + 1; j < i; j++)
                click[j] = 1;
        prev = i;
    }

    memset(click, 0, s->ar_order * sizeof(*click));
    memset(click + (s->window_size - s->ar_order), 0, s->ar_order * sizeof(*click));

    for (i = s->ar_order; i < s->window_size - s->ar_order; i++)
        if (click[i])
            index[nb_clicks++] = i;

    return nb_clicks;
}

typedef struct ThreadData {
    AVFrame *out;
} ThreadData;

static int filter_channel(AVFilterContext *ctx, void *arg, int ch, int nb_jobs)
{
    AudioDeclickContext *s = ctx->priv;
    ThreadData *td = arg;
    AVFrame *out = td->out;
    const double *src = (const double *)s->in->extended_data[ch];
    double *is = (double *)s->is->extended_data[ch];
    double *dst = (double *)s->out->extended_data[ch];
    double *ptr = (double *)out->extended_data[ch];
    double *buf = (double *)s->buffer->extended_data[ch];
    const double *w = s->window_func_lut;
    DeclickChannel *c = &s->chan[ch];
    double sigmae;
    int j, ret;

    sigmae = autoregression(src, s->ar_order, s->window_size, c->acoefficients, c->acorrelation, c->tmp);

    if (isfinite_array(c->acoefficients, s->ar_order + 1)) {
        double *interpolated = c->interpolated;
        int *index = c->index;
        int nb_errors;

        nb_errors = s->detector(s, c, sigmae, c->detection, c->acoefficients,
                                c->click, index, src, dst);
        if (nb_errors > 0) {
            ret = interpolation(c, src, s->ar_order, c->acoefficients, index,
                                nb_errors, c->auxiliary, interpolated);
            if (ret < 0)
                return ret;

            for (j = 0; j < nb_errors; j++) {
                dst[index[j]] = interpolated[j];
                is[index[j]] = 1;
            }
        }
    } else {
        memcpy(dst, src, s->window_size * sizeof(*dst));
    }

    if (s->method == 0) {
        for (j = 0; j < s->window_size; j++)
            buf[j] += dst[j] * w[j];
    } else {
        const int skip = s->overlap_skip;

        for (j = 0; j < s->hop_size; j++)
            buf[j] = dst[skip + j];
    }
    for (j = 0; j < s->hop_size; j++)
        ptr[j] = buf[j];

    memmove(buf, buf + s->hop_size, (s->window_size * 2 - s->hop_size) * sizeof(*buf));
    memmove(is, is + s->hop_size, (s->window_size - s->hop_size) * sizeof(*is));
    memset(buf + s->window_size * 2 - s->hop_size, 0, s->hop_size * sizeof(*buf));
    memset(is + s->window_size - s->hop_size, 0, s->hop_size * sizeof(*is));

    return 0;
}

static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
    AVFilterContext *ctx = inlink->dst;
    AVFilterLink *outlink = ctx->outputs[0];
    AudioDeclickContext *s = ctx->priv;
    AVFrame *out = NULL;
    int ret = 0;

    if (s->pts == AV_NOPTS_VALUE)
        s->pts = in->pts;

    ret = av_audio_fifo_write(s->fifo, (void **)in->extended_data,
                              in->nb_samples);
    av_frame_free(&in);

    while (av_audio_fifo_size(s->fifo) >= s->window_size) {
        int j, ch, detected_errors = 0;
        ThreadData td;

        out = ff_get_audio_buffer(outlink, s->hop_size);
        if (!out)
            return AVERROR(ENOMEM);

        ret = av_audio_fifo_peek(s->fifo, (void **)s->in->extended_data,
                                 s->window_size);
        if (ret < 0)
            break;

        td.out = out;
        ret = ctx->internal->execute(ctx, filter_channel, &td, NULL, inlink->channels);
        if (ret < 0)
            goto fail;

        for (ch = 0; ch < s->in->channels; ch++) {
            double *is = (double *)s->is->extended_data[ch];

            for (j = 0; j < s->hop_size; j++) {
                if (is[j])
                    detected_errors++;
            }
        }

        av_audio_fifo_drain(s->fifo, s->hop_size);

        if (s->samples_left > 0)
            out->nb_samples = FFMIN(s->hop_size, s->samples_left);

        out->pts = s->pts;
        s->pts += s->hop_size;

        s->detected_errors += detected_errors;
        s->nb_samples += out->nb_samples * inlink->channels;

        ret = ff_filter_frame(outlink, out);
        if (ret < 0)
            break;

        if (s->samples_left > 0) {
            s->samples_left -= s->hop_size;
            if (s->samples_left <= 0)
                av_audio_fifo_drain(s->fifo, av_audio_fifo_size(s->fifo));
        }
    }

fail:
    if (ret < 0)
        av_frame_free(&out);
    return ret;
}

static int request_frame(AVFilterLink *outlink)
{
    AVFilterContext *ctx = outlink->src;
    AudioDeclickContext *s = ctx->priv;
    int ret = 0;

    ret = ff_request_frame(ctx->inputs[0]);

    if (ret == AVERROR_EOF && av_audio_fifo_size(s->fifo) > 0) {
        if (!s->samples_left)
            s->samples_left = av_audio_fifo_size(s->fifo) - s->overlap_skip;

        if (s->samples_left > 0) {
            AVFrame *in = ff_get_audio_buffer(outlink, s->window_size - s->samples_left);
            if (!in)
                return AVERROR(ENOMEM);
            ret = filter_frame(ctx->inputs[0], in);
        }
    }

    return ret;
}

static av_cold int init(AVFilterContext *ctx)
{
    AudioDeclickContext *s = ctx->priv;

    s->is_declip = !strcmp(ctx->filter->name, "adeclip");
    if (s->is_declip) {
        s->detector = detect_clips;
    } else {
        s->detector = detect_clicks;
    }

    return 0;
}

static av_cold void uninit(AVFilterContext *ctx)
{
    AudioDeclickContext *s = ctx->priv;
    int i;

    av_log(ctx, AV_LOG_INFO, "Detected %s in %"PRId64" of %"PRId64" samples (%g%%).\n",
           s->is_declip ? "clips" : "clicks", s->detected_errors,
           s->nb_samples, 100. * s->detected_errors / s->nb_samples);

    av_audio_fifo_free(s->fifo);
    av_freep(&s->window_func_lut);
    av_frame_free(&s->in);
    av_frame_free(&s->out);
    av_frame_free(&s->buffer);
    av_frame_free(&s->is);

    if (s->chan) {
        for (i = 0; i < s->nb_channels; i++) {
            DeclickChannel *c = &s->chan[i];

            av_freep(&c->detection);
            av_freep(&c->auxiliary);
            av_freep(&c->acoefficients);
            av_freep(&c->acorrelation);
            av_freep(&c->tmp);
            av_freep(&c->click);
            av_freep(&c->index);
            av_freep(&c->interpolated);
            av_freep(&c->matrix);
            c->matrix_size = 0;
            av_freep(&c->histogram);
            c->histogram_size = 0;
            av_freep(&c->vector);
            c->vector_size = 0;
            av_freep(&c->y);
            c->y_size = 0;
        }
    }
    av_freep(&s->chan);
    s->nb_channels = 0;
}

static const AVFilterPad inputs[] = {
    {
        .name         = "default",
        .type         = AVMEDIA_TYPE_AUDIO,
        .filter_frame = filter_frame,
        .config_props = config_input,
    },
    { NULL }
};

static const AVFilterPad outputs[] = {
    {
        .name          = "default",
        .type          = AVMEDIA_TYPE_AUDIO,
        .request_frame = request_frame,
    },
    { NULL }
};

AVFilter ff_af_adeclick = {
    .name          = "adeclick",
    .description   = NULL_IF_CONFIG_SMALL("Remove impulsive noise from input audio."),
    .query_formats = query_formats,
    .priv_size     = sizeof(AudioDeclickContext),
    .priv_class    = &adeclick_class,
    .init          = init,
    .uninit        = uninit,
    .inputs        = inputs,
    .outputs       = outputs,
    .flags         = AVFILTER_FLAG_SLICE_THREADS,
};

static const AVOption adeclip_options[] = {
    { "w", "set window size",          OFFSET(w),              AV_OPT_TYPE_DOUBLE, {.dbl=55},     10,  100, AF },
    { "o", "set window overlap",       OFFSET(overlap),        AV_OPT_TYPE_DOUBLE, {.dbl=75},     50,   95, AF },
    { "a", "set autoregression order", OFFSET(ar),             AV_OPT_TYPE_DOUBLE, {.dbl=8},       0,   25, AF },
    { "t", "set threshold",            OFFSET(threshold),      AV_OPT_TYPE_DOUBLE, {.dbl=10},      1,  100, AF },
    { "n", "set histogram size",       OFFSET(nb_hbins),       AV_OPT_TYPE_INT,    {.i64=1000},  100, 9999, AF },
    { "m", "set overlap method",       OFFSET(method),         AV_OPT_TYPE_INT,    {.i64=0},       0,    1, AF, "m" },
    { "a", "overlap-add",              0,                      AV_OPT_TYPE_CONST,  {.i64=0},       0,    0, AF, "m" },
    { "s", "overlap-save",             0,                      AV_OPT_TYPE_CONST,  {.i64=1},       0,    0, AF, "m" },
    { NULL }
};

AVFILTER_DEFINE_CLASS(adeclip);

AVFilter ff_af_adeclip = {
    .name          = "adeclip",
    .description   = NULL_IF_CONFIG_SMALL("Remove clipping from input audio."),
    .query_formats = query_formats,
    .priv_size     = sizeof(AudioDeclickContext),
    .priv_class    = &adeclip_class,
    .init          = init,
    .uninit        = uninit,
    .inputs        = inputs,
    .outputs       = outputs,
    .flags         = AVFILTER_FLAG_SLICE_THREADS,
};