FFmpeg/libavfilter/af_whisper.c

/*
 * Copyright (c) 2025 Vittorio Palmisano
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

#include <whisper.h>

#include "libavutil/avutil.h"
#include "libavutil/opt.h"
#include "libavutil/channel_layout.h"
#include "libavutil/samplefmt.h"
#include "libavfilter/avfilter.h"
#include "libavfilter/audio.h"
#include "libavutil/mem.h"
#include "libavutil/avstring.h"
#include "libavutil/internal.h"
#include "libavformat/avio.h"
#include "libavutil/thread.h"

#include "formats.h"

typedef struct WhisperContext {
    const AVClass *class;
    char *model_path;
    char *language;
    bool use_gpu;
    int gpu_device;
    char *vad_model_path;
    float vad_threshold;
    int64_t vad_min_speech_duration;
    int64_t vad_min_silence_duration;

    int64_t queue;
    char *destination;
    char *format;

    struct whisper_context *ctx_wsp;
    struct whisper_vad_context *ctx_vad;
    struct whisper_vad_params vad_params;

    float *audio_buffer;
    int audio_buffer_queue_size;
    int audio_buffer_fill_size;
    int audio_buffer_vad_size;
    int64_t audio_buffer_start_ms;

    int eof;
    int64_t next_pts;

    AVIOContext *avio_context;
    int index;
} WhisperContext;

static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
{
    AVFilterContext *ctx = user_data;
    int av_log_level = AV_LOG_DEBUG;
    switch (level) {
    case GGML_LOG_LEVEL_ERROR:
        av_log_level = AV_LOG_ERROR;
        break;
    case GGML_LOG_LEVEL_WARN:
        av_log_level = AV_LOG_WARNING;
        break;
    }
    av_log(ctx, av_log_level, "%s", text);
}

static int init(AVFilterContext *ctx)
{
    WhisperContext *wctx = ctx->priv;

    static AVOnce init_static_once = AV_ONCE_INIT;
    ff_thread_once(&init_static_once, ggml_backend_load_all);

    whisper_log_set(cb_log, ctx);

    // Init whisper context
    if (!wctx->model_path) {
        av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");
        return AVERROR(EINVAL);
    }

    struct whisper_context_params params = whisper_context_default_params();
    params.use_gpu = wctx->use_gpu;
    params.gpu_device = wctx->gpu_device;

    wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);
    if (wctx->ctx_wsp == NULL) {
        av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);
        return AVERROR(EIO);
    }

    // Init buffer
    wctx->audio_buffer_queue_size = av_rescale(wctx->queue, WHISPER_SAMPLE_RATE, AV_TIME_BASE);
    wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer));
    if (!wctx->audio_buffer)
        return AVERROR(ENOMEM);

    // Init VAD model context
    if (wctx->vad_model_path) {
        struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
        ctx_params.n_threads = ff_filter_get_nb_threads(ctx);
        // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)
        ctx_params.gpu_device = wctx->gpu_device;
        wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params);

        wctx->vad_params = whisper_vad_default_params();
        wctx->vad_params.threshold = wctx->vad_threshold;
        wctx->vad_params.min_speech_duration_ms = av_rescale(wctx->vad_min_speech_duration, 1000, AV_TIME_BASE);
        wctx->vad_params.min_silence_duration_ms = av_rescale(wctx->vad_min_silence_duration, 1000, AV_TIME_BASE);
        wctx->vad_params.max_speech_duration_s = av_rescale(wctx->queue, 1, AV_TIME_BASE);
        wctx->vad_params.speech_pad_ms = 0;
        wctx->vad_params.samples_overlap = 0;
    }

    wctx->next_pts = AV_NOPTS_VALUE;

    if (wctx->destination && strcmp("", wctx->destination)) {
        const char *dst = wctx->destination;
        if (!strcmp("-", dst))
            dst = "pipe:1";
        int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);

        if (ret < 0) {
            av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret));
            return ret;
        }

        wctx->avio_context->direct = AVIO_FLAG_DIRECT;
    }

    av_log(ctx, AV_LOG_INFO,
           "Whisper filter initialized: model: %s lang: %s queue: %ld ms\n",
           wctx->model_path, wctx->language, wctx->queue / 1000);

    return 0;
}

static void uninit(AVFilterContext *ctx)
{
    WhisperContext *wctx = ctx->priv;

    if (wctx->audio_buffer_fill_size > 0) {
        av_log(ctx, AV_LOG_WARNING,
               "Remaining audio buffer %d samples (%d seconds) after stopping\n",
               wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
    }

    if (wctx->ctx_vad) {
        whisper_vad_free(wctx->ctx_vad);
        wctx->ctx_vad = NULL;
    }

    if (wctx->ctx_wsp) {
        whisper_free(wctx->ctx_wsp);
        wctx->ctx_wsp = NULL;
    }

    av_freep(&wctx->audio_buffer);

    if (wctx->avio_context)
        avio_closep(&wctx->avio_context);
}

static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
{
    WhisperContext *wctx = ctx->priv;
    samples = FFMAX(0, FFMIN(samples, wctx->audio_buffer_fill_size));

    if (!wctx->ctx_wsp || samples == 0)
        return;

    const int64_t timestamp_ms = wctx->audio_buffer_start_ms;
    const float duration = (float) samples / WHISPER_SAMPLE_RATE;

    av_log(ctx, AV_LOG_INFO,
           "run transcription at %ld ms, %d/%d samples (%.2f seconds)...\n",
           timestamp_ms, samples, wctx->audio_buffer_fill_size, duration);

    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    params.language = wctx->language;
    params.n_threads = ff_filter_get_nb_threads(ctx);
    params.print_special = 0;
    params.print_progress = 0;
    params.print_realtime = 0;
    params.print_timestamps = 0;

    if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {
        av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");
        return;
    }

    const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);
    char *segments_text = NULL;

    for (int i = 0; i < n_segments; ++i) {
        const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);
        if (av_isspace(text[0]))
            text++;
        char *text_cleaned = av_strireplace(text, "[BLANK_AUDIO]", "");

        if (av_strnlen(text_cleaned, 1) == 0) {
            av_freep(&text_cleaned);
            continue;
        }

        const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);
        const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;
        const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;

        av_log(ctx, AV_LOG_DEBUG, "  [%ld-%ld%s]: \"%s\"\n",
               timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);

        if (segments_text) {
            char *new_text = av_asprintf("%s%s", segments_text, text_cleaned);
            av_freep(&segments_text);
            segments_text = new_text;
        } else
            segments_text = av_strdup(text_cleaned);

        if (wctx->avio_context) {
            const int64_t start_t = timestamp_ms + t0_ms;
            const int64_t end_t = timestamp_ms + t1_ms;
            char *buf = NULL;

            if (!av_strcasecmp(wctx->format, "srt")) {
                buf =
                    av_asprintf
                    ("%d\n%02ld:%02ld:%02ld.%03ld --> %02ld:%02ld:%02ld.%03ld\n%s\n\n",
                     wctx->index, start_t / 3600000,
                     (start_t / 60000) % 60, (start_t / 1000) % 60,
                     start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,
                     (end_t / 1000) % 60, end_t % 1000, text_cleaned);
            } else if (!av_strcasecmp(wctx->format, "json")) {
                buf = av_asprintf("{\"start\":%ld,\"end\":%ld,\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);
            } else
                buf = av_strdup(text_cleaned);

            if (buf) {
                avio_write(wctx->avio_context, buf, strlen(buf));
                av_freep(&buf);
            }
        }

        av_freep(&text_cleaned);
    }

    wctx->index++;

    AVDictionary **metadata = &frame->metadata;
    if (metadata && segments_text) {
        av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
        char *duration_text = av_asprintf("%f", duration);
        av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);
    }
    av_freep(&segments_text);

    if (wctx->audio_buffer_fill_size > samples) {
        memcpy(wctx->audio_buffer, wctx->audio_buffer + samples,
               (wctx->audio_buffer_fill_size - samples) * sizeof(*wctx->audio_buffer));
        wctx->audio_buffer_start_ms += duration * 1000;
    }
    wctx->audio_buffer_fill_size -= samples;
    wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
}

static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
{
    AVFilterContext *ctx = inlink->dst;
    WhisperContext *wctx = ctx->priv;
    AVFilterLink *outlink = ctx->outputs[0];

    const int samples = frame->nb_samples;
    const float *input_data = (const float *) frame->data[0];

    if (wctx->audio_buffer_fill_size + samples > wctx->audio_buffer_queue_size) {
        run_transcription(ctx, frame, wctx->audio_buffer_fill_size);
    }

    if (!wctx->audio_buffer_fill_size)
        wctx->audio_buffer_start_ms = av_rescale_q(frame->pts,
                                                   (AVRational) {1000, 1},
                                                   (AVRational) {inlink->time_base.den, inlink->time_base.num});
    memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));
    wctx->audio_buffer_fill_size += samples;

    if (wctx->ctx_vad
        && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=
        av_rescale(wctx->vad_min_speech_duration + wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
        struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,
                                                                                  wctx->vad_params,
                                                                                  wctx->audio_buffer,
                                                                                  wctx->audio_buffer_fill_size);
        wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;

        if (!segments) {
            av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");
        } else {
            int n_segments = whisper_vad_segments_n_segments(segments);

            if (n_segments > 0) {
                const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
                const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
                int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);

                if (end_pos <= wctx->audio_buffer_fill_size -
                    av_rescale(wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
                    av_log(ctx, AV_LOG_INFO,
                            "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
                            n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
                    run_transcription(ctx, frame, end_pos);
                }
            }

            whisper_vad_free_segments(segments);
        }
    } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)
        run_transcription(ctx, frame, wctx->audio_buffer_fill_size);

    wctx->next_pts = frame->pts + av_rescale_q(samples, (AVRational) {
                                               1, inlink->sample_rate}
                                               , inlink->time_base);
    return ff_filter_frame(outlink, frame);
}

static int push_last_frame(AVFilterLink *outlink)
{
    AVFilterContext *ctx = outlink->src;
    WhisperContext *wctx = ctx->priv;
    AVFrame *frame;
    int n_out = 1;

    if (ctx->is_disabled || wctx->audio_buffer_fill_size == 0)
        return 0;
    frame = ff_get_audio_buffer(outlink, n_out);
    if (!frame)
        return AVERROR(ENOMEM);

    av_samples_set_silence(frame->extended_data, 0, n_out, frame->ch_layout.nb_channels, frame->format);

    frame->pts = wctx->next_pts;
    if (wctx->next_pts != AV_NOPTS_VALUE)
        wctx->next_pts += av_rescale_q(n_out, (AVRational) {
                                       1, outlink->sample_rate}
                                       , outlink->time_base);

    run_transcription(ctx, frame, wctx->audio_buffer_fill_size);

    return ff_filter_frame(outlink, frame);
}

static int activate(AVFilterContext *ctx)
{
    AVFilterLink *inlink = ctx->inputs[0];
    AVFilterLink *outlink = ctx->outputs[0];
    WhisperContext *wctx = ctx->priv;
    int64_t pts;
    int status;

    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);

    if (!wctx->eof && ff_inlink_queued_frames(inlink)) {
        AVFrame *frame = NULL;
        int ret;

        ret = ff_inlink_consume_frame(inlink, &frame);
        if (ret < 0)
            return ret;
        if (ret > 0)
            return filter_frame(inlink, frame);
    }

    if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))
        wctx->eof = status == AVERROR_EOF;

    if (wctx->eof) {
        push_last_frame(outlink);

        ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);
        return 0;
    }

    FF_FILTER_FORWARD_WANTED(outlink, inlink);

    return FFERROR_NOT_READY;
}

static int query_formats(const AVFilterContext *ctx,
                         AVFilterFormatsConfig **cfg_in,
                         AVFilterFormatsConfig **cfg_out)
{
    static const enum AVSampleFormat sample_fmts[] = { AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_NONE };
    AVChannelLayout chlayouts[] = { FF_COUNT2LAYOUT(1), { 0 } };
    int sample_rates[] = { WHISPER_SAMPLE_RATE, -1 };
    int ret;

    ret = ff_set_common_formats_from_list2(ctx, cfg_in, cfg_out, sample_fmts);
    if (ret < 0)
        return ret;

    ret = ff_set_common_channel_layouts_from_list2(ctx, cfg_in, cfg_out, chlayouts);
    if (ret < 0)
        return ret;

    return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);
}

#define OFFSET(x) offsetof(WhisperContext, x)
#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
#define HOURS 3600000000

static const AVOption whisper_options[] = {
    { "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS }, { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },
    { "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },
    { "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },
    { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
    { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },
    { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
    { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
    { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },
    { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },
    { "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },
    { NULL }
};

static const AVClass whisper_class = {
    .class_name = "whisper",
    .item_name = av_default_item_name,
    .option = whisper_options,
    .version = LIBAVUTIL_VERSION_INT,
};

const FFFilter ff_af_whisper = {
    .p.name = "whisper",
    .p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),
    .p.priv_class = &whisper_class,
    .p.flags = AVFILTER_FLAG_METADATA_ONLY,
    .init = init,
    .uninit = uninit,
    .activate = activate,
    .priv_size = sizeof(WhisperContext),
    FILTER_INPUTS(ff_audio_default_filterpad),
    FILTER_OUTPUTS(ff_audio_default_filterpad),
    FILTER_QUERY_FUNC2(query_formats),
};
libavfilter: Whisper audio filter It adds a new audio filter for running audio transcriptions with the whisper model. Documentation and examples are included into the patch. 2025-07-17 10:47:09 +02:00			`/*`
			`* Copyright (c) 2025 Vittorio Palmisano`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public License`
			`* as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public License`
			`* along with FFmpeg; if not, write to the Free Software Foundation, Inc.,`
			`* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include <stdio.h>`
			`#include <stdint.h>`
			`#include <stdlib.h>`

			`#include <whisper.h>`

			`#include "libavutil/avutil.h"`
			`#include "libavutil/opt.h"`
			`#include "libavutil/channel_layout.h"`
			`#include "libavutil/samplefmt.h"`
			`#include "libavfilter/avfilter.h"`
			`#include "libavfilter/audio.h"`
			`#include "libavutil/mem.h"`
			`#include "libavutil/avstring.h"`
			`#include "libavutil/internal.h"`
			`#include "libavformat/avio.h"`
			`#include "libavutil/thread.h"`

			`#include "formats.h"`

			`typedef struct WhisperContext {`
			`const AVClass *class;`
			`char *model_path;`
			`char *language;`
			`bool use_gpu;`
			`int gpu_device;`
			`char *vad_model_path;`
			`float vad_threshold;`
			`int64_t vad_min_speech_duration;`
			`int64_t vad_min_silence_duration;`

			`int64_t queue;`
			`char *destination;`
			`char *format;`

			`struct whisper_context *ctx_wsp;`
			`struct whisper_vad_context *ctx_vad;`
			`struct whisper_vad_params vad_params;`

			`float *audio_buffer;`
			`int audio_buffer_queue_size;`
			`int audio_buffer_fill_size;`
			`int audio_buffer_vad_size;`
			`int64_t audio_buffer_start_ms;`

			`int eof;`
			`int64_t next_pts;`

			`AVIOContext *avio_context;`
			`int index;`
			`} WhisperContext;`

			`static void cb_log(enum ggml_log_level level, const char text, void user_data)`
			`{`
			`AVFilterContext *ctx = user_data;`
			`int av_log_level = AV_LOG_DEBUG;`
			`switch (level) {`
			`case GGML_LOG_LEVEL_ERROR:`
			`av_log_level = AV_LOG_ERROR;`
			`break;`
			`case GGML_LOG_LEVEL_WARN:`
			`av_log_level = AV_LOG_WARNING;`
			`break;`
			`}`
			`av_log(ctx, av_log_level, "%s", text);`
			`}`

			`static int init(AVFilterContext *ctx)`
			`{`
			`WhisperContext *wctx = ctx->priv;`

			`static AVOnce init_static_once = AV_ONCE_INIT;`
			`ff_thread_once(&init_static_once, ggml_backend_load_all);`

			`whisper_log_set(cb_log, ctx);`

			`// Init whisper context`
			`if (!wctx->model_path) {`
			`av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");`
			`return AVERROR(EINVAL);`
			`}`

			`struct whisper_context_params params = whisper_context_default_params();`
			`params.use_gpu = wctx->use_gpu;`
			`params.gpu_device = wctx->gpu_device;`

			`wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);`
			`if (wctx->ctx_wsp == NULL) {`
			`av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);`
			`return AVERROR(EIO);`
			`}`

			`// Init buffer`
			`wctx->audio_buffer_queue_size = av_rescale(wctx->queue, WHISPER_SAMPLE_RATE, AV_TIME_BASE);`
			`wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer));`
			`if (!wctx->audio_buffer)`
			`return AVERROR(ENOMEM);`

			`// Init VAD model context`
			`if (wctx->vad_model_path) {`
			`struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();`
			`ctx_params.n_threads = ff_filter_get_nb_threads(ctx);`
			`// ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)`
			`ctx_params.gpu_device = wctx->gpu_device;`
			`wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params);`

			`wctx->vad_params = whisper_vad_default_params();`
			`wctx->vad_params.threshold = wctx->vad_threshold;`
			`wctx->vad_params.min_speech_duration_ms = av_rescale(wctx->vad_min_speech_duration, 1000, AV_TIME_BASE);`
			`wctx->vad_params.min_silence_duration_ms = av_rescale(wctx->vad_min_silence_duration, 1000, AV_TIME_BASE);`
			`wctx->vad_params.max_speech_duration_s = av_rescale(wctx->queue, 1, AV_TIME_BASE);`
			`wctx->vad_params.speech_pad_ms = 0;`
			`wctx->vad_params.samples_overlap = 0;`
			`}`

			`wctx->next_pts = AV_NOPTS_VALUE;`

			`if (wctx->destination && strcmp("", wctx->destination)) {`
			`const char *dst = wctx->destination;`
			`if (!strcmp("-", dst))`
			`dst = "pipe:1";`
			`int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);`

			`if (ret < 0) {`
			`av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret));`
			`return ret;`
			`}`

			`wctx->avio_context->direct = AVIO_FLAG_DIRECT;`
			`}`

			`av_log(ctx, AV_LOG_INFO,`
			`"Whisper filter initialized: model: %s lang: %s queue: %ld ms\n",`
			`wctx->model_path, wctx->language, wctx->queue / 1000);`

			`return 0;`
			`}`

			`static void uninit(AVFilterContext *ctx)`
			`{`
			`WhisperContext *wctx = ctx->priv;`

			`if (wctx->audio_buffer_fill_size > 0) {`
			`av_log(ctx, AV_LOG_WARNING,`
			`"Remaining audio buffer %d samples (%d seconds) after stopping\n",`
			`wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);`
			`}`

			`if (wctx->ctx_vad) {`
			`whisper_vad_free(wctx->ctx_vad);`
			`wctx->ctx_vad = NULL;`
			`}`

			`if (wctx->ctx_wsp) {`
			`whisper_free(wctx->ctx_wsp);`
			`wctx->ctx_wsp = NULL;`
			`}`

			`av_freep(&wctx->audio_buffer);`

			`if (wctx->avio_context)`
			`avio_closep(&wctx->avio_context);`
			`}`

			`static void run_transcription(AVFilterContext ctx, AVFrame frame, int samples)`
			`{`
			`WhisperContext *wctx = ctx->priv;`
			`samples = FFMAX(0, FFMIN(samples, wctx->audio_buffer_fill_size));`

			`if (!wctx->ctx_wsp \|\| samples == 0)`
			`return;`

			`const int64_t timestamp_ms = wctx->audio_buffer_start_ms;`
			`const float duration = (float) samples / WHISPER_SAMPLE_RATE;`

			`av_log(ctx, AV_LOG_INFO,`
			`"run transcription at %ld ms, %d/%d samples (%.2f seconds)...\n",`
			`timestamp_ms, samples, wctx->audio_buffer_fill_size, duration);`

			`struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);`
			`params.language = wctx->language;`
			`params.n_threads = ff_filter_get_nb_threads(ctx);`
			`params.print_special = 0;`
			`params.print_progress = 0;`
			`params.print_realtime = 0;`
			`params.print_timestamps = 0;`

			`if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {`
			`av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");`
			`return;`
			`}`

			`const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);`
			`char *segments_text = NULL;`

			`for (int i = 0; i < n_segments; ++i) {`
			`const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);`
avfilter/af_whisper: fix broken output for multibyte character text + 1 can break a multibyte character, e.g., Chinese in UTF-8. There is no space at the beginning in this case. 2025-08-15 20:39:49 +08:00			`if (av_isspace(text[0]))`
			`text++;`
			`char *text_cleaned = av_strireplace(text, "[BLANK_AUDIO]", "");`
libavfilter: Whisper audio filter It adds a new audio filter for running audio transcriptions with the whisper model. Documentation and examples are included into the patch. 2025-07-17 10:47:09 +02:00
			`if (av_strnlen(text_cleaned, 1) == 0) {`
			`av_freep(&text_cleaned);`
			`continue;`
			`}`

			`const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);`
			`const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;`
			`const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;`

			`av_log(ctx, AV_LOG_DEBUG, " [%ld-%ld%s]: \"%s\"\n",`
			`timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);`

			`if (segments_text) {`
			`char *new_text = av_asprintf("%s%s", segments_text, text_cleaned);`
			`av_freep(&segments_text);`
			`segments_text = new_text;`
			`} else`
			`segments_text = av_strdup(text_cleaned);`

			`if (wctx->avio_context) {`
			`const int64_t start_t = timestamp_ms + t0_ms;`
			`const int64_t end_t = timestamp_ms + t1_ms;`
			`char *buf = NULL;`

			`if (!av_strcasecmp(wctx->format, "srt")) {`
			`buf =`
			`av_asprintf`
			`("%d\n%02ld:%02ld:%02ld.%03ld --> %02ld:%02ld:%02ld.%03ld\n%s\n\n",`
			`wctx->index, start_t / 3600000,`
			`(start_t / 60000) % 60, (start_t / 1000) % 60,`
			`start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,`
			`(end_t / 1000) % 60, end_t % 1000, text_cleaned);`
			`} else if (!av_strcasecmp(wctx->format, "json")) {`
			`buf = av_asprintf("{\"start\":%ld,\"end\":%ld,\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);`
			`} else`
			`buf = av_strdup(text_cleaned);`

			`if (buf) {`
			`avio_write(wctx->avio_context, buf, strlen(buf));`
			`av_freep(&buf);`
			`}`
			`}`

			`av_freep(&text_cleaned);`
			`}`

			`wctx->index++;`

			`AVDictionary **metadata = &frame->metadata;`
			`if (metadata && segments_text) {`
			`av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);`
			`char *duration_text = av_asprintf("%f", duration);`
			`av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);`
			`}`
			`av_freep(&segments_text);`

			`if (wctx->audio_buffer_fill_size > samples) {`
			`memcpy(wctx->audio_buffer, wctx->audio_buffer + samples,`
			`(wctx->audio_buffer_fill_size - samples) * sizeof(*wctx->audio_buffer));`
			`wctx->audio_buffer_start_ms += duration * 1000;`
			`}`
			`wctx->audio_buffer_fill_size -= samples;`
			`wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;`
			`}`

			`static int filter_frame(AVFilterLink inlink, AVFrame frame)`
			`{`
			`AVFilterContext *ctx = inlink->dst;`
			`WhisperContext *wctx = ctx->priv;`
			`AVFilterLink *outlink = ctx->outputs[0];`

			`const int samples = frame->nb_samples;`
			`const float input_data = (const float ) frame->data[0];`

			`if (wctx->audio_buffer_fill_size + samples > wctx->audio_buffer_queue_size) {`
			`run_transcription(ctx, frame, wctx->audio_buffer_fill_size);`
			`}`

			`if (!wctx->audio_buffer_fill_size)`
			`wctx->audio_buffer_start_ms = av_rescale_q(frame->pts,`
			`(AVRational) {1000, 1},`
			`(AVRational) {inlink->time_base.den, inlink->time_base.num});`
			`memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));`
			`wctx->audio_buffer_fill_size += samples;`

			`if (wctx->ctx_vad`
			`&& (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=`
			`av_rescale(wctx->vad_min_speech_duration + wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {`
			`struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,`
			`wctx->vad_params,`
			`wctx->audio_buffer,`
			`wctx->audio_buffer_fill_size);`
			`wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;`

			`if (!segments) {`
			`av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");`
			`} else {`
			`int n_segments = whisper_vad_segments_n_segments(segments);`

			`if (n_segments > 0) {`
			`const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;`
			`const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;`
			`int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);`

			`if (end_pos <= wctx->audio_buffer_fill_size -`
			`av_rescale(wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {`
			`av_log(ctx, AV_LOG_INFO,`
			`"VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",`
			`n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);`
			`run_transcription(ctx, frame, end_pos);`
			`}`
			`}`

			`whisper_vad_free_segments(segments);`
			`}`
			`} else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)`
			`run_transcription(ctx, frame, wctx->audio_buffer_fill_size);`

			`wctx->next_pts = frame->pts + av_rescale_q(samples, (AVRational) {`
			`1, inlink->sample_rate}`
			`, inlink->time_base);`
			`return ff_filter_frame(outlink, frame);`
			`}`

			`static int push_last_frame(AVFilterLink *outlink)`
			`{`
			`AVFilterContext *ctx = outlink->src;`
			`WhisperContext *wctx = ctx->priv;`
			`AVFrame *frame;`
			`int n_out = 1;`

			`if (ctx->is_disabled \|\| wctx->audio_buffer_fill_size == 0)`
			`return 0;`
			`frame = ff_get_audio_buffer(outlink, n_out);`
			`if (!frame)`
			`return AVERROR(ENOMEM);`

			`av_samples_set_silence(frame->extended_data, 0, n_out, frame->ch_layout.nb_channels, frame->format);`

			`frame->pts = wctx->next_pts;`
			`if (wctx->next_pts != AV_NOPTS_VALUE)`
			`wctx->next_pts += av_rescale_q(n_out, (AVRational) {`
			`1, outlink->sample_rate}`
			`, outlink->time_base);`

			`run_transcription(ctx, frame, wctx->audio_buffer_fill_size);`

			`return ff_filter_frame(outlink, frame);`
			`}`

			`static int activate(AVFilterContext *ctx)`
			`{`
			`AVFilterLink *inlink = ctx->inputs[0];`
			`AVFilterLink *outlink = ctx->outputs[0];`
			`WhisperContext *wctx = ctx->priv;`
			`int64_t pts;`
			`int status;`

			`FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);`

			`if (!wctx->eof && ff_inlink_queued_frames(inlink)) {`
			`AVFrame *frame = NULL;`
			`int ret;`

			`ret = ff_inlink_consume_frame(inlink, &frame);`
			`if (ret < 0)`
			`return ret;`
			`if (ret > 0)`
			`return filter_frame(inlink, frame);`
			`}`

			`if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))`
			`wctx->eof = status == AVERROR_EOF;`

			`if (wctx->eof) {`
			`push_last_frame(outlink);`

			`ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);`
			`return 0;`
			`}`

			`FF_FILTER_FORWARD_WANTED(outlink, inlink);`

			`return FFERROR_NOT_READY;`
			`}`

			`static int query_formats(const AVFilterContext *ctx,`
			`AVFilterFormatsConfig **cfg_in,`
			`AVFilterFormatsConfig **cfg_out)`
			`{`
			`static const enum AVSampleFormat sample_fmts[] = { AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_NONE };`
			`AVChannelLayout chlayouts[] = { FF_COUNT2LAYOUT(1), { 0 } };`
			`int sample_rates[] = { WHISPER_SAMPLE_RATE, -1 };`
			`int ret;`

			`ret = ff_set_common_formats_from_list2(ctx, cfg_in, cfg_out, sample_fmts);`
			`if (ret < 0)`
			`return ret;`

			`ret = ff_set_common_channel_layouts_from_list2(ctx, cfg_in, cfg_out, chlayouts);`
			`if (ret < 0)`
			`return ret;`

			`return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);`
			`}`

			`#define OFFSET(x) offsetof(WhisperContext, x)`
			`#define FLAGS AV_OPT_FLAG_AUDIO_PARAM \| AV_OPT_FLAG_FILTERING_PARAM`
			`#define HOURS 3600000000`

			`static const AVOption whisper_options[] = {`
			`{ "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS }, { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },`
			`{ "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },`
			`{ "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },`
			`{ "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },`
			`{ "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },`
			`{ "format", "Output format (text\|srt\|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },`
			`{ "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },`
			`{ "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },`
			`{ "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },`
			`{ "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },`
			`{ NULL }`
			`};`

			`static const AVClass whisper_class = {`
			`.class_name = "whisper",`
			`.item_name = av_default_item_name,`
			`.option = whisper_options,`
			`.version = LIBAVUTIL_VERSION_INT,`
			`};`

			`const FFFilter ff_af_whisper = {`
			`.p.name = "whisper",`
			`.p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),`
			`.p.priv_class = &whisper_class,`
			`.p.flags = AVFILTER_FLAG_METADATA_ONLY,`
			`.init = init,`
			`.uninit = uninit,`
			`.activate = activate,`
			`.priv_size = sizeof(WhisperContext),`
			`FILTER_INPUTS(ff_audio_default_filterpad),`
			`FILTER_OUTPUTS(ff_audio_default_filterpad),`
			`FILTER_QUERY_FUNC2(query_formats),`
			`};`