lavc: use a separate field for exporting audio encoder padding

Currently, the amount of padding inserted at the beginning by some audio encoders, is exported through AVCodecContext.delay. However - the term 'delay' is heavily overloaded and can have multiple different meanings even in the case of audio encoding. - this field has entirely different meanings, depending on whether the codec context is used for encoding or decoding (and has yet another different meaning for video), preventing generic handling of the codec context. Therefore, add a new field -- AVCodecContext.initial_padding. It could conceivably be used for decoding as well at a later point.
2024-12-23 12:43:46 +02:00 · 2014-08-23 12:40:50 +00:00 · 2014-08-23 12:40:50 +00:00 · 2df0c32ea1
commit 2df0c32ea1
parent c80a816142
22 changed files with 63 additions and 40 deletions
--- a/doc/APIchanges
+++ b/doc/APIchanges
@ -13,6 +13,10 @@ libavutil:     2014-08-09

 API changes, most recent first:

+2014-10-13 - xxxxxxx - lavc 55.03.0 - avcodec.h
+  Add AVCodecContext.initial_padding. Deprecate the use of AVCodecContext.delay
+  for audio encoding.
+
 2014-09-xx - xxxxxxx - lavu 54.04.0 - pixdesc.h
  Add API to return the name of frame and context color properties.

--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@ -777,7 +777,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
    for (i = 0; i < 428; i++)
        ff_aac_pow34sf_tab[i] = sqrt(ff_aac_pow2sf_tab[i] * sqrt(ff_aac_pow2sf_tab[i]));

-    avctx->delay = 1024;
+    avctx->initial_padding = 1024;
    ff_af_queue_init(avctx, &s->afq);

    return 0;
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@ -2436,7 +2436,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
        return ret;

    avctx->frame_size = AC3_BLOCK_SIZE * s->num_blocks;
-    avctx->delay      = AC3_BLOCK_SIZE;
+    avctx->initial_padding = AC3_BLOCK_SIZE;

    s->bitstream_mode = avctx->audio_service_type;
    if (s->bitstream_mode == AV_AUDIO_SERVICE_TYPE_KARAOKE)
--- a/libavcodec/audio_frame_queue.c
+++ b/libavcodec/audio_frame_queue.c
@ -29,8 +29,8 @@ av_cold void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq)
 {
    afq->avctx             = avctx;
    afq->next_pts          = AV_NOPTS_VALUE;
-    afq->remaining_delay   = avctx->delay;
-    afq->remaining_samples = avctx->delay;
+    afq->remaining_delay   = avctx->initial_padding;
+    afq->remaining_samples = avctx->initial_padding;
    afq->frame_queue       = NULL;
 }

--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@ -1191,16 +1191,7 @@ typedef struct AVCodecContext {
     *   encoded input.
     *
     * Audio:
-     *   For encoding, this is the number of "priming" samples added by the
-     *   encoder to the beginning of the stream. The decoded output will be
-     *   delayed by this many samples relative to the input to the encoder (or
-     *   more, if the decoder adds its own padding).
-     *   The timestamps on the output packets are adjusted by the encoder so
-     *   that they always refer to the first sample of the data actually
-     *   contained in the packet, including any added padding.
-     *   E.g. if the timebase is 1/samplerate and the timestamp of the first
-     *   input sample is 0, the timestamp of the first output packet will be
-     *   -delay.
+     *   For encoding, this field is unused (see initial_padding).
     *
     *   For decoding, this is the number of samples the decoder needs to
     *   output before the decoder's output is valid. When seeking, you should
@ -2780,6 +2771,23 @@ typedef struct AVCodecContext {
     * use AVOptions to set this field.
     */
    int side_data_only_packets;
+
+    /**
+     * Audio only. The number of "priming" samples (padding) inserted by the
+     * encoder at the beginning of the audio. I.e. this number of leading
+     * decoded samples must be discarded by the caller to get the original audio
+     * without leading padding.
+     *
+     * - decoding: unused
+     * - encoding: Set by libavcodec. The timestamps on the output packets are
+     *             adjusted by the encoder so that they always refer to the
+     *             first sample of the data actually contained in the packet,
+     *             including any added padding.  E.g. if the timebase is
+     *             1/samplerate and the timestamp of the first input sample is
+     *             0, the timestamp of the first output packet will be
+     *             -initial_padding.
+     */
+    int initial_padding;
 } AVCodecContext;

 /**
--- a/libavcodec/g722enc.c
+++ b/libavcodec/g722enc.c
@ -106,7 +106,7 @@ static av_cold int g722_encode_init(AVCodecContext * avctx)
           a common packet size for VoIP applications */
        avctx->frame_size = 320;
    }
-    avctx->delay = 22;
+    avctx->initial_padding = 22;

    if (avctx->trellis) {
        /* validate trellis */
@ -375,7 +375,7 @@ static int g722_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    }

    if (frame->pts != AV_NOPTS_VALUE)
-        avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->delay);
+        avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->initial_padding);
    *got_packet_ptr = 1;
    return 0;
 }
--- a/libavcodec/libfaac.c
+++ b/libavcodec/libfaac.c
@ -157,7 +157,7 @@ static av_cold int Faac_encode_init(AVCodecContext *avctx)
        goto error;
    }

-    avctx->delay = FAAC_DELAY_SAMPLES;
+    avctx->initial_padding = FAAC_DELAY_SAMPLES;
    ff_af_queue_init(avctx, &s->afq);

    return 0;
--- a/libavcodec/libfdk-aacenc.c
+++ b/libavcodec/libfdk-aacenc.c
@ -286,7 +286,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
    }

    avctx->frame_size = info.frameLength;
-    avctx->delay      = info.encoderDelay;
+    avctx->initial_padding = info.encoderDelay;
    ff_af_queue_init(avctx, &s->afq);

    if (avctx->flags & CODEC_FLAG_GLOBAL_HEADER) {
--- a/libavcodec/libmp3lame.c
+++ b/libavcodec/libmp3lame.c
@ -137,7 +137,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
    }

    /* get encoder delay */
-    avctx->delay = lame_get_encoder_delay(s->gfp) + 528 + 1;
+    avctx->initial_padding = lame_get_encoder_delay(s->gfp) + 528 + 1;
    ff_af_queue_init(avctx, &s->afq);

    avctx->frame_size  = lame_get_framesize(s->gfp);
--- a/libavcodec/libopencore-amr.c
+++ b/libavcodec/libopencore-amr.c
@ -200,7 +200,7 @@ static av_cold int amr_nb_encode_init(AVCodecContext *avctx)
    }

    avctx->frame_size  = 160;
-    avctx->delay       =  50;
+    avctx->initial_padding = 50;
    ff_af_queue_init(avctx, &s->afq);

    s->enc_state = Encoder_Interface_init(s->enc_dtx);
@ -250,7 +250,7 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                return AVERROR(ENOMEM);
            memcpy(flush_buf, samples, frame->nb_samples * sizeof(*flush_buf));
            samples = flush_buf;
-            if (frame->nb_samples < avctx->frame_size - avctx->delay)
+            if (frame->nb_samples < avctx->frame_size - avctx->initial_padding)
                s->enc_last_frame = -1;
        }
        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0) {
--- a/libavcodec/libopusenc.c
+++ b/libavcodec/libopusenc.c
@ -87,7 +87,7 @@ static void libopus_write_header(AVCodecContext *avctx, int stream_count,
    bytestream_put_buffer(&p, "OpusHead", 8);
    bytestream_put_byte(&p, 1); /* Version */
    bytestream_put_byte(&p, channels);
-    bytestream_put_le16(&p, avctx->delay); /* Lookahead samples at 48kHz */
+    bytestream_put_le16(&p, avctx->initial_padding); /* Lookahead samples at 48kHz */
    bytestream_put_le32(&p, avctx->sample_rate); /* Original sample rate */
    bytestream_put_le16(&p, 0); /* Gain of 0dB is recommended. */

@ -277,7 +277,7 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
        goto fail;
    }

-    ret = opus_multistream_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&avctx->delay));
+    ret = opus_multistream_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&avctx->initial_padding));
    if (ret != OPUS_OK)
        av_log(avctx, AV_LOG_WARNING,
               "Unable to get number of lookahead samples: %s\n",
--- a/libavcodec/libspeexenc.c
+++ b/libavcodec/libspeexenc.c
@ -235,7 +235,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
    s->header.frames_per_packet = s->frames_per_packet;

    /* set encoding delay */
-    speex_encoder_ctl(s->enc_state, SPEEX_GET_LOOKAHEAD, &avctx->delay);
+    speex_encoder_ctl(s->enc_state, SPEEX_GET_LOOKAHEAD, &avctx->initial_padding);
    ff_af_queue_init(avctx, &s->afq);

    /* create header packet bytes from header struct */
--- a/libavcodec/libtwolame.c
+++ b/libavcodec/libtwolame.c
@ -60,7 +60,7 @@ static av_cold int twolame_encode_init(AVCodecContext *avctx)
    int ret;

    avctx->frame_size = TWOLAME_SAMPLES_PER_FRAME;
-    avctx->delay      = 512 - 32 + 1;
+    avctx->initial_padding = 512 - 32 + 1;

    s->glopts = twolame_init();
    if (!s->glopts)
@ -151,7 +151,7 @@ static int twolame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
    if (frame) {
        if (frame->pts != AV_NOPTS_VALUE)
-            avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->delay);
+            avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->initial_padding);
    } else {
        avpkt->pts = s->next_pts;
    }
--- a/libavcodec/libvo-aacenc.c
+++ b/libavcodec/libvo-aacenc.c
@ -61,7 +61,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
    int index, ret;

    avctx->frame_size = FRAME_SIZE;
-    avctx->delay      = ENC_DELAY;
+    avctx->initial_padding = ENC_DELAY;
    s->last_frame     = 2;
    ff_af_queue_init(avctx, &s->afq);

--- a/libavcodec/libvo-amrwbenc.c
+++ b/libavcodec/libvo-amrwbenc.c
@ -93,7 +93,7 @@ static av_cold int amr_wb_encode_init(AVCodecContext *avctx)
    s->last_bitrate    = avctx->bit_rate;

    avctx->frame_size  = 320;
-    avctx->delay       =  80;
+    avctx->initial_padding =  80;

    s->state     = E_IF_init();

@ -131,7 +131,7 @@ static int amr_wb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    }

    if (frame->pts != AV_NOPTS_VALUE)
-        avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->delay);
+        avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->initial_padding);

    avpkt->size = size;
    *got_packet_ptr = 1;
--- a/libavcodec/libvorbis.c
+++ b/libavcodec/libvorbis.c
@ -322,8 +322,8 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    if (duration > 0) {
        /* we do not know encoder delay until we get the first packet from
         * libvorbis, so we have to update the AudioFrameQueue counts */
-        if (!avctx->delay) {
-            avctx->delay              = duration;
+        if (!avctx->initial_padding) {
+            avctx->initial_padding    = duration;
            s->afq.remaining_delay   += duration;
            s->afq.remaining_samples += duration;
        }
--- a/libavcodec/mpegaudioenc.c
+++ b/libavcodec/mpegaudioenc.c
@ -84,7 +84,7 @@ static av_cold int MPA_encode_init(AVCodecContext *avctx)
    bitrate = bitrate / 1000;
    s->nb_channels = channels;
    avctx->frame_size = MPA_FRAME_SIZE;
-    avctx->delay      = 512 - 32 + 1;
+    avctx->initial_padding = 512 - 32 + 1;

    /* encoding freq */
    s->lsf = 0;
@ -735,7 +735,7 @@ static int MPA_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    encode_frame(s, bit_alloc, padding);

    if (frame->pts != AV_NOPTS_VALUE)
-        avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->delay);
+        avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->initial_padding);

    avpkt->size = put_bits_count(&s->pb) / 8;
    *got_packet_ptr = 1;
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@ -165,7 +165,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
    }

    avctx->frame_size = NELLY_SAMPLES;
-    avctx->delay      = NELLY_BUF_LEN;
+    avctx->initial_padding = NELLY_BUF_LEN;
    ff_af_queue_init(avctx, &s->afq);
    s->avctx = avctx;
    if ((ret = ff_mdct_init(&s->mdct_ctx, 8, 0, 32768.0)) < 0)
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@ -56,7 +56,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx)
        return -1;
    }
    avctx->frame_size = NBLOCKS * BLOCKSIZE;
-    avctx->delay      = avctx->frame_size;
+    avctx->initial_padding = avctx->frame_size;
    avctx->bit_rate = 8000;
    ractx = avctx->priv_data;
    ractx->lpc_coef[0] = ractx->lpc_tables[0];
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@ -1240,6 +1240,11 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
        }
    }

+#if FF_API_AUDIOENC_DELAY
+    if (av_codec_is_encoder(avctx->codec))
+        avctx->delay = avctx->initial_padding;
+#endif
+
    if (av_codec_is_decoder(avctx->codec)) {
        /* validate channel layout from the decoder */
        if (avctx->channel_layout) {
@ -1447,6 +1452,10 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
 end:
    av_frame_free(&padded_frame);

+#if FF_API_AUDIOENC_DELAY
+    avctx->delay = avctx->initial_padding;
+#endif
+
    return ret;
 }

--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@ -29,8 +29,8 @@
 #include "libavutil/version.h"

 #define LIBAVCODEC_VERSION_MAJOR 56
-#define LIBAVCODEC_VERSION_MINOR  2
-#define LIBAVCODEC_VERSION_MICRO  2
+#define LIBAVCODEC_VERSION_MINOR  3
+#define LIBAVCODEC_VERSION_MICRO  0

 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
                                               LIBAVCODEC_VERSION_MINOR, \
@ -153,5 +153,8 @@
 #ifndef FF_API_AFD
 #define FF_API_AFD               (LIBAVCODEC_VERSION_MAJOR < 57)
 #endif
+#ifndef FF_API_AUDIOENC_DELAY
+#define FF_API_AUDIOENC_DELAY    (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif

 #endif /* AVCODEC_VERSION_H */
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@ -92,8 +92,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
    avctx->block_align = block_align;
    avctx->bit_rate    = avctx->block_align * 8LL * avctx->sample_rate /
                         s->frame_len;
-    avctx->frame_size  =
-    avctx->delay       = s->frame_len;
+    avctx->frame_size = avctx->initial_padding = s->frame_len;

    return 0;
 }
@ -420,7 +419,7 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
    flush_put_bits(&s->pb);

    if (frame->pts != AV_NOPTS_VALUE)
-        avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->delay);
+        avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->initial_padding);

    avpkt->size     = avctx->block_align;
    *got_packet_ptr = 1;