From 3deb4b54a24f8cddce463d9f5751b01efeb976af Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 20 Dec 2016 17:14:26 -0500
Subject: [PATCH] wmavoice: disable bitstream checking.

The checked bitstream reader does that already. To allow parsing of
superframes split over a packet boundary, we always decode the last
superframe in each packet at the start of the next packet, even if
theoretically we could have decoded it. The last superframe in the
last packet is decoded using AV_CODEC_CAP_DELAY.
---
 libavcodec/wmavoice.c | 144 +++++++++---------------------------------
 1 file changed, 29 insertions(+), 115 deletions(-)

diff --git a/libavcodec/wmavoice.c b/libavcodec/wmavoice.c
index 4b3ab431c3..ae100fb39b 100644
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@@ -251,6 +251,7 @@ typedef struct WMAVoiceContext {
 
     int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
                                   ///< only used for comfort noise in #pRNG()
+    int nb_superframes;           ///< number of superframes in current packet
     float gain_pred_err[6];       ///< cache for gain prediction
     float excitation_history[MAX_SIGNAL_HISTORY];
                                   ///< cache of the signal of previous
@@ -875,7 +876,6 @@ static void dequant_lsps(double *lsps, int num,
 /**
  * @name LSP dequantization routines
  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
- * @note we assume enough bits are available, caller should check.
  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
  * @{
@@ -1419,7 +1419,6 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
 
 /**
  * Parse data in a single block.
- * @note we assume enough bits are available, caller should check.
  *
  * @param s WMA Voice decoding context private data
  * @param gb bit I/O context
@@ -1463,7 +1462,6 @@ static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
 
 /**
  * Synthesize output samples for a single frame.
- * @note we assume enough bits are available, caller should check.
  *
  * @param ctx WMA Voice decoder context
  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
@@ -1681,83 +1679,6 @@ static void stabilize_lsps(double *lsps, int num)
     }
 }
 
-/**
- * Test if there's enough bits to read 1 superframe.
- *
- * @param orig_gb bit I/O context used for reading. This function
- *                does not modify the state of the bitreader; it
- *                only uses it to copy the current stream position
- * @param s WMA Voice decoding context private data
- * @return < 0 on error, 1 on not enough bits or 0 if OK.
- */
-static int check_bits_for_superframe(GetBitContext *orig_gb,
-                                     WMAVoiceContext *s)
-{
-    GetBitContext s_gb, *gb = &s_gb;
-    int n, need_bits, bd_idx;
-    const struct frame_type_desc *frame_desc;
-
-    /* initialize a copy */
-    init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
-    skip_bits_long(gb, get_bits_count(orig_gb));
-    av_assert1(get_bits_left(gb) == get_bits_left(orig_gb));
-
-    /* superframe header */
-    if (get_bits_left(gb) < 14)
-        return 1;
-    if (!get_bits1(gb))
-        return AVERROR(ENOSYS);           // WMAPro-in-WMAVoice superframe
-    if (get_bits1(gb)) skip_bits(gb, 12); // number of  samples in superframe
-    if (s->has_residual_lsps) {           // residual LSPs (for all frames)
-        if (get_bits_left(gb) < s->sframe_lsp_bitsize)
-            return 1;
-        skip_bits_long(gb, s->sframe_lsp_bitsize);
-    }
-
-    /* frames */
-    for (n = 0; n < MAX_FRAMES; n++) {
-        int aw_idx_is_ext = 0;
-
-        if (!s->has_residual_lsps) {     // independent LSPs (per-frame)
-           if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
-           skip_bits_long(gb, s->frame_lsp_bitsize);
-        }
-        bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
-        if (bd_idx < 0)
-            return AVERROR_INVALIDDATA; // invalid frame type VLC code
-        frame_desc = &frame_descs[bd_idx];
-        if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
-            if (get_bits_left(gb) < s->pitch_nbits)
-                return 1;
-            skip_bits_long(gb, s->pitch_nbits);
-        }
-        if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
-            skip_bits(gb, 8);
-        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
-            int tmp = get_bits(gb, 6);
-            if (tmp >= 0x36) {
-                skip_bits(gb, 2);
-                aw_idx_is_ext = 1;
-            }
-        }
-
-        /* blocks */
-        if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
-            need_bits = s->block_pitch_nbits +
-                (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
-        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
-            need_bits = 2 * !aw_idx_is_ext;
-        } else
-            need_bits = 0;
-        need_bits += frame_desc->frame_size;
-        if (get_bits_left(gb) < need_bits)
-            return 1;
-        skip_bits_long(gb, need_bits);
-    }
-
-    return 0;
-}
-
 /**
  * Synthesize output samples for a single superframe. If we have any data
  * cached in s->sframe_cache, that will be used instead of whatever is loaded
@@ -1780,7 +1701,7 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
 {
     WMAVoiceContext *s = ctx->priv_data;
     GetBitContext *gb = &s->gb, s_gb;
-    int n, res, n_samples = 480;
+    int n, res, n_samples = MAX_SFRAMESIZE;
     double lsps[MAX_FRAMES][MAX_LSPS];
     const double *mean_lsf = s->lsps == 16 ?
         wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
@@ -1799,12 +1720,6 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
         s->sframe_cache_size = 0;
     }
 
-    if ((res = check_bits_for_superframe(gb, s)) == 1) {
-        *got_frame_ptr = 0;
-        return 1;
-    } else if (res < 0)
-        return res;
-
     /* First bit is speech/music bit, it differentiates between WMAVoice
      * speech samples (the actual codec) and WMAVoice music samples, which
      * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
@@ -1816,13 +1731,14 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
 
     /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
     if (get_bits1(gb)) {
-        if ((n_samples = get_bits(gb, 12)) > 480) {
+        if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
             av_log(ctx, AV_LOG_ERROR,
-                   "Superframe encodes >480 samples (%d), not allowed\n",
-                   n_samples);
+                   "Superframe encodes > %d samples (%d), not allowed\n",
+                   MAX_SFRAMESIZE, n_samples);
             return AVERROR_INVALIDDATA;
         }
     }
+
     /* Parse LSPs, if global for the superframe (can also be per-frame). */
     if (s->has_residual_lsps) {
         double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
@@ -1845,7 +1761,7 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
     }
 
     /* get output buffer */
-    frame->nb_samples = 480;
+    frame->nb_samples = MAX_SFRAMESIZE;
     if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
         return res;
     frame->nb_samples = n_samples;
@@ -1905,26 +1821,23 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
  * decoder).
  *
  * @param s WMA Voice decoding context private data
- * @return 1 if not enough bits were available, or 0 on success.
+ * @return <0 on error, nb_superframes on success.
  */
 static int parse_packet_header(WMAVoiceContext *s)
 {
     GetBitContext *gb = &s->gb;
-    unsigned int res;
+    unsigned int res, n_superframes = 0;
 
-    if (get_bits_left(gb) < 11)
-        return 1;
     skip_bits(gb, 4);          // packet sequence number
     s->has_residual_lsps = get_bits1(gb);
     do {
         res = get_bits(gb, 6); // number of superframes per packet
                                // (minus first one if there is spillover)
-        if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
-            return 1;
+        n_superframes += res;
     } while (res == 0x3F);
     s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
 
-    return 0;
+    return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
 }
 
 /**
@@ -1984,23 +1897,24 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
      * in a single "muxer" packet, so we artificially emulate that by
      * capping the packet size at ctx->block_align. */
     for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
-    if (!size) {
-        *got_frame_ptr = 0;
-        return 0;
-    }
     init_get_bits(&s->gb, avpkt->data, size << 3);
 
     /* size == ctx->block_align is used to indicate whether we are dealing with
      * a new packet or a packet of which we already read the packet header
      * previously. */
-    if (size == ctx->block_align) { // new packet header
-        if ((res = parse_packet_header(s)) < 0)
-            return res;
+    if (!(size % ctx->block_align)) { // new packet header
+        if (!size) {
+            s->spillover_nbits = 0;
+            s->nb_superframes = 0;
+        } else {
+            if ((res = parse_packet_header(s)) < 0)
+                return res;
+            s->nb_superframes = res;
+        }
 
         /* If the packet header specifies a s->spillover_nbits, then we want
          * to push out all data of the previous packet (+ spillover) before
          * continuing to parse new superframes in the current packet. */
-        if (s->spillover_nbits > 0) {
             if (s->sframe_cache_size > 0) {
                 int cnt = get_bits_count(gb);
                 copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
@@ -2021,9 +1935,9 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
                 } else
                     skip_bits_long (gb, s->spillover_nbits - cnt +
                                     get_bits_count(gb)); // resync
-            } else
+            } else if (s->spillover_nbits) {
                 skip_bits_long(gb, s->spillover_nbits);  // resync
-        }
+            }
     } else if (s->skip_bits_next)
         skip_bits(gb, s->skip_bits_next);
 
@@ -2031,6 +1945,10 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
     s->sframe_cache_size = 0;
     s->skip_bits_next = 0;
     pos = get_bits_left(gb);
+    if (s->nb_superframes-- == 0) {
+        *got_frame_ptr = 0;
+        return size;
+    } else if (s->nb_superframes > 0) {
     if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
         return res;
     } else if (*got_frame_ptr) {
@@ -2044,13 +1962,9 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
             return AVERROR_INVALIDDATA;
         }
         return res;
+    }
     } else if ((s->sframe_cache_size = pos) > 0) {
-        /* rewind bit reader to start of last (incomplete) superframe... */
-        init_get_bits(gb, avpkt->data, size << 3);
-        skip_bits_long(gb, (size << 3) - pos);
-        av_assert1(get_bits_left(gb) == pos);
-
-        /* ...and cache it for spillover in next packet */
+        /* ... cache it for spillover in next packet */
         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
         copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
         // FIXME bad - just copy bytes as whole and add use the
@@ -2084,6 +1998,6 @@ AVCodec ff_wmavoice_decoder = {
     .init_static_data = wmavoice_init_static_data,
     .close            = wmavoice_decode_end,
     .decode           = wmavoice_decode_packet,
-    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
     .flush            = wmavoice_flush,
 };