diff --git a/avconv.c b/avconv.c
index 6c100ff42d..64017602cb 100644
--- a/avconv.c
+++ b/avconv.c
@@ -192,6 +192,13 @@ static void avconv_cleanup(int ret)
 
         avcodec_free_context(&ost->enc_ctx);
 
+        while (av_fifo_size(ost->muxing_queue)) {
+            AVPacket pkt;
+            av_fifo_generic_read(ost->muxing_queue, &pkt, sizeof(pkt), NULL);
+            av_packet_unref(&pkt);
+        }
+        av_fifo_free(ost->muxing_queue);
+
         av_freep(&output_streams[i]);
     }
     for (i = 0; i < nb_input_files; i++) {
@@ -255,11 +262,33 @@ static void abort_codec_experimental(AVCodec *c, int encoder)
     exit_program(1);
 }
 
-static void write_packet(AVFormatContext *s, AVPacket *pkt, OutputStream *ost)
+static void write_packet(OutputFile *of, AVPacket *pkt, OutputStream *ost)
 {
+    AVFormatContext *s = of->ctx;
     AVStream *st = ost->st;
     int ret;
 
+    if (!of->header_written) {
+        AVPacket tmp_pkt;
+        /* the muxer is not initialized yet, buffer the packet */
+        if (!av_fifo_space(ost->muxing_queue)) {
+            int new_size = FFMIN(2 * av_fifo_size(ost->muxing_queue),
+                                 ost->max_muxing_queue_size);
+            if (new_size <= av_fifo_size(ost->muxing_queue)) {
+                av_log(NULL, AV_LOG_ERROR,
+                       "Too many packets buffered for output stream %d:%d.\n",
+                       ost->file_index, ost->st->index);
+                exit_program(1);
+            }
+            ret = av_fifo_realloc2(ost->muxing_queue, new_size);
+            if (ret < 0)
+                exit_program(1);
+        }
+        av_packet_move_ref(&tmp_pkt, pkt);
+        av_fifo_generic_write(ost->muxing_queue, &tmp_pkt, sizeof(tmp_pkt), NULL);
+        return;
+    }
+
     /*
      * Audio encoders may split the packets --  #frames in != #packets out.
      * But there is no reordering, so we can limit the number of output packets
@@ -315,7 +344,7 @@ static void write_packet(AVFormatContext *s, AVPacket *pkt, OutputStream *ost)
     }
 }
 
-static void output_packet(AVFormatContext *s, AVPacket *pkt, OutputStream *ost)
+static void output_packet(OutputFile *of, AVPacket *pkt, OutputStream *ost)
 {
     int ret = 0;
 
@@ -345,10 +374,10 @@ static void output_packet(AVFormatContext *s, AVPacket *pkt, OutputStream *ost)
                     goto finish;
                 idx++;
             } else
-                write_packet(s, pkt, ost);
+                write_packet(of, pkt, ost);
         }
     } else
-        write_packet(s, pkt, ost);
+        write_packet(of, pkt, ost);
 
 finish:
     if (ret < 0 && ret != AVERROR_EOF) {
@@ -371,7 +400,7 @@ static int check_recording_time(OutputStream *ost)
     return 1;
 }
 
-static void do_audio_out(AVFormatContext *s, OutputStream *ost,
+static void do_audio_out(OutputFile *of, OutputStream *ost,
                          AVFrame *frame)
 {
     AVCodecContext *enc = ost->enc_ctx;
@@ -401,7 +430,7 @@ static void do_audio_out(AVFormatContext *s, OutputStream *ost,
             goto error;
 
         av_packet_rescale_ts(&pkt, enc->time_base, ost->st->time_base);
-        output_packet(s, &pkt, ost);
+        output_packet(of, &pkt, ost);
     }
 
     return;
@@ -410,7 +439,7 @@ error:
     exit_program(1);
 }
 
-static void do_subtitle_out(AVFormatContext *s,
+static void do_subtitle_out(OutputFile *of,
                             OutputStream *ost,
                             InputStream *ist,
                             AVSubtitle *sub,
@@ -475,11 +504,11 @@ static void do_subtitle_out(AVFormatContext *s,
             else
                 pkt.pts += 90 * sub->end_display_time;
         }
-        output_packet(s, &pkt, ost);
+        output_packet(of, &pkt, ost);
     }
 }
 
-static void do_video_out(AVFormatContext *s,
+static void do_video_out(OutputFile *of,
                          OutputStream *ost,
                          AVFrame *in_picture,
                          int *frame_size)
@@ -492,8 +521,8 @@ static void do_video_out(AVFormatContext *s,
 
     format_video_sync = video_sync_method;
     if (format_video_sync == VSYNC_AUTO)
-        format_video_sync = (s->oformat->flags & AVFMT_NOTIMESTAMPS) ? VSYNC_PASSTHROUGH :
-                            (s->oformat->flags & AVFMT_VARIABLE_FPS) ? VSYNC_VFR : VSYNC_CFR;
+        format_video_sync = (of->ctx->oformat->flags & AVFMT_NOTIMESTAMPS) ? VSYNC_PASSTHROUGH :
+                            (of->ctx->oformat->flags & AVFMT_VARIABLE_FPS) ? VSYNC_VFR : VSYNC_CFR;
     if (format_video_sync != VSYNC_PASSTHROUGH &&
         ost->frame_number &&
         in_picture->pts != AV_NOPTS_VALUE &&
@@ -552,7 +581,7 @@ static void do_video_out(AVFormatContext *s,
             goto error;
 
         av_packet_rescale_ts(&pkt, enc->time_base, ost->st->time_base);
-        output_packet(s, &pkt, ost);
+        output_packet(of, &pkt, ost);
         *frame_size = pkt.size;
 
         /* if two pass, output log */
@@ -662,12 +691,12 @@ static int poll_filter(OutputStream *ost)
         if (!ost->frame_aspect_ratio)
             ost->enc_ctx->sample_aspect_ratio = filtered_frame->sample_aspect_ratio;
 
-        do_video_out(of->ctx, ost, filtered_frame, &frame_size);
+        do_video_out(of, ost, filtered_frame, &frame_size);
         if (vstats_filename && frame_size)
             do_video_stats(ost, frame_size);
         break;
     case AVMEDIA_TYPE_AUDIO:
-        do_audio_out(of->ctx, ost, filtered_frame);
+        do_audio_out(of, ost, filtered_frame);
         break;
     default:
         // TODO support subtitle filters
@@ -975,7 +1004,7 @@ static void flush_encoders(void)
     for (i = 0; i < nb_output_streams; i++) {
         OutputStream   *ost = output_streams[i];
         AVCodecContext *enc = ost->enc_ctx;
-        AVFormatContext *os = output_files[ost->file_index]->ctx;
+        OutputFile      *of = output_files[ost->file_index];
         int stop_encoding = 0;
 
         if (!ost->encoding_needed)
@@ -1022,7 +1051,7 @@ static void flush_encoders(void)
                     break;
                 }
                 av_packet_rescale_ts(&pkt, enc->time_base, ost->st->time_base);
-                output_packet(os, &pkt, ost);
+                output_packet(of, &pkt, ost);
             }
 
             if (stop_encoding)
@@ -1115,7 +1144,7 @@ static void do_streamcopy(InputStream *ist, OutputStream *ost, const AVPacket *p
         opkt.size = pkt->size;
     }
 
-    output_packet(of->ctx, &opkt, ost);
+    output_packet(of, &opkt, ost);
 }
 
 // This does not quite work like avcodec_decode_audio4/avcodec_decode_video2.
@@ -1353,7 +1382,7 @@ static int transcode_subtitles(InputStream *ist, AVPacket *pkt, int *got_output)
         if (!check_output_constraints(ist, ost) || !ost->encoding_needed)
             continue;
 
-        do_subtitle_out(output_files[ost->file_index]->ctx, ost, ist, &subtitle, pkt->pts);
+        do_subtitle_out(output_files[ost->file_index], ost, ist, &subtitle, pkt->pts);
     }
 
     avsubtitle_free(&subtitle);
@@ -1657,6 +1686,17 @@ static int check_init_output_file(OutputFile *of, int file_index)
     if (want_sdp)
         print_sdp();
 
+    /* flush the muxing queues */
+    for (i = 0; i < of->ctx->nb_streams; i++) {
+        OutputStream *ost = output_streams[of->ost_index + i];
+
+        while (av_fifo_size(ost->muxing_queue)) {
+            AVPacket pkt;
+            av_fifo_generic_read(ost->muxing_queue, &pkt, sizeof(pkt), NULL);
+            write_packet(of, &pkt, ost);
+        }
+    }
+
     return 0;
 }
 
diff --git a/avconv.h b/avconv.h
index bc3b6c3948..60729c3491 100644
--- a/avconv.h
+++ b/avconv.h
@@ -190,6 +190,8 @@ typedef struct OptionsContext {
     int        nb_pass;
     SpecifierOpt *passlogfiles;
     int        nb_passlogfiles;
+    SpecifierOpt *max_muxing_queue_size;
+    int        nb_max_muxing_queue_size;
 } OptionsContext;
 
 typedef struct InputFilter {
@@ -391,6 +393,11 @@ typedef struct OutputStream {
 
     /* packet quality factor */
     int quality;
+
+    int max_muxing_queue_size;
+
+    /* the packets are buffered here until the muxer is ready to be initialized */
+    AVFifoBuffer *muxing_queue;
 } OutputStream;
 
 typedef struct OutputFile {
diff --git a/avconv_opt.c b/avconv_opt.c
index 2a4f71a6ff..a1729c30e5 100644
--- a/avconv_opt.c
+++ b/avconv_opt.c
@@ -1073,6 +1073,10 @@ static OutputStream *new_output_stream(OptionsContext *o, AVFormatContext *oc, e
         ost->enc_ctx->global_quality = FF_QP2LAMBDA * qscale;
     }
 
+    ost->max_muxing_queue_size = 128;
+    MATCH_PER_STREAM_OPT(max_muxing_queue_size, i, ost->max_muxing_queue_size, oc, st);
+    ost->max_muxing_queue_size *= sizeof(AVPacket);
+
     if (oc->oformat->flags & AVFMT_GLOBALHEADER)
         ost->enc_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
 
@@ -1083,6 +1087,10 @@ static OutputStream *new_output_stream(OptionsContext *o, AVFormatContext *oc, e
     ost->pix_fmts[0] = ost->pix_fmts[1] = AV_PIX_FMT_NONE;
     ost->last_mux_dts = AV_NOPTS_VALUE;
 
+    ost->muxing_queue = av_fifo_alloc(8 * sizeof(AVPacket));
+    if (!ost->muxing_queue)
+        exit_program(1);
+
     return ost;
 }
 
@@ -2648,6 +2656,9 @@ const OptionDef options[] = {
     { "bsf", HAS_ARG | OPT_STRING | OPT_SPEC | OPT_EXPERT | OPT_OUTPUT, { .off = OFFSET(bitstream_filters) },
         "A comma-separated list of bitstream filters", "bitstream_filters" },
 
+    { "max_muxing_queue_size", HAS_ARG | OPT_INT | OPT_SPEC | OPT_EXPERT | OPT_OUTPUT, { .off = OFFSET(max_muxing_queue_size) },
+        "maximum number of packets that can be buffered while waiting for all streams to initialize", "packets" },
+
     /* data codec support */
     { "dcodec", HAS_ARG | OPT_DATA | OPT_PERFILE | OPT_EXPERT | OPT_INPUT | OPT_OUTPUT, { .func_arg = opt_data_codec },
         "force data codec ('copy' to copy stream)", "codec" },
diff --git a/doc/avconv.texi b/doc/avconv.texi
index cdfef55478..9f2f295bd0 100644
--- a/doc/avconv.texi
+++ b/doc/avconv.texi
@@ -956,6 +956,15 @@ This option enables or disables accurate seeking in input files with the
 transcoding. Use @option{-noaccurate_seek} to disable it, which may be useful
 e.g. when copying some streams and transcoding the others.
 
+@item -max_muxing_queue_size @var{packets} (@emph{output,per-stream})
+When transcoding audio and/or video streams, avconv will not begin writing into
+the output until it has one packet for each such stream. While waiting for that
+to happen, packets for other streams are buffered. This option sets the size of
+this buffer, in packets, for the matching output stream.
+
+The default value of this option should be high enough for most uses, so only
+touch this option if you are sure that you need it.
+
 @end table
 @c man end OPTIONS