diff --git a/Changelog b/Changelog
index 70c9054f92..8b59ac8ff4 100644
--- a/Changelog
+++ b/Changelog
@@ -145,7 +145,7 @@ easier to use. The changes are:
 - pan audio filter
 - IFF Amiga Continuous Bitmap (ACBM) decoder
 - ass filter
-- CRI ADX audio format demuxer
+- CRI ADX audio format muxer and demuxer
 - Playstation Portable PMP format demuxer
 - Microsoft Windows ICO demuxer
 - life source
diff --git a/avconv.c b/avconv.c
index a8f3d70339..e02139db71 100644
--- a/avconv.c
+++ b/avconv.c
@@ -88,6 +88,11 @@
 
 #include "libavutil/avassert.h"
 
+#define VSYNC_AUTO       -1
+#define VSYNC_PASSTHROUGH 0
+#define VSYNC_CFR         1
+#define VSYNC_VFR         2
+
 const char program_name[] = "avconv";
 const int program_birth_year = 2000;
 
@@ -127,7 +132,7 @@ static int do_hex_dump = 0;
 static int do_pkt_dump = 0;
 static int do_pass = 0;
 static const char *pass_logfilename_prefix;
-static int video_sync_method = -1;
+static int video_sync_method = VSYNC_AUTO;
 static int audio_sync_method = 0;
 static float audio_drift_threshold = 0.1;
 static int copy_ts = 0;
@@ -1390,16 +1395,16 @@ static void do_video_out(AVFormatContext *s,
     *frame_size = 0;
 
     format_video_sync = video_sync_method;
-    if (format_video_sync < 0)
-        format_video_sync = (s->oformat->flags & AVFMT_NOTIMESTAMPS) ? 0 :
-                            (s->oformat->flags & AVFMT_VARIABLE_FPS) ? 2 : 1;
+    if (format_video_sync == VSYNC_AUTO)
+        format_video_sync = (s->oformat->flags & AVFMT_NOTIMESTAMPS) ? VSYNC_PASSTHROUGH :
+                            (s->oformat->flags & AVFMT_VARIABLE_FPS) ? VSYNC_VFR : VSYNC_CFR;
 
-    if (format_video_sync) {
+    if (format_video_sync != VSYNC_PASSTHROUGH) {
         double vdelta = sync_ipts - ost->sync_opts;
         // FIXME set to 0.5 after we fix some dts/pts bugs like in avidec.c
         if (vdelta < -1.1)
             nb_frames = 0;
-        else if (format_video_sync == 2) {
+        else if (format_video_sync == VSYNC_VFR) {
             if (vdelta <= -0.6) {
                 nb_frames = 0;
             } else if (vdelta > 0.6)
@@ -4426,6 +4431,17 @@ static int opt_video_filters(OptionsContext *o, const char *opt, const char *arg
     return parse_option(o, "filter:v", arg, options);
 }
 
+static int opt_vsync(const char *opt, const char *arg)
+{
+    if      (!av_strcasecmp(arg, "cfr"))         video_sync_method = VSYNC_CFR;
+    else if (!av_strcasecmp(arg, "vfr"))         video_sync_method = VSYNC_VFR;
+    else if (!av_strcasecmp(arg, "passthrough")) video_sync_method = VSYNC_PASSTHROUGH;
+
+    if (video_sync_method == VSYNC_AUTO)
+        video_sync_method = parse_number_or_die("vsync", arg, OPT_INT, VSYNC_AUTO, VSYNC_VFR);
+    return 0;
+}
+
 #define OFFSET(x) offsetof(OptionsContext, x)
 static const OptionDef options[] = {
     /* main options */
@@ -4457,7 +4473,7 @@ static const OptionDef options[] = {
       "when dumping packets, also dump the payload" },
     { "re", OPT_BOOL | OPT_EXPERT | OPT_OFFSET, {.off = OFFSET(rate_emu)}, "read input at native frame rate", "" },
     { "target", HAS_ARG | OPT_FUNC2, {(void*)opt_target}, "specify target file type (\"vcd\", \"svcd\", \"dvd\", \"dv\", \"dv50\", \"pal-vcd\", \"ntsc-svcd\", ...)", "type" },
-    { "vsync", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&video_sync_method}, "video sync method", "" },
+    { "vsync", HAS_ARG | OPT_EXPERT, {(void*)opt_vsync}, "video sync method", "" },
     { "async", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&audio_sync_method}, "audio sync method", "" },
     { "adrift_threshold", HAS_ARG | OPT_FLOAT | OPT_EXPERT, {(void*)&audio_drift_threshold}, "audio drift threshold", "threshold" },
     { "copyts", OPT_BOOL | OPT_EXPERT, {(void*)&copy_ts}, "copy timestamps" },
diff --git a/configure b/configure
index cd5fe36fa9..43b63bb6fa 100755
--- a/configure
+++ b/configure
@@ -1226,6 +1226,7 @@ HAVE_LIST="
     struct_sockaddr_in6
     struct_sockaddr_sa_len
     struct_sockaddr_storage
+    struct_v4l2_frmivalenum_discrete
     symver
     symver_asm_label
     symver_gnu_asm
@@ -3174,6 +3175,8 @@ makeinfo --version > /dev/null 2>&1 && enable makeinfo  || disable makeinfo
 check_header linux/fb.h
 check_header linux/videodev.h
 check_header linux/videodev2.h
+check_struct linux/videodev2.h "struct v4l2_frmivalenum" discrete
+
 check_header sys/videoio.h
 
 check_func_headers "windows.h vfw.h" capCreateCaptureWindow "$vfwcap_indev_extralibs"
diff --git a/doc/APIchanges b/doc/APIchanges
index 9fa0e07640..b146cb00ad 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -174,6 +174,10 @@ API changes, most recent first:
 2011-08-14 - 323b930 - lavu 51.12.0
   Add av_fifo_peek2(), deprecate av_fifo_peek().
 
+2011-08-26 - lavu 51.9.0
+  - add41de..abc78a5 Do not include intfloat_readwrite.h,
+    mathematics.h, rational.h, pixfmt.h, or log.h from avutil.h.
+
 2011-08-16 - 48f9e45 - lavf 53.8.0
   Add avformat_query_codec().
 
diff --git a/doc/avconv.texi b/doc/avconv.texi
index e6ebe71d9d..85c1457621 100644
--- a/doc/avconv.texi
+++ b/doc/avconv.texi
@@ -749,15 +749,15 @@ Thread count.
 Video sync method.
 
 @table @option
-@item 0
+@item passthrough
 Each frame is passed with its timestamp from the demuxer to the muxer.
-@item 1
+@item cfr
 Frames will be duplicated and dropped to achieve exactly the requested
 constant framerate.
-@item 2
+@item vfr
 Frames are passed through with their timestamp or dropped so as to
 prevent 2 frames from having the same timestamp.
-@item -1
+@item auto
 Chooses between 1 and 2 depending on muxer capabilities. This is the
 default method.
 @end table
diff --git a/doc/general.texi b/doc/general.texi
index 1a9e6cbe0b..2f18e4c126 100644
--- a/doc/general.texi
+++ b/doc/general.texi
@@ -134,7 +134,7 @@ library:
 @item Brute Force & Ignorance   @tab   @tab X
     @tab Used in the game Flash Traffic: City of Angels.
 @item BWF                       @tab X @tab X
-@item CRI ADX                   @tab   @tab X
+@item CRI ADX                   @tab X @tab X
     @tab Audio-only format used in console video games.
 @item Discworld II BMV          @tab   @tab X
 @item Interplay C93             @tab   @tab X
diff --git a/doc/indevs.texi b/doc/indevs.texi
index 038d253da0..3b2c86290c 100644
--- a/doc/indevs.texi
+++ b/doc/indevs.texi
@@ -515,9 +515,9 @@ kind @file{/dev/video@var{N}}, where @var{N} is a number associated to
 the device.
 
 Video4Linux and Video4Linux2 devices only support a limited set of
-@var{width}x@var{height} sizes and frame rates. You can check which are
+@var{width}x@var{height} sizes and framerates. You can check which are
 supported for example with the command @command{dov4l} for Video4Linux
-devices and the command @command{v4l-info} for Video4Linux2 devices.
+devices and using @command{-list_formats all} for Video4Linux2 devices.
 
 If the size for the device is set to 0x0, the input device will
 try to auto-detect the size to use.
diff --git a/ffmpeg.c b/ffmpeg.c
index 2174b93640..e23ea6505c 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -93,6 +93,11 @@
 
 #include "libavutil/avassert.h"
 
+#define VSYNC_AUTO       -1
+#define VSYNC_PASSTHROUGH 0
+#define VSYNC_CFR         1
+#define VSYNC_VFR         2
+
 const char program_name[] = "ffmpeg";
 const int program_birth_year = 2000;
 
@@ -144,7 +149,7 @@ static int do_pkt_dump = 0;
 static int do_psnr = 0;
 static int do_pass = 0;
 static const char *pass_logfilename_prefix;
-static int video_sync_method = -1;
+static int video_sync_method = VSYNC_AUTO;
 static int audio_sync_method = 0;
 static float audio_drift_threshold = 0.1;
 static int copy_ts = 0;
@@ -1433,15 +1438,15 @@ static void do_video_out(AVFormatContext *s,
     *frame_size = 0;
 
     format_video_sync = video_sync_method;
-    if (format_video_sync < 0)
-        format_video_sync = (s->oformat->flags & AVFMT_VARIABLE_FPS) ? ((s->oformat->flags & AVFMT_NOTIMESTAMPS) ? 0 : 2) : 1;
+    if (format_video_sync == VSYNC_AUTO)
+        format_video_sync = (s->oformat->flags & AVFMT_VARIABLE_FPS) ? ((s->oformat->flags & AVFMT_NOTIMESTAMPS) ? VSYNC_PASSTHROUGH : VSYNC_VFR) : 1;
 
-    if (format_video_sync) {
+    if (format_video_sync != VSYNC_PASSTHROUGH) {
         double vdelta = sync_ipts - ost->sync_opts + duration;
         // FIXME set to 0.5 after we fix some dts/pts bugs like in avidec.c
         if (vdelta < -1.1)
             nb_frames = 0;
-        else if (format_video_sync == 2) {
+        else if (format_video_sync == VSYNC_VFR) {
             if (vdelta <= -0.6) {
                 nb_frames = 0;
             } else if (vdelta > 0.6)
@@ -4873,6 +4878,17 @@ static int opt_video_filters(OptionsContext *o, const char *opt, const char *arg
     return parse_option(o, "filter:v", arg, options);
 }
 
+static int opt_vsync(const char *opt, const char *arg)
+{
+    if      (!av_strcasecmp(arg, "cfr"))         video_sync_method = VSYNC_CFR;
+    else if (!av_strcasecmp(arg, "vfr"))         video_sync_method = VSYNC_VFR;
+    else if (!av_strcasecmp(arg, "passthrough")) video_sync_method = VSYNC_PASSTHROUGH;
+
+    if (video_sync_method == VSYNC_AUTO)
+        video_sync_method = parse_number_or_die("vsync", arg, OPT_INT, VSYNC_AUTO, VSYNC_VFR);
+    return 0;
+}
+
 #define OFFSET(x) offsetof(OptionsContext, x)
 static const OptionDef options[] = {
     /* main options */
@@ -4908,7 +4924,7 @@ static const OptionDef options[] = {
     { "loop_input", OPT_BOOL | OPT_EXPERT, {(void*)&loop_input}, "deprecated, use -loop" },
     { "loop_output", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&loop_output}, "deprecated, use -loop", "" },
     { "target", HAS_ARG | OPT_FUNC2, {(void*)opt_target}, "specify target file type (\"vcd\", \"svcd\", \"dvd\", \"dv\", \"dv50\", \"pal-vcd\", \"ntsc-svcd\", ...)", "type" },
-    { "vsync", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&video_sync_method}, "video sync method", "" },
+    { "vsync", HAS_ARG | OPT_EXPERT, {(void*)opt_vsync}, "video sync method", "" },
     { "async", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&audio_sync_method}, "audio sync method", "" },
     { "adrift_threshold", HAS_ARG | OPT_FLOAT | OPT_EXPERT, {(void*)&audio_drift_threshold}, "audio drift threshold", "threshold" },
     { "copyts", OPT_BOOL | OPT_EXPERT, {(void*)&copy_ts}, "copy timestamps" },
diff --git a/libavcodec/adxdec.c b/libavcodec/adxdec.c
index fdff6875e1..ec4b1041af 100644
--- a/libavcodec/adxdec.c
+++ b/libavcodec/adxdec.c
@@ -165,6 +165,13 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
     return buf - avpkt->data;
 }
 
+static void adx_decode_flush(AVCodecContext *avctx)
+{
+    ADXContext *c = avctx->priv_data;
+    memset(c->prev, 0, sizeof(c->prev));
+    c->eof = 0;
+}
+
 AVCodec ff_adpcm_adx_decoder = {
     .name           = "adpcm_adx",
     .type           = AVMEDIA_TYPE_AUDIO,
@@ -172,6 +179,7 @@ AVCodec ff_adpcm_adx_decoder = {
     .priv_data_size = sizeof(ADXContext),
     .init           = adx_decode_init,
     .decode         = adx_decode_frame,
+    .flush          = adx_decode_flush,
     .capabilities   = CODEC_CAP_DR1,
     .long_name      = NULL_IF_CONFIG_SMALL("SEGA CRI ADX ADPCM"),
 };
diff --git a/libavcodec/adxenc.c b/libavcodec/adxenc.c
index 51545fa06c..b029e2eb78 100644
--- a/libavcodec/adxenc.c
+++ b/libavcodec/adxenc.c
@@ -19,9 +19,9 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "adx.h"
+#include "bytestream.h"
 #include "put_bits.h"
 
 /**
@@ -33,167 +33,135 @@
  * adx2wav & wav2adx http://www.geocities.co.jp/Playtown/2004/
  */
 
-/* 18 bytes <-> 32 samples */
-
-static void adx_encode(ADXContext *c, unsigned char *adx, const short *wav,
-                       ADXChannelState *prev)
+static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
+                       ADXChannelState *prev, int channels)
 {
     PutBitContext pb;
     int scale;
-    int i;
-    int s0,s1,s2,d;
-    int max=0;
-    int min=0;
-    int data[32];
+    int i, j;
+    int s0, s1, s2, d;
+    int max = 0;
+    int min = 0;
+    int data[BLOCK_SAMPLES];
 
     s1 = prev->s1;
     s2 = prev->s2;
-    for(i=0;i<32;i++) {
+    for (i = 0, j = 0; j < 32; i += channels, j++) {
         s0 = wav[i];
         d = ((s0 << COEFF_BITS) - c->coeff[0] * s1 - c->coeff[1] * s2) >> COEFF_BITS;
-        data[i]=d;
-        if (max<d) max=d;
-        if (min>d) min=d;
+        data[j] = d;
+        if (max < d)
+            max = d;
+        if (min > d)
+            min = d;
         s2 = s1;
         s1 = s0;
     }
     prev->s1 = s1;
     prev->s2 = s2;
 
-    /* -8..+7 */
-
-    if (max==0 && min==0) {
-        memset(adx,0,18);
+    if (max == 0 && min == 0) {
+        memset(adx, 0, BLOCK_SIZE);
         return;
     }
 
-    if (max/7>-min/8) scale = max/7;
-    else scale = -min/8;
+    if (max / 7 > -min / 8)
+        scale = max / 7;
+    else
+        scale = -min / 8;
 
-    if (scale==0) scale=1;
+    if (scale == 0)
+        scale = 1;
 
     AV_WB16(adx, scale);
 
     init_put_bits(&pb, adx + 2, 16);
-    for (i = 0; i < 32; i++)
-        put_sbits(&pb, 4, av_clip(data[i]/scale, -8, 7));
+    for (i = 0; i < BLOCK_SAMPLES; i++)
+        put_sbits(&pb, 4, av_clip(data[i] / scale, -8, 7));
     flush_put_bits(&pb);
 }
 
-static int adx_encode_header(AVCodecContext *avctx,unsigned char *buf,size_t bufsize)
-{
-#if 0
-    struct {
-        uint32_t offset; /* 0x80000000 + sample start - 4 */
-        unsigned char unknown1[3]; /* 03 12 04 */
-        unsigned char channel; /* 1 or 2 */
-        uint32_t freq;
-        uint32_t size;
-        uint32_t unknown2; /* 01 f4 03 00 */
-        uint32_t unknown3; /* 00 00 00 00 */
-        uint32_t unknown4; /* 00 00 00 00 */
+#define HEADER_SIZE 36
 
-    /* if loop
-        unknown3 00 15 00 01
-        unknown4 00 00 00 01
-        long loop_start_sample;
-        long loop_start_byte;
-        long loop_end_sample;
-        long loop_end_byte;
-        long
-    */
-    } adxhdr; /* big endian */
-    /* offset-6 "(c)CRI" */
-#endif
+static int adx_encode_header(AVCodecContext *avctx, uint8_t *buf, int bufsize)
+{
     ADXContext *c = avctx->priv_data;
 
-    AV_WB32(buf+0x00,0x80000000|0x20);
-    AV_WB32(buf+0x04,0x03120400|avctx->channels);
-    AV_WB32(buf+0x08,avctx->sample_rate);
-    AV_WB32(buf+0x0c,0); /* FIXME: set after */
-    AV_WB16(buf + 0x10, c->cutoff);
-    AV_WB32(buf + 0x12, 0x03000000);
-    AV_WB32(buf + 0x16, 0x00000000);
-    AV_WB32(buf + 0x1a, 0x00000000);
-    memcpy (buf + 0x1e, "(c)CRI", 6);
-    return 0x20+4;
+    if (bufsize < HEADER_SIZE)
+        return AVERROR(EINVAL);
+
+    bytestream_put_be16(&buf, 0x8000);              /* header signature */
+    bytestream_put_be16(&buf, HEADER_SIZE - 4);     /* copyright offset */
+    bytestream_put_byte(&buf, 3);                   /* encoding */
+    bytestream_put_byte(&buf, BLOCK_SIZE);          /* block size */
+    bytestream_put_byte(&buf, 4);                   /* sample size */
+    bytestream_put_byte(&buf, avctx->channels);     /* channels */
+    bytestream_put_be32(&buf, avctx->sample_rate);  /* sample rate */
+    bytestream_put_be32(&buf, 0);                   /* total sample count */
+    bytestream_put_be16(&buf, c->cutoff);           /* cutoff frequency */
+    bytestream_put_byte(&buf, 3);                   /* version */
+    bytestream_put_byte(&buf, 0);                   /* flags */
+    bytestream_put_be32(&buf, 0);                   /* unknown */
+    bytestream_put_be32(&buf, 0);                   /* loop enabled */
+    bytestream_put_be16(&buf, 0);                   /* padding */
+    bytestream_put_buffer(&buf, "(c)CRI", 6);       /* copyright signature */
+
+    return HEADER_SIZE;
 }
 
 static av_cold int adx_encode_init(AVCodecContext *avctx)
 {
     ADXContext *c = avctx->priv_data;
 
-    if (avctx->channels > 2)
-        return -1; /* only stereo or mono =) */
-    avctx->frame_size = 32;
+    if (avctx->channels > 2) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
+        return AVERROR(EINVAL);
+    }
+    avctx->frame_size = BLOCK_SAMPLES;
 
-    avctx->coded_frame= avcodec_alloc_frame();
-    avctx->coded_frame->key_frame= 1;
-
-//    avctx->bit_rate = avctx->sample_rate*avctx->channels*18*8/32;
+    avctx->coded_frame = avcodec_alloc_frame();
 
     /* the cutoff can be adjusted, but this seems to work pretty well */
     c->cutoff = 500;
     ff_adx_calculate_coeffs(c->cutoff, avctx->sample_rate, COEFF_BITS, c->coeff);
 
-    av_log(avctx, AV_LOG_DEBUG, "adx encode init\n");
-
     return 0;
 }
 
 static av_cold int adx_encode_close(AVCodecContext *avctx)
 {
     av_freep(&avctx->coded_frame);
-
     return 0;
 }
 
-static int adx_encode_frame(AVCodecContext *avctx,
-                uint8_t *frame, int buf_size, void *data)
+static int adx_encode_frame(AVCodecContext *avctx, uint8_t *frame,
+                            int buf_size, void *data)
 {
-    ADXContext *c = avctx->priv_data;
-    const short *samples = data;
-    unsigned char *dst = frame;
-    int rest = avctx->frame_size;
+    ADXContext *c          = avctx->priv_data;
+    const int16_t *samples = data;
+    uint8_t *dst           = frame;
+    int ch;
 
-/*
-    input data size =
-    ffmpeg.c: do_audio_out()
-    frame_bytes = enc->frame_size * 2 * enc->channels;
-*/
-
-//    printf("sz=%d ",buf_size); fflush(stdout);
     if (!c->header_parsed) {
-        int hdrsize = adx_encode_header(avctx,dst,buf_size);
-        dst+=hdrsize;
+        int hdrsize;
+        if ((hdrsize = adx_encode_header(avctx, dst, buf_size)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n");
+            return AVERROR(EINVAL);
+        }
+        dst      += hdrsize;
+        buf_size -= hdrsize;
         c->header_parsed = 1;
     }
-
-    if (avctx->channels==1) {
-        while(rest>=32) {
-            adx_encode(c, dst, samples, c->prev);
-            dst+=18;
-            samples+=32;
-            rest-=32;
-        }
-    } else {
-        while(rest>=32*2) {
-            short tmpbuf[32*2];
-            int i;
-
-            for(i=0;i<32;i++) {
-                tmpbuf[i] = samples[i*2];
-                tmpbuf[i+32] = samples[i*2+1];
-            }
-
-            adx_encode(c, dst,      tmpbuf,      c->prev);
-            adx_encode(c, dst + 18, tmpbuf + 32, c->prev + 1);
-            dst+=18*2;
-            samples+=32*2;
-            rest-=32*2;
-        }
+    if (buf_size < BLOCK_SIZE * avctx->channels) {
+        av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n");
+        return AVERROR(EINVAL);
     }
-    return dst-frame;
+
+    for (ch = 0; ch < avctx->channels; ch++) {
+        adx_encode(c, dst, samples + ch, &c->prev[ch], avctx->channels);
+        dst += BLOCK_SIZE;
+    }
+    return dst - frame;
 }
 
 AVCodec ff_adpcm_adx_encoder = {
@@ -204,6 +172,7 @@ AVCodec ff_adpcm_adx_encoder = {
     .init           = adx_encode_init,
     .encode         = adx_encode_frame,
     .close          = adx_encode_close,
-    .sample_fmts = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_S16,AV_SAMPLE_FMT_NONE},
-    .long_name = NULL_IF_CONFIG_SMALL("SEGA CRI ADX ADPCM"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16,
+                                                      AV_SAMPLE_FMT_NONE },
+    .long_name      = NULL_IF_CONFIG_SMALL("SEGA CRI ADX ADPCM"),
 };
diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c
index acf2a7dcd3..9a09fde7a9 100644
--- a/libavcodec/arm/rv34dsp_init_neon.c
+++ b/libavcodec/arm/rv34dsp_init_neon.c
@@ -25,12 +25,9 @@
 
 void ff_rv34_inv_transform_neon(DCTELEM *block);
 void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
-void ff_rv34_dequant4x4_neon(DCTELEM *block, int Qdc, int Q);
 
 void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
 {
     c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
     c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
-
-    c->rv34_dequant4x4 = ff_rv34_dequant4x4_neon;
 }
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index 423b537fb9..f700f5c321 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -107,27 +107,3 @@ function ff_rv34_inv_transform_noround_neon, export=1
         vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
         bx              lr
 endfunc
-
-function ff_rv34_dequant4x4_neon, export=1
-        mov             r3,  r0
-        mov             r12, #16
-        vdup.16         q0,  r2
-        vmov.16         d0[0], r1
-        vld1.16         {d2},     [r0,:64], r12
-        vld1.16         {d4},     [r0,:64], r12
-        vld1.16         {d6},     [r0,:64], r12
-        vld1.16         {d16},    [r0,:64], r12
-        vmull.s16       q1,  d2,  d0
-        vmull.s16       q2,  d4,  d1
-        vmull.s16       q3,  d6,  d1
-        vmull.s16       q8,  d16, d1
-        vqrshrn.s32     d2,  q1,  #4
-        vqrshrn.s32     d4,  q2,  #4
-        vqrshrn.s32     d6,  q3,  #4
-        vqrshrn.s32     d16, q8,  #4
-        vst1.16         {d2},     [r3,:64], r12
-        vst1.16         {d4},     [r3,:64], r12
-        vst1.16         {d6},     [r3,:64], r12
-        vst1.16         {d16},    [r3,:64], r12
-        bx              lr
-endfunc
diff --git a/libavcodec/bytestream.h b/libavcodec/bytestream.h
index 7ca36f8ad3..73ea0c880f 100644
--- a/libavcodec/bytestream.h
+++ b/libavcodec/bytestream.h
@@ -39,11 +39,15 @@ static av_always_inline void bytestream_put_ ##name(uint8_t **b, const type valu
     write(*b, value);\
     (*b) += bytes;\
 }\
+static av_always_inline type bytestream2_get_ ## name ## u(GetByteContext *g)\
+{\
+    return bytestream_get_ ## name(&g->buffer);\
+}\
 static av_always_inline type bytestream2_get_ ## name(GetByteContext *g)\
 {\
     if (g->buffer_end - g->buffer < bytes)\
         return 0;\
-    return bytestream_get_ ## name(&g->buffer);\
+    return bytestream2_get_ ## name ## u(g);\
 }\
 static av_always_inline type bytestream2_peek_ ## name(GetByteContext *g)\
 {\
diff --git a/libavcodec/cabac.c b/libavcodec/cabac.c
index e03043f91f..c603dafddd 100644
--- a/libavcodec/cabac.c
+++ b/libavcodec/cabac.c
@@ -109,10 +109,6 @@ void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size){
     c->low= 0;
     c->range= 0x1FE;
     c->outstanding_count= 0;
-#ifdef STRICT_LIMITS
-    c->sym_count =0;
-#endif
-
     c->pb.bit_left++; //avoids firstBitFlag
 }
 
@@ -183,10 +179,6 @@ static void put_cabac(CABACContext *c, uint8_t * const state, int bit){
     }
 
     renorm_cabac_encoder(c);
-
-#ifdef STRICT_LIMITS
-    c->symCount++;
-#endif
 }
 
 /**
@@ -208,10 +200,6 @@ static void put_cabac_bypass(CABACContext *c, int bit){
         put_cabac_bit(c, 1);
         c->low -= 0x400;
     }
-
-#ifdef STRICT_LIMITS
-    c->symCount++;
-#endif
 }
 
 /**
@@ -236,10 +224,6 @@ static int put_cabac_terminate(CABACContext *c, int bit){
         flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
     }
 
-#ifdef STRICT_LIMITS
-    c->symCount++;
-#endif
-
     return (put_bits_count(&c->pb)+7)>>3;
 }
 
@@ -365,21 +349,6 @@ START_TIMER
             av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i);
 STOP_TIMER("get_cabac")
     }
-#if 0
-    for(i=0; i<SIZE; i++){
-START_TIMER
-        if( r[i] != get_cabac_u(&c, state, (i&1) ? 6 : 7, 3, i&1) )
-            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
-STOP_TIMER("get_cabac_u")
-    }
-
-    for(i=0; i<SIZE; i++){
-START_TIMER
-        if( r[i] != get_cabac_ueg(&c, state, 3, 0, 1, 2))
-            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
-STOP_TIMER("get_cabac_ueg")
-    }
-#endif
     if(!get_cabac_terminate(&c))
         av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n");
 
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index ed156e6fca..d31d75b030 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -41,9 +41,6 @@ typedef struct CABACContext{
     int low;
     int range;
     int outstanding_count;
-#ifdef STRICT_LIMITS
-    int symCount;
-#endif
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
@@ -216,62 +213,4 @@ static int av_unused get_cabac_terminate(CABACContext *c){
     }
 }
 
-#if 0
-/**
- * Get (truncated) unary binarization.
- */
-static int get_cabac_u(CABACContext *c, uint8_t * state, int max, int max_index, int truncated){
-    int i;
-
-    for(i=0; i<max; i++){
-        if(get_cabac(c, state)==0)
-            return i;
-
-        if(i< max_index) state++;
-    }
-
-    return truncated ? max : -1;
-}
-
-/**
- * get unary exp golomb k-th order binarization.
- */
-static int get_cabac_ueg(CABACContext *c, uint8_t * state, int max, int is_signed, int k, int max_index){
-    int i, v;
-    int m= 1<<k;
-
-    if(get_cabac(c, state)==0)
-        return 0;
-
-    if(0 < max_index) state++;
-
-    for(i=1; i<max; i++){
-        if(get_cabac(c, state)==0){
-            if(is_signed && get_cabac_bypass(c)){
-                return -i;
-            }else
-                return i;
-        }
-
-        if(i < max_index) state++;
-    }
-
-    while(get_cabac_bypass(c)){
-        i+= m;
-        m+= m;
-    }
-
-    v=0;
-    while(m>>=1){
-        v+= v + get_cabac_bypass(c);
-    }
-    i += v;
-
-    if(is_signed && get_cabac_bypass(c)){
-        return -i;
-    }else
-        return i;
-}
-#endif /* 0 */
-
 #endif /* AVCODEC_CABAC_H */
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index 7021a70650..e4c6274ae4 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -578,9 +578,8 @@ static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int
 
         for (i = 0; i < 8; i++) {
             DCTELEM *block = ctx->blocks[i];
-            int last_index, overflow;
-            int n = dnxhd_switch_matrix(ctx, i);
-            last_index = ctx->m.dct_quantize(&ctx->m, block, 4&(2*i), qscale, &overflow);
+            int overflow, n = dnxhd_switch_matrix(ctx, i);
+            int last_index = ctx->m.dct_quantize(&ctx->m, block, 4&(2*i), qscale, &overflow);
             //START_TIMER;
             dnxhd_encode_block(ctx, block, last_index, n);
             //STOP_TIMER("encode_block");
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 8a79311f7c..8cd9fe72ef 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -4051,7 +4051,7 @@ static int decode_frame(AVCodecContext *avctx,
     H264Context *h = avctx->priv_data;
     MpegEncContext *s = &h->s;
     AVFrame *pict = data;
-    int buf_index;
+    int buf_index = 0;
     Picture *out;
     int i, out_idx;
 
@@ -4081,7 +4081,7 @@ static int decode_frame(AVCodecContext *avctx,
             *pict= *(AVFrame*)out;
         }
 
-        return buf_size;
+        return buf_index;
     }
     if(h->is_avc && buf_size >= 9 && buf[0]==1 && buf[2]==0 && (buf[4]&0xFC)==0xFC && (buf[5]&0x1F) && buf[8]==0x67){
         int cnt= buf[5]&0x1f;
@@ -4112,7 +4112,6 @@ not_extra:
 
     if (!s->current_picture_ptr && h->nal_unit_type == NAL_END_SEQUENCE) {
         av_assert0(buf_index <= buf_size);
-        buf_size = buf_index;
         goto out;
     }
 
@@ -4193,9 +4192,7 @@ int main(void){
 
     init_get_bits(&gb, temp, 8*SIZE);
     for(i=0; i<COUNT; i++){
-        int j, s;
-
-        s= show_bits(&gb, 24);
+        int j, s = show_bits(&gb, 24);
 
         {START_TIMER
         j= get_ue_golomb(&gb);
@@ -4218,9 +4215,7 @@ int main(void){
 
     init_get_bits(&gb, temp, 8*SIZE);
     for(i=0; i<COUNT; i++){
-        int j, s;
-
-        s= show_bits(&gb, 24);
+        int j, s = show_bits(&gb, 24);
 
         {START_TIMER
         j= get_se_golomb(&gb);
diff --git a/libavcodec/indeo5.c b/libavcodec/indeo5.c
index fc534dd4e4..b1c50c2adb 100644
--- a/libavcodec/indeo5.c
+++ b/libavcodec/indeo5.c
@@ -760,7 +760,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 
     switch_buffers(ctx);
 
-    //START_TIMER;
+    //{ START_TIMER;
 
     if (ctx->frame_type != FRAMETYPE_NULL) {
         for (p = 0; p < 3; p++) {
@@ -775,7 +775,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
         }
     }
 
-    //STOP_TIMER("decode_planes");
+    //STOP_TIMER("decode_planes"); }
 
     if (ctx->frame.data[0])
         avctx->release_buffer(avctx, &ctx->frame);
diff --git a/libavcodec/libspeexenc.c b/libavcodec/libspeexenc.c
index a3176d1388..c89f0748a1 100644
--- a/libavcodec/libspeexenc.c
+++ b/libavcodec/libspeexenc.c
@@ -83,7 +83,8 @@ typedef struct {
     int abr;                    ///< flag to enable ABR
     int pkt_frame_count;        ///< frame count for the current packet
     int lookahead;              ///< encoder delay
-    int sample_count;           ///< total sample count (used for pts)
+    int64_t next_pts;           ///< next pts, in sample_rate time base
+    int pkt_sample_count;       ///< sample count in the current packet
 } LibSpeexEncContext;
 
 static av_cold void print_enc_params(AVCodecContext *avctx,
@@ -201,7 +202,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
 
     /* set encoding delay */
     speex_encoder_ctl(s->enc_state, SPEEX_GET_LOOKAHEAD, &s->lookahead);
-    s->sample_count = -s->lookahead;
+    s->next_pts = -s->lookahead;
 
     /* create header packet bytes from header struct */
     /* note: libspeex allocates the memory for header_data, which is freed
@@ -235,7 +236,6 @@ static int encode_frame(AVCodecContext *avctx, uint8_t *frame, int buf_size,
 {
     LibSpeexEncContext *s = avctx->priv_data;
     int16_t *samples      = data;
-    int sample_count      = s->sample_count;
 
     if (data) {
         /* encode Speex frame */
@@ -243,7 +243,7 @@ static int encode_frame(AVCodecContext *avctx, uint8_t *frame, int buf_size,
             speex_encode_stereo_int(samples, s->header.frame_size, &s->bits);
         speex_encode_int(s->enc_state, samples, &s->bits);
         s->pkt_frame_count++;
-        s->sample_count += avctx->frame_size;
+        s->pkt_sample_count += avctx->frame_size;
     } else {
         /* handle end-of-stream */
         if (!s->pkt_frame_count)
@@ -259,8 +259,10 @@ static int encode_frame(AVCodecContext *avctx, uint8_t *frame, int buf_size,
     if (s->pkt_frame_count == s->frames_per_packet) {
         s->pkt_frame_count = 0;
         avctx->coded_frame->pts =
-            av_rescale_q(sample_count, (AVRational){ 1, avctx->sample_rate },
+            av_rescale_q(s->next_pts, (AVRational){ 1, avctx->sample_rate },
                          avctx->time_base);
+        s->next_pts += s->pkt_sample_count;
+        s->pkt_sample_count = 0;
         if (buf_size > speex_bits_nbytes(&s->bits)) {
             int ret = speex_bits_write(&s->bits, frame, buf_size);
             speex_bits_reset(&s->bits);
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index afe8c1f874..12a539e89a 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -212,7 +212,7 @@ static int rv34_decode_cbp(GetBitContext *gb, RV34VLC *vlc, int table)
 /**
  * Get one coefficient value from the bistream and store it.
  */
-static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext *gb, VLC* vlc)
+static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext *gb, VLC* vlc, int q)
 {
     if(coef){
         if(coef == esc){
@@ -225,14 +225,14 @@ static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext *
         }
         if(get_bits1(gb))
             coef = -coef;
-        *dst = coef;
+        *dst = (coef*q + 8) >> 4;
     }
 }
 
 /**
  * Decode 2x2 subblock of coefficients.
  */
-static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc)
+static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc, int q)
 {
     int coeffs[4];
 
@@ -240,15 +240,35 @@ static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2,
     coeffs[1] = modulo_three_table[code][1];
     coeffs[2] = modulo_three_table[code][2];
     coeffs[3] = modulo_three_table[code][3];
-    decode_coeff(dst  , coeffs[0], 3, gb, vlc);
+    decode_coeff(dst  , coeffs[0], 3, gb, vlc, q);
     if(is_block2){
-        decode_coeff(dst+8, coeffs[1], 2, gb, vlc);
-        decode_coeff(dst+1, coeffs[2], 2, gb, vlc);
+        decode_coeff(dst+8, coeffs[1], 2, gb, vlc, q);
+        decode_coeff(dst+1, coeffs[2], 2, gb, vlc, q);
     }else{
-        decode_coeff(dst+1, coeffs[1], 2, gb, vlc);
-        decode_coeff(dst+8, coeffs[2], 2, gb, vlc);
+        decode_coeff(dst+1, coeffs[1], 2, gb, vlc, q);
+        decode_coeff(dst+8, coeffs[2], 2, gb, vlc, q);
     }
-    decode_coeff(dst+9, coeffs[3], 2, gb, vlc);
+    decode_coeff(dst+9, coeffs[3], 2, gb, vlc, q);
+}
+
+static inline void decode_subblock3(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc,
+                                    int q_dc, int q_ac1, int q_ac2)
+{
+    int coeffs[4];
+
+    coeffs[0] = modulo_three_table[code][0];
+    coeffs[1] = modulo_three_table[code][1];
+    coeffs[2] = modulo_three_table[code][2];
+    coeffs[3] = modulo_three_table[code][3];
+    decode_coeff(dst  , coeffs[0], 3, gb, vlc, q_dc);
+    if(is_block2){
+        decode_coeff(dst+8, coeffs[1], 2, gb, vlc, q_ac1);
+        decode_coeff(dst+1, coeffs[2], 2, gb, vlc, q_ac1);
+    }else{
+        decode_coeff(dst+1, coeffs[1], 2, gb, vlc, q_ac1);
+        decode_coeff(dst+8, coeffs[2], 2, gb, vlc, q_ac1);
+    }
+    decode_coeff(dst+9, coeffs[3], 2, gb, vlc, q_ac2);
 }
 
 /**
@@ -262,7 +282,7 @@ static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2,
  *  o--o
  */
 
-static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *rvlc, int fc, int sc)
+static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *rvlc, int fc, int sc, int q_dc, int q_ac1, int q_ac2)
 {
     int code, pattern;
 
@@ -271,39 +291,23 @@ static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *r
     pattern = code & 0x7;
 
     code >>= 3;
-    decode_subblock(dst, code, 0, gb, &rvlc->coefficient);
+    decode_subblock3(dst, code, 0, gb, &rvlc->coefficient, q_dc, q_ac1, q_ac2);
 
     if(pattern & 4){
         code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 2, code, 0, gb, &rvlc->coefficient);
+        decode_subblock(dst + 2, code, 0, gb, &rvlc->coefficient, q_ac2);
     }
     if(pattern & 2){ // Looks like coefficients 1 and 2 are swapped for this block
         code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 8*2, code, 1, gb, &rvlc->coefficient);
+        decode_subblock(dst + 8*2, code, 1, gb, &rvlc->coefficient, q_ac2);
     }
     if(pattern & 1){
         code = get_vlc2(gb, rvlc->third_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 8*2+2, code, 0, gb, &rvlc->coefficient);
+        decode_subblock(dst + 8*2+2, code, 0, gb, &rvlc->coefficient, q_ac2);
     }
 
 }
 
-/**
- * Dequantize 4x4 block of DC values for 16x16 macroblock.
- * @todo optimize
- */
-static inline void rv34_dequant4x4_16x16(DCTELEM *block, int Qdc, int Q)
-{
-    int i;
-
-    for(i = 0; i < 3; i++)
-         block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Qdc + 8) >> 4;
-    for(; i < 16; i++)
-         block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Q + 8) >> 4;
-}
-/** @} */ //block functions
-
-
 /**
  * @name RV30/40 bitstream parsing
  * @{
@@ -676,8 +680,9 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     srcY += src_y * s->linesize + src_x;
     srcU += uvsrc_y * s->uvlinesize + uvsrc_x;
     srcV += uvsrc_y * s->uvlinesize + uvsrc_x;
-    if(   (unsigned)(src_x - !!lx*2) > s->h_edge_pos - !!lx*2 - (width <<3) - 4
-       || (unsigned)(src_y - !!ly*2) > s->v_edge_pos - !!ly*2 - (height<<3) - 4){
+    if(s->h_edge_pos - (width << 3) < 6 || s->v_edge_pos - (height << 3) < 6 ||
+       (unsigned)(src_x - !!lx*2) > s->h_edge_pos - !!lx*2 - (width <<3) - 4 ||
+       (unsigned)(src_y - !!ly*2) > s->v_edge_pos - !!ly*2 - (height<<3) - 4) {
         uint8_t *uvbuf = s->edge_emu_buffer + 22 * s->linesize;
 
         srcY -= 2 + 2*s->linesize;
@@ -1097,6 +1102,7 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
     MpegEncContext *s = &r->s;
     GetBitContext *gb = &s->gb;
     int cbp, cbp2;
+    int q_dc, q_ac;
     int i, blknum, blkoff;
     LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
     int luma_dc_quant;
@@ -1133,31 +1139,34 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
 
     luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16 ? r->luma_dc_quant_p[s->qscale] : r->luma_dc_quant_i[s->qscale];
     if(r->is16){
+        q_dc = rv34_qscale_tab[luma_dc_quant];
+        q_ac = rv34_qscale_tab[s->qscale];
         memset(block16, 0, 64 * sizeof(*block16));
-        rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0);
-        rv34_dequant4x4_16x16(block16, rv34_qscale_tab[luma_dc_quant],rv34_qscale_tab[s->qscale]);
+        rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac);
         r->rdsp.rv34_inv_transform_tab[1](block16);
     }
 
+    q_ac = rv34_qscale_tab[s->qscale];
     for(i = 0; i < 16; i++, cbp >>= 1){
         if(!r->is16 && !(cbp & 1)) continue;
         blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
         blkoff = ((i & 1) << 2) + ((i & 4) << 3);
         if(cbp & 1)
-            rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->luma_vlc, 0);
-        r->rdsp.rv34_dequant4x4(s->block[blknum] + blkoff, rv34_qscale_tab[s->qscale],rv34_qscale_tab[s->qscale]);
+            rv34_decode_block(s->block[blknum] + blkoff, gb,
+                              r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
         if(r->is16) //FIXME: optimize
             s->block[blknum][blkoff] = block16[(i & 3) | ((i & 0xC) << 1)];
         r->rdsp.rv34_inv_transform_tab[0](s->block[blknum] + blkoff);
     }
     if(r->block_type == RV34_MB_P_MIX16x16)
         r->cur_vlcs = choose_vlc_set(r->si.quant, r->si.vlc_set, 1);
+    q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
+    q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
     for(; i < 24; i++, cbp >>= 1){
         if(!(cbp & 1)) continue;
         blknum = ((i & 4) >> 2) + 4;
         blkoff = ((i & 1) << 2) + ((i & 2) << 4);
-        rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->chroma_vlc, 1);
-        r->rdsp.rv34_dequant4x4(s->block[blknum] + blkoff, rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]],rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]]);
+        rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac);
         r->rdsp.rv34_inv_transform_tab[0](s->block[blknum] + blkoff);
     }
     if (IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos]))
diff --git a/libavcodec/rv34data.h b/libavcodec/rv34data.h
index 2155084d09..1b608e7e35 100644
--- a/libavcodec/rv34data.h
+++ b/libavcodec/rv34data.h
@@ -100,16 +100,6 @@ static const uint16_t rv34_qscale_tab[32] = {
  963, 1074, 1212, 1392, 1566, 1708, 1978, 2211
 };
 
-/**
- * 4x4 dezigzag pattern
- */
-static const uint8_t rv34_dezigzag[16] = {
-  0,  1,  8, 16,
-  9,  2,  3, 10,
- 17, 24, 25, 18,
- 11, 19, 26, 27
-};
-
 /**
  * tables used to translate a quantizer value into a VLC set for decoding
  * The first table is used for intraframes.
diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c
index 974bf9ec16..1f4cea8544 100644
--- a/libavcodec/rv34dsp.c
+++ b/libavcodec/rv34dsp.c
@@ -100,26 +100,10 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){
 /** @} */ // transform
 
 
-/**
- * Dequantize ordinary 4x4 block.
- */
-void ff_rv34_dequant4x4_neon(DCTELEM *block, int Qdc, int Q);
-static void rv34_dequant4x4_c(DCTELEM *block, int Qdc, int Q)
-{
-    int i, j;
-
-    block[0] = (block[0] * Qdc + 8) >> 4;
-    for (i = 0; i < 4; i++)
-        for (j = !i; j < 4; j++)
-            block[j + i*8] = (block[j + i*8] * Q + 8) >> 4;
-}
-
 av_cold void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp) {
     c->rv34_inv_transform_tab[0] = rv34_inv_transform_c;
     c->rv34_inv_transform_tab[1] = rv34_inv_transform_noround_c;
 
-    c->rv34_dequant4x4 = rv34_dequant4x4_c;
-
     if (HAVE_NEON)
         ff_rv34dsp_init_neon(c, dsp);
 }
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index 01352ea793..f2bc20e911 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -56,7 +56,6 @@ typedef struct RV34DSPContext {
     h264_chroma_mc_func avg_chroma_pixels_tab[3];
     rv40_weight_func rv40_weight_pixels_tab[2];
     rv34_inv_transform_func rv34_inv_transform_tab[2];
-    void (*rv34_dequant4x4)(DCTELEM *block, int Qdc, int Q);
     rv40_weak_loop_filter_func rv40_weak_loop_filter[2];
     rv40_strong_loop_filter_func rv40_strong_loop_filter[2];
     rv40_loop_filter_strength_func rv40_loop_filter_strength[2];
diff --git a/libavcodec/ulti.c b/libavcodec/ulti.c
index 4f0d90f665..6e7ba35f9d 100644
--- a/libavcodec/ulti.c
+++ b/libavcodec/ulti.c
@@ -38,16 +38,9 @@ typedef struct UltimotionDecodeContext {
     int width, height, blocks;
     AVFrame frame;
     const uint8_t *ulti_codebook;
+    GetByteContext gb;
 } UltimotionDecodeContext;
 
-#define CHECK_OVERREAD_SIZE(size) \
-    do { \
-        if (buf_end - buf < (size)) { \
-            av_log(avctx, AV_LOG_ERROR, "Insufficient data\n"); \
-            return AVERROR_INVALIDDATA; \
-        } \
-    } while(0)
-
 static av_cold int ulti_decode_init(AVCodecContext *avctx)
 {
     UltimotionDecodeContext *s = avctx->priv_data;
@@ -232,7 +225,6 @@ static int ulti_decode_frame(AVCodecContext *avctx,
     int i;
     int skip;
     int tmp;
-    const uint8_t *buf_end = buf + buf_size;
 
     s->frame.reference = 3;
     s->frame.buffer_hints = FF_BUFFER_HINTS_VALID | FF_BUFFER_HINTS_PRESERVE | FF_BUFFER_HINTS_REUSABLE;
@@ -241,18 +233,20 @@ static int ulti_decode_frame(AVCodecContext *avctx,
         return -1;
     }
 
+    bytestream2_init(&s->gb, buf, buf_size);
+
     while(!done) {
         int idx;
         if(blocks >= s->blocks || y >= s->height)
             break;//all blocks decoded
 
-        CHECK_OVERREAD_SIZE(1);
-        idx = *buf++;
+        if (bytestream2_get_bytes_left(&s->gb) < 1)
+            goto err;
+        idx = bytestream2_get_byteu(&s->gb);
         if((idx & 0xF8) == 0x70) {
             switch(idx) {
             case 0x70: //change modifier
-                CHECK_OVERREAD_SIZE(1);
-                modifier = *buf++;
+                modifier = bytestream2_get_byte(&s->gb);
                 if(modifier>1)
                     av_log(avctx, AV_LOG_INFO, "warning: modifier must be 0 or 1, got %i\n", modifier);
                 break;
@@ -266,8 +260,7 @@ static int ulti_decode_frame(AVCodecContext *avctx,
                 done = 1;
                 break;
             case 0x74: //skip some blocks
-                CHECK_OVERREAD_SIZE(1);
-                skip = *buf++;
+                skip = bytestream2_get_byte(&s->gb);
                 if ((blocks + skip) >= s->blocks)
                     break;
                 blocks += skip;
@@ -294,8 +287,7 @@ static int ulti_decode_frame(AVCodecContext *avctx,
             } else {
                 cf = 0;
                 if (idx) {
-                    CHECK_OVERREAD_SIZE(1);
-                    chroma = *buf++;
+                    chroma = bytestream2_get_byte(&s->gb);
                 }
             }
             for (i = 0; i < 4; i++) { // for every subblock
@@ -303,15 +295,13 @@ static int ulti_decode_frame(AVCodecContext *avctx,
                 if(!code) //skip subblock
                     continue;
                 if(cf) {
-                    CHECK_OVERREAD_SIZE(1);
-                    chroma = *buf++;
+                    chroma = bytestream2_get_byte(&s->gb);
                 }
                 tx = x + block_coords[i * 2];
                 ty = y + block_coords[(i * 2) + 1];
                 switch(code) {
                 case 1:
-                    CHECK_OVERREAD_SIZE(1);
-                    tmp = *buf++;
+                    tmp = bytestream2_get_byte(&s->gb);
 
                     angle = angle_by_index[(tmp >> 6) & 0x3];
 
@@ -331,8 +321,7 @@ static int ulti_decode_frame(AVCodecContext *avctx,
 
                 case 2:
                     if (modifier) { // unpack four luma samples
-                        CHECK_OVERREAD_SIZE(3);
-                        tmp = bytestream_get_be24(&buf);
+                        tmp = bytestream2_get_be24(&s->gb);
 
                         Y[0] = (tmp >> 18) & 0x3F;
                         Y[1] = (tmp >> 12) & 0x3F;
@@ -340,8 +329,7 @@ static int ulti_decode_frame(AVCodecContext *avctx,
                         Y[3] = tmp & 0x3F;
                         angle = 16;
                     } else { // retrieve luma samples from codebook
-                        CHECK_OVERREAD_SIZE(2);
-                        tmp = bytestream_get_be16(&buf);
+                        tmp = bytestream2_get_be16(&s->gb);
 
                         angle = (tmp >> 12) & 0xF;
                         tmp &= 0xFFF;
@@ -357,27 +345,27 @@ static int ulti_decode_frame(AVCodecContext *avctx,
                     if (modifier) { // all 16 luma samples
                         uint8_t Luma[16];
 
-                        CHECK_OVERREAD_SIZE(12);
-
-                        tmp = bytestream_get_be24(&buf);
+                        if (bytestream2_get_bytes_left(&s->gb) < 12)
+                            goto err;
+                        tmp = bytestream2_get_be24u(&s->gb);
                         Luma[0] = (tmp >> 18) & 0x3F;
                         Luma[1] = (tmp >> 12) & 0x3F;
                         Luma[2] = (tmp >> 6) & 0x3F;
                         Luma[3] = tmp & 0x3F;
 
-                        tmp = bytestream_get_be24(&buf);
+                        tmp = bytestream2_get_be24u(&s->gb);
                         Luma[4] = (tmp >> 18) & 0x3F;
                         Luma[5] = (tmp >> 12) & 0x3F;
                         Luma[6] = (tmp >> 6) & 0x3F;
                         Luma[7] = tmp & 0x3F;
 
-                        tmp = bytestream_get_be24(&buf);
+                        tmp = bytestream2_get_be24u(&s->gb);
                         Luma[8] = (tmp >> 18) & 0x3F;
                         Luma[9] = (tmp >> 12) & 0x3F;
                         Luma[10] = (tmp >> 6) & 0x3F;
                         Luma[11] = tmp & 0x3F;
 
-                        tmp = bytestream_get_be24(&buf);
+                        tmp = bytestream2_get_be24u(&s->gb);
                         Luma[12] = (tmp >> 18) & 0x3F;
                         Luma[13] = (tmp >> 12) & 0x3F;
                         Luma[14] = (tmp >> 6) & 0x3F;
@@ -385,22 +373,23 @@ static int ulti_decode_frame(AVCodecContext *avctx,
 
                         ulti_convert_yuv(&s->frame, tx, ty, Luma, chroma);
                     } else {
-                        CHECK_OVERREAD_SIZE(4);
-                        tmp = *buf++;
+                        if (bytestream2_get_bytes_left(&s->gb) < 4)
+                            goto err;
+                        tmp = bytestream2_get_byteu(&s->gb);
                         if(tmp & 0x80) {
                             angle = (tmp >> 4) & 0x7;
-                            tmp = (tmp << 8) + *buf++;
+                            tmp = (tmp << 8) + bytestream2_get_byteu(&s->gb);
                             Y[0] = (tmp >> 6) & 0x3F;
                             Y[1] = tmp & 0x3F;
-                            Y[2] = (*buf++) & 0x3F;
-                            Y[3] = (*buf++) & 0x3F;
+                            Y[2] = bytestream2_get_byteu(&s->gb) & 0x3F;
+                            Y[3] = bytestream2_get_byteu(&s->gb) & 0x3F;
                             ulti_grad(&s->frame, tx, ty, Y, chroma, angle); //draw block
                         } else { // some patterns
                             int f0, f1;
-                            f0 = *buf++;
+                            f0 = bytestream2_get_byteu(&s->gb);
                             f1 = tmp;
-                            Y[0] = (*buf++) & 0x3F;
-                            Y[1] = (*buf++) & 0x3F;
+                            Y[0] = bytestream2_get_byteu(&s->gb) & 0x3F;
+                            Y[1] = bytestream2_get_byteu(&s->gb) & 0x3F;
                             ulti_pattern(&s->frame, tx, ty, f1, f0, Y[0], Y[1], chroma);
                         }
                     }
@@ -422,6 +411,11 @@ static int ulti_decode_frame(AVCodecContext *avctx,
     *(AVFrame*)data= s->frame;
 
     return buf_size;
+
+err:
+    av_log(avctx, AV_LOG_ERROR,
+           "Insufficient data\n");
+    return AVERROR_INVALIDDATA;
 }
 
 AVCodec ff_ulti_decoder = {
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index e4ecf14b5b..1469d815ce 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -568,6 +568,7 @@ static void vc1_mc_1mv(VC1Context *v, int dir)
     }
 
     if (v->rangeredfrm || (v->mv_mode == MV_PMODE_INTENSITY_COMP)
+        || s->h_edge_pos < 22 || v_edge_pos < 22
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx&3) - 16 - s->mspel * 3
         || (unsigned)(src_y - s->mspel) > v_edge_pos    - (my&3) - 16 - s->mspel * 3) {
         uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize;
@@ -799,6 +800,7 @@ static void vc1_mc_4mv_luma(VC1Context *v, int n, int dir)
     if (fieldmv && (src_y & 1) && src_y < 4)
         src_y--;
     if (v->rangeredfrm || (v->mv_mode == MV_PMODE_INTENSITY_COMP)
+        || s->h_edge_pos < 13 || v_edge_pos < 23
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx & 3) - 8 - s->mspel * 2
         || (unsigned)(src_y - (s->mspel << fieldmv)) > v_edge_pos - (my & 3) - ((8 + s->mspel * 2) << fieldmv)) {
         srcY -= s->mspel * (1 + (s->linesize << fieldmv));
@@ -998,6 +1000,7 @@ static void vc1_mc_4mv_chroma(VC1Context *v, int dir)
     }
 
     if (v->rangeredfrm || (v->mv_mode == MV_PMODE_INTENSITY_COMP)
+        || s->h_edge_pos < 18 || v_edge_pos < 18
         || (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 9
         || (unsigned)uvsrc_y > (v_edge_pos    >> 1) - 9) {
         s->dsp.emulated_edge_mc(s->edge_emu_buffer     , srcU, s->uvlinesize,
@@ -1102,6 +1105,7 @@ static void vc1_mc_4mv_chroma4(VC1Context *v)
         if (fieldmv && (uvsrc_y & 1) && uvsrc_y < 2)
             uvsrc_y--;
         if ((v->mv_mode == MV_PMODE_INTENSITY_COMP)
+            || s->h_edge_pos < 10 || v_edge_pos < (5 << fieldmv)
             || (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 5
             || (unsigned)uvsrc_y > v_edge_pos - (5 << fieldmv)) {
             s->dsp.emulated_edge_mc(s->edge_emu_buffer, srcU, s->uvlinesize,
@@ -2006,7 +2010,7 @@ static void vc1_interp_mc(VC1Context *v)
         srcV = s->edge_emu_buffer + 18 * s->linesize;
     }
 
-    if (v->rangeredfrm
+    if (v->rangeredfrm || s->h_edge_pos < 22 || v_edge_pos < 22
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx & 3) - 16 - s->mspel * 3
         || (unsigned)(src_y - s->mspel) > v_edge_pos    - (my & 3) - 16 - s->mspel * 3) {
         uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize;
diff --git a/libavdevice/v4l2.c b/libavdevice/v4l2.c
index 15356fe980..0b00abbf10 100644
--- a/libavdevice/v4l2.c
+++ b/libavdevice/v4l2.c
@@ -69,19 +69,17 @@
 
 static const int desired_video_buffers = 256;
 
-enum io_method {
-    io_read,
-    io_mmap,
-    io_userptr
-};
+#define V4L_ALLFORMATS  3
+#define V4L_RAWFORMATS  1
+#define V4L_COMPFORMATS 2
 
 struct video_data {
     AVClass *class;
     int fd;
     int frame_format; /* V4L2_PIX_FMT_* */
-    enum io_method io_method;
     int width, height;
     int frame_size;
+    int interlaced;
     int top_field_first;
 
     int buffers;
@@ -89,8 +87,10 @@ struct video_data {
     unsigned int *buf_len;
     char *standard;
     int channel;
-    char *video_size; /**< String describing video size, set by a private option. */
+    char *video_size;   /**< String describing video size,
+                             set by a private option. */
     char *pixel_format; /**< Set by a private option. */
+    int list_format;    /**< Set by a private option. */
     char *framerate;    /**< Set by a private option. */
 };
 
@@ -124,7 +124,7 @@ static struct fmt_map fmt_conversion_table[] = {
     { PIX_FMT_NONE,    CODEC_ID_MJPEG,    V4L2_PIX_FMT_JPEG    },
 };
 
-static int device_open(AVFormatContext *ctx, uint32_t *capabilities)
+static int device_open(AVFormatContext *ctx)
 {
     struct v4l2_capability cap;
     int fd;
@@ -137,11 +137,15 @@ static int device_open(AVFormatContext *ctx, uint32_t *capabilities)
     if (ctx->flags & AVFMT_FLAG_NONBLOCK) {
         flags |= O_NONBLOCK;
     }
+
     fd = v4l2_open(ctx->filename, flags, 0);
     if (fd < 0) {
+        err = errno;
+
         av_log(ctx, AV_LOG_ERROR, "Cannot open video device %s : %s\n",
-                 ctx->filename, strerror(errno));
-        return AVERROR(errno);
+               ctx->filename, strerror(err));
+
+        return AVERROR(err);
     }
 #if CONFIG_LIBV4L2
     fd_libv4l = v4l2_fd_open(fd, 0);
@@ -155,53 +159,80 @@ static int device_open(AVFormatContext *ctx, uint32_t *capabilities)
 #endif
 
     res = v4l2_ioctl(fd, VIDIOC_QUERYCAP, &cap);
-    // ENOIOCTLCMD definition only availble on __KERNEL__
-    if (res < 0 && ((err = errno) == 515)) {
-        av_log(ctx, AV_LOG_ERROR, "QUERYCAP not implemented, probably V4L device but not supporting V4L2\n");
-        v4l2_close(fd);
-
-        return AVERROR(515);
-    }
     if (res < 0) {
+        err = errno;
         av_log(ctx, AV_LOG_ERROR, "ioctl(VIDIOC_QUERYCAP): %s\n",
-                 strerror(errno));
-        v4l2_close(fd);
-        return AVERROR(err);
+               strerror(err));
+
+        goto fail;
     }
-    if ((cap.capabilities & V4L2_CAP_VIDEO_CAPTURE) == 0) {
-        av_log(ctx, AV_LOG_ERROR, "Not a video capture device\n");
-        v4l2_close(fd);
-        return AVERROR(ENODEV);
+
+    av_log(ctx, AV_LOG_VERBOSE, "[%d]Capabilities: %x\n",
+           fd, cap.capabilities);
+
+    if (!(cap.capabilities & V4L2_CAP_VIDEO_CAPTURE)) {
+        av_log(ctx, AV_LOG_ERROR, "Not a video capture device.\n");
+        err = ENODEV;
+
+        goto fail;
+    }
+
+    if (!(cap.capabilities & V4L2_CAP_STREAMING)) {
+        av_log(ctx, AV_LOG_ERROR,
+               "The device does not support the streaming I/O method.\n");
+        err = ENOSYS;
+
+        goto fail;
     }
-    *capabilities = cap.capabilities;
 
     return fd;
+
+fail:
+    v4l2_close(fd);
+    return AVERROR(err);
 }
 
-static int device_init(AVFormatContext *ctx, int *width, int *height, uint32_t pix_fmt)
+static int device_init(AVFormatContext *ctx, int *width, int *height,
+                       uint32_t pix_fmt)
 {
     struct video_data *s = ctx->priv_data;
     int fd = s->fd;
-    struct v4l2_format fmt = {0};
+    struct v4l2_format fmt;
+    struct v4l2_pix_format *pix = &fmt.fmt.pix;
+
     int res;
 
+    memset(&fmt, 0, sizeof(struct v4l2_format));
+
     fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    fmt.fmt.pix.width = *width;
-    fmt.fmt.pix.height = *height;
-    fmt.fmt.pix.pixelformat = pix_fmt;
-    fmt.fmt.pix.field = V4L2_FIELD_INTERLACED;
+    pix->width = *width;
+    pix->height = *height;
+    pix->pixelformat = pix_fmt;
+    pix->field = V4L2_FIELD_ANY;
+
     res = v4l2_ioctl(fd, VIDIOC_S_FMT, &fmt);
+
     if ((*width != fmt.fmt.pix.width) || (*height != fmt.fmt.pix.height)) {
-        av_log(ctx, AV_LOG_INFO, "The V4L2 driver changed the video from %dx%d to %dx%d\n", *width, *height, fmt.fmt.pix.width, fmt.fmt.pix.height);
+        av_log(ctx, AV_LOG_INFO,
+               "The V4L2 driver changed the video from %dx%d to %dx%d\n",
+               *width, *height, fmt.fmt.pix.width, fmt.fmt.pix.height);
         *width = fmt.fmt.pix.width;
         *height = fmt.fmt.pix.height;
     }
 
     if (pix_fmt != fmt.fmt.pix.pixelformat) {
-        av_log(ctx, AV_LOG_DEBUG, "The V4L2 driver changed the pixel format from 0x%08X to 0x%08X\n", pix_fmt, fmt.fmt.pix.pixelformat);
+        av_log(ctx, AV_LOG_DEBUG,
+               "The V4L2 driver changed the pixel format "
+               "from 0x%08X to 0x%08X\n",
+               pix_fmt, fmt.fmt.pix.pixelformat);
         res = -1;
     }
 
+    if (fmt.fmt.pix.field == V4L2_FIELD_INTERLACED) {
+        av_log(ctx, AV_LOG_DEBUG, "The V4L2 driver using the interlaced mode");
+        s->interlaced = 1;
+    }
+
     return res;
 }
 
@@ -264,6 +295,71 @@ static enum CodecID fmt_v4l2codec(uint32_t v4l2_fmt)
     return CODEC_ID_NONE;
 }
 
+#if HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE
+static void list_framesizes(AVFormatContext *ctx, int fd, uint32_t pixelformat)
+{
+    struct v4l2_frmsizeenum vfse = { .pixel_format = pixelformat };
+
+    while(!ioctl(fd, VIDIOC_ENUM_FRAMESIZES, &vfse)) {
+        switch (vfse.type) {
+        case V4L2_FRMSIZE_TYPE_DISCRETE:
+            av_log(ctx, AV_LOG_INFO, " %ux%u",
+                   vfse.discrete.width, vfse.discrete.height);
+        break;
+        case V4L2_FRMSIZE_TYPE_CONTINUOUS:
+        case V4L2_FRMSIZE_TYPE_STEPWISE:
+            av_log(ctx, AV_LOG_INFO, " {%u-%u, %u}x{%u-%u, %u}",
+                   vfse.stepwise.min_width,
+                   vfse.stepwise.max_width,
+                   vfse.stepwise.step_width,
+                   vfse.stepwise.min_height,
+                   vfse.stepwise.max_height,
+                   vfse.stepwise.step_height);
+        }
+        vfse.index++;
+    }
+}
+#endif
+
+static void list_formats(AVFormatContext *ctx, int fd, int type)
+{
+    struct v4l2_fmtdesc vfd = { .type = V4L2_BUF_TYPE_VIDEO_CAPTURE };
+
+    while(!ioctl(fd, VIDIOC_ENUM_FMT, &vfd)) {
+        enum CodecID codec_id = fmt_v4l2codec(vfd.pixelformat);
+        enum PixelFormat pix_fmt = fmt_v4l2ff(vfd.pixelformat, codec_id);
+
+        vfd.index++;
+
+        if (!(vfd.flags & V4L2_FMT_FLAG_COMPRESSED) &&
+            type & V4L_RAWFORMATS) {
+            const char *fmt_name = av_get_pix_fmt_name(pix_fmt);
+            av_log(ctx, AV_LOG_INFO, "R : %9s : %20s :",
+                   fmt_name ? fmt_name : "Unsupported",
+                   vfd.description);
+        } else if (vfd.flags & V4L2_FMT_FLAG_COMPRESSED &&
+                   type & V4L_COMPFORMATS) {
+            AVCodec *codec = avcodec_find_encoder(codec_id);
+            av_log(ctx, AV_LOG_INFO, "C : %9s : %20s :",
+                   codec ? codec->name : "Unsupported",
+                   vfd.description);
+        } else {
+            continue;
+        }
+
+#ifdef V4L2_FMT_FLAG_EMULATED
+        if (vfd.flags & V4L2_FMT_FLAG_EMULATED) {
+            av_log(ctx, AV_LOG_WARNING, "%s", "Emulated");
+            continue;
+        }
+#endif
+#if HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE
+        list_framesizes(ctx, fd, vfd.pixelformat);
+#endif
+        av_log(ctx, AV_LOG_INFO, "\n");
+    }
+}
+
 static int mmap_init(AVFormatContext *ctx)
 {
     struct video_data *s = ctx->priv_data;
@@ -314,12 +410,16 @@ static int mmap_init(AVFormatContext *ctx)
 
         s->buf_len[i] = buf.length;
         if (s->frame_size > 0 && s->buf_len[i] < s->frame_size) {
-            av_log(ctx, AV_LOG_ERROR, "Buffer len [%d] = %d != %d\n", i, s->buf_len[i], s->frame_size);
+            av_log(ctx, AV_LOG_ERROR,
+                   "Buffer len [%d] = %d != %d\n",
+                   i, s->buf_len[i], s->frame_size);
 
             return -1;
         }
         s->buf_start[i] = v4l2_mmap(NULL, buf.length,
-                        PROT_READ | PROT_WRITE, MAP_SHARED, s->fd, buf.m.offset);
+                               PROT_READ | PROT_WRITE, MAP_SHARED,
+                               s->fd, buf.m.offset);
+
         if (s->buf_start[i] == MAP_FAILED) {
             av_log(ctx, AV_LOG_ERROR, "mmap: %s\n", strerror(errno));
             return AVERROR(errno);
@@ -329,20 +429,14 @@ static int mmap_init(AVFormatContext *ctx)
     return 0;
 }
 
-static int read_init(AVFormatContext *ctx)
-{
-    return -1;
-}
-
 static void mmap_release_buffer(AVPacket *pkt)
 {
     struct v4l2_buffer buf = {0};
     int res, fd;
     struct buff_data *buf_descriptor = pkt->priv;
 
-    if (pkt->data == NULL) {
-         return;
-    }
+    if (pkt->data == NULL)
+        return;
 
     buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
     buf.memory = V4L2_MEMORY_MMAP;
@@ -351,9 +445,10 @@ static void mmap_release_buffer(AVPacket *pkt)
     av_free(buf_descriptor);
 
     res = v4l2_ioctl(fd, VIDIOC_QBUF, &buf);
-    if (res < 0) {
-        av_log(NULL, AV_LOG_ERROR, "ioctl(VIDIOC_QBUF): %s\n", strerror(errno));
-    }
+    if (res < 0)
+        av_log(NULL, AV_LOG_ERROR, "ioctl(VIDIOC_QBUF): %s\n",
+               strerror(errno));
+
     pkt->data = NULL;
     pkt->size = 0;
 }
@@ -375,13 +470,17 @@ static int mmap_read_frame(AVFormatContext *ctx, AVPacket *pkt)
             pkt->size = 0;
             return AVERROR(EAGAIN);
         }
-        av_log(ctx, AV_LOG_ERROR, "ioctl(VIDIOC_DQBUF): %s\n", strerror(errno));
+        av_log(ctx, AV_LOG_ERROR, "ioctl(VIDIOC_DQBUF): %s\n",
+               strerror(errno));
 
         return AVERROR(errno);
     }
     assert(buf.index < s->buffers);
     if (s->frame_size > 0 && buf.bytesused != s->frame_size) {
-        av_log(ctx, AV_LOG_ERROR, "The v4l2 frame is %d bytes, but %d bytes are expected\n", buf.bytesused, s->frame_size);
+        av_log(ctx, AV_LOG_ERROR,
+               "The v4l2 frame is %d bytes, but %d bytes are expected\n",
+               buf.bytesused, s->frame_size);
+
         return AVERROR_INVALIDDATA;
     }
 
@@ -407,11 +506,6 @@ static int mmap_read_frame(AVFormatContext *ctx, AVPacket *pkt)
     return s->buf_len[buf.index];
 }
 
-static int read_frame(AVFormatContext *ctx, AVPacket *pkt)
-{
-    return -1;
-}
-
 static int mmap_start(AVFormatContext *ctx)
 {
     struct video_data *s = ctx->priv_data;
@@ -427,7 +521,9 @@ static int mmap_start(AVFormatContext *ctx)
 
         res = v4l2_ioctl(s->fd, VIDIOC_QBUF, &buf);
         if (res < 0) {
-            av_log(ctx, AV_LOG_ERROR, "ioctl(VIDIOC_QBUF): %s\n", strerror(errno));
+            av_log(ctx, AV_LOG_ERROR, "ioctl(VIDIOC_QBUF): %s\n",
+                   strerror(errno));
+
             return AVERROR(errno);
         }
     }
@@ -435,7 +531,9 @@ static int mmap_start(AVFormatContext *ctx)
     type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
     res = v4l2_ioctl(s->fd, VIDIOC_STREAMON, &type);
     if (res < 0) {
-        av_log(ctx, AV_LOG_ERROR, "ioctl(VIDIOC_STREAMON): %s\n", strerror(errno));
+        av_log(ctx, AV_LOG_ERROR, "ioctl(VIDIOC_STREAMON): %s\n",
+               strerror(errno));
+
         return AVERROR(errno);
     }
 
@@ -471,8 +569,10 @@ static int v4l2_set_parameters(AVFormatContext *s1, AVFormatParameters *ap)
 
     streamparm.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
 
-    if (s->framerate && (ret = av_parse_video_rate(&framerate_q, s->framerate)) < 0) {
-        av_log(s1, AV_LOG_ERROR, "Could not parse framerate '%s'.\n", s->framerate);
+    if (s->framerate &&
+        (ret = av_parse_video_rate(&framerate_q, s->framerate)) < 0) {
+        av_log(s1, AV_LOG_ERROR, "Could not parse framerate '%s'.\n",
+               s->framerate);
         return ret;
     }
 
@@ -486,7 +586,8 @@ static int v4l2_set_parameters(AVFormatContext *s1, AVFormatParameters *ap)
     av_log(s1, AV_LOG_DEBUG, "The V4L2 driver set input_id: %d, input: %s\n",
             s->channel, input.name);
     if (v4l2_ioctl(s->fd, VIDIOC_S_INPUT, &input.index) < 0) {
-        av_log(s1, AV_LOG_ERROR, "The V4L2 driver ioctl set input(%d) failed\n",
+        av_log(s1, AV_LOG_ERROR,
+               "The V4L2 driver ioctl set input(%d) failed\n",
                 s->channel);
         return AVERROR(EIO);
     }
@@ -506,10 +607,12 @@ static int v4l2_set_parameters(AVFormatContext *s1, AVFormatParameters *ap)
             return ret;
         }
 
-        av_log(s1, AV_LOG_DEBUG, "The V4L2 driver set standard: %s, id: %"PRIu64"\n",
+        av_log(s1, AV_LOG_DEBUG,
+               "The V4L2 driver set standard: %s, id: %"PRIu64"\n",
                s->standard, (uint64_t)standard.id);
         if (v4l2_ioctl(s->fd, VIDIOC_S_STD, &standard.id) < 0) {
-            av_log(s1, AV_LOG_ERROR, "The V4L2 driver ioctl set standard(%s) failed\n",
+            av_log(s1, AV_LOG_ERROR,
+                   "The V4L2 driver ioctl set standard(%s) failed\n",
                    s->standard);
             return AVERROR(EIO);
         }
@@ -520,6 +623,7 @@ static int v4l2_set_parameters(AVFormatContext *s1, AVFormatParameters *ap)
                framerate_q.den, framerate_q.num);
         tpf->numerator   = framerate_q.den;
         tpf->denominator = framerate_q.num;
+
         if (v4l2_ioctl(s->fd, VIDIOC_S_PARM, &streamparm) != 0) {
             av_log(s1, AV_LOG_ERROR,
                    "ioctl set time per frame(%d/%d) failed\n",
@@ -530,14 +634,15 @@ static int v4l2_set_parameters(AVFormatContext *s1, AVFormatParameters *ap)
         if (framerate_q.num != tpf->denominator ||
             framerate_q.den != tpf->numerator) {
             av_log(s1, AV_LOG_INFO,
-                   "The driver changed the time per frame from %d/%d to %d/%d\n",
+                   "The driver changed the time per frame from "
+                   "%d/%d to %d/%d\n",
                    framerate_q.den, framerate_q.num,
                    tpf->numerator, tpf->denominator);
         }
     } else {
-        /* if timebase value is not set, read the timebase value from the driver */
         if (v4l2_ioctl(s->fd, VIDIOC_G_PARM, &streamparm) != 0) {
-            av_log(s1, AV_LOG_ERROR, "ioctl(VIDIOC_G_PARM): %s\n", strerror(errno));
+            av_log(s1, AV_LOG_ERROR, "ioctl(VIDIOC_G_PARM): %s\n",
+                   strerror(errno));
             return AVERROR(errno);
         }
     }
@@ -571,6 +676,7 @@ static uint32_t device_try_init(AVFormatContext *s1,
             }
         }
     }
+
     if (desired_format != 0) {
         *codec_id = fmt_v4l2codec(desired_format);
         assert(*codec_id != CODEC_ID_NONE);
@@ -584,7 +690,7 @@ static int v4l2_read_header(AVFormatContext *s1, AVFormatParameters *ap)
     struct video_data *s = s1->priv_data;
     AVStream *st;
     int res = 0;
-    uint32_t desired_format, capabilities;
+    uint32_t desired_format;
     enum CodecID codec_id;
     enum PixelFormat pix_fmt = PIX_FMT_NONE;
 
@@ -593,42 +699,62 @@ static int v4l2_read_header(AVFormatContext *s1, AVFormatParameters *ap)
         res = AVERROR(ENOMEM);
         goto out;
     }
+
+    s->fd = device_open(s1);
+    if (s->fd < 0) {
+        res = s->fd;
+        goto out;
+    }
+
+    if (s->list_format) {
+        list_formats(s1, s->fd, s->list_format);
+        res = AVERROR_EXIT;
+        goto out;
+    }
+
     avpriv_set_pts_info(st, 64, 1, 1000000); /* 64 bits pts in us */
 
-    if (s->video_size && (res = av_parse_video_size(&s->width, &s->height, s->video_size)) < 0) {
-        av_log(s1, AV_LOG_ERROR, "Could not parse video size '%s'.\n", s->video_size);
-        goto out;
-    }
-    if (s->pixel_format && (pix_fmt = av_get_pix_fmt(s->pixel_format)) == PIX_FMT_NONE) {
-        av_log(s1, AV_LOG_ERROR, "No such pixel format: %s.\n", s->pixel_format);
-        res = AVERROR(EINVAL);
+    if (s->video_size &&
+        (res = av_parse_video_size(&s->width, &s->height, s->video_size)) < 0) {
+        av_log(s1, AV_LOG_ERROR, "Could not parse video size '%s'.\n",
+               s->video_size);
         goto out;
     }
 
-    capabilities = 0;
-    s->fd = device_open(s1, &capabilities);
-    if (s->fd < 0) {
-        res = AVERROR(EIO);
-        goto out;
+    if (s->pixel_format) {
+
+        pix_fmt = av_get_pix_fmt(s->pixel_format);
+
+        if (pix_fmt == PIX_FMT_NONE) {
+            av_log(s1, AV_LOG_ERROR, "No such pixel format: %s.\n",
+                   s->pixel_format);
+
+            res = AVERROR(EINVAL);
+            goto out;
+        }
     }
-    av_log(s1, AV_LOG_VERBOSE, "[%d]Capabilities: %x\n", s->fd, capabilities);
 
     if (!s->width && !s->height) {
         struct v4l2_format fmt;
 
-        av_log(s1, AV_LOG_VERBOSE, "Querying the device for the current frame size\n");
+        av_log(s1, AV_LOG_VERBOSE,
+               "Querying the device for the current frame size\n");
         fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
         if (v4l2_ioctl(s->fd, VIDIOC_G_FMT, &fmt) < 0) {
-            av_log(s1, AV_LOG_ERROR, "ioctl(VIDIOC_G_FMT): %s\n", strerror(errno));
+            av_log(s1, AV_LOG_ERROR, "ioctl(VIDIOC_G_FMT): %s\n",
+                   strerror(errno));
             res = AVERROR(errno);
             goto out;
         }
+
         s->width  = fmt.fmt.pix.width;
         s->height = fmt.fmt.pix.height;
-        av_log(s1, AV_LOG_VERBOSE, "Setting frame size to %dx%d\n", s->width, s->height);
+        av_log(s1, AV_LOG_VERBOSE,
+               "Setting frame size to %dx%d\n", s->width, s->height);
     }
 
-    desired_format = device_try_init(s1, pix_fmt, &s->width, &s->height, &codec_id);
+    desired_format = device_try_init(s1, pix_fmt, &s->width, &s->height,
+                                     &codec_id);
     if (desired_format == 0) {
         av_log(s1, AV_LOG_ERROR, "Cannot find a proper format for "
                "codec_id %d, pix_fmt %d.\n", s1->video_codec_id, pix_fmt);
@@ -639,32 +765,29 @@ static int v4l2_read_header(AVFormatContext *s1, AVFormatParameters *ap)
     }
     if ((res = av_image_check_size(s->width, s->height, 0, s1)) < 0)
         goto out;
+
     s->frame_format = desired_format;
 
     if ((res = v4l2_set_parameters(s1, ap)) < 0)
         goto out;
 
     st->codec->pix_fmt = fmt_v4l2ff(desired_format, codec_id);
-    s->frame_size = avpicture_get_size(st->codec->pix_fmt, s->width, s->height);
-    if (capabilities & V4L2_CAP_STREAMING) {
-        s->io_method = io_mmap;
-        res = mmap_init(s1);
-        if (res == 0) {
-            res = mmap_start(s1);
-        }
-    } else {
-        s->io_method = io_read;
-        res = read_init(s1);
-    }
-    if (res < 0) {
+    s->frame_size =
+        avpicture_get_size(st->codec->pix_fmt, s->width, s->height);
+
+    if ((res = mmap_init(s1)) ||
+        (res = mmap_start(s1)) < 0) {
         v4l2_close(s->fd);
-        res = AVERROR(EIO);
         goto out;
     }
+
     s->top_field_first = first_field(s->fd);
 
     st->codec->codec_type = AVMEDIA_TYPE_VIDEO;
     st->codec->codec_id = codec_id;
+    if (codec_id == CODEC_ID_RAWVIDEO)
+        st->codec->codec_tag =
+            avcodec_pix_fmt_to_codec_tag(st->codec->pix_fmt);
     st->codec->width = s->width;
     st->codec->height = s->height;
     st->codec->bit_rate = s->frame_size * 1/av_q2d(st->codec->time_base) * 8;
@@ -676,26 +799,17 @@ out:
 static int v4l2_read_packet(AVFormatContext *s1, AVPacket *pkt)
 {
     struct video_data *s = s1->priv_data;
+    AVFrame *frame = s1->streams[0]->codec->coded_frame;
     int res;
 
-    if (s->io_method == io_mmap) {
-        av_init_packet(pkt);
-        res = mmap_read_frame(s1, pkt);
-    } else if (s->io_method == io_read) {
-        if (av_new_packet(pkt, s->frame_size) < 0)
-            return AVERROR(EIO);
-
-        res = read_frame(s1, pkt);
-    } else {
-        return AVERROR(EIO);
-    }
-    if (res < 0) {
+    av_init_packet(pkt);
+    if ((res = mmap_read_frame(s1, pkt)) < 0) {
         return res;
     }
 
-    if (s1->streams[0]->codec->coded_frame) {
-        s1->streams[0]->codec->coded_frame->interlaced_frame = 1;
-        s1->streams[0]->codec->coded_frame->top_field_first = s->top_field_first;
+    if (frame && s->interlaced) {
+        frame->interlaced_frame = 1;
+        frame->top_field_first = s->top_field_first;
     }
 
     return pkt->size;
@@ -705,9 +819,7 @@ static int v4l2_read_close(AVFormatContext *s1)
 {
     struct video_data *s = s1->priv_data;
 
-    if (s->io_method == io_mmap) {
-        mmap_close(s);
-    }
+    mmap_close(s);
 
     v4l2_close(s->fd);
     return 0;
@@ -717,11 +829,15 @@ static int v4l2_read_close(AVFormatContext *s1)
 #define DEC AV_OPT_FLAG_DECODING_PARAM
 
 static const AVOption options[] = {
-    { "standard", "", OFFSET(standard), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_DECODING_PARAM },
-    { "channel",  "", OFFSET(channel),  AV_OPT_TYPE_INT,    {.dbl = 0 }, 0, INT_MAX, AV_OPT_FLAG_DECODING_PARAM },
-    { "video_size", "A string describing frame size, such as 640x480 or hd720.", OFFSET(video_size), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, DEC },
-    { "pixel_format", "", OFFSET(pixel_format), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, DEC },
-    { "framerate", "", OFFSET(framerate), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, DEC },
+    { "standard",     "TV standard, used only by analog frame grabber",            OFFSET(standard),     AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0,       DEC },
+    { "channel",      "TV channel, used only by frame grabber",                    OFFSET(channel),      AV_OPT_TYPE_INT,    {.dbl = 0 },    0, INT_MAX, DEC },
+    { "video_size",   "A string describing frame size, such as 640x480 or hd720.", OFFSET(video_size),   AV_OPT_TYPE_STRING, {.str = NULL},  0, 0,       DEC },
+    { "pixel_format", "",                                                          OFFSET(pixel_format), AV_OPT_TYPE_STRING, {.str = NULL},  0, 0,       DEC },
+    { "framerate",    "",                                                          OFFSET(framerate),    AV_OPT_TYPE_STRING, {.str = NULL},  0, 0,       DEC },
+    { "list_formats", "List available formats and exit",                           OFFSET(list_format),  AV_OPT_TYPE_INT,    {.dbl = 0 },  0, INT_MAX, DEC, "list_formats" },
+    { "all",          "Show all available formats",                                OFFSET(list_format),  AV_OPT_TYPE_CONST,  {.dbl = V4L_ALLFORMATS  },    0, INT_MAX, DEC, "list_formats" },
+    { "raw",          "Show only non-compressed formats",                          OFFSET(list_format),  AV_OPT_TYPE_CONST,  {.dbl = V4L_RAWFORMATS  },    0, INT_MAX, DEC, "list_formats" },
+    { "compressed",   "Show only compressed formats",                              OFFSET(list_format),  AV_OPT_TYPE_CONST,  {.dbl = V4L_COMPFORMATS },    0, INT_MAX, DEC, "list_formats" },
     { NULL },
 };
 
diff --git a/libavformat/Makefile b/libavformat/Makefile
index 877a86cac9..1afaddca1e 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -26,6 +26,7 @@ OBJS-$(CONFIG_AC3_MUXER)                 += rawenc.o
 OBJS-$(CONFIG_ACT_DEMUXER)               += act.o
 OBJS-$(CONFIG_ADF_DEMUXER)               += bintext.o sauce.o
 OBJS-$(CONFIG_ADX_DEMUXER)               += adxdec.o
+OBJS-$(CONFIG_ADX_MUXER)                 += rawenc.o
 OBJS-$(CONFIG_ADTS_MUXER)                += adtsenc.o
 OBJS-$(CONFIG_AEA_DEMUXER)               += aea.o pcm.o
 OBJS-$(CONFIG_AIFF_DEMUXER)              += aiffdec.o riff.o pcm.o isom.o
diff --git a/libavformat/adxdec.c b/libavformat/adxdec.c
index dca9748301..ab11d832d8 100644
--- a/libavformat/adxdec.c
+++ b/libavformat/adxdec.c
@@ -109,4 +109,5 @@ AVInputFormat ff_adx_demuxer = {
     .read_packet    = adx_read_packet,
     .extensions     = "adx",
     .value          = CODEC_ID_ADPCM_ADX,
+    .flags          = AVFMT_GENERIC_INDEX,
 };
diff --git a/libavformat/allformats.c b/libavformat/allformats.c
index 615ae574d0..d8ca0d50ea 100644
--- a/libavformat/allformats.c
+++ b/libavformat/allformats.c
@@ -54,7 +54,7 @@ void av_register_all(void)
     REGISTER_DEMUXER  (ACT, act);
     REGISTER_DEMUXER  (ADF, adf);
     REGISTER_MUXER    (ADTS, adts);
-    REGISTER_DEMUXER  (ADX, adx);
+    REGISTER_MUXDEMUX (ADX, adx);
     REGISTER_DEMUXER  (AEA, aea);
     REGISTER_MUXDEMUX (AIFF, aiff);
     REGISTER_MUXDEMUX (AMR, amr);
diff --git a/libavformat/aviobuf.c b/libavformat/aviobuf.c
index d9d012ee90..cac6b1f5eb 100644
--- a/libavformat/aviobuf.c
+++ b/libavformat/aviobuf.c
@@ -574,6 +574,10 @@ static void fill_buffer(AVIOContext *s)
     int len= s->buffer_size - (dst - s->buffer);
     int max_buffer_size = s->max_packet_size ? s->max_packet_size : IO_BUFFER_SIZE;
 
+    /* can't fill the buffer without read_packet, just set EOF if appropiate */
+    if (!s->read_packet && s->buf_ptr >= s->buf_end)
+        s->eof_reached = 1;
+
     /* no need to do anything if EOF already reached */
     if (s->eof_reached)
         return;
diff --git a/libavformat/mtv.c b/libavformat/mtv.c
index 4252309a6e..b4b06d96cd 100644
--- a/libavformat/mtv.c
+++ b/libavformat/mtv.c
@@ -112,10 +112,12 @@ static int mtv_read_header(AVFormatContext *s, AVFormatParameters *ap)
 
     avio_skip(pb, 4);
     audio_subsegments = avio_rl16(pb);
-    if(!audio_subsegments){
-        av_log(s, AV_LOG_ERROR, "audio_subsegments is 0\n");
-        return AVERROR(EINVAL);
+
+    if (audio_subsegments == 0) {
+        av_log_ask_for_sample(s, "MTV files without audio are not supported\n");
+        return AVERROR_INVALIDDATA;
     }
+
     mtv->full_segment_size =
         audio_subsegments * (MTV_AUDIO_PADDING_SIZE + MTV_ASUBCHUNK_DATA_SIZE) +
         mtv->img_segment_size;
diff --git a/libavformat/rawenc.c b/libavformat/rawenc.c
index b2ff79abf3..4c1ede4928 100644
--- a/libavformat/rawenc.c
+++ b/libavformat/rawenc.c
@@ -45,6 +45,18 @@ AVOutputFormat ff_ac3_muxer = {
 };
 #endif
 
+#if CONFIG_ADX_MUXER
+AVOutputFormat ff_adx_muxer = {
+    .name              = "adx",
+    .long_name         = NULL_IF_CONFIG_SMALL("CRI ADX"),
+    .extensions        = "adx",
+    .audio_codec       = CODEC_ID_ADPCM_ADX,
+    .video_codec       = CODEC_ID_NONE,
+    .write_packet      = ff_raw_write_packet,
+    .flags             = AVFMT_NOTIMESTAMPS,
+};
+#endif
+
 #if CONFIG_DIRAC_MUXER
 AVOutputFormat ff_dirac_muxer = {
     .name              = "dirac",
diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c
index 924a39b9f1..4b8184c4f4 100644
--- a/libpostproc/postprocess_template.c
+++ b/libpostproc/postprocess_template.c
@@ -2470,7 +2470,7 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
     int64_t dc_mask, eq_mask, both_masks;
     int64_t sums[10*8*2];
     src+= step*3; // src points to begin of the 8x8 Block
-//START_TIMER
+    //{ START_TIMER
     __asm__ volatile(
         "movq %0, %%mm7                         \n\t"
         "movq %1, %%mm6                         \n\t"
@@ -2995,7 +2995,8 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
     STOP_TIMER("step16")
 }else{
     STOP_TIMER("stepX")
-}*/
+}
+    } */
 }
 #endif //HAVE_MMX
 
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 6f25145c3a..78d0112c8e 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -19,7 +19,8 @@ OBJS-$(HAVE_MMX)           +=  x86/rgb2rgb.o            \
                                x86/swscale_mmx.o        \
                                x86/yuv2rgb_mmx.o
 OBJS-$(HAVE_VIS)           +=  sparc/yuv2rgb_vis.o
-OBJS-$(HAVE_YASM)          +=  x86/scale.o
+MMX-OBJS-$(HAVE_YASM)      +=  x86/output.o             \
+                               x86/scale.o
 
 $(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
 
diff --git a/libswscale/ppc/yuv2rgb_altivec.h b/libswscale/ppc/yuv2rgb_altivec.h
index 163eba6eb7..7c2a7e547b 100644
--- a/libswscale/ppc/yuv2rgb_altivec.h
+++ b/libswscale/ppc/yuv2rgb_altivec.h
@@ -21,8 +21,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef PPC_YUV2RGB_ALTIVEC_H
-#define PPC_YUV2RGB_ALTIVEC_H 1
+#ifndef SWSCALE_PPC_YUV2RGB_ALTIVEC_H
+#define SWSCALE_PPC_YUV2RGB_ALTIVEC_H
 
 #define YUV2PACKEDX_HEADER(suffix) \
 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
@@ -39,4 +39,4 @@ YUV2PACKEDX_HEADER(rgba);
 YUV2PACKEDX_HEADER(rgb24);
 YUV2PACKEDX_HEADER(bgr24);
 
-#endif /* PPC_YUV2RGB_ALTIVEC_H */
+#endif /* SWSCALE_PPC_YUV2RGB_ALTIVEC_H */
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 72f6d58144..ff0656ee20 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -18,39 +18,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/*
-  supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
-  supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
-  {BGR,RGB}{1,4,8,15,16} support dithering
-
-  unscaled special converters (YV12=I420=IYUV, Y800=Y8)
-  YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
-  x -> x
-  YUV9 -> YV12
-  YUV9/YV12 -> Y800
-  Y800 -> YUV9/YV12
-  BGR24 -> BGR32 & RGB24 -> RGB32
-  BGR32 -> BGR24 & RGB32 -> RGB24
-  BGR15 -> BGR16
-*/
-
-/*
-tested special converters (most are tested actually, but I did not write it down ...)
- YV12 -> BGR12/BGR16
- YV12 -> YV12
- BGR15 -> BGR16
- BGR16 -> BGR16
- YVU9 -> YV12
-
-untested special converters
-  YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
-  YV12/I420 -> YV12/I420
-  YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
-  BGR24 -> BGR32 & RGB24 -> RGB32
-  BGR32 -> BGR24 & RGB32 -> RGB24
-  BGR24 -> YV12
-*/
-
 #include <inttypes.h>
 #include <string.h>
 #include <math.h>
@@ -2371,36 +2338,6 @@ find_c_packed_planar_out_funcs(SwsContext *c,
     } else {
         YUV_PACKED:
         switch (dstFormat) {
-        case PIX_FMT_GRAY16BE:
-            *yuv2packed1 = yuv2gray16BE_1_c;
-            *yuv2packed2 = yuv2gray16BE_2_c;
-            *yuv2packedX = yuv2gray16BE_X_c;
-            break;
-        case PIX_FMT_GRAY16LE:
-            *yuv2packed1 = yuv2gray16LE_1_c;
-            *yuv2packed2 = yuv2gray16LE_2_c;
-            *yuv2packedX = yuv2gray16LE_X_c;
-            break;
-        case PIX_FMT_MONOWHITE:
-            *yuv2packed1 = yuv2monowhite_1_c;
-            *yuv2packed2 = yuv2monowhite_2_c;
-            *yuv2packedX = yuv2monowhite_X_c;
-            break;
-        case PIX_FMT_MONOBLACK:
-            *yuv2packed1 = yuv2monoblack_1_c;
-            *yuv2packed2 = yuv2monoblack_2_c;
-            *yuv2packedX = yuv2monoblack_X_c;
-            break;
-        case PIX_FMT_YUYV422:
-            *yuv2packed1 = yuv2yuyv422_1_c;
-            *yuv2packed2 = yuv2yuyv422_2_c;
-            *yuv2packedX = yuv2yuyv422_X_c;
-            break;
-        case PIX_FMT_UYVY422:
-            *yuv2packed1 = yuv2uyvy422_1_c;
-            *yuv2packed2 = yuv2uyvy422_2_c;
-            *yuv2packedX = yuv2uyvy422_X_c;
-            break;
         case PIX_FMT_RGB48LE:
             *yuv2packed1 = yuv2rgb48le_1_c;
             *yuv2packed2 = yuv2rgb48le_2_c;
@@ -2517,6 +2454,38 @@ find_c_packed_planar_out_funcs(SwsContext *c,
             break;
         }
     }
+    switch (dstFormat) {
+    case PIX_FMT_GRAY16BE:
+        *yuv2packed1 = yuv2gray16BE_1_c;
+        *yuv2packed2 = yuv2gray16BE_2_c;
+        *yuv2packedX = yuv2gray16BE_X_c;
+        break;
+    case PIX_FMT_GRAY16LE:
+        *yuv2packed1 = yuv2gray16LE_1_c;
+        *yuv2packed2 = yuv2gray16LE_2_c;
+        *yuv2packedX = yuv2gray16LE_X_c;
+        break;
+    case PIX_FMT_MONOWHITE:
+        *yuv2packed1 = yuv2monowhite_1_c;
+        *yuv2packed2 = yuv2monowhite_2_c;
+        *yuv2packedX = yuv2monowhite_X_c;
+        break;
+    case PIX_FMT_MONOBLACK:
+        *yuv2packed1 = yuv2monoblack_1_c;
+        *yuv2packed2 = yuv2monoblack_2_c;
+        *yuv2packedX = yuv2monoblack_X_c;
+        break;
+    case PIX_FMT_YUYV422:
+        *yuv2packed1 = yuv2yuyv422_1_c;
+        *yuv2packed2 = yuv2yuyv422_2_c;
+        *yuv2packedX = yuv2yuyv422_X_c;
+        break;
+    case PIX_FMT_UYVY422:
+        *yuv2packed1 = yuv2uyvy422_1_c;
+        *yuv2packed2 = yuv2uyvy422_2_c;
+        *yuv2packedX = yuv2uyvy422_X_c;
+        break;
+    }
 }
 
 #define DEBUG_SWSCALE_BUFFERS 0
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
new file mode 100644
index 0000000000..0ec2038d1d
--- /dev/null
+++ b/libswscale/x86/output.asm
@@ -0,0 +1,409 @@
+;******************************************************************************
+;* x86-optimized vertical line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*                    Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+minshort:      times 8 dw 0x8000
+yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
+yuv2yuvX_10_start:  times 4 dd 0x10000
+yuv2yuvX_9_start:   times 4 dd 0x20000
+yuv2yuvX_10_upper:  times 8 dw 0x3ff
+yuv2yuvX_9_upper:   times 8 dw 0x1ff
+pd_4:          times 4 dd 4
+pd_4min0x40000:times 4 dd 4 - (0x40000)
+pw_16:         times 8 dw 16
+pw_32:         times 8 dw 32
+pw_512:        times 8 dw 512
+pw_1024:       times 8 dw 1024
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; vertical line scaling
+;
+; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+; and
+; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
+;                                     const int16_t **src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+;
+; Scale one or $filterSize lines of source data to generate one line of output
+; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
+; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
+; of 2. $offset is either 0 or 3. $dither holds 8 values.
+;-----------------------------------------------------------------------------
+
+%macro yuv2planeX_fn 4
+
+%ifdef ARCH_X86_32
+%define cntr_reg r1
+%define movsx mov
+%else
+%define cntr_reg r11
+%define movsx movsxd
+%endif
+
+cglobal yuv2planeX_%2_%1, %4, 7, %3
+%if %2 == 8 || %2 == 9 || %2 == 10
+    pxor            m6,  m6
+%endif ; %2 == 8/9/10
+
+%if %2 == 8
+%ifdef ARCH_X86_32
+%assign pad 0x2c - (stack_offset & 15)
+    SUB             rsp, pad
+%define m_dith m7
+%else ; x86-64
+%define m_dith m9
+%endif ; x86-32
+
+    ; create registers holding dither
+    movq        m_dith, [r5]             ; dither
+    test            r6d, r6d
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq  m_dith,  m_dith
+%endif ; mmsize == 16
+    PALIGNR     m_dith,  m_dith,  3,  m0
+.no_rot:
+%if mmsize == 16
+    punpcklbw   m_dith,  m6
+%ifdef ARCH_X86_64
+    punpcklwd       m8,  m_dith,  m6
+    pslld           m8,  12
+%else ; x86-32
+    punpcklwd       m5,  m_dith,  m6
+    pslld           m5,  12
+%endif ; x86-32/64
+    punpckhwd   m_dith,  m6
+    pslld       m_dith,  12
+%ifdef ARCH_X86_32
+    mova      [rsp+ 0],  m5
+    mova      [rsp+16],  m_dith
+%endif
+%else ; mmsize == 8
+    punpcklbw       m5,  m_dith,  m6
+    punpckhbw   m_dith,  m6
+    punpcklwd       m4,  m5,  m6
+    punpckhwd       m5,  m6
+    punpcklwd       m3,  m_dith,  m6
+    punpckhwd   m_dith,  m6
+    pslld           m4,  12
+    pslld           m5,  12
+    pslld           m3,  12
+    pslld       m_dith,  12
+    mova      [rsp+ 0],  m4
+    mova      [rsp+ 8],  m5
+    mova      [rsp+16],  m3
+    mova      [rsp+24],  m_dith
+%endif ; mmsize == 8/16
+%endif ; %2 == 8
+
+    xor             r5,  r5
+
+.pixelloop:
+%assign %%i 0
+    ; the rep here is for the 8bit output mmx case, where dither covers
+    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
+    ; pixels per iteration. In order to not have to keep track of where
+    ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
+%if %2 == 8
+%rep 16/mmsize
+%endif ; %2 == 8
+
+%if %2 == 8
+%ifdef ARCH_X86_32
+    mova            m2, [rsp+mmsize*(0+%%i)]
+    mova            m1, [rsp+mmsize*(1+%%i)]
+%else ; x86-64
+    mova            m2,  m8
+    mova            m1,  m_dith
+%endif ; x86-32/64
+%else ; %2 == 9/10/16
+    mova            m1, [yuv2yuvX_%2_start]
+    mova            m2,  m1
+%endif ; %2 == 8/9/10/16
+    movsx     cntr_reg,  r1m
+.filterloop_ %+ %%i:
+    ; input pixels
+    mov             r6, [r2+gprsize*cntr_reg-2*gprsize]
+%if %2 == 16
+    mova            m3, [r6+r5*4]
+    mova            m5, [r6+r5*4+mmsize]
+%else ; %2 == 8/9/10
+    mova            m3, [r6+r5*2]
+%endif ; %2 == 8/9/10/16
+    mov             r6, [r2+gprsize*cntr_reg-gprsize]
+%if %2 == 16
+    mova            m4, [r6+r5*4]
+    mova            m6, [r6+r5*4+mmsize]
+%else ; %2 == 8/9/10
+    mova            m4, [r6+r5*2]
+%endif ; %2 == 8/9/10/16
+
+    ; coefficients
+    movd            m0, [r0+2*cntr_reg-4]; coeff[0], coeff[1]
+%if %2 == 16
+    pshuflw         m7,  m0,  0          ; coeff[0]
+    pshuflw         m0,  m0,  0x55       ; coeff[1]
+    pmovsxwd        m7,  m7              ; word -> dword
+    pmovsxwd        m0,  m0              ; word -> dword
+
+    pmulld          m3,  m7
+    pmulld          m5,  m7
+    pmulld          m4,  m0
+    pmulld          m6,  m0
+
+    paddd           m2,  m3
+    paddd           m1,  m5
+    paddd           m2,  m4
+    paddd           m1,  m6
+%else ; %2 == 10/9/8
+    punpcklwd       m5,  m3,  m4
+    punpckhwd       m3,  m4
+    SPLATD          m0,  m0
+
+    pmaddwd         m5,  m0
+    pmaddwd         m3,  m0
+
+    paddd           m2,  m5
+    paddd           m1,  m3
+%endif ; %2 == 8/9/10/16
+
+    sub       cntr_reg,  2
+    jg .filterloop_ %+ %%i
+
+%if %2 == 16
+    psrad           m2,  31 - %2
+    psrad           m1,  31 - %2
+%else ; %2 == 10/9/8
+    psrad           m2,  27 - %2
+    psrad           m1,  27 - %2
+%endif ; %2 == 8/9/10/16
+
+%if %2 == 8
+    packssdw        m2,  m1
+    packuswb        m2,  m2
+    movh     [r3+r5*1],  m2
+%else ; %2 == 9/10/16
+%if %2 == 16
+    packssdw        m2,  m1
+    paddw           m2, [minshort]
+%else ; %2 == 9/10
+%ifidn %1, sse4
+    packusdw        m2,  m1
+%elifidn %1, avx
+    packusdw        m2,  m1
+%else ; mmx2/sse2
+    packssdw        m2,  m1
+    pmaxsw          m2,  m6
+%endif ; mmx2/sse2/sse4/avx
+    pminsw          m2, [yuv2yuvX_%2_upper]
+%endif ; %2 == 9/10/16
+    mova     [r3+r5*2],  m2
+%endif ; %2 == 8/9/10/16
+
+    add             r5,  mmsize/2
+    sub             r4d, mmsize/2
+%if %2 == 8
+%assign %%i %%i+2
+%endrep
+%endif ; %2 == 8
+    jg .pixelloop
+
+%if %2 == 8
+%ifdef ARCH_X86_32
+    ADD             rsp, pad
+    RET
+%else ; x86-64
+    REP_RET
+%endif ; x86-32/64
+%else ; %2 == 9/10/16
+    REP_RET
+%endif ; %2 == 8/9/10/16
+%endmacro
+
+%define PALIGNR PALIGNR_MMX
+%ifdef ARCH_X86_32
+INIT_MMX
+yuv2planeX_fn mmx2,  8,  0, 7
+yuv2planeX_fn mmx2,  9,  0, 5
+yuv2planeX_fn mmx2, 10,  0, 5
+%endif
+
+INIT_XMM
+yuv2planeX_fn sse2,  8, 10, 7
+yuv2planeX_fn sse2,  9,  7, 5
+yuv2planeX_fn sse2, 10,  7, 5
+
+%define PALIGNR PALIGNR_SSSE3
+yuv2planeX_fn sse4,  8, 10, 7
+yuv2planeX_fn sse4,  9,  7, 5
+yuv2planeX_fn sse4, 10,  7, 5
+yuv2planeX_fn sse4, 16,  8, 5
+
+INIT_AVX
+yuv2planeX_fn avx,   8, 10, 7
+yuv2planeX_fn avx,   9,  7, 5
+yuv2planeX_fn avx,  10,  7, 5
+
+; %1=outout-bpc, %2=alignment (u/a)
+%macro yuv2plane1_mainloop 2
+.loop_%2:
+%if %1 == 8
+    paddsw          m0, m2, [r0+r2*2+mmsize*0]
+    paddsw          m1, m3, [r0+r2*2+mmsize*1]
+    psraw           m0, 7
+    psraw           m1, 7
+    packuswb        m0, m1
+    mov%2      [r1+r2], m0
+%elif %1 == 16
+    paddd           m0, m4, [r0+r2*4+mmsize*0]
+    paddd           m1, m4, [r0+r2*4+mmsize*1]
+    paddd           m2, m4, [r0+r2*4+mmsize*2]
+    paddd           m3, m4, [r0+r2*4+mmsize*3]
+    psrad           m0, 3
+    psrad           m1, 3
+    psrad           m2, 3
+    psrad           m3, 3
+%if cpuflag(sse4) ; avx/sse4
+    packusdw        m0, m1
+    packusdw        m2, m3
+%else ; mmx/sse2
+    packssdw        m0, m1
+    packssdw        m2, m3
+    paddw           m0, m5
+    paddw           m2, m5
+%endif ; mmx/sse2/sse4/avx
+    mov%2    [r1+r2*2], m0
+    mov%2    [r1+r2*2+mmsize], m2
+%else
+    paddsw          m0, m2, [r0+r2*2+mmsize*0]
+    paddsw          m1, m2, [r0+r2*2+mmsize*1]
+    psraw           m0, 15 - %1
+    psraw           m1, 15 - %1
+    pmaxsw          m0, m4
+    pmaxsw          m1, m4
+    pminsw          m0, m3
+    pminsw          m1, m3
+    mov%2    [r1+r2*2], m0
+    mov%2    [r1+r2*2+mmsize], m1
+%endif
+    add             r2, mmsize
+    jl .loop_%2
+%endmacro
+
+%macro yuv2plane1_fn 3
+cglobal yuv2plane1_%1, %3, %3, %2
+    add             r2, mmsize - 1
+    and             r2, ~(mmsize - 1)
+%if %1 == 8
+    add             r1, r2
+%else ; %1 != 8
+    lea             r1, [r1+r2*2]
+%endif ; %1 == 8
+%if %1 == 16
+    lea             r0, [r0+r2*4]
+%else ; %1 != 16
+    lea             r0, [r0+r2*2]
+%endif ; %1 == 16
+    neg             r2
+
+%if %1 == 8
+    pxor            m4, m4               ; zero
+
+    ; create registers holding dither
+    movq            m3, [r3]             ; dither
+    test           r4d, r4d
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq      m3, m3
+%endif ; mmsize == 16
+    PALIGNR_MMX     m3, m3, 3, m2
+.no_rot:
+%if mmsize == 8
+    mova            m2, m3
+    punpckhbw       m3, m4               ; byte->word
+    punpcklbw       m2, m4               ; byte->word
+%else
+    punpcklbw       m3, m4
+    mova            m2, m3
+%endif
+%elif %1 == 9
+    pxor            m4, m4
+    mova            m3, [pw_512]
+    mova            m2, [pw_32]
+%elif %1 == 10
+    pxor            m4, m4
+    mova            m3, [pw_1024]
+    mova            m2, [pw_16]
+%else ; %1 == 16
+%if cpuflag(sse4) ; sse4/avx
+    mova            m4, [pd_4]
+%else ; mmx/sse2
+    mova            m4, [pd_4min0x40000]
+    mova            m5, [minshort]
+%endif ; mmx/sse2/sse4/avx
+%endif ; %1 == ..
+
+    ; actual pixel scaling
+%if mmsize == 8
+    yuv2plane1_mainloop %1, a
+%else ; mmsize == 16
+    test            r1, 15
+    jnz .unaligned
+    yuv2plane1_mainloop %1, a
+    REP_RET
+.unaligned:
+    yuv2plane1_mainloop %1, u
+%endif ; mmsize == 8/16
+    REP_RET
+%endmacro
+
+%ifdef ARCH_X86_32
+INIT_MMX mmx
+yuv2plane1_fn  8, 0, 5
+yuv2plane1_fn 16, 0, 3
+
+INIT_MMX mmx2
+yuv2plane1_fn  9, 0, 3
+yuv2plane1_fn 10, 0, 3
+%endif
+
+INIT_XMM sse2
+yuv2plane1_fn  8, 5, 5
+yuv2plane1_fn  9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 6, 3
+
+INIT_XMM sse4
+yuv2plane1_fn 16, 5, 3
+
+INIT_XMM avx
+yuv2plane1_fn  8, 5, 5
+yuv2plane1_fn  9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 5, 3
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 14e2fb8406..09313b926f 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -1,7 +1,6 @@
 ;******************************************************************************
-;* x86-optimized horizontal/vertical line scaling functions
+;* x86-optimized horizontal line scaling functions
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
-;*                    Kieran Kunhya <kieran@kunhya.com>
 ;*
 ;* This file is part of Libav.
 ;*
@@ -29,17 +28,6 @@ max_19bit_int: times 4 dd 0x7ffff
 max_19bit_flt: times 4 dd 524287.0
 minshort:      times 8 dw 0x8000
 unicoeff:      times 4 dd 0x20000000
-yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
-yuv2yuvX_10_start:  times 4 dd 0x10000
-yuv2yuvX_9_start:   times 4 dd 0x20000
-yuv2yuvX_10_upper:  times 8 dw 0x3ff
-yuv2yuvX_9_upper:   times 8 dw 0x1ff
-pd_4:          times 4 dd 4
-pd_4min0x40000:times 4 dd 4 - (0x40000)
-pw_16:         times 8 dw 16
-pw_32:         times 8 dw 32
-pw_512:        times 8 dw 512
-pw_1024:       times 8 dw 1024
 
 SECTION .text
 
@@ -441,371 +429,3 @@ INIT_XMM
 SCALE_FUNCS2 sse2,  6, 7, 8
 SCALE_FUNCS2 ssse3, 6, 6, 8
 SCALE_FUNCS2 sse4,  6, 6, 8
-
-;-----------------------------------------------------------------------------
-; vertical line scaling
-;
-; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
-;                                     const uint8_t *dither, int offset)
-; and
-; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
-;                                     const int16_t **src, uint8_t *dst, int dstW,
-;                                     const uint8_t *dither, int offset)
-;
-; Scale one or $filterSize lines of source data to generate one line of output
-; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
-; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
-; of 2. $offset is either 0 or 3. $dither holds 8 values.
-;-----------------------------------------------------------------------------
-
-%macro yuv2planeX_fn 4
-
-%ifdef ARCH_X86_32
-%define cntr_reg r1
-%define movsx mov
-%else
-%define cntr_reg r11
-%define movsx movsxd
-%endif
-
-cglobal yuv2planeX_%2_%1, %4, 7, %3
-%if %2 == 8 || %2 == 9 || %2 == 10
-    pxor            m6,  m6
-%endif ; %2 == 8/9/10
-
-%if %2 == 8
-%ifdef ARCH_X86_32
-%assign pad 0x2c - (stack_offset & 15)
-    SUB             rsp, pad
-%define m_dith m7
-%else ; x86-64
-%define m_dith m9
-%endif ; x86-32
-
-    ; create registers holding dither
-    movq        m_dith, [r5]             ; dither
-    test            r6d, r6d
-    jz              .no_rot
-%if mmsize == 16
-    punpcklqdq  m_dith,  m_dith
-%endif ; mmsize == 16
-    PALIGNR     m_dith,  m_dith,  3,  m0
-.no_rot:
-%if mmsize == 16
-    punpcklbw   m_dith,  m6
-%ifdef ARCH_X86_64
-    punpcklwd       m8,  m_dith,  m6
-    pslld           m8,  12
-%else ; x86-32
-    punpcklwd       m5,  m_dith,  m6
-    pslld           m5,  12
-%endif ; x86-32/64
-    punpckhwd   m_dith,  m6
-    pslld       m_dith,  12
-%ifdef ARCH_X86_32
-    mova      [rsp+ 0],  m5
-    mova      [rsp+16],  m_dith
-%endif
-%else ; mmsize == 8
-    punpcklbw       m5,  m_dith,  m6
-    punpckhbw   m_dith,  m6
-    punpcklwd       m4,  m5,  m6
-    punpckhwd       m5,  m6
-    punpcklwd       m3,  m_dith,  m6
-    punpckhwd   m_dith,  m6
-    pslld           m4,  12
-    pslld           m5,  12
-    pslld           m3,  12
-    pslld       m_dith,  12
-    mova      [rsp+ 0],  m4
-    mova      [rsp+ 8],  m5
-    mova      [rsp+16],  m3
-    mova      [rsp+24],  m_dith
-%endif ; mmsize == 8/16
-%endif ; %2 == 8
-
-    xor             r5,  r5
-
-.pixelloop:
-%assign %%i 0
-    ; the rep here is for the 8bit output mmx case, where dither covers
-    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
-    ; pixels per iteration. In order to not have to keep track of where
-    ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
-%if %2 == 8
-%rep 16/mmsize
-%endif ; %2 == 8
-
-%if %2 == 8
-%ifdef ARCH_X86_32
-    mova            m2, [rsp+mmsize*(0+%%i)]
-    mova            m1, [rsp+mmsize*(1+%%i)]
-%else ; x86-64
-    mova            m2,  m8
-    mova            m1,  m_dith
-%endif ; x86-32/64
-%else ; %2 == 9/10/16
-    mova            m1, [yuv2yuvX_%2_start]
-    mova            m2,  m1
-%endif ; %2 == 8/9/10/16
-    movsx     cntr_reg,  r1m
-.filterloop_ %+ %%i:
-    ; input pixels
-    mov             r6, [r2+gprsize*cntr_reg-2*gprsize]
-%if %2 == 16
-    mova            m3, [r6+r5*4]
-    mova            m5, [r6+r5*4+mmsize]
-%else ; %2 == 8/9/10
-    mova            m3, [r6+r5*2]
-%endif ; %2 == 8/9/10/16
-    mov             r6, [r2+gprsize*cntr_reg-gprsize]
-%if %2 == 16
-    mova            m4, [r6+r5*4]
-    mova            m6, [r6+r5*4+mmsize]
-%else ; %2 == 8/9/10
-    mova            m4, [r6+r5*2]
-%endif ; %2 == 8/9/10/16
-
-    ; coefficients
-    movd            m0, [r0+2*cntr_reg-4]; coeff[0], coeff[1]
-%if %2 == 16
-    pshuflw         m7,  m0,  0          ; coeff[0]
-    pshuflw         m0,  m0,  0x55       ; coeff[1]
-    pmovsxwd        m7,  m7              ; word -> dword
-    pmovsxwd        m0,  m0              ; word -> dword
-
-    pmulld          m3,  m7
-    pmulld          m5,  m7
-    pmulld          m4,  m0
-    pmulld          m6,  m0
-
-    paddd           m2,  m3
-    paddd           m1,  m5
-    paddd           m2,  m4
-    paddd           m1,  m6
-%else ; %2 == 10/9/8
-    punpcklwd       m5,  m3,  m4
-    punpckhwd       m3,  m4
-    SPLATD          m0,  m0
-
-    pmaddwd         m5,  m0
-    pmaddwd         m3,  m0
-
-    paddd           m2,  m5
-    paddd           m1,  m3
-%endif ; %2 == 8/9/10/16
-
-    sub       cntr_reg,  2
-    jg .filterloop_ %+ %%i
-
-%if %2 == 16
-    psrad           m2,  31 - %2
-    psrad           m1,  31 - %2
-%else ; %2 == 10/9/8
-    psrad           m2,  27 - %2
-    psrad           m1,  27 - %2
-%endif ; %2 == 8/9/10/16
-
-%if %2 == 8
-    packssdw        m2,  m1
-    packuswb        m2,  m2
-    movh     [r3+r5*1],  m2
-%else ; %2 == 9/10/16
-%if %2 == 16
-    packssdw        m2,  m1
-    paddw           m2, [minshort]
-%else ; %2 == 9/10
-%ifidn %1, sse4
-    packusdw        m2,  m1
-%elifidn %1, avx
-    packusdw        m2,  m1
-%else ; mmx2/sse2
-    packssdw        m2,  m1
-    pmaxsw          m2,  m6
-%endif ; mmx2/sse2/sse4/avx
-    pminsw          m2, [yuv2yuvX_%2_upper]
-%endif ; %2 == 9/10/16
-    mova     [r3+r5*2],  m2
-%endif ; %2 == 8/9/10/16
-
-    add             r5,  mmsize/2
-    sub             r4d, mmsize/2
-%if %2 == 8
-%assign %%i %%i+2
-%endrep
-%endif ; %2 == 8
-    jg .pixelloop
-
-%if %2 == 8
-%ifdef ARCH_X86_32
-    ADD             rsp, pad
-    RET
-%else ; x86-64
-    REP_RET
-%endif ; x86-32/64
-%else ; %2 == 9/10/16
-    REP_RET
-%endif ; %2 == 8/9/10/16
-%endmacro
-
-%define PALIGNR PALIGNR_MMX
-%ifdef ARCH_X86_32
-INIT_MMX
-yuv2planeX_fn mmx2,  8,  0, 7
-yuv2planeX_fn mmx2,  9,  0, 5
-yuv2planeX_fn mmx2, 10,  0, 5
-%endif
-
-INIT_XMM
-yuv2planeX_fn sse2,  8, 10, 7
-yuv2planeX_fn sse2,  9,  7, 5
-yuv2planeX_fn sse2, 10,  7, 5
-
-%define PALIGNR PALIGNR_SSSE3
-yuv2planeX_fn sse4,  8, 10, 7
-yuv2planeX_fn sse4,  9,  7, 5
-yuv2planeX_fn sse4, 10,  7, 5
-yuv2planeX_fn sse4, 16,  8, 5
-
-INIT_AVX
-yuv2planeX_fn avx,   8, 10, 7
-yuv2planeX_fn avx,   9,  7, 5
-yuv2planeX_fn avx,  10,  7, 5
-
-; %1=outout-bpc, %2=alignment (u/a)
-%macro yuv2plane1_mainloop 2
-.loop_%2:
-%if %1 == 8
-    paddsw          m0, m2, [r0+r2*2+mmsize*0]
-    paddsw          m1, m3, [r0+r2*2+mmsize*1]
-    psraw           m0, 7
-    psraw           m1, 7
-    packuswb        m0, m1
-    mov%2      [r1+r2], m0
-%elif %1 == 16
-    paddd           m0, m4, [r0+r2*4+mmsize*0]
-    paddd           m1, m4, [r0+r2*4+mmsize*1]
-    paddd           m2, m4, [r0+r2*4+mmsize*2]
-    paddd           m3, m4, [r0+r2*4+mmsize*3]
-    psrad           m0, 3
-    psrad           m1, 3
-    psrad           m2, 3
-    psrad           m3, 3
-%if cpuflag(sse4) ; avx/sse4
-    packusdw        m0, m1
-    packusdw        m2, m3
-%else ; mmx/sse2
-    packssdw        m0, m1
-    packssdw        m2, m3
-    paddw           m0, m5
-    paddw           m2, m5
-%endif ; mmx/sse2/sse4/avx
-    mov%2    [r1+r2*2], m0
-    mov%2    [r1+r2*2+mmsize], m2
-%else
-    paddsw          m0, m2, [r0+r2*2+mmsize*0]
-    paddsw          m1, m2, [r0+r2*2+mmsize*1]
-    psraw           m0, 15 - %1
-    psraw           m1, 15 - %1
-    pmaxsw          m0, m4
-    pmaxsw          m1, m4
-    pminsw          m0, m3
-    pminsw          m1, m3
-    mov%2    [r1+r2*2], m0
-    mov%2    [r1+r2*2+mmsize], m1
-%endif
-    add             r2, mmsize
-    jl .loop_%2
-%endmacro
-
-%macro yuv2plane1_fn 3
-cglobal yuv2plane1_%1, %3, %3, %2
-    add             r2, mmsize - 1
-    and             r2, ~(mmsize - 1)
-%if %1 == 8
-    add             r1, r2
-%else ; %1 != 8
-    lea             r1, [r1+r2*2]
-%endif ; %1 == 8
-%if %1 == 16
-    lea             r0, [r0+r2*4]
-%else ; %1 != 16
-    lea             r0, [r0+r2*2]
-%endif ; %1 == 16
-    neg             r2
-
-%if %1 == 8
-    pxor            m4, m4               ; zero
-
-    ; create registers holding dither
-    movq            m3, [r3]             ; dither
-    test           r4d, r4d
-    jz              .no_rot
-%if mmsize == 16
-    punpcklqdq      m3, m3
-%endif ; mmsize == 16
-    PALIGNR_MMX     m3, m3, 3, m2
-.no_rot:
-%if mmsize == 8
-    mova            m2, m3
-    punpckhbw       m3, m4               ; byte->word
-    punpcklbw       m2, m4               ; byte->word
-%else
-    punpcklbw       m3, m4
-    mova            m2, m3
-%endif
-%elif %1 == 9
-    pxor            m4, m4
-    mova            m3, [pw_512]
-    mova            m2, [pw_32]
-%elif %1 == 10
-    pxor            m4, m4
-    mova            m3, [pw_1024]
-    mova            m2, [pw_16]
-%else ; %1 == 16
-%if cpuflag(sse4) ; sse4/avx
-    mova            m4, [pd_4]
-%else ; mmx/sse2
-    mova            m4, [pd_4min0x40000]
-    mova            m5, [minshort]
-%endif ; mmx/sse2/sse4/avx
-%endif ; %1 == ..
-
-    ; actual pixel scaling
-%if mmsize == 8
-    yuv2plane1_mainloop %1, a
-%else ; mmsize == 16
-    test            r1, 15
-    jnz .unaligned
-    yuv2plane1_mainloop %1, a
-    REP_RET
-.unaligned:
-    yuv2plane1_mainloop %1, u
-%endif ; mmsize == 8/16
-    REP_RET
-%endmacro
-
-%ifdef ARCH_X86_32
-INIT_MMX mmx
-yuv2plane1_fn  8, 0, 5
-yuv2plane1_fn 16, 0, 3
-
-INIT_MMX mmx2
-yuv2plane1_fn  9, 0, 3
-yuv2plane1_fn 10, 0, 3
-%endif
-
-INIT_XMM sse2
-yuv2plane1_fn  8, 5, 5
-yuv2plane1_fn  9, 5, 3
-yuv2plane1_fn 10, 5, 3
-yuv2plane1_fn 16, 6, 3
-
-INIT_XMM sse4
-yuv2plane1_fn 16, 5, 3
-
-INIT_XMM avx
-yuv2plane1_fn  8, 5, 5
-yuv2plane1_fn  9, 5, 3
-yuv2plane1_fn 10, 5, 3
-yuv2plane1_fn 16, 5, 3
diff --git a/tests/codec-regression.sh b/tests/codec-regression.sh
index f3ead20298..7b58f53072 100755
--- a/tests/codec-regression.sh
+++ b/tests/codec-regression.sh
@@ -369,6 +369,11 @@ do_audio_encoding g726.wav "-b:a 32k -ac 1 -ar 8000 -acodec g726"
 do_audio_decoding
 fi
 
+if [ -n "$do_adpcm_adx" ] ; then
+do_audio_encoding adpcm_adx.adx "-acodec adpcm_adx"
+do_audio_decoding
+fi
+
 if [ -n "$do_adpcm_ima_wav" ] ; then
 do_audio_encoding adpcm_ima.wav "-acodec adpcm_ima_wav"
 do_audio_decoding
diff --git a/tests/ref/acodec/adpcm_adx b/tests/ref/acodec/adpcm_adx
new file mode 100644
index 0000000000..8d86698101
--- /dev/null
+++ b/tests/ref/acodec/adpcm_adx
@@ -0,0 +1,4 @@
+0a30509d9296b857e134b762b76dbc31 *./tests/data/acodec/adpcm_adx.adx
+297720 ./tests/data/acodec/adpcm_adx.adx
+2dbc601ed5259f4d74dc48ccd8da7eaf *./tests/data/adpcm_adx.acodec.out.wav
+stddev: 6989.46 PSNR: 19.44 MAXDIFF:65398 bytes:  1058432/  1058400