Merge remote-tracking branch 'qatar/master'

* qatar/master: (35 commits) flvdec: Do not call parse_keyframes_index with a NULL stream libspeexdec: include system headers before local headers libspeexdec: return meaningful error codes libspeexdec: cosmetics: reindent libspeexdec: decode one frame at a time. swscale: fix signed shift overflows in ff_yuv2rgb_c_init_tables() Move timefilter code from lavf to lavd. mov: add support for hdvd and pgapmetadata atoms mov: rename function _stik, some indentation cosmetics mov: rename function _int8 to remove ambiguity, some indentation cosmetics mov: parse the gnre atom mp3on4: check for allocation failures in decode_init_mp3on4() mp3on4: create a separate flush function for MP3onMP4. mp3on4: ensure that the frame channel count does not exceed the codec channel count. mp3on4: set channel layout mp3on4: fix the output channel order mp3on4: allocate temp buffer with av_malloc() instead of on the stack. mp3on4: copy MPADSPContext from first context to all contexts. fmtconvert: port float_to_int16_interleave() 2-channel x86 inline asm to yasm fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm ... Conflicts: libavcodec/arm/h264dsp_init_arm.c libavcodec/h264.c libavcodec/h264.h libavcodec/h264_cabac.c libavcodec/h264_cavlc.c libavcodec/h264_ps.c libavcodec/h264dsp_template.c libavcodec/h264idct_template.c libavcodec/h264pred.c libavcodec/h264pred_template.c libavcodec/x86/h264dsp_mmx.c libavdevice/Makefile libavdevice/jack_audio.c libavformat/Makefile libavformat/flvdec.c libavformat/flvenc.c libavutil/pixfmt.h libswscale/utils.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
2024-12-23 12:43:46 +02:00 · 2011-10-22 01:03:27 +02:00 · 2011-10-22 01:03:27 +02:00 · aedc908601
commit aedc908601
parent 1a7090bfaf f4b51d061f
47 changed files with 1112 additions and 991 deletions
--- a/1
+++ b/1
@ -67,6 +67,7 @@ easier to use. The changes are:
 - aevalsrc audio source added
 - Ut Video decoder
 - Speex encoding via libspeex
 - 4:2:2 H.264 decoding support
 version 0.8:
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                       int beta, int8_t *tc0);
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
-                                      int weight, int offset);
+                                   int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
-                                     int weight, int offset);
+                                  int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
-                                     int weight, int offset);
+                                  int log2_den, int weight, int offset);
 void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                        int log2_den, int weightd, int weights,
+                                     int height, int log2_den, int weightd,
-                                        int offset);
+                                     int weights, int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
+                                    int height, int log2_den, int weightd,
-                                       int offset);
+                                    int weights, int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
+                                    int height, int log2_den, int weightd,
-                                       int offset);
+                                    int weights, int offset);
 void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@ -101,23 +76,14 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
    }
    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
-    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
+    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
-    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
+    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
-    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
+    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
-    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
+
-    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
-    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
-    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
    c->h264_idct_add        = ff_h264_idct_add_neon;
    c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@ -1592,7 +1592,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q2,  q8
        vmov            q3,  q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
        vld1.8          {d20-d21},[r0,:128], r2
        \macd           q2,  d0,  d20
        pld             [r0]
@ -1632,7 +1632,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q1,  q8
        vmov            q10, q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
        vld1.8          {d4},[r0,:64], r2
        \macd           q1,  d0,  d4
        pld             [r0]
@ -1662,7 +1662,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q1,  q8
        vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r3,  r3,  #4
        vld1.32         {d4[0]},[r0,:32], r2
        vld1.32         {d4[1]},[r0,:32], r2
        \macd           q1,  d0,  d4
@ -1700,16 +1700,17 @@ endfunc
        .endm
        .macro  biweight_func w
-function biweight_h264_pixels_\w\()_neon
+function ff_biweight_h264_pixels_\w\()_neon, export=1
        push            {r4-r6, lr}
-        add             r4,  sp,  #16
+        ldr             r12, [sp, #16]
        add             r4,  sp,  #20
        ldm             r4,  {r4-r6}
        lsr             lr,  r4,  #31
        add             r6,  r6,  #1
        eors            lr,  lr,  r5,  lsr #30
        orr             r6,  r6,  #1
-        vdup.16         q9,  r3
+        vdup.16         q9,  r12
-        lsl             r6,  r6,  r3
+        lsl             r6,  r6,  r12
        vmvn            q9,  q9
        vdup.16         q8,  r6
        mov             r6,  r0
@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
 endfunc
        .endm
        .macro  biweight_entry w, h, b=1
 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
        mov             ip,  #\h
 .if \b
        b               biweight_h264_pixels_\w\()_neon
 .endif
 endfunc
        .endm
        biweight_entry  16, 8
        biweight_entry  16, 16, b=0
        biweight_func   16
        biweight_entry  8,  16
        biweight_entry  8,  4
        biweight_entry  8,  8,  b=0
        biweight_func   8
        biweight_entry  4,  8
        biweight_entry  4,  2
        biweight_entry  4,  4,  b=0
        biweight_func   4
@ Weighted prediction
        .macro  weight_16 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
-1:      subs            ip,  ip,  #2
+1:      subs            r2,  r2,  #2
        vld1.8          {d20-d21},[r0,:128], r1
        vmull.u8        q2,  d0,  d20
        pld             [r0]
@ -1785,8 +1767,8 @@ endfunc
        .endm
        .macro  weight_8 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
-1:      subs            ip,  ip,  #2
+1:      subs            r2,  r2,  #2
        vld1.8          {d4},[r0,:64], r1
        vmull.u8        q1,  d0,  d4
        pld             [r0]
@ -1806,10 +1788,10 @@ endfunc
        .endm
        .macro  weight_4 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
        vmov            q1,  q8
        vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r2,  r2,  #4
        vld1.32         {d4[0]},[r0,:32], r1
        vld1.32         {d4[1]},[r0,:32], r1
        vmull.u8        q1,  d0,  d4
@ -1842,50 +1824,32 @@ endfunc
        .endm
        .macro  weight_func w
-function weight_h264_pixels_\w\()_neon
+function ff_weight_h264_pixels_\w\()_neon, export=1
        push            {r4, lr}
-        ldr             r4,  [sp, #8]
+        ldr             r12, [sp, #8]
-        cmp             r2,  #1
+        ldr             r4,  [sp, #12]
-        lsl             r4,  r4,  r2
+        cmp             r3,  #1
        lsl             r4,  r4,  r3
        vdup.16         q8,  r4
        mov             r4,  r0
        ble             20f
-        rsb             lr,  r2,  #1
+        rsb             lr,  r3,  #1
        vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
        blt             10f
        weight_\w       vhadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
        weight_\w       vhsub.s16
-20:     rsb             lr,  r2,  #0
+20:     rsb             lr,  r3,  #0
        vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
        blt             10f
        weight_\w       vadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
        weight_\w       vsub.s16
 endfunc
        .endm
        .macro  weight_entry w, h, b=1
 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
        mov             ip,  #\h
 .if \b
        b               weight_h264_pixels_\w\()_neon
 .endif
 endfunc
        .endm
        weight_entry    16, 8
        weight_entry    16, 16, b=0
        weight_func     16
        weight_entry    8,  16
        weight_entry    8,  4
        weight_entry    8,  8,  b=0
        weight_func     8
        weight_entry    4,  8
        weight_entry    4,  2
        weight_entry    4,  4,  b=0
        weight_func     4
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@ -70,7 +70,15 @@ typedef struct FmtConvertContext {
                                      long len, int channels);
    /**
-     * Convert an array of interleaved float to multiple arrays of float.
+     * Convert multiple arrays of float to an array of interleaved float.
     *
     * @param dst destination array of interleaved float.
     *            constraints: 16-byte aligned
     * @param src source array of float arrays, one for each channel.
     *            constraints: 16-byte aligned
     * @param len number of elements to convert.
     *            constraints: multiple of 8
     * @param channels number of channels
     */
    void (*float_interleave)(float *dst, const float **src, unsigned int len,
                             int channels);
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@ -460,11 +460,14 @@ static void chroma_dc_dct_c(DCTELEM *block){
 }
 #endif
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
+static av_always_inline void
 mc_dir_part(H264Context *h, Picture *pic, int n, int square,
            int height, int delta, int list,
            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
            int src_x_offset, int src_y_offset,
            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
-                           int pixel_shift, int chroma444){
+            int pixel_shift, int chroma_idc)
 {
    MpegEncContext * const s = &h->s;
    const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
    int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
@ -479,6 +482,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
    const int full_my= my>>2;
    const int pic_width  = 16*s->mb_width;
    const int pic_height = 16*s->mb_height >> MB_FIELD;
    int ysh;
    if(mx&7) extra_width -= 3;
    if(my&7) extra_height -= 3;
@ -487,7 +491,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
       || full_my < 0-extra_height
       || full_mx + 16/*FIXME*/ > pic_width + extra_width
       || full_my + 16/*FIXME*/ > pic_height + extra_height){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
                                16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
            src_y= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
        emu=1;
    }
@ -499,7 +504,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
    if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
-    if(chroma444){
+    if(chroma_idc == 3 /* yuv444 */){
        src_cb = pic->f.data[1] + offset;
        if(emu){
            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
@ -524,42 +529,55 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
        return;
    }
-    if(MB_FIELD){
+    ysh = 3 - (chroma_idc == 2 /* yuv422 */);
    if(chroma_idc == 1 /* yuv420 */ && MB_FIELD){
        // chroma offset when predicting from a field of opposite parity
        my += 2 * ((s->mb_y & 1) - (pic->f.reference - 1));
        emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
    }
-    src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize;
+
-    src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize;
+    src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
    src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
    if(emu){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize,
                                9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
                                pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
            src_cb= s->edge_emu_buffer;
    }
-    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
              mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
    if(emu){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize,
                                9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
                                pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
            src_cr= s->edge_emu_buffer;
    }
-    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
              mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
 }
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
+static av_always_inline void
 mc_part_std(H264Context *h, int n, int square, int height, int delta,
            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
            int x_offset, int y_offset,
            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+            int list0, int list1, int pixel_shift, int chroma_idc)
 {
    MpegEncContext * const s = &h->s;
    qpel_mc_func *qpix_op=  qpix_put;
    h264_chroma_mc_func chroma_op= chroma_put;
    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    if(chroma444){
+    if (chroma_idc == 3 /* yuv444 */) {
        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    }else{
+    } else if (chroma_idc == 2 /* yuv422 */) {
        dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
        dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
    } else /* yuv420 */ {
        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
    }
@ -568,9 +586,9 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
    if(list0){
        Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
+        mc_dir_part(h, ref, n, square, height, delta, 0,
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift, chroma444);
+                           qpix_op, chroma_op, pixel_shift, chroma_idc);
        qpix_op=  qpix_avg;
        chroma_op= chroma_avg;
@ -578,28 +596,36 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
    if(list1){
        Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
+        mc_dir_part(h, ref, n, square, height, delta, 1,
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift, chroma444);
+                           qpix_op, chroma_op, pixel_shift, chroma_idc);
    }
 }
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
+static av_always_inline void
 mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
                 uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                 int x_offset, int y_offset,
                 qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                 h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
                 h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+                 int list0, int list1, int pixel_shift, int chroma_idc){
    MpegEncContext * const s = &h->s;
    int chroma_height;
    dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    if(chroma444){
+    if (chroma_idc == 3 /* yuv444 */) {
        chroma_height = height;
        chroma_weight_avg = luma_weight_avg;
        chroma_weight_op = luma_weight_op;
        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    }else{
+    } else if (chroma_idc == 2 /* yuv422 */) {
        chroma_height = height;
        dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
        dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
    } else /* yuv420 */ {
        chroma_height = height >> 1;
        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
    }
@ -615,27 +641,32 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
        int refn0 = h->ref_cache[0][ scan8[n] ];
        int refn1 = h->ref_cache[1][ scan8[n] ];
-        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
+        mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
                    dest_y, dest_cb, dest_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
+                    x_offset, y_offset, qpix_put, chroma_put,
-        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
+                    pixel_shift, chroma_idc);
        mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
                    tmp_y, tmp_cb, tmp_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
+                    x_offset, y_offset, qpix_put, chroma_put,
                    pixel_shift, chroma_idc);
        if(h->use_weight == 2){
            int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
            int weight1 = 64 - weight0;
-            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
+            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize,
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
+                              height,        5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
                              chroma_height, 5, weight0, weight1, 0);
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
                              chroma_height, 5, weight0, weight1, 0);
        }else{
-            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
+            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                            h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
                            h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                            h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                            h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                            h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                            h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
        }
@ -643,42 +674,46 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
        int list = list1 ? 1 : 0;
        int refn = h->ref_cache[list][ scan8[n] ];
        Picture *ref= &h->ref_list[list][refn];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
+        mc_dir_part(h, ref, n, square, height, delta, list,
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                    qpix_put, chroma_put, pixel_shift, chroma444);
+                    qpix_put, chroma_put, pixel_shift, chroma_idc);
-        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
+        luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                       h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
        if(h->use_weight_chroma){
-            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
-            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
        }
    }
 }
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
+static av_always_inline void
 mc_part(H264Context *h, int n, int square, int height, int delta,
        uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
        int x_offset, int y_offset,
        qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
        qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
        h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+        int list0, int list1, int pixel_shift, int chroma_idc)
 {
    if((h->use_weight==2 && list0 && list1
        && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
       || h->use_weight==1)
-        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                         x_offset, y_offset, qpix_put, chroma_put,
-                         weight_op[0], weight_op[3], weight_avg[0],
+                         weight_op[0], weight_op[1], weight_avg[0],
-                         weight_avg[3], list0, list1, pixel_shift, chroma444);
+                         weight_avg[1], list0, list1, pixel_shift, chroma_idc);
    else
-        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
-                    chroma_avg, list0, list1, pixel_shift, chroma444);
+                    chroma_avg, list0, list1, pixel_shift, chroma_idc);
 }
-static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){
+static av_always_inline void
 prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc)
 {
    /* fetch pixels for estimated mv 4 macroblocks ahead
     * optimized for 64byte cache lines */
    MpegEncContext * const s = &h->s;
@ -689,7 +724,7 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, in
        uint8_t **src = h->ref_list[list][refn].f.data;
        int off= (mx << pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize + (64 << pixel_shift);
        s->dsp.prefetch(src[0]+off, s->linesize, 4);
-        if(chroma444){
+        if (chroma_idc == 3 /* yuv444 */) {
            s->dsp.prefetch(src[1]+off, s->linesize, 4);
            s->dsp.prefetch(src[2]+off, s->linesize, 4);
        }else{
@ -703,7 +738,8 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                      qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
                      qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
                      h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                      int pixel_shift, int chroma444){
+                      int pixel_shift, int chroma_idc)
 {
    MpegEncContext * const s = &h->s;
    const int mb_xy= h->mb_xy;
    const int mb_type = s->current_picture.f.mb_type[mb_xy];
@ -712,36 +748,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
    if(HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
        await_references(h);
-    prefetch_motion(h, 0, pixel_shift, chroma444);
+    prefetch_motion(h, 0, pixel_shift, chroma_idc);
    if(IS_16X16(mb_type)){
-        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                weight_op, weight_avg,
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
    }else if(IS_16X8(mb_type)){
-        mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
-        mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
+        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
    }else if(IS_8X16(mb_type)){
-        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
-        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
+        mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
    }else{
        int i;
@ -754,50 +790,72 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
            int y_offset= (i&2)<<1;
            if(IS_SUB_8X8(sub_mb_type)){
-                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                    &weight_op[3], &weight_avg[3],
+                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
            }else if(IS_SUB_8X4(sub_mb_type)){
-                mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
-                mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+                mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
            }else if(IS_SUB_4X8(sub_mb_type)){
-                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
-                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+                mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
            }else{
                int j;
                assert(IS_SUB_4X4(sub_mb_type));
                for(j=0; j<4; j++){
                    int sub_x_offset= x_offset + 2*(j&1);
                    int sub_y_offset= y_offset +   (j&2);
-                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+                    mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[6], &weight_avg[6],
+                        &weight_op[2], &weight_avg[2],
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma444);
+                        pixel_shift, chroma_idc);
                }
            }
        }
    }
-    prefetch_motion(h, 1, pixel_shift, chroma444);
+    prefetch_motion(h, 1, pixel_shift, chroma_idc);
 }
 static av_always_inline void
 hl_motion_420(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
              qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
              qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
              h264_weight_func *weight_op, h264_biweight_func *weight_avg,
              int pixel_shift)
 {
    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1);
 }
 static av_always_inline void
 hl_motion_422(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
              qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
              qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
              h264_weight_func *weight_op, h264_biweight_func *weight_avg,
              int pixel_shift)
 {
    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2);
 }
 static void free_tables(H264Context *h, int free_rbsp){
@ -1468,7 +1526,10 @@ static void decode_postinit(H264Context *h, int setup_finished){
        ff_thread_finish_setup(s->avctx);
 }
-static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
+static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y,
                                              uint8_t *src_cb, uint8_t *src_cr,
                                              int linesize, int uvlinesize, int simple)
 {
    MpegEncContext * const s = &h->s;
    uint8_t *top_border;
    int top_idx = 1;
@ -1813,7 +1874,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
    }
 }
-static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
+static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift)
 {
    MpegEncContext * const s = &h->s;
    const int mb_x= s->mb_x;
    const int mb_y= s->mb_y;
@ -1827,7 +1889,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
    /* is_h264 should always be true if SVQ3 is disabled. */
    const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    const int block_h = 16>>s->chroma_y_shift;
+    const int block_h = 16 >> s->chroma_y_shift;
    const int chroma422 = CHROMA422;
    dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
    dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h;
@ -1844,8 +1907,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
        block_offset = &h->block_offset[48];
        if(mb_y&1){ //FIXME move out of this function?
            dest_y -= s->linesize*15;
-            dest_cb-= s->uvlinesize*(block_h-1);
+            dest_cb-= s->uvlinesize * (block_h - 1);
-            dest_cr-= s->uvlinesize*(block_h-1);
+            dest_cr-= s->uvlinesize * (block_h - 1);
        }
        if(FRAME_MBAFF) {
            int list;
@ -1884,7 +1947,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
            }
            if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < 8; i++) {
+                    for (i = 0; i < block_h; i++) {
                        uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
                        uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
                        for (j = 0; j < 8; j++) {
@ -1911,13 +1974,13 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
            if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                if (!h->sps.chroma_format_idc) {
                    for (i=0; i<8; i++) {
-                        memset(dest_cb+ i*uvlinesize, 1 << (bit_depth - 1), 8);
+                        memset(dest_cb + i*uvlinesize, 1 << (bit_depth - 1), 8);
-                        memset(dest_cr+ i*uvlinesize, 1 << (bit_depth - 1), 8);
+                        memset(dest_cr + i*uvlinesize, 1 << (bit_depth - 1), 8);
                    }
                } else {
                    for (i=0; i<block_h; i++) {
-                        memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
+                        memcpy(dest_cb + i*uvlinesize, h->mb + 128 + i*4,  8);
-                        memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
+                        memcpy(dest_cr + i*uvlinesize, h->mb + 160 + i*4,  8);
                    }
                }
            }
@ -1937,11 +2000,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
            if(h->deblocking_filter)
                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift);
        }else if(is_h264){
-            hl_motion(h, dest_y, dest_cb, dest_cr,
+            if (chroma422) {
                hl_motion_422(h, dest_y, dest_cb, dest_cr,
                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                              h->h264dsp.weight_h264_pixels_tab,
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 0);
+                              h->h264dsp.biweight_h264_pixels_tab,
                              pixel_shift);
            } else {
                hl_motion_420(h, dest_y, dest_cb, dest_cr,
                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                              h->h264dsp.weight_h264_pixels_tab,
                              h->h264dsp.biweight_h264_pixels_tab,
                              pixel_shift);
            }
        }
        hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
@ -1959,14 +2032,20 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                            if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
                                idct_add   (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
                        }
                        if (chroma422) {
                            for(i=j*16+4; i<j*16+8; i++){
                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
                                    idct_add   (dest[j-1] + block_offset[i+4], h->mb + (i*16 << pixel_shift), uvlinesize);
                            }
                        }
                    }
                }
            }else{
                if(is_h264){
                    int qp[2];
-                    if (CHROMA422) {
+                    if (chroma422) {
-                        qp[0] = h->chroma_qp[0]+3;
+                        qp[0] = h->chroma_qp[0] + 3;
-                        qp[1] = h->chroma_qp[1]+3;
+                        qp[1] = h->chroma_qp[1] + 3;
                    } else {
                        qp[0] = h->chroma_qp[0];
                        qp[1] = h->chroma_qp[1];
@ -2086,7 +2165,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                      h->h264dsp.weight_h264_pixels_tab,
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 1);
+                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3);
        }
        for (p = 0; p < plane_count; p++)
@ -2690,6 +2769,8 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
            case 9 :
                if (CHROMA444)
                    s->avctx->pix_fmt = PIX_FMT_YUV444P9;
                else if (CHROMA422)
                    s->avctx->pix_fmt = PIX_FMT_YUV422P9;
                else
                    s->avctx->pix_fmt = PIX_FMT_YUV420P9;
                break;
@ -2708,7 +2789,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
                       s->avctx->pix_fmt = PIX_FMT_GBR24P;
                       av_log(h->s.avctx, AV_LOG_DEBUG, "Detected GBR colorspace.\n");
                    }
-                }else if (CHROMA422) {
+                } else if (CHROMA422) {
                    s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ422P : PIX_FMT_YUV422P;
                }else{
                    s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
@ -3384,7 +3465,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
    const int end_mb_y= s->mb_y + FRAME_MBAFF;
    const int old_slice_type= h->slice_type;
    const int pixel_shift = h->pixel_shift;
-    const int block_h = 16>>s->chroma_y_shift;
+    const int block_h = 16 >> s->chroma_y_shift;
    if(h->deblocking_filter) {
        for(mb_x= start_x; mb_x<end_x; mb_x++){
@ -3401,8 +3482,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                s->mb_x= mb_x;
                s->mb_y= mb_y;
                dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
-                dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*(8<<CHROMA444) + mb_y * s->uvlinesize * block_h;
+                dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h;
-                dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift)*(8<<CHROMA444) + mb_y * s->uvlinesize * block_h;
+                dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h;
                    //FIXME simplify above
                if (MB_FIELD) {
@ -3410,8 +3491,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                    uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
                    if(mb_y&1){ //FIXME move out of this function?
                        dest_y -= s->linesize*15;
-                        dest_cb-= s->uvlinesize*(block_h-1);
+                        dest_cb-= s->uvlinesize * (block_h - 1);
-                        dest_cr-= s->uvlinesize*(block_h-1);
+                        dest_cr-= s->uvlinesize * (block_h - 1);
                    }
                } else {
                    linesize   = h->mb_linesize   = s->linesize;
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@ -1565,7 +1565,12 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 };
-static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
+static av_always_inline void
 decode_cabac_residual_internal(H264Context *h, DCTELEM *block,
                               int cat, int n, const uint8_t *scantable,
                               const uint32_t *qmul, int max_coeff,
                               int is_dc, int chroma422)
 {
    static const int significant_coeff_flag_offset[2][14] = {
      { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
      { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
@ -1593,7 +1598,10 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
     * map node ctx => cabac ctx for level=1 */
    static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
    /* map node ctx => cabac ctx for level>1 */
-    static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+    static const uint8_t coeff_abs_levelgt1_ctx[2][8] = {
        { 5, 5, 5, 5, 6, 7, 8, 9 },
        { 5, 5, 5, 5, 6, 7, 8, 8 }, // 422/dc case
    };
    static const uint8_t coeff_abs_level_transition[2][8] = {
    /* update node ctx after decoding a level=1 */
        { 1, 2, 3, 3, 4, 5, 6, 7 },
@ -1652,7 +1660,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
        coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index,
                                                 last_coeff_ctx_base, sig_off);
    } else {
-        if (is_dc && max_coeff == 8) { // dc 422
+        if (is_dc && chroma422) { // dc 422
            DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]);
        } else {
            coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index,
@ -1661,7 +1669,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
 #else
        DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
    } else {
-        if (is_dc && max_coeff == 8) { // dc 422
+        if (is_dc && chroma422) { // dc 422
            DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]);
        } else {
            DECODE_SIGNIFICANCE(max_coeff - 1, last, last);
@ -1701,9 +1709,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
            } \
        } else { \
            int coeff_abs = 2; \
-            if (is_dc && max_coeff == 8) \
+            ctx = coeff_abs_levelgt1_ctx[is_dc && chroma422][node_ctx] + abs_level_m1_ctx_base; \
                node_ctx = FFMIN(node_ctx, 6); \
            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; \
            node_ctx = coeff_abs_level_transition[1][node_ctx]; \
 \
            while( coeff_abs < 15 && get_cabac( CC, ctx ) ) { \
@ -1745,11 +1751,18 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
 }
 static void decode_cabac_residual_dc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
-    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1);
+    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 0);
 }
 static void decode_cabac_residual_dc_internal_422(H264Context *h, DCTELEM *block,
                                                  int cat, int n, const uint8_t *scantable,
                                                  int max_coeff)
 {
    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 1);
 }
 static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
-    decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
+    decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0, 0);
 }
 /* cat: 0-> DC 16x16  n = 0
@ -1773,6 +1786,19 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *
    decode_cabac_residual_dc_internal( h, block, cat, n, scantable, max_coeff );
 }
 static av_always_inline void
 decode_cabac_residual_dc_422(H264Context *h, DCTELEM *block,
                             int cat, int n, const uint8_t *scantable,
                             int max_coeff)
 {
    /* read coded block flag */
    if (get_cabac(&h->cabac, &h->cabac_state[get_cabac_cbf_ctx(h, cat, n, max_coeff, 1)]) == 0) {
        h->non_zero_count_cache[scan8[n]] = 0;
        return;
    }
    decode_cabac_residual_dc_internal_422(h, block, cat, n, scantable, max_coeff);
 }
 static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
    /* read coded block flag */
    if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) {
@ -2325,17 +2351,14 @@ decode_intra_mb:
        if(CHROMA444){
            decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1);
            decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2);
-        } else {
+        } else if (CHROMA422) {
            const int num_c8x8 = h->sps.chroma_format_idc;
            if( cbp&0x30 ){
                int c;
                for( c = 0; c < 2; c++ ) {
                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                    decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3,
+                    decode_cabac_residual_dc_422(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3,
-                                             CHROMA_DC_BLOCK_INDEX+c,
+                                                 CHROMA_DC_BLOCK_INDEX + c,
-                                             CHROMA422 ? chroma422_dc_scan : chroma_dc_scan,
+                                                 chroma422_dc_scan, 8);
                                             4*num_c8x8);
                }
            }
@ -2344,7 +2367,7 @@ decode_intra_mb:
                for( c = 0; c < 2; c++ ) {
                    DCTELEM *mb = h->mb + (16*(16 + 16*c) << pixel_shift);
                    qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
-                    for (i8x8 = 0; i8x8 < num_c8x8; i8x8++) {
+                    for (i8x8 = 0; i8x8 < 2; i8x8++) {
                        for (i = 0; i < 4; i++) {
                            const int index = 16 + 16 * c + 8*i8x8 + i;
                            //av_log(s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16);
@ -2357,6 +2380,29 @@ decode_intra_mb:
                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
            }
        } else /* yuv420 */ {
            if( cbp&0x30 ){
                int c;
                for( c = 0; c < 2; c++ ) {
                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
                    decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
                }
            }
            if( cbp&0x20 ) {
                int c, i;
                for( c = 0; c < 2; c++ ) {
                    qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
                    for( i = 0; i < 4; i++ ) {
                        const int index = 16 + 16 * c + i;
                        //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
                        decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
                    }
                }
            } else {
                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
            }
        }
    } else {
        fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
    else\
        c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
 \
-    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
+    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
-    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
+    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
-    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
+    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
-    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
+    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
-    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
+    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
-    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
+    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
-    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
+    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
-    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
+    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
 \
    c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
    c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@ -31,16 +31,18 @@
 #include "dsputil.h"
 //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
+                                 int log2_denom, int weight, int offset);
 typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
                                   int log2_denom, int weightd, int weights, int offset);
 /**
 * Context for storing H.264 DSP functions
 */
 typedef struct H264DSPContext{
    /* weighted MC */
-    h264_weight_func weight_h264_pixels_tab[10];
+    h264_weight_func weight_h264_pixels_tab[4];
-    h264_biweight_func biweight_h264_pixels_tab[10];
+    h264_biweight_func biweight_h264_pixels_tab[4];
    /* loop filter */
    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@ -29,14 +29,16 @@
 #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
 #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
+#define H264_WEIGHT(W) \
-static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *p_block, int stride, int log2_denom, int weight, int offset){ \
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
                                           int log2_denom, int weight, int offset) \
 { \
    int y; \
-    pixel *block = (pixel*)p_block; \
+    pixel *block = (pixel*)_block; \
    stride >>= sizeof(pixel)-1; \
    offset <<= (log2_denom + (BIT_DEPTH-8)); \
    if(log2_denom) offset += 1<<(log2_denom-1); \
-    for(y=0; y<H; y++, block += stride){ \
+    for (y = 0; y < height; y++, block += stride) { \
        op_scale1(0); \
        op_scale1(1); \
        if(W==2) continue; \
@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *p_block, int strid
        op_scale1(15); \
    } \
 } \
-static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
                                             int log2_denom, int weightd, int weights, int offset) \
 { \
    int y; \
    pixel *dst = (pixel*)_dst; \
    pixel *src = (pixel*)_src; \
    stride >>= sizeof(pixel)-1; \
    offset <<= (BIT_DEPTH-8); \
    offset = ((offset + 1) | 1) << log2_denom; \
-    for(y=0; y<H; y++, dst += stride, src += stride){ \
+    for (y = 0; y < height; y++, dst += stride, src += stride) { \
        op_scale2(0); \
        op_scale2(1); \
        if(W==2) continue; \
@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
    } \
 }
-H264_WEIGHT(16,16)
+H264_WEIGHT(16)
-H264_WEIGHT(16,8)
+H264_WEIGHT(8)
-H264_WEIGHT(8,16)
+H264_WEIGHT(4)
-H264_WEIGHT(8,8)
+H264_WEIGHT(2)
 H264_WEIGHT(8,4)
 H264_WEIGHT(4,8)
 H264_WEIGHT(4,4)
 H264_WEIGHT(4,2)
 H264_WEIGHT(2,4)
 H264_WEIGHT(2,2)
 #undef op_scale1
 #undef op_scale2
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@ -228,16 +228,6 @@ void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *
 void FUNCC(ff_h264_idct_add8_422)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
    int i, j;
 #if 0
    av_log(NULL, AV_LOG_INFO, "idct\n");
    int32_t *b = block;
    for (int i = 0; i < 256; i++) {
        av_log(NULL, AV_LOG_INFO, "%5d ", b[i+256]);
        if (!((i+1) % 16))
            av_log(NULL, AV_LOG_INFO, "\n");
    }
 #endif
    for(j=1; j<3; j++){
        for(i=j*16; i<j*16+4; i++){
            if(nnzc[ scan8[i] ])
@ -296,13 +286,13 @@ void FUNCC(ff_h264_luma_dc_dequant_idct)(DCTELEM *p_output, DCTELEM *p_input, in
 #undef stride
 }
-void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *p_block, int qmul){
+void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *_block, int qmul){
    const int stride= 16*2;
    const int xStride= 16;
    int i;
    int temp[8];
    static const uint8_t x_offset[2]={0, 16};
-    dctcoef *block = (dctcoef*)p_block;
+    dctcoef *block = (dctcoef*)_block;
    for(i=0; i<4; i++){
        temp[2*i+0] = block[stride*i + xStride*0] + block[stride*i + xStride*1];
@ -321,22 +311,13 @@ void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *p_block, int qmul){
        block[stride*2+offset]= ((z1 - z2)*qmul + 128) >> 8;
        block[stride*3+offset]= ((z0 - z3)*qmul + 128) >> 8;
    }
 #if 0
    av_log(NULL, AV_LOG_INFO, "after chroma dc\n");
    for (int i = 0; i < 256; i++) {
        av_log(NULL, AV_LOG_INFO, "%5d ", block[i]);
        if (!((i+1) % 16))
            av_log(NULL, AV_LOG_INFO, "\n");
    }
 #endif
 }
-void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *p_block, int qmul){
+void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *_block, int qmul){
    const int stride= 16*2;
    const int xStride= 16;
    int a,b,c,d,e;
-    dctcoef *block = (dctcoef*)p_block;
+    dctcoef *block = (dctcoef*)_block;
    a= block[stride*0 + xStride*0];
    b= block[stride*0 + xStride*1];
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@ -462,10 +462,10 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co
            h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x16_dc                    , depth);\
            h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc               , depth);\
            h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc                , depth);\
-            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\
+            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l0t, depth);\
-            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\
+            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0lt, depth);\
-            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
+            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l00, depth);\
-            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
+            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0l0, depth);\
        }\
    }else{\
        h->pred8x8[DC_PRED8x8     ]= FUNCD(pred8x8_dc_rv40);\
@ -510,8 +510,13 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co
    h->pred4x4_add  [ HOR_PRED   ]= FUNCC(pred4x4_horizontal_add          , depth);\
    h->pred8x8l_add [VERT_PRED   ]= FUNCC(pred8x8l_vertical_add           , depth);\
    h->pred8x8l_add [ HOR_PRED   ]= FUNCC(pred8x8l_horizontal_add         , depth);\
    if (chroma_format_idc == 1) {\
    h->pred8x8_add  [VERT_PRED8x8]= FUNCC(pred8x8_vertical_add            , depth);\
    h->pred8x8_add  [ HOR_PRED8x8]= FUNCC(pred8x8_horizontal_add          , depth);\
    } else {\
        h->pred8x8_add  [VERT_PRED8x8]= FUNCC(pred8x16_vertical_add            , depth);\
        h->pred8x8_add  [ HOR_PRED8x8]= FUNCC(pred8x16_horizontal_add          , depth);\
    }\
    h->pred16x16_add[VERT_PRED8x8]= FUNCC(pred16x16_vertical_add          , depth);\
    h->pred16x16_add[ HOR_PRED8x8]= FUNCC(pred16x16_horizontal_add        , depth);\
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@ -663,23 +663,45 @@ static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
    FUNCC(pred4x4_dc)(src, NULL, stride);
 }
 static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, int stride){
    FUNCC(pred8x16_top_dc)(src, stride);
    FUNCC(pred4x4_dc)(src, NULL, stride);
 }
 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
    FUNCC(pred8x8_dc)(src, stride);
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
 }
 static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, int stride){
    FUNCC(pred8x16_dc)(src, stride);
    FUNCC(pred4x4_top_dc)(src, NULL, stride);
 }
 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
    FUNCC(pred8x8_left_dc)(src, stride);
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
 }
 static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, int stride){
    FUNCC(pred8x16_left_dc)(src, stride);
    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
 }
 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
    FUNCC(pred8x8_left_dc)(src, stride);
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
 }
 static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, int stride){
    FUNCC(pred8x16_left_dc)(src, stride);
    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
 }
 static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
  int j, k;
  int a;
@ -1126,8 +1148,24 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, c
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
 }
 static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
    int i;
    for(i=0; i<4; i++)
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
    for(i=4; i<8; i++)
        FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
 }
 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
    int i;
    for(i=0; i<4; i++)
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
 }
 static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
    int i;
    for(i=0; i<4; i++)
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
    for(i=4; i<8; i++)
        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
 }
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@ -18,11 +18,11 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "avcodec.h"
 #include <speex/speex.h>
 #include <speex/speex_header.h>
 #include <speex/speex_stereo.h>
 #include <speex/speex_callbacks.h>
 #include "avcodec.h"
 typedef struct {
    SpeexBits bits;
@ -60,14 +60,14 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
        mode = speex_lib_get_mode(s->header->mode);
        if (!mode) {
            av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", s->header->mode);
-            return -1;
+            return AVERROR_INVALIDDATA;
        }
    } else
        av_log(avctx, AV_LOG_INFO, "Missing Speex header, assuming defaults.\n");
    if (avctx->channels > 2) {
        av_log(avctx, AV_LOG_ERROR, "Only stereo and mono are supported.\n");
-        return -1;
+        return AVERROR(EINVAL);
    }
    speex_bits_init(&s->bits);
@ -99,32 +99,42 @@ static int libspeex_decode_frame(AVCodecContext *avctx,
    uint8_t *buf = avpkt->data;
    int buf_size = avpkt->size;
    LibSpeexContext *s = avctx->priv_data;
-    int16_t *output = data, *end;
+    int16_t *output = data;
-    int i, num_samples;
+    int out_size, ret, consumed = 0;
-    num_samples = s->frame_size * avctx->channels;
+    /* check output buffer size */
-    end = output + *data_size / sizeof(*output);
+    out_size = s->frame_size * avctx->channels *
               av_get_bytes_per_sample(avctx->sample_fmt);
    if (*data_size < out_size) {
        av_log(avctx, AV_LOG_ERROR, "Output buffer is too small\n");
        return AVERROR(EINVAL);
    }
    /* if there is not enough data left for the smallest possible frame,
       reset the libspeex buffer using the current packet, otherwise ignore
       the current packet and keep decoding frames from the libspeex buffer. */
    if (speex_bits_remaining(&s->bits) < 43) {
        /* check for flush packet */
        if (!buf || !buf_size) {
            *data_size = 0;
            return buf_size;
        }
        /* set new buffer */
        speex_bits_read_from(&s->bits, buf, buf_size);
        consumed = buf_size;
    }
-    for (i = 0; speex_bits_remaining(&s->bits) && output + num_samples < end; i++) {
+    /* decode a single frame */
-        int ret = speex_decode_int(s->dec_state, &s->bits, output);
+    ret = speex_decode_int(s->dec_state, &s->bits, output);
    if (ret <= -2) {
        av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
-            return -1;
+        return AVERROR_INVALIDDATA;
-        } else if (ret == -1)
+    }
            // end of stream
            break;
    if (avctx->channels == 2)
        speex_decode_stereo_int(output, s->frame_size, &s->stereo);
-        output += num_samples;
+    *data_size = out_size;
-    }
+    return consumed;
    avctx->frame_size = s->frame_size * i;
    *data_size = avctx->channels * avctx->frame_size * sizeof(*output);
    return buf_size;
 }
 static av_cold int libspeex_decode_close(AVCodecContext *avctx)
@ -138,6 +148,12 @@ static av_cold int libspeex_decode_close(AVCodecContext *avctx)
    return 0;
 }
 static av_cold void libspeex_decode_flush(AVCodecContext *avctx)
 {
    LibSpeexContext *s = avctx->priv_data;
    speex_bits_reset(&s->bits);
 }
 AVCodec ff_libspeex_decoder = {
    .name           = "libspeex",
    .type           = AVMEDIA_TYPE_AUDIO,
@ -146,5 +162,7 @@ AVCodec ff_libspeex_decoder = {
    .init           = libspeex_decode_init,
    .close          = libspeex_decode_close,
    .decode         = libspeex_decode_frame,
    .flush          = libspeex_decode_flush,
    .capabilities   = CODEC_CAP_SUBFRAMES | CODEC_CAP_DELAY,
    .long_name = NULL_IF_CONFIG_SMALL("libspeex Speex"),
 };
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@ -1893,24 +1893,50 @@ typedef struct MP3On4DecodeContext {
    int syncword; ///< syncword patch
    const uint8_t *coff; ///< channels offsets in output buffer
    MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance
    OUT_INT *decoded_buf;           ///< output buffer for decoded samples
 } MP3On4DecodeContext;
 #include "mpeg4audio.h"
 /* Next 3 arrays are indexed by channel config number (passed via codecdata) */
 static const uint8_t mp3Frames[8] = {0,1,1,2,3,3,4,5};   /* number of mp3 decoder instances */
-/* offsets into output buffer, assume output order is FL FR BL BR C LFE */
+/* offsets into output buffer, assume output order is FL FR C LFE BL BR SL SR */
 static const uint8_t chan_offset[8][5] = {
    {0},
    {0},            // C
    {0},            // FLR
    {2,0},          // C FLR
    {2,0,3},        // C FLR BS
-    {4,0,2},        // C FLR BLRS
+    {2,0,3},        // C FLR BLRS
-    {4,0,2,5},      // C FLR BLRS LFE
+    {2,0,4,3},      // C FLR BLRS LFE
-    {4,0,2,6,5},    // C FLR BLRS BLR LFE
+    {2,0,6,4,3},    // C FLR BLRS BLR LFE
 };
 /* mp3on4 channel layouts */
 static const int16_t chan_layout[8] = {
    0,
    AV_CH_LAYOUT_MONO,
    AV_CH_LAYOUT_STEREO,
    AV_CH_LAYOUT_SURROUND,
    AV_CH_LAYOUT_4POINT0,
    AV_CH_LAYOUT_5POINT0,
    AV_CH_LAYOUT_5POINT1,
    AV_CH_LAYOUT_7POINT1
 };
 static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
 {
    MP3On4DecodeContext *s = avctx->priv_data;
    int i;
    for (i = 0; i < s->frames; i++)
        av_free(s->mp3decctx[i]);
    av_freep(&s->decoded_buf);
    return 0;
 }
 static int decode_init_mp3on4(AVCodecContext * avctx)
 {
@ -1931,6 +1957,7 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
    s->frames = mp3Frames[cfg.chan_config];
    s->coff = chan_offset[cfg.chan_config];
    avctx->channels = ff_mpeg4audio_channels[cfg.chan_config];
    avctx->channel_layout = chan_layout[cfg.chan_config];
    if (cfg.sample_rate < 16000)
        s->syncword = 0xffe00000;
@ -1944,6 +1971,8 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
     */
    // Allocate zeroed memory for the first decoder context
    s->mp3decctx[0] = av_mallocz(sizeof(MPADecodeContext));
    if (!s->mp3decctx[0])
        goto alloc_fail;
    // Put decoder context in place to make init_decode() happy
    avctx->priv_data = s->mp3decctx[0];
    decode_init(avctx);
@ -1956,23 +1985,38 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
     */
    for (i = 1; i < s->frames; i++) {
        s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext));
        if (!s->mp3decctx[i])
            goto alloc_fail;
        s->mp3decctx[i]->adu_mode = 1;
        s->mp3decctx[i]->avctx = avctx;
        s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
    }
    /* Allocate buffer for multi-channel output if needed */
    if (s->frames > 1) {
        s->decoded_buf = av_malloc(MPA_FRAME_SIZE * MPA_MAX_CHANNELS *
                                   sizeof(*s->decoded_buf));
        if (!s->decoded_buf)
            goto alloc_fail;
    }
    return 0;
 alloc_fail:
    decode_close_mp3on4(avctx);
    return AVERROR(ENOMEM);
 }
-static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
+static void flush_mp3on4(AVCodecContext *avctx)
 {
    MP3On4DecodeContext *s = avctx->priv_data;
    int i;
    MP3On4DecodeContext *s = avctx->priv_data;
-    for (i = 0; i < s->frames; i++)
+    for (i = 0; i < s->frames; i++) {
-        av_free(s->mp3decctx[i]);
+        MPADecodeContext *m = s->mp3decctx[i];
-
+        memset(m->synth_buf, 0, sizeof(m->synth_buf));
-    return 0;
+        m->last_buf_size = 0;
    }
 }
@ -1987,12 +2031,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
    int fsize, len = buf_size, out_size = 0;
    uint32_t header;
    OUT_INT *out_samples = data;
    OUT_INT decoded_buf[MPA_FRAME_SIZE * MPA_MAX_CHANNELS];
    OUT_INT *outptr, *bp;
-    int fr, j, n;
+    int fr, j, n, ch;
-    if(*data_size < MPA_FRAME_SIZE * MPA_MAX_CHANNELS * s->frames * sizeof(OUT_INT))
+    if (*data_size < MPA_FRAME_SIZE * avctx->channels * sizeof(OUT_INT)) {
-        return -1;
+        av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n");
        return AVERROR(EINVAL);
    }
    *data_size = 0;
    // Discard too short frames
@ -2000,10 +2045,11 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
        return -1;
    // If only one decoder interleave is not needed
-    outptr = s->frames == 1 ? out_samples : decoded_buf;
+    outptr = s->frames == 1 ? out_samples : s->decoded_buf;
    avctx->bit_rate = 0;
    ch = 0;
    for (fr = 0; fr < s->frames; fr++) {
        fsize = AV_RB16(buf) >> 4;
        fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE);
@ -2016,6 +2062,14 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
            break;
        avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header);
        if (ch + m->nb_channels > avctx->channels) {
            av_log(avctx, AV_LOG_ERROR, "frame channel count exceeds codec "
                                        "channel count\n");
            return AVERROR_INVALIDDATA;
        }
        ch += m->nb_channels;
        out_size += mp_decode_frame(m, outptr, buf, fsize);
        buf += fsize;
        len -= fsize;
@ -2026,13 +2080,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
            bp = out_samples + s->coff[fr];
            if(m->nb_channels == 1) {
                for(j = 0; j < n; j++) {
-                    *bp = decoded_buf[j];
+                    *bp = s->decoded_buf[j];
                    bp += avctx->channels;
                }
            } else {
                for(j = 0; j < n; j++) {
-                    bp[0] = decoded_buf[j++];
+                    bp[0] = s->decoded_buf[j++];
-                    bp[1] = decoded_buf[j];
+                    bp[1] = s->decoded_buf[j];
                    bp += avctx->channels;
                }
            }
@ -2110,7 +2164,7 @@ AVCodec ff_mp3on4_decoder = {
    .init           = decode_init_mp3on4,
    .close          = decode_close_mp3on4,
    .decode         = decode_frame_mp3on4,
-    .flush          = flush,
+    .flush          = flush_mp3on4,
    .long_name      = NULL_IF_CONFIG_SMALL("MP3onMP4"),
 };
 #endif
--- a/libavcodec/mpegaudiodec_float.c
+++ b/libavcodec/mpegaudiodec_float.c
@ -83,7 +83,7 @@ AVCodec ff_mp3on4float_decoder = {
    .init           = decode_init_mp3on4,
    .close          = decode_close_mp3on4,
    .decode         = decode_frame_mp3on4,
-    .flush          = flush,
+    .flush          = flush_mp3on4,
    .long_name      = NULL_IF_CONFIG_SMALL("MP3onMP4"),
 };
 #endif
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
 }
 static av_always_inline
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
+void weight_h264_W_altivec(uint8_t *block, int stride, int height,
                           int log2_denom, int weight, int offset, int w)
 {
    int y, aligned;
    vec_u8 vblock;
@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
    voffset = vec_splat(vtemp, 5);
    aligned = !((unsigned long)block & 0xf);
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
        vblock = vec_ld(0, block);
        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
 }
 static av_always_inline
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
+void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
-                               int weightd, int weights, int offset, int w, int h)
+                             int log2_denom, int weightd, int weights, int offset, int w)
 {
    int y, dst_aligned, src_aligned;
    vec_u8 vsrc, vdst;
@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
    dst_aligned = !((unsigned long)dst & 0xf);
    src_aligned = !((unsigned long)src & 0xf);
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
        vdst = vec_ld(0, dst);
        vsrc = vec_ld(0, src);
@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
    }
 }
-#define H264_WEIGHT(W,H) \
+#define H264_WEIGHT(W) \
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
+static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
-    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
+                                                   int log2_denom, int weight, int offset){ \
    weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \
 }\
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
-    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+                                                     int log2_denom, int weightd, int weights, int offset){ \
    biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
 }
-H264_WEIGHT(16,16)
+H264_WEIGHT(16)
-H264_WEIGHT(16, 8)
+H264_WEIGHT( 8)
 H264_WEIGHT( 8,16)
 H264_WEIGHT( 8, 8)
 H264_WEIGHT( 8, 4)
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
        c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
        c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
-        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
-        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
-        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
-        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
    }
    }
 }
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@ -158,6 +158,8 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, int l
    case PIX_FMT_YUV420P9BE:
    case PIX_FMT_YUV420P10LE:
    case PIX_FMT_YUV420P10BE:
    case PIX_FMT_YUV422P9LE:
    case PIX_FMT_YUV422P9BE:
    case PIX_FMT_YUV422P10LE:
    case PIX_FMT_YUV422P10BE:
    case PIX_FMT_YUV444P9LE:
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@ -41,24 +41,57 @@ static void free_buffers(VP8Context *s)
    av_freep(&s->top_nnz);
    av_freep(&s->edge_emu_buffer);
    av_freep(&s->top_border);
    av_freep(&s->segmentation_map);
    s->macroblocks = NULL;
 }
-static void vp8_decode_flush(AVCodecContext *avctx)
+static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
 {
    int ret;
    if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
        return ret;
    if (!s->maps_are_invalid && s->num_maps_to_be_freed) {
        f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
    } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
        ff_thread_release_buffer(s->avctx, f);
        return AVERROR(ENOMEM);
    }
    return 0;
 }
 static void vp8_release_frame(VP8Context *s, AVFrame *f, int is_close)
 {
    if (!is_close) {
        if (f->ref_index[0]) {
            assert(s->num_maps_to_be_freed < FF_ARRAY_ELEMS(s->segmentation_maps));
            s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
            f->ref_index[0] = NULL;
        }
    } else {
        av_freep(&f->ref_index[0]);
    }
    ff_thread_release_buffer(s->avctx, f);
 }
 static void vp8_decode_flush_impl(AVCodecContext *avctx, int force, int is_close)
 {
    VP8Context *s = avctx->priv_data;
    int i;
-    if (!avctx->is_copy) {
+    if (!avctx->is_copy || force) {
        for (i = 0; i < 5; i++)
            if (s->frames[i].data[0])
-                ff_thread_release_buffer(avctx, &s->frames[i]);
+                vp8_release_frame(s, &s->frames[i], is_close);
    }
    memset(s->framep, 0, sizeof(s->framep));
    free_buffers(s);
    s->maps_are_invalid = 1;
 }
 static void vp8_decode_flush(AVCodecContext *avctx)
 {
    vp8_decode_flush_impl(avctx, 0, 0);
 }
 static int update_dimensions(VP8Context *s, int width, int height)
@ -68,7 +101,7 @@ static int update_dimensions(VP8Context *s, int width, int height)
        if (av_image_check_size(width, height, 0, s->avctx))
            return AVERROR_INVALIDDATA;
-        vp8_decode_flush(s->avctx);
+        vp8_decode_flush_impl(s->avctx, 1, 0);
        avcodec_set_dimensions(s->avctx, width, height);
    }
@ -81,10 +114,9 @@ static int update_dimensions(VP8Context *s, int width, int height)
    s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
    s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
    s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
    s->segmentation_map        = av_mallocz(s->mb_width*s->mb_height);
    if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
-        !s->top_nnz || !s->top_border || !s->segmentation_map)
+        !s->top_nnz || !s->top_border)
        return AVERROR(ENOMEM);
    s->macroblocks        = s->macroblocks_base + 1;
@ -1508,6 +1540,14 @@ static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
    }
 }
 static void release_queued_segmaps(VP8Context *s, int is_close)
 {
    int leave_behind = is_close ? 0 : !s->maps_are_invalid;
    while (s->num_maps_to_be_freed > leave_behind)
        av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
    s->maps_are_invalid = 0;
 }
 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
                            AVPacket *avpkt)
 {
@ -1516,6 +1556,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
    enum AVDiscard skip_thresh;
    AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT];
    release_queued_segmaps(s, 0);
    if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
        return ret;
@ -1538,7 +1580,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
            &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
-            ff_thread_release_buffer(avctx, &s->frames[i]);
+            vp8_release_frame(s, &s->frames[i], 0);
    // find a free buffer
    for (i = 0; i < 5; i++)
@ -1559,8 +1601,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
    curframe->key_frame = s->keyframe;
    curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
    curframe->reference = referenced ? 3 : 0;
-    curframe->ref_index[0] = s->segmentation_map;
+    if ((ret = vp8_alloc_frame(s, curframe))) {
    if ((ret = ff_thread_get_buffer(avctx, curframe))) {
        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
        return ret;
    }
@ -1652,8 +1693,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
            s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
            s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
-            decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy,
+            decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
-                           prev_frame ? prev_frame->ref_index[0] + mb_xy : NULL);
+                           prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
@ -1736,7 +1777,8 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx)
 static av_cold int vp8_decode_free(AVCodecContext *avctx)
 {
-    vp8_decode_flush(avctx);
+    vp8_decode_flush_impl(avctx, 0, 1);
    release_queued_segmaps(avctx->priv_data, 1);
    return 0;
 }
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@ -130,7 +130,6 @@ typedef struct {
    uint8_t *intra4x4_pred_mode_top;
    uint8_t intra4x4_pred_mode_left[4];
    uint8_t *segmentation_map;
    /**
     * Macroblocks can have one of 4 different quants in a frame when
@ -237,6 +236,16 @@ typedef struct {
    H264PredContext hpc;
    vp8_mc_func put_pixels_tab[3][3][3];
    AVFrame frames[5];
    /**
     * A list of segmentation_map buffers that are to be free()'ed in
     * the next decoding iteration. We can't free() them right away
     * because the map may still be used by subsequent decoding threads.
     * Unused if frame threading is off.
     */
    uint8_t *segmentation_maps[5];
    int num_maps_to_be_freed;
    int maps_are_invalid;
 } VP8Context;
 #endif /* AVCODEC_VP8_H */
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@ -1055,14 +1055,6 @@ emu_edge mmx
 ;                           int32_t max, unsigned int len)
 ;-----------------------------------------------------------------------------
 %macro SPLATD_MMX 1
    punpckldq  %1, %1
 %endmacro
 %macro SPLATD_SSE2 1
    pshufd  %1, %1, 0
 %endmacro
 %macro VECTOR_CLIP_INT32 4
 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
 %ifidn %1, sse2
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@ -24,6 +24,146 @@
 SECTION_TEXT
 ;---------------------------------------------------------------------------------
 ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
 ;---------------------------------------------------------------------------------
 %macro INT32_TO_FLOAT_FMUL_SCALAR 2
 %ifdef ARCH_X86_64
 cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
 %else
 cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
    movss   m0, mulm
 %endif
    SPLATD  m0
    shl     lenq, 2
    add     srcq, lenq
    add     dstq, lenq
    neg     lenq
 .loop:
 %ifidn %1, sse2
    cvtdq2ps  m1, [srcq+lenq   ]
    cvtdq2ps  m2, [srcq+lenq+16]
 %else
    cvtpi2ps  m1, [srcq+lenq   ]
    cvtpi2ps  m3, [srcq+lenq+ 8]
    cvtpi2ps  m2, [srcq+lenq+16]
    cvtpi2ps  m4, [srcq+lenq+24]
    movlhps   m1, m3
    movlhps   m2, m4
 %endif
    mulps     m1, m0
    mulps     m2, m0
    mova  [dstq+lenq   ], m1
    mova  [dstq+lenq+16], m2
    add     lenq, 32
    jl .loop
    REP_RET
 %endmacro
 INIT_XMM
 %define SPLATD SPLATD_SSE
 %define movdqa movaps
 INT32_TO_FLOAT_FMUL_SCALAR sse, 5
 %undef movdqa
 %define SPLATD SPLATD_SSE2
 INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
 %undef SPLATD
 ;------------------------------------------------------------------------------
 ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
 ;------------------------------------------------------------------------------
 %macro FLOAT_TO_INT16 2
 cglobal float_to_int16_%1, 3,3,%2, dst, src, len
    add       lenq, lenq
    lea       srcq, [srcq+2*lenq]
    add       dstq, lenq
    neg       lenq
 .loop:
 %ifidn %1, sse2
    cvtps2dq    m0, [srcq+2*lenq   ]
    cvtps2dq    m1, [srcq+2*lenq+16]
    packssdw    m0, m1
    mova  [dstq+lenq], m0
 %else
    cvtps2pi    m0, [srcq+2*lenq   ]
    cvtps2pi    m1, [srcq+2*lenq+ 8]
    cvtps2pi    m2, [srcq+2*lenq+16]
    cvtps2pi    m3, [srcq+2*lenq+24]
    packssdw    m0, m1
    packssdw    m2, m3
    mova  [dstq+lenq  ], m0
    mova  [dstq+lenq+8], m2
 %endif
    add       lenq, 16
    js .loop
 %ifnidn %1, sse2
    emms
 %endif
    REP_RET
 %endmacro
 INIT_XMM
 FLOAT_TO_INT16 sse2, 2
 INIT_MMX
 FLOAT_TO_INT16 sse, 0
 %define cvtps2pi pf2id
 FLOAT_TO_INT16 3dnow, 0
 %undef cvtps2pi
 ;-------------------------------------------------------------------------------
 ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
 ;-------------------------------------------------------------------------------
 %macro FLOAT_TO_INT16_INTERLEAVE2 1
 cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
    lea      lenq, [4*r2q]
    mov     src1q, [src0q+gprsize]
    mov     src0q, [src0q]
    add      dstq, lenq
    add     src0q, lenq
    add     src1q, lenq
    neg      lenq
 .loop:
 %ifidn %1, sse2
    cvtps2dq   m0, [src0q+lenq]
    cvtps2dq   m1, [src1q+lenq]
    packssdw   m0, m1
    movhlps    m1, m0
    punpcklwd  m0, m1
    mova  [dstq+lenq], m0
 %else
    cvtps2pi   m0, [src0q+lenq  ]
    cvtps2pi   m1, [src0q+lenq+8]
    cvtps2pi   m2, [src1q+lenq  ]
    cvtps2pi   m3, [src1q+lenq+8]
    packssdw   m0, m1
    packssdw   m2, m3
    mova       m1, m0
    punpcklwd  m0, m2
    punpckhwd  m1, m2
    mova  [dstq+lenq  ], m0
    mova  [dstq+lenq+8], m1
 %endif
    add      lenq, 16
    js .loop
 %ifnidn %1, sse2
    emms
 %endif
    REP_RET
 %endmacro
 INIT_MMX
 %define cvtps2pi pf2id
 FLOAT_TO_INT16_INTERLEAVE2 3dnow
 %undef cvtps2pi
 %define movdqa movaps
 FLOAT_TO_INT16_INTERLEAVE2 sse
 %undef movdqa
 INIT_XMM
 FLOAT_TO_INT16_INTERLEAVE2 sse2
 %macro PSWAPD_SSE 2
    pshufw %1, %2, 0x4e
 %endmacro
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@ -26,133 +26,32 @@
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/fmtconvert.h"
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
+#if HAVE_YASM
 {
    x86_reg i = -4*len;
    __asm__ volatile(
        "movss  %3, %%xmm4 \n"
        "shufps $0, %%xmm4, %%xmm4 \n"
        "1: \n"
        "cvtpi2ps   (%2,%0), %%xmm0 \n"
        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
        "movlhps  %%xmm1,    %%xmm0 \n"
        "movlhps  %%xmm3,    %%xmm2 \n"
        "mulps    %%xmm4,    %%xmm0 \n"
        "mulps    %%xmm4,    %%xmm2 \n"
        "movaps   %%xmm0,   (%1,%0) \n"
        "movaps   %%xmm2, 16(%1,%0) \n"
        "add $32, %0 \n"
        "jl 1b \n"
        :"+r"(i)
        :"r"(dst+len), "r"(src+len), "m"(mul)
    );
 }
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
-{
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
    x86_reg i = -4*len;
    __asm__ volatile(
        "movss  %3, %%xmm4 \n"
        "shufps $0, %%xmm4, %%xmm4 \n"
        "1: \n"
        "cvtdq2ps   (%2,%0), %%xmm0 \n"
        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
        "mulps    %%xmm4,    %%xmm0 \n"
        "mulps    %%xmm4,    %%xmm1 \n"
        "movaps   %%xmm0,   (%1,%0) \n"
        "movaps   %%xmm1, 16(%1,%0) \n"
        "add $32, %0 \n"
        "jl 1b \n"
        :"+r"(i)
        :"r"(dst+len), "r"(src+len), "m"(mul)
    );
 }
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
+void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
-    x86_reg reglen = len;
+void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
-    // not bit-exact: pf2id uses different rounding than C and SSE
+void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
    __asm__ volatile(
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
        "packssdw   %%mm1       , %%mm0     \n\t"
        "packssdw   %%mm3       , %%mm2     \n\t"
        "movq       %%mm0       ,  (%1,%0)  \n\t"
        "movq       %%mm2       , 8(%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        "femms                              \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
+void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
-    x86_reg reglen = len;
+void ff_float_to_int16_interleave2_sse  (int16_t *dst, const float **src, long len);
-    __asm__ volatile(
+void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
        "packssdw   %%mm1       , %%mm0     \n\t"
        "packssdw   %%mm3       , %%mm2     \n\t"
        "movq       %%mm0       ,  (%1,%0)  \n\t"
        "movq       %%mm2       , 8(%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        "emms                               \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }
 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
    x86_reg reglen = len;
    __asm__ volatile(
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
        "packssdw   %%xmm1      , %%xmm0    \n\t"
        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }
 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 #if !HAVE_YASM
 #define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
 #define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
 #endif
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
    int i,j,c;\
    for(c=0; c<channels; c++){\
-        float_to_int16_##cpu(tmp, src[c], len);\
+        ff_float_to_int16_##cpu(tmp, src[c], len);\
        for(i=0, j=c; i<len; i++, j+=channels)\
            dst[j] = tmp[i];\
    }\
@ -160,73 +59,18 @@ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const
 \
 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
    if(channels==1)\
-        float_to_int16_##cpu(dst, src[0], len);\
+        ff_float_to_int16_##cpu(dst, src[0], len);\
    else if(channels==2){\
-        x86_reg reglen = len; \
+        ff_float_to_int16_interleave2_##cpu(dst, src, len);\
        const float *src0 = src[0];\
        const float *src1 = src[1];\
        __asm__ volatile(\
            "shl $2, %0 \n"\
            "add %0, %1 \n"\
            "add %0, %2 \n"\
            "add %0, %3 \n"\
            "neg %0 \n"\
            body\
            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
        );\
    }else if(channels==6){\
        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
    }else\
        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 }
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
+FLOAT_TO_INT16_INTERLEAVE(3dnow)
-    "1:                         \n"
+FLOAT_TO_INT16_INTERLEAVE(sse)
-    "pf2id     (%2,%0), %%mm0   \n"
+FLOAT_TO_INT16_INTERLEAVE(sse2)
    "pf2id    8(%2,%0), %%mm1   \n"
    "pf2id     (%3,%0), %%mm2   \n"
    "pf2id    8(%3,%0), %%mm3   \n"
    "packssdw    %%mm1, %%mm0   \n"
    "packssdw    %%mm3, %%mm2   \n"
    "movq        %%mm0, %%mm1   \n"
    "punpcklwd   %%mm2, %%mm0   \n"
    "punpckhwd   %%mm2, %%mm1   \n"
    "movq        %%mm0,  (%1,%0)\n"
    "movq        %%mm1, 8(%1,%0)\n"
    "add $16, %0                \n"
    "js 1b                      \n"
    "femms                      \n"
 )
 FLOAT_TO_INT16_INTERLEAVE(sse,
    "1:                         \n"
    "cvtps2pi  (%2,%0), %%mm0   \n"
    "cvtps2pi 8(%2,%0), %%mm1   \n"
    "cvtps2pi  (%3,%0), %%mm2   \n"
    "cvtps2pi 8(%3,%0), %%mm3   \n"
    "packssdw    %%mm1, %%mm0   \n"
    "packssdw    %%mm3, %%mm2   \n"
    "movq        %%mm0, %%mm1   \n"
    "punpcklwd   %%mm2, %%mm0   \n"
    "punpckhwd   %%mm2, %%mm1   \n"
    "movq        %%mm0,  (%1,%0)\n"
    "movq        %%mm1, 8(%1,%0)\n"
    "add $16, %0                \n"
    "js 1b                      \n"
    "emms                       \n"
 )
 FLOAT_TO_INT16_INTERLEAVE(sse2,
    "1:                         \n"
    "cvtps2dq  (%2,%0), %%xmm0  \n"
    "cvtps2dq  (%3,%0), %%xmm1  \n"
    "packssdw   %%xmm1, %%xmm0  \n"
    "movhlps    %%xmm0, %%xmm1  \n"
    "punpcklwd  %%xmm1, %%xmm0  \n"
    "movdqa     %%xmm0, (%1,%0) \n"
    "add $16, %0                \n"
    "js 1b                      \n"
 )
 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
    if(channels==6)
@ -235,7 +79,6 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
        float_to_int16_interleave_3dnow(dst, src, len, channels);
 }
 #if HAVE_YASM
 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
@ -269,34 +112,32 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
    int mm_flags = av_get_cpu_flags();
    if (mm_flags & AV_CPU_FLAG_MMX) {
 #if HAVE_YASM
    if (mm_flags & AV_CPU_FLAG_MMX) {
        c->float_interleave = float_interleave_mmx;
 #endif
-        if(mm_flags & AV_CPU_FLAG_3DNOW){
+        if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->float_to_int16 = float_to_int16_3dnow;
+                c->float_to_int16 = ff_float_to_int16_3dnow;
                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
            }
        }
-        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
+        if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) {
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
            }
        }
-        if(mm_flags & AV_CPU_FLAG_SSE){
+        if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
-            c->float_to_int16 = float_to_int16_sse;
+            c->float_to_int16 = ff_float_to_int16_sse;
            c->float_to_int16_interleave = float_to_int16_interleave_sse;
 #if HAVE_YASM
            c->float_interleave = float_interleave_sse;
 #endif
        }
-        if(mm_flags & AV_CPU_FLAG_SSE2){
+        if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
-            c->float_to_int16 = float_to_int16_sse2;
+            c->float_to_int16 = ff_float_to_int16_sse2;
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
        }
    }
 #endif
 }
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@ -28,21 +28,20 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 ; biweight pred:
 ;
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
-;                               int log2_denom, int weightd, int weights,
+;                            int height, int log2_denom, int weightd,
-;                               int offset);
+;                            int weights, int offset);
 ; and
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
-;                             int log2_denom, int weight,
+;                          int log2_denom, int weight, int offset);
 ;                             int offset);
 ;-----------------------------------------------------------------------------
 %macro WEIGHT_SETUP 0
-    add        r4, r4
+    add        r5, r5
-    inc        r4
+    inc        r5
-    movd       m3, r3d
+    movd       m3, r4d
-    movd       m5, r4d
+    movd       m5, r5d
-    movd       m6, r2d
+    movd       m6, r3d
    pslld      m5, m6
    psrld      m5, 1
 %if mmsize == 16
@ -71,60 +70,41 @@ SECTION .text
    packuswb      m0, m1
 %endmacro
-%macro WEIGHT_FUNC_DBL_MM 1
+INIT_MMX
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0
+cglobal h264_weight_16_mmx2, 6, 6, 0
    WEIGHT_SETUP
    mov        r2, %1
 %if %1 == 16
 .nextrow
    WEIGHT_OP 0,  4
    mova     [r0  ], m0
    WEIGHT_OP 8, 12
    mova     [r0+8], m0
    add        r0, r1
-    dec        r2
+    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
 %endif
 %endmacro
-INIT_MMX
+%macro WEIGHT_FUNC_MM 3
-WEIGHT_FUNC_DBL_MM 16
+cglobal h264_weight_%1_%3, 6, 6, %2
 WEIGHT_FUNC_DBL_MM  8
 %macro WEIGHT_FUNC_MM 4
 cglobal h264_weight_%1x%2_%4, 7, 7, %3
    WEIGHT_SETUP
    mov        r2, %2
 %if %2 == 16
 .nextrow
    WEIGHT_OP 0, mmsize/2
    mova     [r0], m0
    add        r0, r1
-    dec        r2
+    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
 %endif
 %endmacro
 INIT_MMX
-WEIGHT_FUNC_MM  8, 16,  0, mmx2
+WEIGHT_FUNC_MM  8, 0, mmx2
 WEIGHT_FUNC_MM  8,  8,  0, mmx2
 WEIGHT_FUNC_MM  8,  4,  0, mmx2
 INIT_XMM
-WEIGHT_FUNC_MM 16, 16,  8, sse2
+WEIGHT_FUNC_MM 16, 8, sse2
 WEIGHT_FUNC_MM 16,  8,  8, sse2
-%macro WEIGHT_FUNC_HALF_MM 5
+%macro WEIGHT_FUNC_HALF_MM 3
-cglobal h264_weight_%1x%2_%5, 5, 5, %4
+cglobal h264_weight_%1_%3, 6, 6, %2
    WEIGHT_SETUP
-    mov        r2, %2/2
+    sar       r2d, 1
    lea        r3, [r1*2]
 %if %2 == mmsize
 .nextrow
    WEIGHT_OP 0, r1
    movh     [r0], m0
@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
    movh     [r0+r1], m0
 %endif
    add        r0, r3
-    dec        r2
+    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
 %endif
 %endmacro
 INIT_MMX
-WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
 %macro BIWEIGHT_SETUP 0
-    add        r6, 1
+%ifdef ARCH_X86_64
-    or         r6, 1
+%define off_regd r11d
-    add        r3, 1
+%else
-    movd       m3, r4d
+%define off_regd r3d
-    movd       m4, r5d
+%endif
-    movd       m5, r6d
+    mov  off_regd, r7m
-    movd       m6, r3d
+    add  off_regd, 1
    or   off_regd, 1
    add        r4, 1
    movd       m3, r5d
    movd       m4, r6d
    movd       m5, off_regd
    movd       m6, r4d
    pslld      m5, m6
    psrld      m5, 1
 %if mmsize == 16
@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
    packuswb   m0, m1
 %endmacro
-%macro BIWEIGHT_FUNC_DBL_MM 1
+INIT_MMX
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
+cglobal h264_biweight_16_mmx2, 7, 7, 0
    BIWEIGHT_SETUP
-    mov        r3, %1
+    movifnidn r3d, r3m
 %if %1 == 16
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, 4
@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
    mova     [r0+8], m0
    add        r0, r2
    add        r1, r2
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
 %endif
 %endmacro
-INIT_MMX
+%macro BIWEIGHT_FUNC_MM 3
-BIWEIGHT_FUNC_DBL_MM 16
+cglobal h264_biweight_%1_%3, 7, 7, %2
 BIWEIGHT_FUNC_DBL_MM  8
 %macro BIWEIGHT_FUNC_MM 4
 cglobal h264_biweight_%1x%2_%4, 7, 7, %3
    BIWEIGHT_SETUP
-    mov        r3, %2
+    movifnidn r3d, r3m
 %if %2 == 16
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, mmsize/2
@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
 %endif
 %endmacro
 INIT_MMX
-BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
+BIWEIGHT_FUNC_MM  8, 0, mmx2
 BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
 BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_MM 16, 16,  8, sse2
+BIWEIGHT_FUNC_MM 16, 8, sse2
 BIWEIGHT_FUNC_MM 16,  8,  8, sse2
-%macro BIWEIGHT_FUNC_HALF_MM 5
+%macro BIWEIGHT_FUNC_HALF_MM 3
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4
+cglobal h264_biweight_%1_%3, 7, 7, %2
    BIWEIGHT_SETUP
-    mov        r3, %2/2
+    movifnidn r3d, r3m
    sar        r3, 1
    lea        r4, [r2*2]
 %if %2 == mmsize
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, r2
@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
 %endif
    add        r0, r4
    add        r1, r4
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
 %endif
 %endmacro
 INIT_MMX
-BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
+BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
 BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
 BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
 BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
 BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
 %macro BIWEIGHT_SSSE3_SETUP 0
-    add        r6, 1
+%ifdef ARCH_X86_64
-    or         r6, 1
+%define off_regd r11d
-    add        r3, 1
+%else
-    movd       m4, r4d
+%define off_regd r3d
-    movd       m0, r5d
+%endif
-    movd       m5, r6d
+    mov  off_regd, r7m
-    movd       m6, r3d
+    add  off_regd, 1
    or   off_regd, 1
    add        r4, 1
    movd       m4, r5d
    movd       m0, r6d
    movd       m5, off_regd
    movd       m6, r4d
    pslld      m5, m6
    psrld      m5, 1
    punpcklbw  m4, m0
@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
    packuswb   m0, m2
 %endmacro
-%macro BIWEIGHT_SSSE3_16 1
+INIT_XMM
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
+cglobal h264_biweight_16_ssse3, 7, 7, 8
    BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1
+    movifnidn r3d, r3m
 %if %1 == 16
 .nextrow
    movh       m0, [r0]
    movh       m2, [r0+8]
@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
 %endif
 %endmacro
 INIT_XMM
-BIWEIGHT_SSSE3_16 16
+cglobal h264_biweight_8_ssse3, 7, 7, 8
 BIWEIGHT_SSSE3_16  8
 %macro BIWEIGHT_SSSE3_8 1
 cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
    BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1/2
+    movifnidn r3d, r3m
    sar        r3, 1
    lea        r4, [r2*2]
 %if %1 == 16
 .nextrow
    movh       m0, [r0]
    movh       m1, [r1]
@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
    movhps     [r0+r2], m0
    add        r0, r4
    add        r1, r4
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
 %endif
 %endmacro
 INIT_XMM
 BIWEIGHT_SSSE3_8 16
 BIWEIGHT_SSSE3_8  8
 BIWEIGHT_SSSE3_8  4
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@ -36,33 +36,26 @@ cextern pw_1
 SECTION .text
 ;-----------------------------------------------------------------------------
-; void h264_weight(uint8_t *dst, int stride, int log2_denom,
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
 ;                  int weight, int offset);
 ;-----------------------------------------------------------------------------
-%ifdef ARCH_X86_32
+%macro WEIGHT_PROLOGUE 0
 DECLARE_REG_TMP 2
 %else
 DECLARE_REG_TMP 10
 %endif
 %macro WEIGHT_PROLOGUE 1
    mov t0, %1
 .prologue
-    PROLOGUE 0,5,8
+    PROLOGUE 0,6,8
    movifnidn  r0, r0mp
    movifnidn r1d, r1m
    movifnidn r3d, r3m
    movifnidn r4d, r4m
    movifnidn r5d, r5m
 %endmacro
 %macro WEIGHT_SETUP 1
    mova       m0, [pw_1]
-    movd       m2, r2m
+    movd       m2, r3m
    pslld      m0, m2       ; 1<<log2_denom
    SPLATW     m0, m0
-    shl        r4, 19       ; *8, move to upper half of dword
+    shl        r5, 19       ; *8, move to upper half of dword
-    lea        r4, [r4+r3*2+0x10000]
+    lea        r5, [r5+r4*2+0x10000]
-    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
+    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
    pshufd     m3, m3, 0
    mova       m4, [pw_pixel_max]
    paddw      m2, [sq_1]   ; log2_denom+1
@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
 %endmacro
 %macro WEIGHT_FUNC_DBL 1
-cglobal h264_weight_16x16_10_%1
+cglobal h264_weight_16_10_%1
-    WEIGHT_PROLOGUE 16
+    WEIGHT_PROLOGUE
    WEIGHT_SETUP %1
 .nextrow
    WEIGHT_OP %1,  0
@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
    WEIGHT_OP %1, 16
    mova [r0+16], m5
    add       r0, r1
-    dec       t0
+    dec       r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_16x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
 %macro WEIGHT_FUNC_MM 1
-cglobal h264_weight_8x16_10_%1
+cglobal h264_weight_8_10_%1
-    WEIGHT_PROLOGUE 16
+    WEIGHT_PROLOGUE
    WEIGHT_SETUP %1
 .nextrow
    WEIGHT_OP  %1, 0
    mova     [r0], m5
    add        r0, r1
-    dec        t0
+    dec        r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_8x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 cglobal h264_weight_8x4_10_%1
    mov t0, 4
    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
 %macro WEIGHT_FUNC_HALF_MM 1
-cglobal h264_weight_4x8_10_%1
+cglobal h264_weight_4_10_%1
-    WEIGHT_PROLOGUE 4
+    WEIGHT_PROLOGUE
    sar         r2d, 1
    WEIGHT_SETUP %1
    lea         r3, [r1*2]
 .nextrow
@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
    movh      [r0], m5
    movhps [r0+r1], m5
    add         r0, r3
-    dec         t0
+    dec         r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_4x4_10_%1
    mov t0, 2
    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 cglobal h264_weight_4x2_10_%1
    mov t0, 1
    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
 ;-----------------------------------------------------------------------------
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
-;                    int weightd, int weights, int offset);
+;                    int log2_denom, int weightd, int weights, int offset);
 ;-----------------------------------------------------------------------------
 %ifdef ARCH_X86_32
-DECLARE_REG_TMP 2,3
+DECLARE_REG_TMP 3
 %else
-DECLARE_REG_TMP 10,2
+DECLARE_REG_TMP 10
 %endif
-%macro BIWEIGHT_PROLOGUE 1
+%macro BIWEIGHT_PROLOGUE 0
    mov t0, %1
 .prologue
    PROLOGUE 0,7,8
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
-    movifnidn t1d, r2m
+    movifnidn r2d, r2m
    movifnidn r4d, r4m
    movifnidn r5d, r5m
    movifnidn r6d, r6m
    movifnidn t0d, r7m
 %endmacro
 %macro BIWEIGHT_SETUP 1
-    lea        r6, [r6*4+1] ; (offset<<2)+1
+    lea        t0, [t0*4+1] ; (offset<<2)+1
-    or         r6, 1
+    or         t0, 1
-    shl        r5, 16
+    shl        r6, 16
-    or         r4, r5
+    or         r5, r6
-    movd       m4, r4d      ; weightd | weights
+    movd       m4, r5d      ; weightd | weights
-    movd       m5, r6d      ; (offset+1)|1
+    movd       m5, t0d      ; (offset+1)|1
-    movd       m6, r3m      ; log2_denom
+    movd       m6, r4m      ; log2_denom
    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
    paddd      m6, [sq_1]
    pshufd     m4, m4, 0
    pshufd     m5, m5, 0
    mova       m3, [pw_pixel_max]
    movifnidn r3d, r3m
 %ifnidn %1, sse4
    pxor       m7, m7
 %endif
@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
 %endmacro
 %macro BIWEIGHT_FUNC_DBL 1
-cglobal h264_biweight_16x16_10_%1
+cglobal h264_biweight_16_10_%1
-    BIWEIGHT_PROLOGUE 16
+    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
 .nextrow
    BIWEIGHT  %1,  0
    mova [r0   ], m0
    BIWEIGHT  %1, 16
    mova [r0+16], m0
-    add       r0, t1
+    add       r0, r2
-    add       r1, t1
+    add       r1, r2
-    dec       t0
+    dec       r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_16x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
 BIWEIGHT_FUNC_DBL sse4
 %macro BIWEIGHT_FUNC 1
-cglobal h264_biweight_8x16_10_%1
+cglobal h264_biweight_8_10_%1
-    BIWEIGHT_PROLOGUE 16
+    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
 .nextrow
    BIWEIGHT %1, 0
    mova   [r0], m0
-    add      r0, t1
+    add      r0, r2
-    add      r1, t1
+    add      r1, r2
-    dec      t0
+    dec      r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_8x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 cglobal h264_biweight_8x4_10_%1
    mov t0, 4
    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
 BIWEIGHT_FUNC sse4
 %macro BIWEIGHT_FUNC_HALF 1
-cglobal h264_biweight_4x8_10_%1
+cglobal h264_biweight_4_10_%1
-    BIWEIGHT_PROLOGUE 4
+    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
-    lea        r4, [t1*2]
+    sar        r3d, 1
    lea        r4, [r2*2]
 .nextrow
-    BIWEIGHT    %1, 0, t1
+    BIWEIGHT    %1, 0, r2
    movh   [r0   ], m0
-    movhps [r0+t1], m0
+    movhps [r0+r2], m0
    add         r0, r4
    add         r1, r4
-    dec         t0
+    dec         r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_4x4_10_%1
    mov t0, 2
    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 cglobal h264_biweight_4x2_10_%1
    mov t0, 1
    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 %endmacro
 INIT_XMM
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@ -298,63 +298,53 @@ LF_IFUNC(v,  luma_intra,      10, mmxext)
 /***********************************/
 /* weighted prediction */
-#define H264_WEIGHT(W, H, OPT) \
+#define H264_WEIGHT(W, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
+    int stride, int height, int log2_denom, int weight, int offset);
-#define H264_BIWEIGHT(W, H, OPT) \
+#define H264_BIWEIGHT(W, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
-    uint8_t *src, int stride, int log2_denom, int weightd, \
+    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
    int weights, int offset);
-#define H264_BIWEIGHT_MMX(W,H) \
+#define H264_BIWEIGHT_MMX(W) \
-H264_WEIGHT  (W, H, mmx2) \
+H264_WEIGHT  (W, mmx2) \
-H264_BIWEIGHT(W, H, mmx2)
+H264_BIWEIGHT(W, mmx2)
-#define H264_BIWEIGHT_MMX_SSE(W,H) \
+#define H264_BIWEIGHT_MMX_SSE(W) \
-H264_BIWEIGHT_MMX(W, H) \
+H264_BIWEIGHT_MMX(W) \
-H264_WEIGHT      (W, H, sse2) \
+H264_WEIGHT      (W, sse2) \
-H264_BIWEIGHT    (W, H, sse2) \
+H264_BIWEIGHT    (W, sse2) \
-H264_BIWEIGHT    (W, H, ssse3)
+H264_BIWEIGHT    (W, ssse3)
-H264_BIWEIGHT_MMX_SSE(16, 16)
+H264_BIWEIGHT_MMX_SSE(16)
-H264_BIWEIGHT_MMX_SSE(16,  8)
+H264_BIWEIGHT_MMX_SSE( 8)
-H264_BIWEIGHT_MMX_SSE( 8, 16)
+H264_BIWEIGHT_MMX    ( 4)
 H264_BIWEIGHT_MMX_SSE( 8,  8)
 H264_BIWEIGHT_MMX_SSE( 8,  4)
 H264_BIWEIGHT_MMX    ( 4,  8)
 H264_BIWEIGHT_MMX    ( 4,  4)
 H264_BIWEIGHT_MMX    ( 4,  2)
-#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
+#define H264_WEIGHT_10(W, DEPTH, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
+    int stride, int height, int log2_denom, int weight, int offset);
-#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
-    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
+    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
     int weightd, int weights, int offset);
-#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
-H264_WEIGHT_10  (W, H, DEPTH, sse2) \
+H264_WEIGHT_10  (W, DEPTH, sse2) \
-H264_WEIGHT_10  (W, H, DEPTH, sse4) \
+H264_WEIGHT_10  (W, DEPTH, sse4) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
+H264_BIWEIGHT_10(W, DEPTH, sse2) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse4)
+H264_BIWEIGHT_10(W, DEPTH, sse4)
-H264_BIWEIGHT_10_SSE(16, 16, 10)
+H264_BIWEIGHT_10_SSE(16, 10)
-H264_BIWEIGHT_10_SSE(16,  8, 10)
+H264_BIWEIGHT_10_SSE( 8, 10)
-H264_BIWEIGHT_10_SSE( 8, 16, 10)
+H264_BIWEIGHT_10_SSE( 4, 10)
 H264_BIWEIGHT_10_SSE( 8,  8, 10)
 H264_BIWEIGHT_10_SSE( 8,  4, 10)
 H264_BIWEIGHT_10_SSE( 4,  8, 10)
 H264_BIWEIGHT_10_SSE( 4,  4, 10)
 H264_BIWEIGHT_10_SSE( 4,  2, 10)
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
    int mm_flags = av_get_cpu_flags();
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) {
        c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
    }
@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
 #endif
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
-            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
-            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
            if (mm_flags&AV_CPU_FLAG_SSE2) {
                c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
-                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
+                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
-                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
+                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
 #if HAVE_ALIGNED_STACK
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
            }
            if (mm_flags&AV_CPU_FLAG_SSSE3) {
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
            }
            if (HAVE_AVX && mm_flags&AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
 #endif
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
            }
            if (mm_flags&AV_CPU_FLAG_SSE4) {
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
            }
 #if HAVE_AVX
            if (mm_flags&AV_CPU_FLAG_AVX) {
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@ -10,7 +10,7 @@ OBJS    = alldevices.o avdevice.o
 # input/output devices
 OBJS-$(CONFIG_ALSA_INDEV)                += alsa-audio-common.o \
-                                            alsa-audio-dec.o
+                                            alsa-audio-dec.o timefilter.o
 OBJS-$(CONFIG_ALSA_OUTDEV)               += alsa-audio-common.o \
                                            alsa-audio-enc.o
 OBJS-$(CONFIG_BKTR_INDEV)                += bktr.o
@ -19,7 +19,7 @@ OBJS-$(CONFIG_DSHOW_INDEV)               += dshow.o dshow_enummediatypes.o \
                                            dshow_pin.o dshow_common.o
 OBJS-$(CONFIG_DV1394_INDEV)              += dv1394.o
 OBJS-$(CONFIG_FBDEV_INDEV)               += fbdev.o
-OBJS-$(CONFIG_JACK_INDEV)                += jack_audio.o
+OBJS-$(CONFIG_JACK_INDEV)                += jack_audio.o timefilter.o
 OBJS-$(CONFIG_LAVFI_INDEV)               += lavfi.o
 OBJS-$(CONFIG_OPENAL_INDEV)              += openal-dec.o
 OBJS-$(CONFIG_OSS_INDEV)                 += oss_audio.o
@ -39,4 +39,6 @@ OBJS-$(CONFIG_LIBDC1394_INDEV)           += libdc1394.o
 SKIPHEADERS-$(HAVE_ALSA_ASOUNDLIB_H)     += alsa-audio.h
 SKIPHEADERS-$(HAVE_SNDIO_H)              += sndio_common.h
 TESTPROGS = timefilter
 include $(SRC_PATH)/subdir.mak
--- a/libavdevice/alsa-audio.h
+++ b/libavdevice/alsa-audio.h
@ -33,7 +33,7 @@
 #include <alsa/asoundlib.h>
 #include "config.h"
 #include "libavutil/log.h"
-#include "libavformat/timefilter.h"
+#include "timefilter.h"
 #include "avdevice.h"
 /* XXX: we make the assumption that the soundcard accepts this format */
--- a/libavdevice/jack_audio.c
+++ b/libavdevice/jack_audio.c
@ -28,7 +28,8 @@
 #include "libavutil/fifo.h"
 #include "libavutil/opt.h"
 #include "libavcodec/avcodec.h"
-#include "libavformat/timefilter.h"
+#include "libavformat/avformat.h"
 #include "timefilter.h"
 #include "avdevice.h"
 /**
--- a/libavdevice/timefilter.c
+++ b/libavdevice/timefilter.c
@ -24,8 +24,8 @@
 #include "config.h"
 #include "avformat.h"
 #include "timefilter.h"
 #include "libavutil/mem.h"
 struct TimeFilter {
    /// Delay Locked Loop data. These variables refer to mathematical
--- a/libavdevice/timefilter.h
+++ b/libavdevice/timefilter.h
@ -22,8 +22,8 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
-#ifndef AVFORMAT_TIMEFILTER_H
+#ifndef AVDEVICE_TIMEFILTER_H
-#define AVFORMAT_TIMEFILTER_H
+#define AVDEVICE_TIMEFILTER_H
 /**
 * Opaque type representing a time filter state
@ -94,4 +94,4 @@ void ff_timefilter_reset(TimeFilter *);
 */
 void ff_timefilter_destroy(TimeFilter *);
-#endif /* AVFORMAT_TIMEFILTER_H */
+#endif /* AVDEVICE_TIMEFILTER_H */
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@ -354,11 +354,8 @@ OBJS-$(CONFIG_RTP_PROTOCOL)              += rtpproto.o
 OBJS-$(CONFIG_TCP_PROTOCOL)              += tcp.o
 OBJS-$(CONFIG_UDP_PROTOCOL)              += udp.o
 # libavdevice dependencies
 OBJS-$(CONFIG_ALSA_INDEV)                += timefilter.o
 OBJS-$(CONFIG_JACK_INDEV)                += timefilter.o
-TESTPROGS = seek timefilter
+TESTPROGS = seek
 TOOLS     = pktdumper probetest
 include $(SRC_PATH)/subdir.mak
--- a/libavformat/flvdec.c
+++ b/libavformat/flvdec.c
@ -228,8 +228,9 @@ static int amf_parse_object(AVFormatContext *s, AVStream *astream, AVStream *vst
        case AMF_DATA_TYPE_OBJECT: {
            unsigned int keylen;
-            if (vstream && ioc->seekable && key && !strcmp(KEYFRAMES_TAG, key) && depth == 1)
+            if ((vstream || astream) && ioc->seekable && key && !strcmp(KEYFRAMES_TAG, key) && depth == 1)
-                if (parse_keyframes_index(s, ioc, vstream, max_pos) < 0)
+                if (parse_keyframes_index(s, ioc, vstream ? vstream : astream,
                                          max_pos) < 0)
                    av_log(s, AV_LOG_ERROR, "Keyframe index parsing failed\n");
            while(avio_tell(ioc) < max_pos - 2 && (keylen = avio_rb16(ioc))) {
--- a/libavformat/flvenc.c
+++ b/libavformat/flvenc.c
@ -60,10 +60,13 @@ typedef struct FLVContext {
    int64_t duration_offset;
    int64_t filesize_offset;
    int64_t duration;
    int delay; ///< first dts delay for AVC
    int64_t last_ts;
 } FLVContext;
 typedef struct FLVStreamContext {
    int     delay;      ///< first dts delay for each stream (needed for AVC & Speex)
    int64_t last_ts;    ///< last timestamp for each stream
 } FLVStreamContext;
 static int get_audio_flags(AVCodecContext *enc){
    int flags = (enc->bits_per_coded_sample == 16) ? FLV_SAMPLESSIZE_16BIT : FLV_SAMPLESSIZE_8BIT;
@ -182,6 +185,7 @@ static int flv_write_header(AVFormatContext *s)
    for(i=0; i<s->nb_streams; i++){
        AVCodecContext *enc = s->streams[i]->codec;
        FLVStreamContext *sc;
        if (enc->codec_type == AVMEDIA_TYPE_VIDEO) {
            if (s->streams[i]->r_frame_rate.den && s->streams[i]->r_frame_rate.num) {
                framerate = av_q2d(s->streams[i]->r_frame_rate);
@ -199,6 +203,12 @@ static int flv_write_header(AVFormatContext *s)
                return -1;
        }
        av_set_pts_info(s->streams[i], 32, 1, 1000); /* 32 bit pts in ms */
        sc = av_mallocz(sizeof(FLVStreamContext));
        if (!sc)
            return AVERROR(ENOMEM);
        s->streams[i]->priv_data = sc;
        sc->last_ts = -1;
    }
    avio_write(pb, "FLV", 3);
    avio_w8(pb,1);
@ -218,8 +228,6 @@ static int flv_write_header(AVFormatContext *s)
        }
    }
    flv->last_ts = -1;
    /* write meta_tag */
    avio_w8(pb, 18);         // tag type META
    metadata_size_pos= avio_tell(pb);
@ -361,9 +369,10 @@ static int flv_write_trailer(AVFormatContext *s)
    /* Add EOS tag */
    for (i = 0; i < s->nb_streams; i++) {
        AVCodecContext *enc = s->streams[i]->codec;
        FLVStreamContext *sc = s->streams[i]->priv_data;
        if (enc->codec_type == AVMEDIA_TYPE_VIDEO &&
                (enc->codec_id == CODEC_ID_H264 || enc->codec_id == CODEC_ID_MPEG4)) {
-            put_avc_eos_tag(pb, flv->last_ts);
+            put_avc_eos_tag(pb, sc->last_ts);
        }
    }
@ -384,6 +393,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
    AVIOContext *pb = s->pb;
    AVCodecContext *enc = s->streams[pkt->stream_index]->codec;
    FLVContext *flv = s->priv_data;
    FLVStreamContext *sc = s->streams[pkt->stream_index]->priv_data;
    unsigned ts;
    int size= pkt->size;
    uint8_t *data= NULL;
@ -434,20 +444,20 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
        av_log(s, AV_LOG_ERROR, "malformated aac bitstream, use -absf aac_adtstoasc\n");
        return -1;
    }
-    if (!flv->delay && pkt->dts < 0)
+    if (!sc->delay && pkt->dts < 0)
-        flv->delay = -pkt->dts;
+        sc->delay = -pkt->dts;
-    ts = pkt->dts + flv->delay; // add delay to force positive dts
+    ts = pkt->dts + sc->delay; // add delay to force positive dts
    /* check Speex packet duration */
-    if (enc->codec_id == CODEC_ID_SPEEX && ts - flv->last_ts > 160) {
+    if (enc->codec_id == CODEC_ID_SPEEX && ts - sc->last_ts > 160) {
        av_log(s, AV_LOG_WARNING, "Warning: Speex stream has more than "
                                  "8 frames per packet. Adobe Flash "
                                  "Player cannot handle this!\n");
    }
-    if (flv->last_ts < ts)
+    if (sc->last_ts < ts)
-        flv->last_ts = ts;
+        sc->last_ts = ts;
    avio_wb24(pb,size + flags_size);
    avio_wb24(pb,ts);
@ -471,7 +481,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
    avio_write(pb, data ? data : pkt->data, size);
    avio_wb32(pb,size+flags_size+11); // previous tag size
-    flv->duration = FFMAX(flv->duration, pkt->pts + flv->delay + pkt->duration);
+    flv->duration = FFMAX(flv->duration, pkt->pts + sc->delay + pkt->duration);
    avio_flush(pb);
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@ -35,6 +35,7 @@
 #include "riff.h"
 #include "isom.h"
 #include "libavcodec/get_bits.h"
 #include "id3v1.h"
 #if CONFIG_ZLIB
 #include <zlib.h>
@ -99,7 +100,7 @@ static int mov_metadata_track_or_disc_number(MOVContext *c, AVIOContext *pb,
    return 0;
 }
-static int mov_metadata_int8(MOVContext *c, AVIOContext *pb,
+static int mov_metadata_int8_bypass_padding(MOVContext *c, AVIOContext *pb,
                                            unsigned len, const char *key)
 {
    char buf[16];
@ -115,7 +116,7 @@ static int mov_metadata_int8(MOVContext *c, AVIOContext *pb,
    return 0;
 }
-static int mov_metadata_stik(MOVContext *c, AVIOContext *pb,
+static int mov_metadata_int8_no_padding(MOVContext *c, AVIOContext *pb,
                                        unsigned len, const char *key)
 {
    char buf[16];
@ -126,6 +127,23 @@ static int mov_metadata_stik(MOVContext *c, AVIOContext *pb,
    return 0;
 }
 static int mov_metadata_gnre(MOVContext *c, AVIOContext *pb,
                             unsigned len, const char *key)
 {
    short genre;
    char buf[20];
    avio_r8(pb); // unknown
    genre = avio_r8(pb);
    if (genre < 1 || genre > ID3v1_GENRE_MAX)
        return 0;
    snprintf(buf, sizeof(buf), "%s", ff_id3v1_genre_str[genre-1]);
    av_dict_set(&c->fc->metadata, key, buf, 0);
    return 0;
 }
 static const uint32_t mac_to_unicode[128] = {
    0x00C4,0x00C5,0x00C7,0x00C9,0x00D1,0x00D6,0x00DC,0x00E1,
    0x00E0,0x00E2,0x00E4,0x00E3,0x00E5,0x00E7,0x00E9,0x00E8,
@ -189,6 +207,8 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
    case MKTAG(0xa9,'a','l','b'): key = "album";     break;
    case MKTAG(0xa9,'d','a','y'): key = "date";      break;
    case MKTAG(0xa9,'g','e','n'): key = "genre";     break;
    case MKTAG( 'g','n','r','e'): key = "genre";
        parse = mov_metadata_gnre; break;
    case MKTAG(0xa9,'t','o','o'):
    case MKTAG(0xa9,'s','w','r'): key = "encoder";   break;
    case MKTAG(0xa9,'e','n','c'): key = "encoder";   break;
@ -202,11 +222,15 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
    case MKTAG( 'd','i','s','k'): key = "disc";
        parse = mov_metadata_track_or_disc_number; break;
    case MKTAG( 't','v','e','s'): key = "episode_sort";
-        parse = mov_metadata_int8; break;
+        parse = mov_metadata_int8_bypass_padding; break;
    case MKTAG( 't','v','s','n'): key = "season_number";
-        parse = mov_metadata_int8; break;
+        parse = mov_metadata_int8_bypass_padding; break;
    case MKTAG( 's','t','i','k'): key = "media_type";
-        parse = mov_metadata_stik; break;
+        parse = mov_metadata_int8_no_padding; break;
    case MKTAG( 'h','d','v','d'): key = "hd_video";
        parse = mov_metadata_int8_no_padding; break;
    case MKTAG( 'p','g','a','p'): key = "gapless_playback";
        parse = mov_metadata_int8_no_padding; break;
    }
    if (c->itunes_metadata && atom.size > 8) {
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@ -859,6 +859,29 @@ const AVPixFmtDescriptor av_pix_fmt_descriptors[PIX_FMT_NB] = {
        },
        .flags = PIX_FMT_BE,
    },
    [PIX_FMT_YUV422P9LE] = {
        .name = "yuv422p9le",
        .nb_components= 3,
        .log2_chroma_w= 1,
        .log2_chroma_h= 0,
        .comp = {
            {0,1,1,0,8},        /* Y */
            {1,1,1,0,8},        /* U */
            {2,1,1,0,8},        /* V */
        },
    },
    [PIX_FMT_YUV422P9BE] = {
        .name = "yuv422p9be",
        .nb_components= 3,
        .log2_chroma_w= 1,
        .log2_chroma_h= 0,
        .comp = {
            {0,1,1,0,8},        /* Y */
            {1,1,1,0,8},        /* U */
            {2,1,1,0,8},        /* V */
        },
        .flags = PIX_FMT_BE,
    },
    [PIX_FMT_YUV422P10LE] = {
        .name = "yuv422p10le",
        .nb_components= 3,
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@ -149,12 +149,15 @@ enum PixelFormat {
    PIX_FMT_YUV444P9LE, ///< planar YUV 4:4:4, 27bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
    PIX_FMT_YUV444P10BE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
    PIX_FMT_YUV444P10LE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
    PIX_FMT_YUV422P9BE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
    PIX_FMT_YUV422P9LE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
    PIX_FMT_RGBA64BE,  ///< packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A, the 2-byte value for each R/G/B/A component is stored as big-endian
    PIX_FMT_RGBA64LE,  ///< packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A, the 2-byte value for each R/G/B/A component is stored as little-endian
    PIX_FMT_BGRA64BE,  ///< packed RGBA 16:16:16:16, 64bpp, 16B, 16G, 16R, 16A, the 2-byte value for each R/G/B/A component is stored as big-endian
    PIX_FMT_BGRA64LE,  ///< packed RGBA 16:16:16:16, 64bpp, 16B, 16G, 16R, 16A, the 2-byte value for each R/G/B/A component is stored as little-endian
    PIX_FMT_GBR24P,    ///< planar GBR, 24bpp, 8G, 8B, 8R.
    PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };
@ -182,6 +185,7 @@ enum PixelFormat {
 #define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE)
 #define PIX_FMT_YUV420P9  PIX_FMT_NE(YUV420P9BE , YUV420P9LE)
 #define PIX_FMT_YUV422P9  PIX_FMT_NE(YUV422P9BE , YUV422P9LE)
 #define PIX_FMT_YUV444P9  PIX_FMT_NE(YUV444P9BE , YUV444P9LE)
 #define PIX_FMT_YUV420P10 PIX_FMT_NE(YUV420P10BE, YUV420P10LE)
 #define PIX_FMT_YUV422P10 PIX_FMT_NE(YUV422P10BE, YUV422P10LE)
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@ -536,6 +536,18 @@
 %endif
 %endmacro
 %macro SPLATD_MMX 1
    punpckldq  %1, %1
 %endmacro
 %macro SPLATD_SSE 1
    shufps  %1, %1, 0
 %endmacro
 %macro SPLATD_SSE2 1
    pshufd  %1, %1, 0
 %endmacro
 %macro CLIPW 3 ;(dst, min, max)
    pmaxsw %1, %2
    pminsw %1, %3
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@ -2843,6 +2843,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
 #if HAVE_BIGENDIAN
        case PIX_FMT_YUV444P9LE:
        case PIX_FMT_YUV422P9LE:
        case PIX_FMT_YUV420P9LE:
        case PIX_FMT_YUV422P10LE:
        case PIX_FMT_YUV420P10LE:
@ -2852,6 +2853,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
        case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
 #else
        case PIX_FMT_YUV444P9BE:
        case PIX_FMT_YUV422P9BE:
        case PIX_FMT_YUV420P9BE:
        case PIX_FMT_YUV444P10BE:
        case PIX_FMT_YUV422P10BE:
@ -2912,6 +2914,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
    switch (srcFormat) {
 #if HAVE_BIGENDIAN
    case PIX_FMT_YUV444P9LE:
    case PIX_FMT_YUV422P9LE:
    case PIX_FMT_YUV420P9LE:
    case PIX_FMT_YUV422P10LE:
    case PIX_FMT_YUV420P10LE:
@ -2922,6 +2925,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
    case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
 #else
    case PIX_FMT_YUV444P9BE:
    case PIX_FMT_YUV422P9BE:
    case PIX_FMT_YUV420P9BE:
    case PIX_FMT_YUV444P10BE:
    case PIX_FMT_YUV422P10BE:
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@ -547,6 +547,8 @@ const char *sws_format_name(enum PixelFormat format);
 #define isNBPS(x)       (           \
           (x)==PIX_FMT_YUV420P9LE  \
        || (x)==PIX_FMT_YUV420P9BE  \
        || (x)==PIX_FMT_YUV422P9LE  \
        || (x)==PIX_FMT_YUV422P9BE  \
        || (x)==PIX_FMT_YUV444P9BE  \
        || (x)==PIX_FMT_YUV444P9LE  \
        || (x)==PIX_FMT_YUV422P10BE \
@ -574,6 +576,7 @@ const char *sws_format_name(enum PixelFormat format);
 #define isPlanarYUV(x)  (           \
        isPlanar8YUV(x)             \
        || (x)==PIX_FMT_YUV420P9LE  \
        || (x)==PIX_FMT_YUV422P9LE  \
        || (x)==PIX_FMT_YUV444P9LE  \
        || (x)==PIX_FMT_YUV420P10LE \
        || (x)==PIX_FMT_YUV422P10LE \
@ -583,6 +586,7 @@ const char *sws_format_name(enum PixelFormat format);
        || (x)==PIX_FMT_YUV422P16LE \
        || (x)==PIX_FMT_YUV444P16LE \
        || (x)==PIX_FMT_YUV420P9BE  \
        || (x)==PIX_FMT_YUV422P9BE  \
        || (x)==PIX_FMT_YUV444P9BE  \
        || (x)==PIX_FMT_YUV420P10BE \
        || (x)==PIX_FMT_YUV422P10BE \
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@ -136,6 +136,8 @@ const static FormatEntry format_entries[PIX_FMT_NB] = {
    [PIX_FMT_YUV420P9LE]  = { 1 , 1 },
    [PIX_FMT_YUV420P10BE] = { 1 , 1 },
    [PIX_FMT_YUV420P10LE] = { 1 , 1 },
    [PIX_FMT_YUV422P9BE]  = { 1 , 1 },
    [PIX_FMT_YUV422P9LE]  = { 1 , 1 },
    [PIX_FMT_YUV422P10BE] = { 1 , 1 },
    [PIX_FMT_YUV422P10LE] = { 1 , 1 },
    [PIX_FMT_YUV444P9BE]  = { 1 , 1 },
@ -280,15 +282,18 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
                if (flags & SWS_BICUBIC) {
                    int64_t B= (param[0] != SWS_PARAM_DEFAULT ? param[0] :   0) * (1<<24);
                    int64_t C= (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1<<24);
-                    int64_t dd = ( d*d)>>30;
+
-                    int64_t ddd= (dd*d)>>30;
+                    if (d >= 1LL<<31) {
                        coeff = 0.0;
                    } else {
                        int64_t dd  = (d  * d) >> 30;
                        int64_t ddd = (dd * d) >> 30;
                        if (d < 1LL<<30)
                            coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30);
                    else if (d < 1LL<<31)
                        coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30);
                        else
-                        coeff=0.0;
+                            coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30);
                    }
                    coeff *= fone>>(30+24);
                }
 /*                else if (flags & SWS_X) {
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@ -790,8 +790,8 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int
        y_table32 = c->yuvTable;
        yb = -(384<<16) - oy;
        for (i = 0; i < 1024; i++) {
-            uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
+            unsigned yval = av_clip_uint8((yb + 0x8000) >> 16);
-            y_table32[i     ] = (yval << rbase) + (needAlpha ? 0 : (255 << abase));
+            y_table32[i     ] = (yval << rbase) + (needAlpha ? 0 : (255u << abase));
            y_table32[i+1024] = yval << gbase;
            y_table32[i+2048] = yval << bbase;
            yb += cy;
--- a/tests/ref/lavfi/pixdesc
+++ b/tests/ref/lavfi/pixdesc
@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
 yuv422p9be          29b71579946940a8c00fa844c9dff507
 yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
--- a/tests/ref/lavfi/pixfmts_copy
+++ b/tests/ref/lavfi/pixfmts_copy
@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
 yuv422p9be          29b71579946940a8c00fa844c9dff507
 yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
--- a/tests/ref/lavfi/pixfmts_null
+++ b/tests/ref/lavfi/pixfmts_null
@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
 yuv422p9be          29b71579946940a8c00fa844c9dff507
 yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
--- a/tests/ref/lavfi/pixfmts_scale
+++ b/tests/ref/lavfi/pixfmts_scale
@ -42,6 +42,8 @@ yuv422p10be         cea7ca6b0e66d6f29539885896c88603
 yuv422p10le         a10c4a5837547716f13cd61918b145f9
 yuv422p16be         285993ee0c0f4f8e511ee46f93c5f38c
 yuv422p16le         61bfcee8e54465f760164f5a75d40b5e
 yuv422p9be          82494823944912f73cebc58ad2979bbd
 yuv422p9le          fc69c8a21f473916a4b4225636b97e06
 yuv440p             461503fdb9b90451020aa3b25ddf041c
 yuv444p             81b2eba962d12e8d64f003ac56f6faf2
 yuv444p10be         e9d3c8e744b8b0d8187ca092fa203fc9
--- a/tests/ref/lavfi/pixfmts_vflip
+++ b/tests/ref/lavfi/pixfmts_vflip
@ -42,6 +42,8 @@ yuv422p10be         588fe319b96513c32e21d3e32b45447f
 yuv422p10le         11b57f2bd9661024153f3973b9090cdb
 yuv422p16be         c092d083548c2a144c372a98c46875c7
 yuv422p16le         c071b9397a416d51cbe339345cbcba84
 yuv422p9be          7c6f1e140b3999ee7d923854e507752a
 yuv422p9le          51f10d79c07989060dd06e767e6d7d60
 yuv440p             876385e96165acf51271b20e5d85a416
 yuv444p             9c3c667d1613b72d15bc6d851c5eb8f7
 yuv444p10be         944a4997c4edb3a8dd0f0493cfd5a1fd