Add the notion of pixel size in h264 related functions.

In high bit depth the pixels will not be stored in uint8_t like in the normal case, but in uint16_t. The pixel size is thus 1 in normal bit depth and 2 in high bit depth. Preparatory patch for high bit depth h264 decoding support. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2024-12-23 12:43:46 +02:00 · 2011-03-29 17:48:57 +02:00 · 2011-03-29 17:48:57 +02:00 · dc172ecc6e
commit dc172ecc6e
parent 86b0d9cd58
7 changed files with 238 additions and 149 deletions
--- a/ffplay.c
+++ b/ffplay.c
@ -1582,6 +1582,7 @@ static int input_get_buffer(AVCodecContext *codec, AVFrame *pic)
    int perms = AV_PERM_WRITE;
    int i, w, h, stride[4];
    unsigned edge;
+    int pixel_size;

    av_assert0(codec->flags & CODEC_FLAG_EMU_EDGE);

@ -1609,6 +1610,7 @@ static int input_get_buffer(AVCodecContext *codec, AVFrame *pic)
    if(!(ref = avfilter_get_video_buffer(ctx->outputs[0], perms, w, h)))
        return -1;

+    pixel_size = av_pix_fmt_descriptors[ref->format].comp[0].step_minus1+1;
    ref->video->w = codec->width;
    ref->video->h = codec->height;
    for(i = 0; i < 4; i ++) {
@ -1616,7 +1618,7 @@ static int input_get_buffer(AVCodecContext *codec, AVFrame *pic)
        unsigned vshift = (i == 1 || i == 2) ? av_pix_fmt_descriptors[ref->format].log2_chroma_h : 0;

        if (ref->data[i]) {
-            ref->data[i]    += (edge >> hshift) + ((edge * ref->linesize[i]) >> vshift);
+            ref->data[i]    += ((edge * pixel_size) >> hshift) + ((edge * ref->linesize[i]) >> vshift);
        }
        pic->data[i]     = ref->data[i];
        pic->linesize[i] = ref->linesize[i];
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@ -459,7 +459,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
    const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
    int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
    const int luma_xy= (mx&3) + ((my&3)<<2);
-    uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
+    uint8_t * src_y = pic->data[0] + (mx>>2)*h->pixel_size + (my>>2)*h->mb_linesize;
    uint8_t * src_cb, * src_cr;
    int extra_width= h->emu_edge_width;
    int extra_height= h->emu_edge_height;
@ -476,8 +476,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
       || full_my < 0-extra_height
       || full_mx + 16/*FIXME*/ > pic_width + extra_width
       || full_my + 16/*FIXME*/ > pic_height + extra_height){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
-            src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - 2*h->pixel_size - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+            src_y= s->edge_emu_buffer + 2*h->pixel_size + 2*h->mb_linesize;
        emu=1;
    }

@ -493,8 +493,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
        my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
        emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
    }
-    src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
-    src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
+    src_cb= pic->data[1] + (mx>>3)*h->pixel_size + (my>>3)*h->mb_uvlinesize;
+    src_cr= pic->data[2] + (mx>>3)*h->pixel_size + (my>>3)*h->mb_uvlinesize;

    if(emu){
        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
@ -519,9 +519,9 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
    qpel_mc_func *qpix_op=  qpix_put;
    h264_chroma_mc_func chroma_op= chroma_put;

-    dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
-    dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
-    dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
+    dest_y  += 2*x_offset*h->pixel_size + 2*y_offset*h->  mb_linesize;
+    dest_cb +=   x_offset*h->pixel_size +   y_offset*h->mb_uvlinesize;
+    dest_cr +=   x_offset*h->pixel_size +   y_offset*h->mb_uvlinesize;
    x_offset += 8*s->mb_x;
    y_offset += 8*(s->mb_y >> MB_FIELD);

@ -552,9 +552,9 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                           int list0, int list1){
    MpegEncContext * const s = &h->s;

-    dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
-    dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
-    dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
+    dest_y  += 2*x_offset*h->pixel_size + 2*y_offset*h->  mb_linesize;
+    dest_cb +=   x_offset*h->pixel_size +   y_offset*h->mb_uvlinesize;
+    dest_cr +=   x_offset*h->pixel_size +   y_offset*h->mb_uvlinesize;
    x_offset += 8*s->mb_x;
    y_offset += 8*(s->mb_y >> MB_FIELD);

@ -562,7 +562,7 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
        /* don't optimize for luma-only case, since B-frames usually
         * use implicit weights => chroma too. */
        uint8_t *tmp_cb = s->obmc_scratchpad;
-        uint8_t *tmp_cr = s->obmc_scratchpad + 8;
+        uint8_t *tmp_cr = s->obmc_scratchpad + 8*h->pixel_size;
        uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
        int refn0 = h->ref_cache[0][ scan8[n] ];
        int refn1 = h->ref_cache[1][ scan8[n] ];
@ -637,9 +637,9 @@ static inline void prefetch_motion(H264Context *h, int list){
        const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
        const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
        uint8_t **src= h->ref_list[list][refn].data;
-        int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
+        int off= mx*h->pixel_size + (my + (s->mb_x&3)*4)*h->mb_linesize + 64*h->pixel_size;
        s->dsp.prefetch(src[0]+off, s->linesize, 4);
-        off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
+        off= (mx>>1)*h->pixel_size + ((my>>1)*h->pixel_size + (s->mb_x&7))*s->uvlinesize + 64*h->pixel_size;
        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
    }
 }
@ -664,11 +664,11 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
                weight_op, weight_avg,
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
    }else if(IS_16X8(mb_type)){
-        mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 4, 8*h->pixel_size, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                &weight_op[1], &weight_avg[1],
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
-        mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
+        mc_part(h, 8, 0, 4, 8*h->pixel_size, dest_y, dest_cb, dest_cr, 0, 4,
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                &weight_op[1], &weight_avg[1],
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
@ -698,11 +698,11 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
                    &weight_op[3], &weight_avg[3],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
            }else if(IS_SUB_8X4(sub_mb_type)){
-                mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 2, 4*h->pixel_size, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                    &weight_op[4], &weight_avg[4],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-                mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+                mc_part(h, n+2, 0, 2, 4*h->pixel_size, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                    &weight_op[4], &weight_avg[4],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
@ -900,8 +900,8 @@ static void clone_tables(H264Context *dst, H264Context *src, int i){
 * Allocate buffers which are not shared amongst multiple threads.
 */
 static int context_init(H264Context *h){
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t)*2, fail)
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t)*2, fail)

    h->ref_cache[0][scan8[5 ]+1] = h->ref_cache[0][scan8[7 ]+1] = h->ref_cache[0][scan8[13]+1] =
    h->ref_cache[1][scan8[5 ]+1] = h->ref_cache[1][scan8[7 ]+1] = h->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
@ -1003,6 +1003,8 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx){

    ff_h264_decode_init_vlc();

+    h->pixel_size = 1;
+
    h->thread_context[0] = h;
    h->outputed_poc = h->next_outputed_poc = INT_MIN;
    h->prev_poc_msb= 1<<16;
@ -1165,14 +1167,14 @@ int ff_h264_frame_start(H264Context *h){
    assert(s->linesize && s->uvlinesize);

    for(i=0; i<16; i++){
-        h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
-        h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[i]= 4*((scan8[i] - scan8[0])&7)*h->pixel_size + 4*s->linesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7)*h->pixel_size + 8*s->linesize*((scan8[i] - scan8[0])>>3);
    }
    for(i=0; i<4; i++){
        h->block_offset[16+i]=
-        h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7)*h->pixel_size + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
        h->block_offset[24+16+i]=
-        h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7)*h->pixel_size + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
    }

    /* can't be in alloc_tables because linesize isn't known there.
@ -1372,9 +1374,16 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
            if(!MB_MBAFF){
                top_border = h->top_borders[0][s->mb_x];
                AV_COPY128(top_border, src_y + 15*linesize);
+                if (h->pixel_size == 2)
+                    AV_COPY128(top_border+16, src_y+15*linesize+16);
                if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
+                    if (h->pixel_size == 2) {
+                        AV_COPY128(top_border+32, src_cb+7*uvlinesize);
+                        AV_COPY128(top_border+48, src_cr+7*uvlinesize);
+                    } else {
                    AV_COPY64(top_border+16, src_cb+7*uvlinesize);
                    AV_COPY64(top_border+24, src_cr+7*uvlinesize);
+                    }
                }
            }
        }else if(MB_MBAFF){
@ -1387,10 +1396,17 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
    // There are two lines saved, the line above the the top macroblock of a pair,
    // and the line above the bottom macroblock
    AV_COPY128(top_border, src_y + 16*linesize);
+    if (h->pixel_size == 2)
+        AV_COPY128(top_border+16, src_y+16*linesize+16);

    if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
+        if (h->pixel_size == 2) {
+            AV_COPY128(top_border+32, src_cb+8*uvlinesize);
+            AV_COPY128(top_border+48, src_cr+8*uvlinesize);
+        } else {
        AV_COPY64(top_border+16, src_cb+8*uvlinesize);
        AV_COPY64(top_border+24, src_cr+8*uvlinesize);
+        }
    }
 }

@ -1419,40 +1435,61 @@ static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_c
        deblock_top =  (s->mb_y > !!MB_FIELD);
    }

-    src_y  -=   linesize + 1;
-    src_cb -= uvlinesize + 1;
-    src_cr -= uvlinesize + 1;
+    src_y  -=   linesize + h->pixel_size;
+    src_cb -= uvlinesize + h->pixel_size;
+    src_cr -= uvlinesize + h->pixel_size;

    top_border_m1 = h->top_borders[top_idx][s->mb_x-1];
    top_border    = h->top_borders[top_idx][s->mb_x];

 #define XCHG(a,b,xchg)\
+    if (h->pixel_size == 2) {\
+        if (xchg) {\
+            AV_SWAP64(b+0,a+0);\
+            AV_SWAP64(b+8,a+8);\
+        } else {\
+            AV_COPY128(b,a); \
+        }\
+    } else \
 if (xchg) AV_SWAP64(b,a);\
 else      AV_COPY64(b,a);

    if(deblock_top){
        if(deblock_left){
-            XCHG(top_border_m1+8, src_y -7, 1);
+            XCHG(top_border_m1+8*h->pixel_size, src_y -7*h->pixel_size, 1);
        }
-        XCHG(top_border+0, src_y +1, xchg);
-        XCHG(top_border+8, src_y +9, 1);
+        XCHG(top_border+0*h->pixel_size, src_y +1*h->pixel_size, xchg);
+        XCHG(top_border+8*h->pixel_size, src_y +9*h->pixel_size, 1);
        if(s->mb_x+1 < s->mb_width){
-            XCHG(h->top_borders[top_idx][s->mb_x+1], src_y +17, 1);
+            XCHG(h->top_borders[top_idx][s->mb_x+1], src_y +17*h->pixel_size, 1);
        }
    }
-
    if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
        if(deblock_top){
            if(deblock_left){
-                XCHG(top_border_m1+16, src_cb -7, 1);
-                XCHG(top_border_m1+24, src_cr -7, 1);
+                XCHG(top_border_m1+16*h->pixel_size, src_cb -7*h->pixel_size, 1);
+                XCHG(top_border_m1+24*h->pixel_size, src_cr -7*h->pixel_size, 1);
            }
-            XCHG(top_border+16, src_cb+1, 1);
-            XCHG(top_border+24, src_cr+1, 1);
+            XCHG(top_border+16*h->pixel_size, src_cb+h->pixel_size, 1);
+            XCHG(top_border+24*h->pixel_size, src_cr+h->pixel_size, 1);
        }
    }
 }

+static av_always_inline int dctcoef_get(H264Context *h, DCTELEM *mb, int index) {
+    if (h->pixel_size == 1)
+        return mb[index];
+    else
+        return ((int32_t*)mb)[index];
+}
+
+static av_always_inline void dctcoef_set(H264Context *h, DCTELEM *mb, int index, int value) {
+    if (h->pixel_size == 1)
+        mb[index] = value;
+    else
+        ((int32_t*)mb)[index] = value;
+}
+
 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
    MpegEncContext * const s = &h->s;
    const int mb_x= s->mb_x;
@ -1469,12 +1506,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);

-    dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
-    dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
-    dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
+    dest_y  = s->current_picture.data[0] + (mb_x*h->pixel_size + mb_y * s->linesize  ) * 16;
+    dest_cb = s->current_picture.data[1] + (mb_x*h->pixel_size + mb_y * s->uvlinesize) * 8;
+    dest_cr = s->current_picture.data[2] + (mb_x*h->pixel_size + mb_y * s->uvlinesize) * 8;

-    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
-    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
+    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64*h->pixel_size, s->linesize, 4);
+    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64*h->pixel_size, dest_cr - dest_cb, 2);

    h->list_counts[mb_xy]= h->list_count;

@ -1511,6 +1548,28 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
    }

    if (!simple && IS_INTRA_PCM(mb_type)) {
+        if (h->pixel_size == 2) {
+            const int bit_depth = h->sps.bit_depth_luma;
+            int j;
+            GetBitContext gb;
+            init_get_bits(&gb, (uint8_t*)h->mb, 384*bit_depth);
+
+            for (i = 0; i < 16; i++) {
+                uint16_t *tmp_y  = (uint16_t*)(dest_y  + i*linesize);
+                for (j = 0; j < 16; j++)
+                    tmp_y[j] = get_bits(&gb, bit_depth);
+            }
+            for (i = 0; i < 8; i++) {
+                uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
+                for (j = 0; j < 8; j++)
+                    tmp_cb[j] = get_bits(&gb, bit_depth);
+            }
+            for (i = 0; i < 8; i++) {
+                uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
+                for (j = 0; j < 8; j++)
+                    tmp_cr[j] = get_bits(&gb, bit_depth);
+            }
+        } else {
        for (i=0; i<16; i++) {
            memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
        }
@ -1518,6 +1577,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
            memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
            memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
        }
+        }
    } else {
        if(IS_INTRA(mb_type)){
            if(h->deblocking_filter)
@ -1542,16 +1602,16 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
                            uint8_t * const ptr= dest_y + block_offset[i];
                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                                h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
+                                h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16*h->pixel_size, linesize);
                            }else{
                                const int nnz = h->non_zero_count_cache[ scan8[i] ];
                                h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
                                                            (h->topright_samples_available<<i)&0x4000, linesize);
                                if(nnz){
-                                    if(nnz == 1 && h->mb[i*16])
-                                        idct_dc_add(ptr, h->mb + i*16, linesize);
+                                    if(nnz == 1 && dctcoef_get(h, h->mb, i*16))
+                                        idct_dc_add(ptr, h->mb + i*16*h->pixel_size, linesize);
                                    else
-                                        idct_add   (ptr, h->mb + i*16, linesize);
+                                        idct_add   (ptr, h->mb + i*16*h->pixel_size, linesize);
                                }
                            }
                        }
@ -1568,18 +1628,24 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];

                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                                h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
+                                h->hpc.pred4x4_add[dir](ptr, h->mb + i*16*h->pixel_size, linesize);
                            }else{
                                uint8_t *topright;
                                int nnz, tr;
+                                uint64_t tr_high;
                                if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
                                    const int topright_avail= (h->topright_samples_available<<i)&0x8000;
                                    assert(mb_y || linesize <= block_offset[i]);
                                    if(!topright_avail){
+                                        if (h->pixel_size == 2) {
+                                            tr_high= ((uint16_t*)ptr)[3 - linesize/2]*0x0001000100010001ULL;
+                                            topright= (uint8_t*) &tr_high;
+                                        } else {
                                        tr= ptr[3 - linesize]*0x01010101;
                                        topright= (uint8_t*) &tr;
+                                        }
                                    }else
-                                        topright= ptr + 4 - linesize;
+                                        topright= ptr + 4*h->pixel_size - linesize;
                                }else
                                    topright= NULL;

@ -1587,10 +1653,10 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
                                nnz = h->non_zero_count_cache[ scan8[i] ];
                                if(nnz){
                                    if(is_h264){
-                                        if(nnz == 1 && h->mb[i*16])
-                                            idct_dc_add(ptr, h->mb + i*16, linesize);
+                                        if(nnz == 1 && dctcoef_get(h, h->mb, i*16))
+                                            idct_dc_add(ptr, h->mb + i*16*h->pixel_size, linesize);
                                        else
-                                            idct_add   (ptr, h->mb + i*16, linesize);
+                                            idct_add   (ptr, h->mb + i*16*h->pixel_size, linesize);
                                    }
 #if CONFIG_SVQ3_DECODER
                                    else
@ -1611,7 +1677,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
                            static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
                                                                    8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
                            for(i = 0; i < 16; i++)
-                                h->mb[dc_mapping[i]] = h->mb_luma_dc[i];
+                                dctcoef_set(h, h->mb, dc_mapping[i], dctcoef_get(h, h->mb_luma_dc, i));
                        }
                    }
                }
@ -1638,8 +1704,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
                            h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
                        }else{
                            for(i=0; i<16; i++){
-                                if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
-                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
+                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h, h->mb, i*16))
+                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16*h->pixel_size, linesize);
                            }
                        }
                    }else{
@ -1651,7 +1717,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
                        idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
                        for(i=0; i<16; i+=di){
                            if(h->non_zero_count_cache[ scan8[i] ]){
-                                idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                                idct_add(dest_y + block_offset[i], h->mb + i*16*h->pixel_size, linesize);
                            }
                        }
                    }else{
@ -1679,21 +1745,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
            uint8_t *dest[2] = {dest_cb, dest_cr};
            if(transform_bypass){
                if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16*h->pixel_size, uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16*h->pixel_size, uvlinesize);
                }else{
                    idct_add = s->dsp.add_pixels4;
                    for(i=16; i<16+8; i++){
-                        if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
-                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
+                        if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h, h->mb, i*16))
+                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16*h->pixel_size, uvlinesize);
                    }
                }
            }else{
                if(is_h264){
                    if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16     , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16*h->pixel_size       , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
                    if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16+4*16)*h->pixel_size, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
                    h->h264dsp.h264_idct_add8(dest, block_offset,
                                              h->mb, uvlinesize,
                                              h->non_zero_count_cache);
@ -2906,9 +2972,9 @@ static void loop_filter(H264Context *h){

                s->mb_x= mb_x;
                s->mb_y= mb_y;
-                dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
-                dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
-                dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
+                dest_y  = s->current_picture.data[0] + (mb_x*h->pixel_size + mb_y * s->linesize  ) * 16;
+                dest_cb = s->current_picture.data[1] + (mb_x*h->pixel_size + mb_y * s->uvlinesize) * 8;
+                dest_cr = s->current_picture.data[2] + (mb_x*h->pixel_size + mb_y * s->uvlinesize) * 8;
                    //FIXME simplify above

                if (MB_FIELD) {
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@ -265,6 +265,7 @@ typedef struct MMCO{
 typedef struct H264Context{
    MpegEncContext s;
    H264DSPContext h264dsp;
+    int pixel_size;
    int chroma_qp[2]; //QPc

    int qp_thresh;      ///< QP threshold to skip loopfilter
@ -296,7 +297,7 @@ typedef struct H264Context{
    unsigned int top_samples_available;
    unsigned int topright_samples_available;
    unsigned int left_samples_available;
-    uint8_t (*top_borders[2])[16+2*8];
+    uint8_t (*top_borders[2])[(16+2*8)*2];

    /**
     * non zero coeff count cache.
@ -406,9 +407,9 @@ typedef struct H264Context{
    GetBitContext *intra_gb_ptr;
    GetBitContext *inter_gb_ptr;

-    DECLARE_ALIGNED(16, DCTELEM, mb)[16*24];
-    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16];
-    DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
+    DECLARE_ALIGNED(16, DCTELEM, mb)[16*24*2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
+    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16*2];
+    DCTELEM mb_padding[256*2];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb

    /**
     * Cabac
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@ -1105,40 +1105,47 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT

        int j= scantable[index[--coeff_count]];

-        if( get_cabac( CC, ctx ) == 0 ) {
-            node_ctx = coeff_abs_level_transition[0][node_ctx];
-            if( is_dc ) {
-                block[j] = get_cabac_bypass_sign( CC, -1);
-            }else{
-                block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
-            }
+#define STORE_BLOCK(type) \
+        if( get_cabac( CC, ctx ) == 0 ) { \
+            node_ctx = coeff_abs_level_transition[0][node_ctx]; \
+            if( is_dc ) { \
+                ((type*)block)[j] = get_cabac_bypass_sign( CC, -1); \
+            }else{ \
+                ((type*)block)[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6; \
+            } \
+        } else { \
+            int coeff_abs = 2; \
+            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; \
+            node_ctx = coeff_abs_level_transition[1][node_ctx]; \
+\
+            while( coeff_abs < 15 && get_cabac( CC, ctx ) ) { \
+                coeff_abs++; \
+            } \
+\
+            if( coeff_abs >= 15 ) { \
+                int j = 0; \
+                while( get_cabac_bypass( CC ) ) { \
+                    j++; \
+                } \
+\
+                coeff_abs=1; \
+                while( j-- ) { \
+                    coeff_abs += coeff_abs + get_cabac_bypass( CC ); \
+                } \
+                coeff_abs+= 14; \
+            } \
+\
+            if( is_dc ) { \
+                ((type*)block)[j] = get_cabac_bypass_sign( CC, -coeff_abs ); \
+            }else{ \
+                ((type*)block)[j] = ((int)(get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32)) >> 6; \
+            } \
+        }
+
+        if (h->pixel_size == 2) {
+            STORE_BLOCK(int32_t)
        } else {
-            int coeff_abs = 2;
-            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
-            node_ctx = coeff_abs_level_transition[1][node_ctx];
-
-            while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
-                coeff_abs++;
-            }
-
-            if( coeff_abs >= 15 ) {
-                int j = 0;
-                while( get_cabac_bypass( CC ) ) {
-                    j++;
-                }
-
-                coeff_abs=1;
-                while( j-- ) {
-                    coeff_abs += coeff_abs + get_cabac_bypass( CC );
-                }
-                coeff_abs+= 14;
-            }
-
-            if( is_dc ) {
-                block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
-            }else{
-                block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
-            }
+            STORE_BLOCK(int16_t)
        }
    } while( coeff_count );
 #ifdef CABAC_ON_STACK
@ -1304,6 +1311,7 @@ decode_intra_mb:
    h->slice_table[ mb_xy ]= h->slice_num;

    if(IS_INTRA_PCM(mb_type)) {
+        const int mb_size = 384*h->sps.bit_depth_luma/8;
        const uint8_t *ptr;

        // We assume these blocks are very rare so we do not optimize it.
@ -1316,9 +1324,9 @@ decode_intra_mb:
        }

        // The pixels are stored in the same order as levels in h->mb array.
-        memcpy(h->mb, ptr, 256); ptr+=256;
+        memcpy(h->mb, ptr, 2*mb_size/3); ptr+=2*mb_size/3;
        if(CHROMA){
-            memcpy(h->mb+128, ptr, 128); ptr+=128;
+            memcpy(h->mb+mb_size/3, ptr, mb_size/3); ptr+=mb_size/3;
        }

        ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
@ -1652,13 +1660,15 @@ decode_intra_mb:
            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
            AV_ZERO128(h->mb_luma_dc+0);
            AV_ZERO128(h->mb_luma_dc+8);
+            AV_ZERO128(h->mb_luma_dc+16);
+            AV_ZERO128(h->mb_luma_dc+24);
            decode_cabac_residual_dc( h, h->mb_luma_dc, 0, LUMA_DC_BLOCK_INDEX, scan, 16);

            if( cbp&15 ) {
                qmul = h->dequant4_coeff[0][s->qscale];
                for( i = 0; i < 16; i++ ) {
                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
-                    decode_cabac_residual_nondc(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
+                    decode_cabac_residual_nondc(h, h->mb + 16*i*h->pixel_size, 1, i, scan + 1, qmul, 15);
                }
            } else {
                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
@ -1668,7 +1678,7 @@ decode_intra_mb:
            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
                if( cbp & (1<<i8x8) ) {
                    if( IS_8x8DCT(mb_type) ) {
-                        decode_cabac_residual_nondc(h, h->mb + 64*i8x8, 5, 4*i8x8,
+                        decode_cabac_residual_nondc(h, h->mb + 64*i8x8*h->pixel_size, 5, 4*i8x8,
                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
                    } else {
                        qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
@ -1676,7 +1686,7 @@ decode_intra_mb:
                            const int index = 4*i8x8 + i4x4;
                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
 //START_TIMER
-                            decode_cabac_residual_nondc(h, h->mb + 16*index, 2, index, scan, qmul, 16);
+                            decode_cabac_residual_nondc(h, h->mb + 16*index*h->pixel_size, 2, index, scan, qmul, 16);
 //STOP_TIMER("decode_residual")
                        }
                    }
@ -1691,7 +1701,7 @@ decode_intra_mb:
            int c;
            for( c = 0; c < 2; c++ ) {
                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                decode_cabac_residual_dc(h, h->mb + 256 + 16*4*c, 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
+                decode_cabac_residual_dc(h, h->mb + (256 + 16*4*c)*h->pixel_size, 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
            }
        }

@ -1702,7 +1712,7 @@ decode_intra_mb:
                for( i = 0; i < 4; i++ ) {
                    const int index = 16 + 4 * c + i;
                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    decode_cabac_residual_nondc(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
+                    decode_cabac_residual_nondc(h, h->mb + 16*index*h->pixel_size, 4, index, scan + 1, qmul, 15);
                }
            }
        } else {
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@ -488,37 +488,44 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
            zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
    }

-    scantable += zeros_left + total_coeff - 1;
-    if(n >= LUMA_DC_BLOCK_INDEX){
-        block[*scantable] = level[0];
-        for(i=1;i<total_coeff && zeros_left > 0;i++) {
-            if(zeros_left < 7)
-                run_before= get_vlc2(gb, (run_vlc-1)[zeros_left].table, RUN_VLC_BITS, 1);
-            else
-                run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
-            zeros_left -= run_before;
-            scantable -= 1 + run_before;
-            block[*scantable]= level[i];
-        }
-        for(;i<total_coeff;i++) {
-            scantable--;
-            block[*scantable]= level[i];
-        }
-    }else{
-        block[*scantable] = (level[0] * qmul[*scantable] + 32)>>6;
-        for(i=1;i<total_coeff && zeros_left > 0;i++) {
-            if(zeros_left < 7)
-                run_before= get_vlc2(gb, (run_vlc-1)[zeros_left].table, RUN_VLC_BITS, 1);
-            else
-                run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
-            zeros_left -= run_before;
-            scantable -= 1 + run_before;
-            block[*scantable]= (level[i] * qmul[*scantable] + 32)>>6;
-        }
-        for(;i<total_coeff;i++) {
-            scantable--;
-            block[*scantable]= (level[i] * qmul[*scantable] + 32)>>6;
-        }
+#define STORE_BLOCK(type) \
+    scantable += zeros_left + total_coeff - 1; \
+    if(n >= LUMA_DC_BLOCK_INDEX){ \
+        ((type*)block)[*scantable] = level[0]; \
+        for(i=1;i<total_coeff && zeros_left > 0;i++) { \
+            if(zeros_left < 7) \
+                run_before= get_vlc2(gb, (run_vlc-1)[zeros_left].table, RUN_VLC_BITS, 1); \
+            else \
+                run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2); \
+            zeros_left -= run_before; \
+            scantable -= 1 + run_before; \
+            ((type*)block)[*scantable]= level[i]; \
+        } \
+        for(;i<total_coeff;i++) { \
+            scantable--; \
+            ((type*)block)[*scantable]= level[i]; \
+        } \
+    }else{ \
+        ((type*)block)[*scantable] = ((int)(level[0] * qmul[*scantable] + 32))>>6; \
+        for(i=1;i<total_coeff && zeros_left > 0;i++) { \
+            if(zeros_left < 7) \
+                run_before= get_vlc2(gb, (run_vlc-1)[zeros_left].table, RUN_VLC_BITS, 1); \
+            else \
+                run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2); \
+            zeros_left -= run_before; \
+            scantable -= 1 + run_before; \
+            ((type*)block)[*scantable]= ((int)(level[i] * qmul[*scantable] + 32))>>6; \
+        } \
+        for(;i<total_coeff;i++) { \
+            scantable--; \
+            ((type*)block)[*scantable]= ((int)(level[i] * qmul[*scantable] + 32))>>6; \
+        } \
+    }
+
+    if (h->pixel_size == 2) {
+        STORE_BLOCK(int32_t)
+    } else {
+        STORE_BLOCK(int16_t)
    }

    if(zeros_left<0){
@ -605,7 +612,7 @@ decode_intra_mb:
        align_get_bits(&s->gb);

        // The pixels are stored in the same order as levels in h->mb array.
-        for(x=0; x < (CHROMA ? 384 : 256); x++){
+        for(x=0; x < (CHROMA ? 384 : 256)*h->sps.bit_depth_luma/8; x++){
            ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
        }

@ -941,6 +948,8 @@ decode_intra_mb:
        if(IS_INTRA16x16(mb_type)){
            AV_ZERO128(h->mb_luma_dc+0);
            AV_ZERO128(h->mb_luma_dc+8);
+            AV_ZERO128(h->mb_luma_dc+16);
+            AV_ZERO128(h->mb_luma_dc+24);
            if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
                return -1; //FIXME continue if partitioned and other return -1 too
            }
@ -951,7 +960,7 @@ decode_intra_mb:
                for(i8x8=0; i8x8<4; i8x8++){
                    for(i4x4=0; i4x4<4; i4x4++){
                        const int index= i4x4 + 4*i8x8;
-                        if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
+                        if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index*h->pixel_size, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
                            return -1;
                        }
                    }
@ -963,7 +972,7 @@ decode_intra_mb:
            for(i8x8=0; i8x8<4; i8x8++){
                if(cbp & (1<<i8x8)){
                    if(IS_8x8DCT(mb_type)){
-                        DCTELEM *buf = &h->mb[64*i8x8];
+                        DCTELEM *buf = &h->mb[64*i8x8*h->pixel_size];
                        uint8_t *nnz;
                        for(i4x4=0; i4x4<4; i4x4++){
                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
@ -976,7 +985,7 @@ decode_intra_mb:
                        for(i4x4=0; i4x4<4; i4x4++){
                            const int index= i4x4 + 4*i8x8;

-                            if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
+                            if( decode_residual(h, gb, h->mb + 16*index*h->pixel_size, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
                                return -1;
                            }
                        }
@ -990,7 +999,7 @@ decode_intra_mb:

        if(cbp&0x30){
            for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
+                if( decode_residual(h, gb, h->mb + (256 + 16*4*chroma_idx)*h->pixel_size, CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
                    return -1;
                }
        }
@ -1000,7 +1009,7 @@ decode_intra_mb:
                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
                for(i4x4=0; i4x4<4; i4x4++){
                    const int index= 16 + 4*chroma_idx + i4x4;
-                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
+                    if( decode_residual(h, gb, h->mb + 16*index*h->pixel_size, index, scan + 1, qmul, 15) < 0){
                        return -1;
                    }
                }
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@ -650,10 +650,10 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
        tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
        //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
        if( dir == 0 ) {
-            filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, h );
+            filter_mb_edgev( &img_y[4*edge*h->pixel_size], linesize, bS, qp, h );
            if( (edge&1) == 0 ) {
-                filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, h->chroma_qp[0], h);
-                filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, h->chroma_qp[1], h);
+                filter_mb_edgecv( &img_cb[2*edge*h->pixel_size], uvlinesize, bS, h->chroma_qp[0], h);
+                filter_mb_edgecv( &img_cr[2*edge*h->pixel_size], uvlinesize, bS, h->chroma_qp[1], h);
            }
        } else {
            filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@ -286,6 +286,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
        int unaligned;
        AVPicture picture;
        int stride_align[4];
+        const int pixel_size = av_pix_fmt_descriptors[s->pix_fmt].comp[0].step_minus1+1;

        avcodec_get_chroma_sub_sample(s->pix_fmt, &h_chroma_shift, &v_chroma_shift);

@ -335,7 +336,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
            if((s->flags&CODEC_FLAG_EMU_EDGE) || !size[2])
                buf->data[i] = buf->base[i];
            else
-                buf->data[i] = buf->base[i] + FFALIGN((buf->linesize[i]*EDGE_WIDTH>>v_shift) + (EDGE_WIDTH>>h_shift), stride_align[i]);
+                buf->data[i] = buf->base[i] + FFALIGN((buf->linesize[i]*EDGE_WIDTH>>v_shift) + (pixel_size*EDGE_WIDTH>>h_shift), stride_align[i]);
        }
        if(size[1] && !size[2])
            ff_set_systematic_pal2((uint32_t*)buf->data[1], s->pix_fmt);