From a50f0bea25a3da605cd547fe3bdfd36c8764b847 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Sun, 10 Apr 2011 17:04:13 +0200 Subject: [PATCH] H264: Split out hl_motion and template it, this seems a bit faster Signed-off-by: Michael Niedermayer --- libavcodec/Makefile | 6 +- libavcodec/h264.c | 419 +----------------------------------- libavcodec/h264.h | 6 + libavcodec/h264_hl_motion.c | 164 ++++++++++++++ libavcodec/h264_hl_motion.h | 282 ++++++++++++++++++++++++ 5 files changed, 456 insertions(+), 421 deletions(-) create mode 100644 libavcodec/h264_hl_motion.c create mode 100644 libavcodec/h264_hl_motion.h diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 0730124870..1cb8ee5264 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -168,7 +168,7 @@ OBJS-$(CONFIG_H263_ENCODER) += mpegvideo_enc.o mpeg4video.o \ ratecontrol.o h263.o ituh263enc.o \ flvenc.o mpeg12data.o \ mpegvideo.o error_resilience.o -OBJS-$(CONFIG_H264_DECODER) += h264.o \ +OBJS-$(CONFIG_H264_DECODER) += h264.o h264_hl_motion.o \ h264_loopfilter.o h264_direct.o \ cabac.o h264_sei.o h264_ps.o \ h264_refs.o h264_cavlc.o h264_cabac.o\ @@ -356,7 +356,7 @@ OBJS-$(CONFIG_SVQ1_ENCODER) += svq1enc.o svq1.o \ mpegvideo.o error_resilience.o \ ituh263enc.o mpegvideo_enc.o \ ratecontrol.o mpeg12data.o -OBJS-$(CONFIG_SVQ3_DECODER) += h264.o svq3.o \ +OBJS-$(CONFIG_SVQ3_DECODER) += h264.o svq3.o h264_hl_motion.o \ h264_loopfilter.o h264_direct.o \ h264_sei.o h264_ps.o h264_refs.o \ h264_cavlc.o h264_cabac.o cabac.o \ @@ -594,7 +594,7 @@ OBJS-$(CONFIG_DVDSUB_PARSER) += dvdsub_parser.o OBJS-$(CONFIG_FLAC_PARSER) += flac_parser.o flacdata.o flac.o OBJS-$(CONFIG_H261_PARSER) += h261_parser.o OBJS-$(CONFIG_H263_PARSER) += h263_parser.o -OBJS-$(CONFIG_H264_PARSER) += h264_parser.o h264.o \ +OBJS-$(CONFIG_H264_PARSER) += h264_parser.o h264.o h264_hl_motion.o \ cabac.o \ h264_refs.o h264_sei.o h264_direct.o \ h264_loopfilter.o h264_cabac.o \ diff --git a/libavcodec/h264.c b/libavcodec/h264.c index 1fa333c83e..6365afb303 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -250,141 +250,6 @@ static int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){ return 0; } -static inline int get_lowest_part_list_y(H264Context *h, Picture *pic, int n, int height, - int y_offset, int list){ - int raw_my= h->mv_cache[list][ scan8[n] ][1]; - int filter_height= (raw_my&3) ? 2 : 0; - int full_my= (raw_my>>2) + y_offset; - int top = full_my - filter_height, bottom = full_my + height + filter_height; - - return FFMAX(abs(top), bottom); -} - -static inline void get_lowest_part_y(H264Context *h, int refs[2][48], int n, int height, - int y_offset, int list0, int list1, int *nrefs){ - MpegEncContext * const s = &h->s; - int my; - - y_offset += 16*(s->mb_y >> MB_FIELD); - - if(list0){ - int ref_n = h->ref_cache[0][ scan8[n] ]; - Picture *ref= &h->ref_list[0][ref_n]; - - // Error resilience puts the current picture in the ref list. - // Don't try to wait on these as it will cause a deadlock. - // Fields can wait on each other, though. - if(ref->thread_opaque != s->current_picture.thread_opaque || - (ref->reference&3) != s->picture_structure) { - my = get_lowest_part_list_y(h, ref, n, height, y_offset, 0); - if (refs[0][ref_n] < 0) nrefs[0] += 1; - refs[0][ref_n] = FFMAX(refs[0][ref_n], my); - } - } - - if(list1){ - int ref_n = h->ref_cache[1][ scan8[n] ]; - Picture *ref= &h->ref_list[1][ref_n]; - - if(ref->thread_opaque != s->current_picture.thread_opaque || - (ref->reference&3) != s->picture_structure) { - my = get_lowest_part_list_y(h, ref, n, height, y_offset, 1); - if (refs[1][ref_n] < 0) nrefs[1] += 1; - refs[1][ref_n] = FFMAX(refs[1][ref_n], my); - } - } -} - -/** - * Wait until all reference frames are available for MC operations. - * - * @param h the H264 context - */ -static void await_references(H264Context *h){ - MpegEncContext * const s = &h->s; - const int mb_xy= h->mb_xy; - const int mb_type= s->current_picture.mb_type[mb_xy]; - int refs[2][48]; - int nrefs[2] = {0}; - int ref, list; - - memset(refs, -1, sizeof(refs)); - - if(IS_16X16(mb_type)){ - get_lowest_part_y(h, refs, 0, 16, 0, - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); - }else if(IS_16X8(mb_type)){ - get_lowest_part_y(h, refs, 0, 8, 0, - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); - get_lowest_part_y(h, refs, 8, 8, 8, - IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs); - }else if(IS_8X16(mb_type)){ - get_lowest_part_y(h, refs, 0, 16, 0, - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); - get_lowest_part_y(h, refs, 4, 16, 0, - IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs); - }else{ - int i; - - assert(IS_8X8(mb_type)); - - for(i=0; i<4; i++){ - const int sub_mb_type= h->sub_mb_type[i]; - const int n= 4*i; - int y_offset= (i&2)<<2; - - if(IS_SUB_8X8(sub_mb_type)){ - get_lowest_part_y(h, refs, n , 8, y_offset, - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); - }else if(IS_SUB_8X4(sub_mb_type)){ - get_lowest_part_y(h, refs, n , 4, y_offset, - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); - get_lowest_part_y(h, refs, n+2, 4, y_offset+4, - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); - }else if(IS_SUB_4X8(sub_mb_type)){ - get_lowest_part_y(h, refs, n , 8, y_offset, - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); - get_lowest_part_y(h, refs, n+1, 8, y_offset, - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); - }else{ - int j; - assert(IS_SUB_4X4(sub_mb_type)); - for(j=0; j<4; j++){ - int sub_y_offset= y_offset + 2*(j&2); - get_lowest_part_y(h, refs, n+j, 4, sub_y_offset, - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); - } - } - } - } - - for(list=h->list_count-1; list>=0; list--){ - for(ref=0; ref<48 && nrefs[list]; ref++){ - int row = refs[list][ref]; - if(row >= 0){ - Picture *ref_pic = &h->ref_list[list][ref]; - int ref_field = ref_pic->reference - 1; - int ref_field_picture = ref_pic->field_picture; - int pic_height = 16*s->mb_height >> ref_field_picture; - - row <<= MB_MBAFF; - nrefs[list]--; - - if(!FIELD_PICTURE && ref_field_picture){ // frame referencing two fields - ff_thread_await_progress((AVFrame*)ref_pic, FFMIN((row >> 1) - !(row&1), pic_height-1), 1); - ff_thread_await_progress((AVFrame*)ref_pic, FFMIN((row >> 1) , pic_height-1), 0); - }else if(FIELD_PICTURE && !ref_field_picture){ // field referencing one field of a frame - ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row*2 + ref_field , pic_height-1), 0); - }else if(FIELD_PICTURE){ - ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row, pic_height-1), ref_field); - }else{ - ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row, pic_height-1), 0); - } - } - } - } -} - #if 0 /** * DCT transforms the 16 dc values. @@ -451,288 +316,6 @@ static void chroma_dc_dct_c(DCTELEM *block){ } #endif -static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int src_x_offset, int src_y_offset, - qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){ - MpegEncContext * const s = &h->s; - const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; - int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; - const int luma_xy= (mx&3) + ((my&3)<<2); - uint8_t * src_y = pic->data[0] + ((mx>>2)<pixel_shift) + (my>>2)*h->mb_linesize; - uint8_t * src_cb, * src_cr; - int extra_width= h->emu_edge_width; - int extra_height= h->emu_edge_height; - int emu=0; - const int full_mx= mx>>2; - const int full_my= my>>2; - const int pic_width = 16*s->mb_width; - const int pic_height = 16*s->mb_height >> MB_FIELD; - - if(mx&7) extra_width -= 3; - if(my&7) extra_height -= 3; - - if( full_mx < 0-extra_width - || full_my < 0-extra_height - || full_mx + 16/*FIXME*/ > pic_width + extra_width - || full_my + 16/*FIXME*/ > pic_height + extra_height){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2<pixel_shift) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height); - src_y= s->edge_emu_buffer + (2<pixel_shift) + 2*h->mb_linesize; - emu=1; - } - - qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps? - if(!square){ - qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize); - } - - if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return; - - if(MB_FIELD){ - // chroma offset when predicting from a field of opposite parity - my += 2 * ((s->mb_y & 1) - (pic->reference - 1)); - emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1); - } - src_cb= pic->data[1] + ((mx>>3)<pixel_shift) + (my>>3)*h->mb_uvlinesize; - src_cr= pic->data[2] + ((mx>>3)<pixel_shift) + (my>>3)*h->mb_uvlinesize; - - if(emu){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); - src_cb= s->edge_emu_buffer; - } - chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7); - - if(emu){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); - src_cr= s->edge_emu_buffer; - } - chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7); -} - -static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - int list0, int list1){ - MpegEncContext * const s = &h->s; - qpel_mc_func *qpix_op= qpix_put; - h264_chroma_mc_func chroma_op= chroma_put; - - dest_y += (2*x_offset<pixel_shift) + 2*y_offset*h-> mb_linesize; - dest_cb += ( x_offset<pixel_shift) + y_offset*h->mb_uvlinesize; - dest_cr += ( x_offset<pixel_shift) + y_offset*h->mb_uvlinesize; - x_offset += 8*s->mb_x; - y_offset += 8*(s->mb_y >> MB_FIELD); - - if(list0){ - Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; - mc_dir_part(h, ref, n, square, chroma_height, delta, 0, - dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_op, chroma_op); - - qpix_op= qpix_avg; - chroma_op= chroma_avg; - } - - if(list1){ - Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; - mc_dir_part(h, ref, n, square, chroma_height, delta, 1, - dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_op, chroma_op); - } -} - -static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, - h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, - int list0, int list1){ - MpegEncContext * const s = &h->s; - - dest_y += (2*x_offset<pixel_shift) + 2*y_offset*h-> mb_linesize; - dest_cb += ( x_offset<pixel_shift) + y_offset*h->mb_uvlinesize; - dest_cr += ( x_offset<pixel_shift) + y_offset*h->mb_uvlinesize; - x_offset += 8*s->mb_x; - y_offset += 8*(s->mb_y >> MB_FIELD); - - if(list0 && list1){ - /* don't optimize for luma-only case, since B-frames usually - * use implicit weights => chroma too. */ - uint8_t *tmp_cb = s->obmc_scratchpad; - uint8_t *tmp_cr = s->obmc_scratchpad + (8<pixel_shift); - uint8_t *tmp_y = s->obmc_scratchpad + 8*h->mb_uvlinesize; - int refn0 = h->ref_cache[0][ scan8[n] ]; - int refn1 = h->ref_cache[1][ scan8[n] ]; - - mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0, - dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put); - mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1, - tmp_y, tmp_cb, tmp_cr, - x_offset, y_offset, qpix_put, chroma_put); - - if(h->use_weight == 2){ - int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; - int weight1 = 64 - weight0; - luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); - }else{ - luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, - h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], - h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); - chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, - h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], - h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); - chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, - h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], - h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); - } - }else{ - int list = list1 ? 1 : 0; - int refn = h->ref_cache[list][ scan8[n] ]; - Picture *ref= &h->ref_list[list][refn]; - mc_dir_part(h, ref, n, square, chroma_height, delta, list, - dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put, chroma_put); - - luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom, - h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); - if(h->use_weight_chroma){ - chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, - h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); - chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, - h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); - } - } -} - -static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - h264_weight_func *weight_op, h264_biweight_func *weight_avg, - int list0, int list1){ - if((h->use_weight==2 && list0 && list1 - && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) - || h->use_weight==1) - mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put, - weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1); - else - mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1); -} - -static inline void prefetch_motion(H264Context *h, int list){ - /* fetch pixels for estimated mv 4 macroblocks ahead - * optimized for 64byte cache lines */ - MpegEncContext * const s = &h->s; - const int refn = h->ref_cache[list][scan8[0]]; - if(refn >= 0){ - const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8; - const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y; - uint8_t **src= h->ref_list[list][refn].data; - int off= ((mx+64)<pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize; - s->dsp.prefetch(src[0]+off, s->linesize, 4); - off= (((mx>>1)+64)<pixel_shift) + ((my>>1) + (s->mb_x&7))*s->uvlinesize; - s->dsp.prefetch(src[1]+off, src[2]-src[1], 2); - } -} - -static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), - qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), - h264_weight_func *weight_op, h264_biweight_func *weight_avg){ - MpegEncContext * const s = &h->s; - const int mb_xy= h->mb_xy; - const int mb_type= s->current_picture.mb_type[mb_xy]; - - assert(IS_INTER(mb_type)); - - if(HAVE_PTHREADS && s->avctx->active_thread_type&FF_THREAD_FRAME) - await_references(h); - prefetch_motion(h, 0); - - if(IS_16X16(mb_type)){ - mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], - weight_op, weight_avg, - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - }else if(IS_16X8(mb_type)){ - mc_part(h, 0, 0, 4, (8<pixel_shift), dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - &weight_op[1], &weight_avg[1], - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - mc_part(h, 8, 0, 4, (8<pixel_shift), dest_y, dest_cb, dest_cr, 0, 4, - qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - &weight_op[1], &weight_avg[1], - IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); - }else if(IS_8X16(mb_type)){ - mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[2], &weight_avg[2], - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[2], &weight_avg[2], - IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); - }else{ - int i; - - assert(IS_8X8(mb_type)); - - for(i=0; i<4; i++){ - const int sub_mb_type= h->sub_mb_type[i]; - const int n= 4*i; - int x_offset= (i&1)<<2; - int y_offset= (i&2)<<1; - - if(IS_SUB_8X8(sub_mb_type)){ - mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[3], &weight_avg[3], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else if(IS_SUB_8X4(sub_mb_type)){ - mc_part(h, n , 0, 2, (4<pixel_shift), dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[4], &weight_avg[4], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - mc_part(h, n+2, 0, 2, (4<pixel_shift), dest_y, dest_cb, dest_cr, x_offset, y_offset+2, - qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[4], &weight_avg[4], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else if(IS_SUB_4X8(sub_mb_type)){ - mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[5], &weight_avg[5], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[5], &weight_avg[5], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else{ - int j; - assert(IS_SUB_4X4(sub_mb_type)); - for(j=0; j<4; j++){ - int sub_x_offset= x_offset + 2*(j&1); - int sub_y_offset= y_offset + (j&2); - mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[6], &weight_avg[6], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - } - } - } - } - - prefetch_motion(h, 1); -} - static void free_tables(H264Context *h, int free_rbsp){ int i; @@ -1692,7 +1275,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ if(h->deblocking_filter) xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple); }else if(is_h264){ - hl_motion(h, dest_y, dest_cb, dest_cr, + ff_hl_motion(h, dest_y, dest_cb, dest_cr, s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, h->h264dsp.weight_h264_pixels_tab, h->h264dsp.biweight_h264_pixels_tab); diff --git a/libavcodec/h264.h b/libavcodec/h264.h index c9a2f0073b..ab02de1dfb 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -717,6 +717,12 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint void ff_h264_reset_sei(H264Context *h); +void ff_hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg); + + /* o-o o-o / / / diff --git a/libavcodec/h264_hl_motion.c b/libavcodec/h264_hl_motion.c new file mode 100644 index 0000000000..654b8b8444 --- /dev/null +++ b/libavcodec/h264_hl_motion.c @@ -0,0 +1,164 @@ + +#include "h264.h" +#include "thread.h" + +static inline int get_lowest_part_list_y(H264Context *h, Picture *pic, int n, int height, + int y_offset, int list){ + int raw_my= h->mv_cache[list][ scan8[n] ][1]; + int filter_height= (raw_my&3) ? 2 : 0; + int full_my= (raw_my>>2) + y_offset; + int top = full_my - filter_height, bottom = full_my + height + filter_height; + + return FFMAX(abs(top), bottom); +} + +static inline void get_lowest_part_y(H264Context *h, int refs[2][48], int n, int height, + int y_offset, int list0, int list1, int *nrefs){ + MpegEncContext * const s = &h->s; + int my; + + y_offset += 16*(s->mb_y >> MB_FIELD); + + if(list0){ + int ref_n = h->ref_cache[0][ scan8[n] ]; + Picture *ref= &h->ref_list[0][ref_n]; + + // Error resilience puts the current picture in the ref list. + // Don't try to wait on these as it will cause a deadlock. + // Fields can wait on each other, though. + if(ref->thread_opaque != s->current_picture.thread_opaque || + (ref->reference&3) != s->picture_structure) { + my = get_lowest_part_list_y(h, ref, n, height, y_offset, 0); + if (refs[0][ref_n] < 0) nrefs[0] += 1; + refs[0][ref_n] = FFMAX(refs[0][ref_n], my); + } + } + + if(list1){ + int ref_n = h->ref_cache[1][ scan8[n] ]; + Picture *ref= &h->ref_list[1][ref_n]; + + if(ref->thread_opaque != s->current_picture.thread_opaque || + (ref->reference&3) != s->picture_structure) { + my = get_lowest_part_list_y(h, ref, n, height, y_offset, 1); + if (refs[1][ref_n] < 0) nrefs[1] += 1; + refs[1][ref_n] = FFMAX(refs[1][ref_n], my); + } + } +} + +/** + * Wait until all reference frames are available for MC operations. + * + * @param h the H264 context + */ +static void await_references(H264Context *h){ + MpegEncContext * const s = &h->s; + const int mb_xy= h->mb_xy; + const int mb_type= s->current_picture.mb_type[mb_xy]; + int refs[2][48]; + int nrefs[2] = {0}; + int ref, list; + + memset(refs, -1, sizeof(refs)); + + if(IS_16X16(mb_type)){ + get_lowest_part_y(h, refs, 0, 16, 0, + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); + }else if(IS_16X8(mb_type)){ + get_lowest_part_y(h, refs, 0, 8, 0, + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); + get_lowest_part_y(h, refs, 8, 8, 8, + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs); + }else if(IS_8X16(mb_type)){ + get_lowest_part_y(h, refs, 0, 16, 0, + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs); + get_lowest_part_y(h, refs, 4, 16, 0, + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs); + }else{ + int i; + + assert(IS_8X8(mb_type)); + + for(i=0; i<4; i++){ + const int sub_mb_type= h->sub_mb_type[i]; + const int n= 4*i; + int y_offset= (i&2)<<2; + + if(IS_SUB_8X8(sub_mb_type)){ + get_lowest_part_y(h, refs, n , 8, y_offset, + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); + }else if(IS_SUB_8X4(sub_mb_type)){ + get_lowest_part_y(h, refs, n , 4, y_offset, + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); + get_lowest_part_y(h, refs, n+2, 4, y_offset+4, + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); + }else if(IS_SUB_4X8(sub_mb_type)){ + get_lowest_part_y(h, refs, n , 8, y_offset, + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); + get_lowest_part_y(h, refs, n+1, 8, y_offset, + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); + }else{ + int j; + assert(IS_SUB_4X4(sub_mb_type)); + for(j=0; j<4; j++){ + int sub_y_offset= y_offset + 2*(j&2); + get_lowest_part_y(h, refs, n+j, 4, sub_y_offset, + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs); + } + } + } + } + + for(list=h->list_count-1; list>=0; list--){ + for(ref=0; ref<48 && nrefs[list]; ref++){ + int row = refs[list][ref]; + if(row >= 0){ + Picture *ref_pic = &h->ref_list[list][ref]; + int ref_field = ref_pic->reference - 1; + int ref_field_picture = ref_pic->field_picture; + int pic_height = 16*s->mb_height >> ref_field_picture; + + row <<= MB_MBAFF; + nrefs[list]--; + + if(!FIELD_PICTURE && ref_field_picture){ // frame referencing two fields + ff_thread_await_progress((AVFrame*)ref_pic, FFMIN((row >> 1) - !(row&1), pic_height-1), 1); + ff_thread_await_progress((AVFrame*)ref_pic, FFMIN((row >> 1) , pic_height-1), 0); + }else if(FIELD_PICTURE && !ref_field_picture){ // field referencing one field of a frame + ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row*2 + ref_field , pic_height-1), 0); + }else if(FIELD_PICTURE){ + ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row, pic_height-1), ref_field); + }else{ + ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row, pic_height-1), 0); + } + } + } + } +} + +#define FUNC(a) a ## _8 +#define PIXEL_SHIFT 0 +#include "h264_hl_motion.h" + +#undef PIXEL_SHIFT +#undef FUNC +#define FUNC(a) a ## _16 +#define PIXEL_SHIFT 1 +#include "h264_hl_motion.h" + +void ff_hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg){ + if(h->pixel_shift){ + hl_motion_16(h, dest_y, dest_cb, dest_cr, + qpix_put, chroma_put, + qpix_avg, chroma_avg, + weight_op, weight_avg); + }else + hl_motion_8(h, dest_y, dest_cb, dest_cr, + qpix_put, chroma_put, + qpix_avg, chroma_avg, + weight_op, weight_avg); +} diff --git a/libavcodec/h264_hl_motion.h b/libavcodec/h264_hl_motion.h new file mode 100644 index 0000000000..f354251fc9 --- /dev/null +++ b/libavcodec/h264_hl_motion.h @@ -0,0 +1,282 @@ + +static inline void FUNC(mc_dir_part)(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int src_x_offset, int src_y_offset, + qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){ + MpegEncContext * const s = &h->s; + const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; + int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; + const int luma_xy= (mx&3) + ((my&3)<<2); + uint8_t * src_y = pic->data[0] + ((mx>>2)<>2)*h->mb_linesize; + uint8_t * src_cb, * src_cr; + int extra_width= h->emu_edge_width; + int extra_height= h->emu_edge_height; + int emu=0; + const int full_mx= mx>>2; + const int full_my= my>>2; + const int pic_width = 16*s->mb_width; + const int pic_height = 16*s->mb_height >> MB_FIELD; + + if(mx&7) extra_width -= 3; + if(my&7) extra_height -= 3; + + if( full_mx < 0-extra_width + || full_my < 0-extra_height + || full_mx + 16/*FIXME*/ > pic_width + extra_width + || full_my + 16/*FIXME*/ > pic_height + extra_height){ + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2<mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height); + src_y= s->edge_emu_buffer + (2<mb_linesize; + emu=1; + } + + qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps? + if(!square){ + qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize); + } + + if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return; + + if(MB_FIELD){ + // chroma offset when predicting from a field of opposite parity + my += 2 * ((s->mb_y & 1) - (pic->reference - 1)); + emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1); + } + src_cb= pic->data[1] + ((mx>>3)<>3)*h->mb_uvlinesize; + src_cr= pic->data[2] + ((mx>>3)<>3)*h->mb_uvlinesize; + + if(emu){ + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); + src_cb= s->edge_emu_buffer; + } + chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7); + + if(emu){ + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); + src_cr= s->edge_emu_buffer; + } + chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7); +} + +static inline void FUNC(mc_part_std)(H264Context *h, int n, int square, int chroma_height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, + int list0, int list1){ + MpegEncContext * const s = &h->s; + qpel_mc_func *qpix_op= qpix_put; + h264_chroma_mc_func chroma_op= chroma_put; + + dest_y += (2*x_offset< mb_linesize; + dest_cb += ( x_offset<mb_uvlinesize; + dest_cr += ( x_offset<mb_uvlinesize; + x_offset += 8*s->mb_x; + y_offset += 8*(s->mb_y >> MB_FIELD); + + if(list0){ + Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; + FUNC(mc_dir_part)(h, ref, n, square, chroma_height, delta, 0, + dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_op, chroma_op); + + qpix_op= qpix_avg; + chroma_op= chroma_avg; + } + + if(list1){ + Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; + FUNC(mc_dir_part)(h, ref, n, square, chroma_height, delta, 1, + dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_op, chroma_op); + } +} + +static inline void FUNC(mc_part_weighted)(H264Context *h, int n, int square, int chroma_height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, + h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, + int list0, int list1){ + MpegEncContext * const s = &h->s; + + dest_y += (2*x_offset< mb_linesize; + dest_cb += ( x_offset<mb_uvlinesize; + dest_cr += ( x_offset<mb_uvlinesize; + x_offset += 8*s->mb_x; + y_offset += 8*(s->mb_y >> MB_FIELD); + + if(list0 && list1){ + /* don't optimize for luma-only case, since B-frames usually + * use implicit weights => chroma too. */ + uint8_t *tmp_cb = s->obmc_scratchpad; + uint8_t *tmp_cr = s->obmc_scratchpad + (8<obmc_scratchpad + 8*h->mb_uvlinesize; + int refn0 = h->ref_cache[0][ scan8[n] ]; + int refn1 = h->ref_cache[1][ scan8[n] ]; + + FUNC(mc_dir_part)(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0, + dest_y, dest_cb, dest_cr, + x_offset, y_offset, qpix_put, chroma_put); + FUNC(mc_dir_part)(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1, + tmp_y, tmp_cb, tmp_cr, + x_offset, y_offset, qpix_put, chroma_put); + + if(h->use_weight == 2){ + int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; + int weight1 = 64 - weight0; + luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); + }else{ + luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, + h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], + h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); + chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, + h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], + h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); + chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, + h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], + h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); + } + }else{ + int list = list1 ? 1 : 0; + int refn = h->ref_cache[list][ scan8[n] ]; + Picture *ref= &h->ref_list[list][refn]; + FUNC(mc_dir_part)(h, ref, n, square, chroma_height, delta, list, + dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_put, chroma_put); + + luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom, + h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); + if(h->use_weight_chroma){ + chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, + h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); + chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, + h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); + } + } +} + +static inline void FUNC(mc_part)(H264Context *h, int n, int square, int chroma_height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, + h264_weight_func *weight_op, h264_biweight_func *weight_avg, + int list0, int list1){ + if((h->use_weight==2 && list0 && list1 + && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) + || h->use_weight==1) + FUNC(mc_part_weighted)(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, + x_offset, y_offset, qpix_put, chroma_put, + weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1); + else + FUNC(mc_part_std)(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, + x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1); +} + +static inline void FUNC(prefetch_motion)(H264Context *h, int list){ + /* fetch pixels for estimated mv 4 macroblocks ahead + * optimized for 64byte cache lines */ + MpegEncContext * const s = &h->s; + const int refn = h->ref_cache[list][scan8[0]]; + if(refn >= 0){ + const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8; + const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y; + uint8_t **src= h->ref_list[list][refn].data; + int off= ((mx+64)<mb_x&3)*4)*h->mb_linesize; + s->dsp.prefetch(src[0]+off, s->linesize, 4); + off= (((mx>>1)+64)<>1) + (s->mb_x&7))*s->uvlinesize; + s->dsp.prefetch(src[1]+off, src[2]-src[1], 2); + } +} + +static void FUNC(hl_motion)(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg){ + MpegEncContext * const s = &h->s; + const int mb_xy= h->mb_xy; + const int mb_type= s->current_picture.mb_type[mb_xy]; + + assert(IS_INTER(mb_type)); + + if(HAVE_PTHREADS && s->avctx->active_thread_type&FF_THREAD_FRAME) + await_references(h); + FUNC(prefetch_motion)(h, 0); + + if(IS_16X16(mb_type)){ + FUNC(mc_part)(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, + qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], + weight_op, weight_avg, + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + }else if(IS_16X8(mb_type)){ + FUNC(mc_part)(h, 0, 0, 4, (8<mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[2], &weight_avg[2], + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + FUNC(mc_part)(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[2], &weight_avg[2], + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); + }else{ + int i; + + assert(IS_8X8(mb_type)); + + for(i=0; i<4; i++){ + const int sub_mb_type= h->sub_mb_type[i]; + const int n= 4*i; + int x_offset= (i&1)<<2; + int y_offset= (i&2)<<1; + + if(IS_SUB_8X8(sub_mb_type)){ + FUNC(mc_part)(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[3], &weight_avg[3], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + }else if(IS_SUB_8X4(sub_mb_type)){ + FUNC(mc_part)(h, n , 0, 2, (4<mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[5], &weight_avg[5], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + FUNC(mc_part)(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[5], &weight_avg[5], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + }else{ + int j; + assert(IS_SUB_4X4(sub_mb_type)); + for(j=0; j<4; j++){ + int sub_x_offset= x_offset + 2*(j&1); + int sub_y_offset= y_offset + (j&2); + FUNC(mc_part)(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[6], &weight_avg[6], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + } + } + } + } + + FUNC(prefetch_motion)(h, 1); +}