From a579db0c4fe026d49c71d1ff64a2d1d07c152d68 Mon Sep 17 00:00:00 2001 From: Ivan Kalvachev Date: Mon, 27 Oct 2003 23:22:43 +0000 Subject: [PATCH] XvMC speedup by removing one memcpy and doing MB packing Originally committed as revision 2442 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/mpeg12.c | 67 +++++++++++++++++++++++++++++------------- libavcodec/mpegvideo.c | 8 +++-- libavcodec/mpegvideo.h | 2 ++ libavcodec/xvmcvideo.c | 63 +++++++++++++++++++++++++++++++-------- 4 files changed, 105 insertions(+), 35 deletions(-) diff --git a/libavcodec/mpeg12.c b/libavcodec/mpeg12.c index cd3a725aec..8eca32c626 100644 --- a/libavcodec/mpeg12.c +++ b/libavcodec/mpeg12.c @@ -72,6 +72,8 @@ static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred); #ifdef HAVE_XVMC extern int XVMC_field_start(MpegEncContext *s, AVCodecContext *avctx); extern int XVMC_field_end(MpegEncContext *s); +extern void XVMC_pack_pblocks(MpegEncContext *s,int cbp); +extern void XVMC_init_block(s);//set s->block #endif #ifdef CONFIG_ENCODERS @@ -1083,15 +1085,24 @@ static int mpeg_decode_mb(MpegEncContext *s, }else memset(s->last_mv, 0, sizeof(s->last_mv)); /* reset mv prediction */ s->mb_intra = 1; +#ifdef HAVE_XVMC + //one 1 we memcpy blocks in xvmcvideo + if(s->avctx->xvmc_acceleration > 1){ + XVMC_pack_pblocks(s,-1);//inter are always full blocks + if(s->swap_uv){ + exchange_uv(s); + } + } +#endif if (s->codec_id == CODEC_ID_MPEG2VIDEO) { for(i=0;i<6;i++) { - if (mpeg2_decode_block_intra(s, block[i], i) < 0) + if (mpeg2_decode_block_intra(s, s->pblocks[i], i) < 0) return -1; } } else { for(i=0;i<6;i++) { - if (mpeg1_decode_block_intra(s, block[i], i) < 0) + if (mpeg1_decode_block_intra(s, s->pblocks[i], i) < 0) return -1; } } @@ -1262,10 +1273,20 @@ static int mpeg_decode_mb(MpegEncContext *s, } cbp++; +#ifdef HAVE_XVMC + //on 1 we memcpy blocks in xvmcvideo + if(s->avctx->xvmc_acceleration > 1){ + XVMC_pack_pblocks(s,cbp); + if(s->swap_uv){ + exchange_uv(s); + } + } +#endif + if (s->codec_id == CODEC_ID_MPEG2VIDEO) { for(i=0;i<6;i++) { if (cbp & 32) { - if (mpeg2_decode_block_non_intra(s, block[i], i) < 0) + if (mpeg2_decode_block_non_intra(s, s->pblocks[i], i) < 0) return -1; } else { s->block_last_index[i] = -1; @@ -1275,7 +1296,7 @@ static int mpeg_decode_mb(MpegEncContext *s, } else { for(i=0;i<6;i++) { if (cbp & 32) { - if (mpeg1_decode_block_inter(s, block[i], i) < 0) + if (mpeg1_decode_block_inter(s, s->pblocks[i], i) < 0) return -1; } else { s->block_last_index[i] = -1; @@ -1960,10 +1981,12 @@ static void mpeg_decode_extension(AVCodecContext *avctx, } } -static void exchange_uv(AVFrame *f){ - uint8_t *t= f->data[1]; - f->data[1]= f->data[2]; - f->data[2]= t; +static void exchange_uv(MpegEncContext *s){ +short * tmp; + + tmp = s->pblocks[4]; + s->pblocks[4] = s->pblocks[5]; + s->pblocks[5] = tmp; } #define DECODE_SLICE_FATAL_ERROR -2 @@ -2093,6 +2116,12 @@ static int mpeg_decode_slice(AVCodecContext *avctx, ff_init_block_index(s); for(;;) { +#ifdef HAVE_XVMC + //one 1 we memcpy blocks in xvmcvideo + if(s->avctx->xvmc_acceleration > 1) + XVMC_init_block(s);//set s->block +#endif + s->dsp.clear_blocks(s->block[0]); ret = mpeg_decode_mb(s, s->block); @@ -2133,14 +2162,9 @@ static int mpeg_decode_slice(AVCodecContext *avctx, MPV_decode_mb(s, s->block); if (++s->mb_x >= s->mb_width) { - if(s->avctx->codec_tag == ff_get_fourcc("VCR2")) - exchange_uv((AVFrame*)s->current_picture_ptr); ff_draw_horiz_band(s, 16*s->mb_y, 16); - if(s->avctx->codec_tag == ff_get_fourcc("VCR2")) - exchange_uv((AVFrame*)s->current_picture_ptr); - s->mb_x = 0; s->mb_y++; @@ -2233,8 +2257,6 @@ static int slice_end(AVCodecContext *avctx, AVFrame *pict) ff_print_debug_info(s, s->last_picture_ptr); } } - if(s->avctx->codec_tag == ff_get_fourcc("VCR2")) - exchange_uv(pict); return 1; } else { @@ -2294,11 +2316,13 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, //get_format() or set_video(width,height,aspect,pix_fmt); //until then pix_fmt may be changed right after codec init if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT ) - avctx->idct_algo = FF_IDCT_SIMPLE; + if( avctx->idct_algo == FF_IDCT_AUTO ) + avctx->idct_algo = FF_IDCT_SIMPLE; if (MPV_common_init(s) < 0) return -1; s1->mpeg_enc_ctx_allocated = 1; + s->swap_uv = 0;//just in case vcr2 and mpeg2 stream have been concatinated } skip_bits(&s->gb, 10); /* vbv_buffer_size */ @@ -2378,10 +2402,13 @@ static int vcr2_init_sequence(AVCodecContext *avctx) //get_format() or set_video(width,height,aspect,pix_fmt); //until then pix_fmt may be changed right after codec init if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT ) - avctx->idct_algo = FF_IDCT_SIMPLE; + if( avctx->idct_algo == FF_IDCT_AUTO ) + avctx->idct_algo = FF_IDCT_SIMPLE; if (MPV_common_init(s) < 0) return -1; + exchange_uv(s);//common init reset pblocks, so we swap them here + s->swap_uv = 1;// in case of xvmc we need to swap uv for each MB s1->mpeg_enc_ctx_allocated = 1; for(i=0;i<64;i++) { @@ -2634,14 +2661,14 @@ static int mpeg_mc_decode_init(AVCodecContext *avctx){ if( !(avctx->slice_flags & SLICE_FLAG_CODED_ORDER) ) return -1; - if( !(avctx->slice_flags & SLICE_FLAG_ALLOW_FIELD) ) + if( !(avctx->slice_flags & SLICE_FLAG_ALLOW_FIELD) ){ dprintf("mpeg12.c: XvMC decoder will work better if SLICE_FLAG_ALLOW_FIELD is set\n"); - + } mpeg_decode_init(avctx); s = avctx->priv_data; avctx->pix_fmt = PIX_FMT_XVMC_MPEG2_IDCT; - avctx->xvmc_acceleration = 1; + avctx->xvmc_acceleration = 2;//2 - the blocks are packed! return 0; } diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c index b65aec5b35..5a121a178c 100644 --- a/libavcodec/mpegvideo.c +++ b/libavcodec/mpegvideo.c @@ -56,7 +56,7 @@ static int sse_mb(MpegEncContext *s); #ifdef HAVE_XVMC extern int XVMC_field_start(MpegEncContext*s, AVCodecContext *avctx); extern void XVMC_field_end(MpegEncContext *s); -extern void XVMC_decode_mb(MpegEncContext *s, DCTELEM block[6][64]); +extern void XVMC_decode_mb(MpegEncContext *s); #endif void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w)= draw_edges_c; @@ -519,6 +519,10 @@ int MPV_common_init(MpegEncContext *s) s->block= s->blocks[0]; + for(i=0;i<12;i++){ + s->pblocks[i] = (short *)(&s->block[i]); + } + s->parse_context.state= -1; s->context_initialized = 1; @@ -2485,7 +2489,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) const int mb_xy = s->mb_y * s->mb_stride + s->mb_x; #ifdef HAVE_XVMC if(s->avctx->xvmc_acceleration){ - XVMC_decode_mb(s,block); + XVMC_decode_mb(s);//xvmc uses pblocks return; } #endif diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h index 40a3bdfc04..6552931909 100644 --- a/libavcodec/mpegvideo.h +++ b/libavcodec/mpegvideo.h @@ -655,6 +655,8 @@ typedef struct MpegEncContext { int rtp_payload_size; void (*rtp_callback)(void *data, int size, int packet_number); uint8_t *ptr_lastgob; + int swap_uv;//vcr2 codec is mpeg2 varint with UV swaped + short * pblocks[12]; DCTELEM (*block)[64]; ///< points to one of the following blocks DCTELEM (*blocks)[6][64]; // for HQ mode we need to keep the best block diff --git a/libavcodec/xvmcvideo.c b/libavcodec/xvmcvideo.c index b13135b99b..6e6e58fd40 100644 --- a/libavcodec/xvmcvideo.c +++ b/libavcodec/xvmcvideo.c @@ -41,6 +41,33 @@ //#include "xvmc_debug.h" +//set s->block +inline void XVMC_init_block(MpegEncContext *s){ +xvmc_render_state_t * render; + render = (xvmc_render_state_t*)s->current_picture.data[2]; + assert(render != NULL); + if( (render == NULL) || (render->magic != MP_XVMC_RENDER_MAGIC) ){ + assert(0); + return;//make sure that this is render packet + } + s->block =(DCTELEM *)(render->data_blocks+(render->next_free_data_block_num)*64); +} + +void XVMC_pack_pblocks(MpegEncContext *s, int cbp){ +int i,j; +#define numblocks 6 + + j=0; + for(i=0;ipblocks[i] = (short *)(&s->block[(j++)]); + }else{ + s->pblocks[i] = NULL; + } +// printf("s->pblocks[%d]=%p ,s->block=%p cbp=%d\n",i,s->pblocks[i],s->block,cbp); + } +} + static int calc_cbp(MpegEncContext *s, int blocknum){ /* compute cbp */ // for I420 bit_offset=5 @@ -110,7 +137,7 @@ xvmc_render_state_t * render; } } -void XVMC_decode_mb(MpegEncContext *s, DCTELEM block[6][64]){ +void XVMC_decode_mb(MpegEncContext *s){ XvMCMacroBlock * mv_block; xvmc_render_state_t * render; int i,cbp,blocks_per_mb; @@ -242,14 +269,14 @@ const int mb_xy = s->mb_y * s->mb_stride + s->mb_x; */ if(s->flags & CODEC_FLAG_GRAY){ if(s->mb_intra){//intra frames are alwasy full chroma block - memset(block[4],0,sizeof(short)*8*8);//so we need to clear them - memset(block[5],0,sizeof(short)*8*8); - if(!render->unsigned_intra) - block[4][0] = block[5][0] = 1<<10; - } - else + for(i=4; ipblocks[i],0,sizeof(short)*8*8);//so we need to clear them + if(!render->unsigned_intra) + s->pblocks[i][0] = 1<<10; + } + }else blocks_per_mb = 4;//Luminance blocks only - }; + } cbp = calc_cbp(s,blocks_per_mb); mv_block->coded_block_pattern = cbp; if(cbp == 0) @@ -259,14 +286,24 @@ const int mb_xy = s->mb_y * s->mb_stride + s->mb_x; if(s->block_last_index[i] >= 0){ // i do not have unsigned_intra MOCO to test, hope it is OK if( (s->mb_intra) && ( render->idct || (!render->idct && !render->unsigned_intra)) ) - block[i][0]-=1<<10; + s->pblocks[i][0]-=1<<10; if(!render->idct){ - s->dsp.idct(block[i]); + s->dsp.idct(s->pblocks[i]); //!!TODO!clip!!! } -//TODO:avoid block copy by modifying s->block pointer - memcpy(&render->data_blocks[(render->next_free_data_block_num++)*64], - block[i],sizeof(short)*8*8); +//copy blocks only if the codec doesn't support pblocks reordering + if(s->avctx->xvmc_acceleration == 1){ + memcpy(&render->data_blocks[(render->next_free_data_block_num)*64], + s->pblocks[i],sizeof(short)*8*8); + }else{ +/* if(s->pblocks[i] != &render->data_blocks[ + (render->next_free_data_block_num)*64]){ + printf("ERROR mb(%d,%d) s->pblocks[i]=%p data_block[]=%p\n", + s->mb_x,s->mb_y, s->pblocks[i], + &render->data_blocks[(render->next_free_data_block_num)*64]); + }*/ + } + render->next_free_data_block_num++; } } render->filled_mv_blocks_num++;