diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index f54d74d285..2d15bd3538 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -486,6 +486,10 @@ typedef struct DSPContext { void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block); void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block); void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block); + void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, DCTELEM *block); + void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block); + void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block); + void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block); void (*vc1_v_overlap)(uint8_t* src, int stride); void (*vc1_h_overlap)(uint8_t* src, int stride); void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq); diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c index 57982444fe..ec6a660ad9 100644 --- a/libavcodec/vc1.c +++ b/libavcodec/vc1.c @@ -337,6 +337,10 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte v->s.dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add; v->s.dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add; v->s.dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add; + v->s.dsp.vc1_inv_trans_8x8_dc = ff_simple_idct_add; + v->s.dsp.vc1_inv_trans_8x4_dc = ff_simple_idct84_add; + v->s.dsp.vc1_inv_trans_4x8_dc = ff_simple_idct48_add; + v->s.dsp.vc1_inv_trans_4x4_dc = ff_simple_idct44_add; } v->fastuvmc = get_bits1(gb); //common diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index 5d4dd0633b..6172e0c047 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -2028,8 +2028,12 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan block[idx] += (block[idx] < 0) ? -mquant : mquant; } if(!skip_block){ - s->dsp.vc1_inv_trans_8x8(block); - s->dsp.add_pixels_clamped(block, dst, linesize); + if(i==1) + s->dsp.vc1_inv_trans_8x8_dc(dst, linesize, block); + else{ + s->dsp.vc1_inv_trans_8x8(block); + s->dsp.add_pixels_clamped(block, dst, linesize); + } if(apply_filter && cbp_top & 0xC) s->dsp.vc1_v_loop_filter8(dst, linesize, v->pq); if(apply_filter && cbp_left & 0xA) @@ -2053,7 +2057,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan block[idx + off] += (block[idx + off] < 0) ? -mquant : mquant; } if(!(subblkpat & (1 << (3 - j))) && !skip_block){ - s->dsp.vc1_inv_trans_4x4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off); + if(i==1) + s->dsp.vc1_inv_trans_4x4_dc(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off); + else + s->dsp.vc1_inv_trans_4x4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off); if(apply_filter && (j&2 ? pat & (1<<(j-2)) : (cbp_top & (1 << (j + 2))))) s->dsp.vc1_v_loop_filter4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, v->pq); if(apply_filter && (j&1 ? pat & (1<<(j-1)) : (cbp_left & (1 << (j + 1))))) @@ -2078,7 +2085,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan block[idx] += (block[idx] < 0) ? -mquant : mquant; } if(!(subblkpat & (1 << (1 - j))) && !skip_block){ - s->dsp.vc1_inv_trans_8x4(dst + j*4*linesize, linesize, block + off); + if(i==1) + s->dsp.vc1_inv_trans_8x4_dc(dst + j*4*linesize, linesize, block + off); + else + s->dsp.vc1_inv_trans_8x4(dst + j*4*linesize, linesize, block + off); if(apply_filter && j ? pat & 0x3 : (cbp_top & 0xC)) s->dsp.vc1_v_loop_filter8(dst + j*4*linesize, linesize, v->pq); if(apply_filter && cbp_left & (2 << j)) @@ -2103,7 +2113,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan block[idx] += (block[idx] < 0) ? -mquant : mquant; } if(!(subblkpat & (1 << (1 - j))) && !skip_block){ - s->dsp.vc1_inv_trans_4x8(dst + j*4, linesize, block + off); + if(i==1) + s->dsp.vc1_inv_trans_4x8_dc(dst + j*4, linesize, block + off); + else + s->dsp.vc1_inv_trans_4x8(dst + j*4, linesize, block + off); if(apply_filter && cbp_top & (2 << j)) s->dsp.vc1_v_loop_filter4(dst + j*4, linesize, v->pq); if(apply_filter && j ? pat & 0x5 : (cbp_left & 0xA)) diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c index 5773ab1e3c..5e79736a50 100644 --- a/libavcodec/vc1dsp.c +++ b/libavcodec/vc1dsp.c @@ -178,6 +178,26 @@ static void vc1_h_loop_filter16_c(uint8_t *src, int stride, int pq) /** Do inverse transform on 8x8 block */ +static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + int i; + int dc = block[0]; + const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + dc = (3 * dc + 1) >> 1; + dc = (3 * dc + 16) >> 5; + for(i = 0; i < 8; i++){ + dest[0] = cm[dest[0]+dc]; + dest[1] = cm[dest[1]+dc]; + dest[2] = cm[dest[2]+dc]; + dest[3] = cm[dest[3]+dc]; + dest[4] = cm[dest[4]+dc]; + dest[5] = cm[dest[5]+dc]; + dest[6] = cm[dest[6]+dc]; + dest[7] = cm[dest[7]+dc]; + dest += linesize; + } +} + static void vc1_inv_trans_8x8_c(DCTELEM block[64]) { int i; @@ -249,6 +269,26 @@ static void vc1_inv_trans_8x8_c(DCTELEM block[64]) /** Do inverse transform on 8x4 part of block */ +static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + int i; + int dc = block[0]; + const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + dc = ( 3 * dc + 1) >> 1; + dc = (17 * dc + 64) >> 7; + for(i = 0; i < 4; i++){ + dest[0] = cm[dest[0]+dc]; + dest[1] = cm[dest[1]+dc]; + dest[2] = cm[dest[2]+dc]; + dest[3] = cm[dest[3]+dc]; + dest[4] = cm[dest[4]+dc]; + dest[5] = cm[dest[5]+dc]; + dest[6] = cm[dest[6]+dc]; + dest[7] = cm[dest[7]+dc]; + dest += linesize; + } +} + static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block) { int i; @@ -306,6 +346,22 @@ static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block) /** Do inverse transform on 4x8 parts of block */ +static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + int i; + int dc = block[0]; + const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + dc = (17 * dc + 4) >> 3; + dc = (12 * dc + 64) >> 7; + for(i = 0; i < 8; i++){ + dest[0] = cm[dest[0]+dc]; + dest[1] = cm[dest[1]+dc]; + dest[2] = cm[dest[2]+dc]; + dest[3] = cm[dest[3]+dc]; + dest += linesize; + } +} + static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block) { int i; @@ -363,6 +419,22 @@ static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block) /** Do inverse transform on 4x4 part of block */ +static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + int i; + int dc = block[0]; + const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + dc = (17 * dc + 4) >> 3; + dc = (17 * dc + 64) >> 7; + for(i = 0; i < 4; i++){ + dest[0] = cm[dest[0]+dc]; + dest[1] = cm[dest[1]+dc]; + dest[2] = cm[dest[2]+dc]; + dest[3] = cm[dest[3]+dc]; + dest += linesize; + } +} + static void vc1_inv_trans_4x4_c(uint8_t *dest, int linesize, DCTELEM *block) { int i; @@ -545,6 +617,10 @@ void ff_vc1dsp_init(DSPContext* dsp, AVCodecContext *avctx) { dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c; dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c; dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c; + dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_c; + dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_c; + dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_c; + dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c; dsp->vc1_h_overlap = vc1_h_overlap_c; dsp->vc1_v_overlap = vc1_v_overlap_c; dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_c; diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index 2c98b37b97..3071d02808 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -494,6 +494,204 @@ DECLARE_FUNCTION(3, 1) DECLARE_FUNCTION(3, 2) DECLARE_FUNCTION(3, 3) +static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) +{ + int dc = block[0]; + dc = (17 * dc + 4) >> 3; + dc = (17 * dc + 64) >> 7; + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + __asm__ volatile( + "movd %0, %%mm2 \n\t" + "movd %1, %%mm3 \n\t" + "movd %2, %%mm4 \n\t" + "movd %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movd %%mm2, %0 \n\t" + "movd %%mm3, %1 \n\t" + "movd %%mm4, %2 \n\t" + "movd %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dest+0*linesize)), + "+m"(*(uint32_t*)(dest+1*linesize)), + "+m"(*(uint32_t*)(dest+2*linesize)), + "+m"(*(uint32_t*)(dest+3*linesize)) + ); +} + +static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) +{ + int dc = block[0]; + dc = (17 * dc + 4) >> 3; + dc = (12 * dc + 64) >> 7; + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + __asm__ volatile( + "movd %0, %%mm2 \n\t" + "movd %1, %%mm3 \n\t" + "movd %2, %%mm4 \n\t" + "movd %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movd %%mm2, %0 \n\t" + "movd %%mm3, %1 \n\t" + "movd %%mm4, %2 \n\t" + "movd %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dest+0*linesize)), + "+m"(*(uint32_t*)(dest+1*linesize)), + "+m"(*(uint32_t*)(dest+2*linesize)), + "+m"(*(uint32_t*)(dest+3*linesize)) + ); + dest += 4*linesize; + __asm__ volatile( + "movd %0, %%mm2 \n\t" + "movd %1, %%mm3 \n\t" + "movd %2, %%mm4 \n\t" + "movd %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movd %%mm2, %0 \n\t" + "movd %%mm3, %1 \n\t" + "movd %%mm4, %2 \n\t" + "movd %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dest+0*linesize)), + "+m"(*(uint32_t*)(dest+1*linesize)), + "+m"(*(uint32_t*)(dest+2*linesize)), + "+m"(*(uint32_t*)(dest+3*linesize)) + ); +} + +static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) +{ + int dc = block[0]; + dc = ( 3 * dc + 1) >> 1; + dc = (17 * dc + 64) >> 7; + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + __asm__ volatile( + "movq %0, %%mm2 \n\t" + "movq %1, %%mm3 \n\t" + "movq %2, %%mm4 \n\t" + "movq %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movq %%mm2, %0 \n\t" + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dest+0*linesize)), + "+m"(*(uint32_t*)(dest+1*linesize)), + "+m"(*(uint32_t*)(dest+2*linesize)), + "+m"(*(uint32_t*)(dest+3*linesize)) + ); +} + +static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) +{ + int dc = block[0]; + dc = (3 * dc + 1) >> 1; + dc = (3 * dc + 16) >> 5; + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + __asm__ volatile( + "movq %0, %%mm2 \n\t" + "movq %1, %%mm3 \n\t" + "movq %2, %%mm4 \n\t" + "movq %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movq %%mm2, %0 \n\t" + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dest+0*linesize)), + "+m"(*(uint32_t*)(dest+1*linesize)), + "+m"(*(uint32_t*)(dest+2*linesize)), + "+m"(*(uint32_t*)(dest+3*linesize)) + ); + dest += 4*linesize; + __asm__ volatile( + "movq %0, %%mm2 \n\t" + "movq %1, %%mm3 \n\t" + "movq %2, %%mm4 \n\t" + "movq %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movq %%mm2, %0 \n\t" + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dest+0*linesize)), + "+m"(*(uint32_t*)(dest+1*linesize)), + "+m"(*(uint32_t*)(dest+2*linesize)), + "+m"(*(uint32_t*)(dest+3*linesize)) + ); +} + void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { mm_flags = mm_support(); @@ -537,5 +735,10 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; + + dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; + dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; + dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; + dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; } }