mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
h264_idct8_add_mmx
Originally committed as revision 5123 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
6da971f160
commit
548a1c8a35
@ -358,8 +358,12 @@ typedef struct H264Context{
|
||||
|
||||
uint8_t zigzag_scan[16];
|
||||
uint8_t field_scan[16];
|
||||
uint8_t zigzag_scan8x8[64];
|
||||
uint8_t zigzag_scan8x8_cavlc[64];
|
||||
const uint8_t *zigzag_scan_q0;
|
||||
const uint8_t *field_scan_q0;
|
||||
const uint8_t *zigzag_scan8x8_q0;
|
||||
const uint8_t *zigzag_scan8x8_cavlc_q0;
|
||||
|
||||
int x264_build;
|
||||
}H264Context;
|
||||
@ -2953,6 +2957,7 @@ static void free_tables(H264Context *h){
|
||||
|
||||
static void init_dequant8_coeff_table(H264Context *h){
|
||||
int i,q,x;
|
||||
const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
|
||||
h->dequant8_coeff[0] = h->dequant8_buffer[0];
|
||||
h->dequant8_coeff[1] = h->dequant8_buffer[1];
|
||||
|
||||
@ -2966,8 +2971,9 @@ static void init_dequant8_coeff_table(H264Context *h){
|
||||
int shift = div6[q];
|
||||
int idx = rem6[q];
|
||||
for(x=0; x<64; x++)
|
||||
h->dequant8_coeff[i][q][x] = ((uint32_t)dequant8_coeff_init[idx][
|
||||
dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * h->pps.scaling_matrix8[i][x]) << shift;
|
||||
h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
|
||||
((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
|
||||
h->pps.scaling_matrix8[i][x]) << shift;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4317,14 +4323,31 @@ static int decode_slice_header(H264Context *h){
|
||||
#define T(x) (x>>2) | ((x<<2) & 0xF)
|
||||
h->zigzag_scan[i] = T(zigzag_scan[i]);
|
||||
h-> field_scan[i] = T( field_scan[i]);
|
||||
#undef T
|
||||
}
|
||||
}
|
||||
if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
|
||||
memcpy(h->zigzag_scan8x8, zigzag_scan8x8, 64*sizeof(uint8_t));
|
||||
memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
|
||||
}else{
|
||||
int i;
|
||||
for(i=0; i<64; i++){
|
||||
#define T(x) (x>>3) | ((x&7)<<3)
|
||||
h->zigzag_scan8x8[i] = T(zigzag_scan8x8[i]);
|
||||
h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
|
||||
#undef T
|
||||
}
|
||||
}
|
||||
if(h->sps.transform_bypass){ //FIXME same ugly
|
||||
h->zigzag_scan_q0 = zigzag_scan;
|
||||
h->field_scan_q0 = field_scan;
|
||||
h->zigzag_scan8x8_q0 = zigzag_scan8x8;
|
||||
h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
|
||||
}else{
|
||||
h->zigzag_scan_q0 = h->zigzag_scan;
|
||||
h->field_scan_q0 = h->field_scan;
|
||||
h->zigzag_scan8x8_q0 = h->zigzag_scan8x8;
|
||||
h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
|
||||
}
|
||||
|
||||
alloc_tables(h);
|
||||
@ -5101,7 +5124,7 @@ decode_intra_mb:
|
||||
int i8x8, i4x4, chroma_idx;
|
||||
int chroma_qp, dquant;
|
||||
GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
|
||||
const uint8_t *scan, *dc_scan;
|
||||
const uint8_t *scan, *scan8x8, *dc_scan;
|
||||
|
||||
// fill_non_zero_count_cache(h);
|
||||
|
||||
@ -5112,6 +5135,7 @@ decode_intra_mb:
|
||||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
|
||||
dc_scan= luma_dc_zigzag_scan;
|
||||
}
|
||||
scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
|
||||
|
||||
dquant= get_se_golomb(&s->gb);
|
||||
|
||||
@ -5153,7 +5177,7 @@ decode_intra_mb:
|
||||
DCTELEM *buf = &h->mb[64*i8x8];
|
||||
uint8_t *nnz;
|
||||
for(i4x4=0; i4x4<4; i4x4++){
|
||||
if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4,
|
||||
if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
|
||||
h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
|
||||
return -1;
|
||||
}
|
||||
@ -6144,7 +6168,7 @@ decode_intra_mb:
|
||||
s->current_picture.mb_type[mb_xy]= mb_type;
|
||||
|
||||
if( cbp || IS_INTRA16x16( mb_type ) ) {
|
||||
const uint8_t *scan, *dc_scan;
|
||||
const uint8_t *scan, *scan8x8, *dc_scan;
|
||||
int dqp;
|
||||
|
||||
if(IS_INTERLACED(mb_type)){
|
||||
@ -6154,6 +6178,7 @@ decode_intra_mb:
|
||||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
|
||||
dc_scan= luma_dc_zigzag_scan;
|
||||
}
|
||||
scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
|
||||
|
||||
h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
|
||||
if( dqp == INT_MIN ){
|
||||
@ -6187,7 +6212,7 @@ decode_intra_mb:
|
||||
if( cbp & (1<<i8x8) ) {
|
||||
if( IS_8x8DCT(mb_type) ) {
|
||||
if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
|
||||
zigzag_scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
|
||||
scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
|
||||
return -1;
|
||||
} else
|
||||
for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
|
||||
|
@ -2734,6 +2734,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
|
||||
c->h264_idct_dc_add=
|
||||
c->h264_idct_add= ff_h264_idct_add_mmx;
|
||||
c->h264_idct8_dc_add=
|
||||
c->h264_idct8_add= ff_h264_idct8_add_mmx;
|
||||
|
||||
if (mm_flags & MM_MMXEXT) {
|
||||
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
|
||||
|
@ -104,6 +104,133 @@ static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void h264_idct8_1d(int16_t *block)
|
||||
{
|
||||
asm volatile(
|
||||
"movq 112(%0), %%mm7 \n\t"
|
||||
"movq 80(%0), %%mm5 \n\t"
|
||||
"movq 48(%0), %%mm3 \n\t"
|
||||
"movq 16(%0), %%mm1 \n\t"
|
||||
|
||||
"movq %%mm7, %%mm4 \n\t"
|
||||
"movq %%mm3, %%mm6 \n\t"
|
||||
"movq %%mm5, %%mm0 \n\t"
|
||||
"movq %%mm7, %%mm2 \n\t"
|
||||
"psraw $1, %%mm4 \n\t"
|
||||
"psraw $1, %%mm6 \n\t"
|
||||
"psubw %%mm7, %%mm0 \n\t"
|
||||
"psubw %%mm6, %%mm2 \n\t"
|
||||
"psubw %%mm4, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm2 \n\t"
|
||||
"psubw %%mm3, %%mm0 \n\t"
|
||||
"paddw %%mm1, %%mm2 \n\t"
|
||||
|
||||
"movq %%mm5, %%mm4 \n\t"
|
||||
"movq %%mm1, %%mm6 \n\t"
|
||||
"psraw $1, %%mm4 \n\t"
|
||||
"psraw $1, %%mm6 \n\t"
|
||||
"paddw %%mm5, %%mm4 \n\t"
|
||||
"paddw %%mm1, %%mm6 \n\t"
|
||||
"paddw %%mm7, %%mm4 \n\t"
|
||||
"paddw %%mm5, %%mm6 \n\t"
|
||||
"psubw %%mm1, %%mm4 \n\t"
|
||||
"paddw %%mm3, %%mm6 \n\t"
|
||||
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm4, %%mm3 \n\t"
|
||||
"movq %%mm2, %%mm5 \n\t"
|
||||
"movq %%mm6, %%mm7 \n\t"
|
||||
"psraw $2, %%mm6 \n\t"
|
||||
"psraw $2, %%mm3 \n\t"
|
||||
"psraw $2, %%mm5 \n\t"
|
||||
"psraw $2, %%mm0 \n\t"
|
||||
"paddw %%mm6, %%mm1 \n\t"
|
||||
"paddw %%mm2, %%mm3 \n\t"
|
||||
"psubw %%mm4, %%mm5 \n\t"
|
||||
"psubw %%mm0, %%mm7 \n\t"
|
||||
|
||||
"movq 32(%0), %%mm2 \n\t"
|
||||
"movq 96(%0), %%mm6 \n\t"
|
||||
"movq %%mm2, %%mm4 \n\t"
|
||||
"movq %%mm6, %%mm0 \n\t"
|
||||
"psraw $1, %%mm4 \n\t"
|
||||
"psraw $1, %%mm6 \n\t"
|
||||
"psubw %%mm0, %%mm4 \n\t"
|
||||
"paddw %%mm2, %%mm6 \n\t"
|
||||
|
||||
"movq (%0), %%mm2 \n\t"
|
||||
"movq 64(%0), %%mm0 \n\t"
|
||||
SUMSUB_BA( %%mm0, %%mm2 )
|
||||
SUMSUB_BA( %%mm6, %%mm0 )
|
||||
SUMSUB_BA( %%mm4, %%mm2 )
|
||||
SUMSUB_BA( %%mm7, %%mm6 )
|
||||
SUMSUB_BA( %%mm5, %%mm4 )
|
||||
SUMSUB_BA( %%mm3, %%mm2 )
|
||||
SUMSUB_BA( %%mm1, %%mm0 )
|
||||
:: "r"(block)
|
||||
);
|
||||
}
|
||||
|
||||
static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
||||
{
|
||||
int i;
|
||||
int16_t __attribute__ ((aligned(8))) b2[64];
|
||||
|
||||
block[0] += 32;
|
||||
|
||||
for(i=0; i<2; i++){
|
||||
uint64_t tmp;
|
||||
|
||||
h264_idct8_1d(block+4*i);
|
||||
|
||||
asm volatile(
|
||||
"movq %%mm7, %0 \n\t"
|
||||
TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
|
||||
"movq %%mm0, 8(%1) \n\t"
|
||||
"movq %%mm6, 24(%1) \n\t"
|
||||
"movq %%mm7, 40(%1) \n\t"
|
||||
"movq %%mm4, 56(%1) \n\t"
|
||||
"movq %0, %%mm7 \n\t"
|
||||
TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
|
||||
"movq %%mm7, (%1) \n\t"
|
||||
"movq %%mm1, 16(%1) \n\t"
|
||||
"movq %%mm0, 32(%1) \n\t"
|
||||
"movq %%mm3, 48(%1) \n\t"
|
||||
: "=m"(tmp)
|
||||
: "r"(b2+32*i)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
for(i=0; i<2; i++){
|
||||
h264_idct8_1d(b2+4*i);
|
||||
|
||||
asm volatile(
|
||||
"psraw $6, %%mm7 \n\t"
|
||||
"psraw $6, %%mm6 \n\t"
|
||||
"psraw $6, %%mm5 \n\t"
|
||||
"psraw $6, %%mm4 \n\t"
|
||||
"psraw $6, %%mm3 \n\t"
|
||||
"psraw $6, %%mm2 \n\t"
|
||||
"psraw $6, %%mm1 \n\t"
|
||||
"psraw $6, %%mm0 \n\t"
|
||||
|
||||
"movq %%mm7, (%0) \n\t"
|
||||
"movq %%mm5, 16(%0) \n\t"
|
||||
"movq %%mm3, 32(%0) \n\t"
|
||||
"movq %%mm1, 48(%0) \n\t"
|
||||
"movq %%mm0, 64(%0) \n\t"
|
||||
"movq %%mm2, 80(%0) \n\t"
|
||||
"movq %%mm4, 96(%0) \n\t"
|
||||
"movq %%mm6, 112(%0) \n\t"
|
||||
:: "r"(b2+4*i)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
add_pixels_clamped_mmx(b2, dst, stride);
|
||||
}
|
||||
|
||||
static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
|
||||
{
|
||||
int dc = (block[0] + 32) >> 6;
|
||||
|
Loading…
Reference in New Issue
Block a user