mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-04-02 20:35:37 +02:00
H.264: split luma dc idct out and implement MMX/SSE2 versions
About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
6c18f1cda2
commit
19fb234e4a
@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl
|
|||||||
void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
|
void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
|
||||||
void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
|
void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
|
||||||
|
|
||||||
|
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul);
|
||||||
|
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
|
||||||
|
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
|
||||||
|
|
||||||
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
|
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
|
||||||
const float *win, float add_bias, int len);
|
const float *win, float add_bias, int len);
|
||||||
void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
|
void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
|
||||||
|
@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* IDCT transforms the 16 dc values and dequantizes them.
|
|
||||||
* @param qp quantization parameter
|
|
||||||
*/
|
|
||||||
static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
|
|
||||||
#define stride 16
|
|
||||||
int i;
|
|
||||||
int temp[16]; //FIXME check if this is a good idea
|
|
||||||
static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride};
|
|
||||||
static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
|
|
||||||
|
|
||||||
//memset(block, 64, 2*256);
|
|
||||||
//return;
|
|
||||||
for(i=0; i<4; i++){
|
|
||||||
const int offset= y_offset[i];
|
|
||||||
const int z0= block[offset+stride*0] + block[offset+stride*4];
|
|
||||||
const int z1= block[offset+stride*0] - block[offset+stride*4];
|
|
||||||
const int z2= block[offset+stride*1] - block[offset+stride*5];
|
|
||||||
const int z3= block[offset+stride*1] + block[offset+stride*5];
|
|
||||||
|
|
||||||
temp[4*i+0]= z0+z3;
|
|
||||||
temp[4*i+1]= z1+z2;
|
|
||||||
temp[4*i+2]= z1-z2;
|
|
||||||
temp[4*i+3]= z0-z3;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(i=0; i<4; i++){
|
|
||||||
const int offset= x_offset[i];
|
|
||||||
const int z0= temp[4*0+i] + temp[4*2+i];
|
|
||||||
const int z1= temp[4*0+i] - temp[4*2+i];
|
|
||||||
const int z2= temp[4*1+i] - temp[4*3+i];
|
|
||||||
const int z3= temp[4*1+i] + temp[4*3+i];
|
|
||||||
|
|
||||||
block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
|
|
||||||
block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
|
|
||||||
block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
|
|
||||||
block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
/**
|
/**
|
||||||
* DCT transforms the 16 dc values.
|
* DCT transforms the 16 dc values.
|
||||||
@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
|
|||||||
h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
|
h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
|
||||||
if(is_h264){
|
if(is_h264){
|
||||||
if(!transform_bypass)
|
if(!transform_bypass)
|
||||||
h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
|
h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
|
||||||
|
else{
|
||||||
|
static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
|
||||||
|
8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
|
||||||
|
for(i = 0; i < 16; i++)
|
||||||
|
h->mb[dc_mapping[i]] = h->mb_luma_dc[i];
|
||||||
|
}
|
||||||
}else
|
}else
|
||||||
ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
|
ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
|
||||||
}
|
}
|
||||||
if(h->deblocking_filter)
|
if(h->deblocking_filter)
|
||||||
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
|
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
|
||||||
|
@ -406,6 +406,7 @@ typedef struct H264Context{
|
|||||||
GetBitContext *inter_gb_ptr;
|
GetBitContext *inter_gb_ptr;
|
||||||
|
|
||||||
DECLARE_ALIGNED(16, DCTELEM, mb)[16*24];
|
DECLARE_ALIGNED(16, DCTELEM, mb)[16*24];
|
||||||
|
DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16];
|
||||||
DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
|
DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -600,10 +601,6 @@ typedef struct H264Context{
|
|||||||
|
|
||||||
extern const uint8_t ff_h264_chroma_qp[52];
|
extern const uint8_t ff_h264_chroma_qp[52];
|
||||||
|
|
||||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
|
|
||||||
|
|
||||||
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decode SEI
|
* Decode SEI
|
||||||
*/
|
*/
|
||||||
|
@ -1597,17 +1597,15 @@ decode_intra_mb:
|
|||||||
s->current_picture.mb_type[mb_xy]= mb_type;
|
s->current_picture.mb_type[mb_xy]= mb_type;
|
||||||
|
|
||||||
if( cbp || IS_INTRA16x16( mb_type ) ) {
|
if( cbp || IS_INTRA16x16( mb_type ) ) {
|
||||||
const uint8_t *scan, *scan8x8, *dc_scan;
|
const uint8_t *scan, *scan8x8;
|
||||||
const uint32_t *qmul;
|
const uint32_t *qmul;
|
||||||
|
|
||||||
if(IS_INTERLACED(mb_type)){
|
if(IS_INTERLACED(mb_type)){
|
||||||
scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
|
scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
|
||||||
scan= s->qscale ? h->field_scan : h->field_scan_q0;
|
scan= s->qscale ? h->field_scan : h->field_scan_q0;
|
||||||
dc_scan= luma_dc_field_scan;
|
|
||||||
}else{
|
}else{
|
||||||
scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
|
scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
|
||||||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
|
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
|
||||||
dc_scan= luma_dc_zigzag_scan;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// decode_cabac_mb_dqp
|
// decode_cabac_mb_dqp
|
||||||
@ -1642,7 +1640,9 @@ decode_intra_mb:
|
|||||||
if( IS_INTRA16x16( mb_type ) ) {
|
if( IS_INTRA16x16( mb_type ) ) {
|
||||||
int i;
|
int i;
|
||||||
//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
|
//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
|
||||||
decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16);
|
AV_ZERO128(h->mb_luma_dc+0);
|
||||||
|
AV_ZERO128(h->mb_luma_dc+8);
|
||||||
|
decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16);
|
||||||
|
|
||||||
if( cbp&15 ) {
|
if( cbp&15 ) {
|
||||||
qmul = h->dequant4_coeff[0][s->qscale];
|
qmul = h->dequant4_coeff[0][s->qscale];
|
||||||
|
@ -911,16 +911,14 @@ decode_intra_mb:
|
|||||||
int i8x8, i4x4, chroma_idx;
|
int i8x8, i4x4, chroma_idx;
|
||||||
int dquant;
|
int dquant;
|
||||||
GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
|
GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
|
||||||
const uint8_t *scan, *scan8x8, *dc_scan;
|
const uint8_t *scan, *scan8x8;
|
||||||
|
|
||||||
if(IS_INTERLACED(mb_type)){
|
if(IS_INTERLACED(mb_type)){
|
||||||
scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
|
scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
|
||||||
scan= s->qscale ? h->field_scan : h->field_scan_q0;
|
scan= s->qscale ? h->field_scan : h->field_scan_q0;
|
||||||
dc_scan= luma_dc_field_scan;
|
|
||||||
}else{
|
}else{
|
||||||
scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
|
scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
|
||||||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
|
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
|
||||||
dc_scan= luma_dc_zigzag_scan;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dquant= get_se_golomb(&s->gb);
|
dquant= get_se_golomb(&s->gb);
|
||||||
@ -939,7 +937,9 @@ decode_intra_mb:
|
|||||||
h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
|
h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
|
||||||
h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
|
h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
|
||||||
if(IS_INTRA16x16(mb_type)){
|
if(IS_INTRA16x16(mb_type)){
|
||||||
if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
|
AV_ZERO128(h->mb_luma_dc+0);
|
||||||
|
AV_ZERO128(h->mb_luma_dc+8);
|
||||||
|
if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
|
||||||
return -1; //FIXME continue if partitioned and other return -1 too
|
return -1; //FIXME continue if partitioned and other return -1 too
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c)
|
|||||||
c->h264_idct8_add4 = ff_h264_idct8_add4_c;
|
c->h264_idct8_add4 = ff_h264_idct8_add4_c;
|
||||||
c->h264_idct_add8 = ff_h264_idct_add8_c;
|
c->h264_idct_add8 = ff_h264_idct_add8_c;
|
||||||
c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
|
c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
|
||||||
|
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c;
|
||||||
|
|
||||||
c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
|
c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
|
||||||
c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
|
c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
|
||||||
|
@ -65,11 +65,13 @@ typedef struct H264DSPContext{
|
|||||||
void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
|
void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
|
||||||
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
|
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
|
||||||
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
|
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
|
||||||
|
|
||||||
void (*h264_dct)(DCTELEM block[4][4]);
|
void (*h264_dct)(DCTELEM block[4][4]);
|
||||||
void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
||||||
void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
||||||
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
||||||
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
||||||
|
void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
|
||||||
}H264DSPContext;
|
}H264DSPContext;
|
||||||
|
|
||||||
void ff_h264dsp_init(H264DSPContext *c);
|
void ff_h264dsp_init(H264DSPContext *c);
|
||||||
|
@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block
|
|||||||
ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
|
ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* IDCT transforms the 16 dc values and dequantizes them.
|
||||||
|
* @param qp quantization parameter
|
||||||
|
*/
|
||||||
|
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){
|
||||||
|
#define stride 16
|
||||||
|
int i;
|
||||||
|
int temp[16];
|
||||||
|
static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};
|
||||||
|
|
||||||
|
for(i=0; i<4; i++){
|
||||||
|
const int z0= input[4*i+0] + input[4*i+1];
|
||||||
|
const int z1= input[4*i+0] - input[4*i+1];
|
||||||
|
const int z2= input[4*i+2] - input[4*i+3];
|
||||||
|
const int z3= input[4*i+2] + input[4*i+3];
|
||||||
|
|
||||||
|
temp[4*i+0]= z0+z3;
|
||||||
|
temp[4*i+1]= z0-z3;
|
||||||
|
temp[4*i+2]= z1-z2;
|
||||||
|
temp[4*i+3]= z1+z2;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(i=0; i<4; i++){
|
||||||
|
const int offset= x_offset[i];
|
||||||
|
const int z0= temp[4*0+i] + temp[4*2+i];
|
||||||
|
const int z1= temp[4*0+i] - temp[4*2+i];
|
||||||
|
const int z2= temp[4*1+i] - temp[4*3+i];
|
||||||
|
const int z3= temp[4*1+i] + temp[4*3+i];
|
||||||
|
|
||||||
|
output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
|
||||||
|
output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
|
||||||
|
output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
|
||||||
|
output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
|
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp)
|
||||||
{
|
{
|
||||||
const int qmul = svq3_dequant_coeff[qp];
|
const int qmul = svq3_dequant_coeff[qp];
|
||||||
#define stride 16
|
#define stride 16
|
||||||
int i;
|
int i;
|
||||||
int temp[16];
|
int temp[16];
|
||||||
static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride};
|
static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride};
|
||||||
static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride};
|
|
||||||
|
|
||||||
for (i = 0; i < 4; i++){
|
for (i = 0; i < 4; i++){
|
||||||
const int offset = y_offset[i];
|
const int z0= 13*(input[4*i+0] + input[4*i+1]);
|
||||||
const int z0 = 13*(block[offset+stride*0] + block[offset+stride*4]);
|
const int z1= 13*(input[4*i+0] - input[4*i+1]);
|
||||||
const int z1 = 13*(block[offset+stride*0] - block[offset+stride*4]);
|
const int z2= 7* input[4*i+2] - 17*input[4*i+3];
|
||||||
const int z2 = 7* block[offset+stride*1] - 17*block[offset+stride*5];
|
const int z3= 17* input[4*i+2] + 7*input[4*i+3];
|
||||||
const int z3 = 17* block[offset+stride*1] + 7*block[offset+stride*5];
|
|
||||||
|
|
||||||
temp[4*i+0] = z0+z3;
|
temp[4*i+0] = z0+z3;
|
||||||
temp[4*i+1] = z1+z2;
|
temp[4*i+1] = z1+z2;
|
||||||
@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
|
|||||||
const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i];
|
const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i];
|
||||||
const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i];
|
const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i];
|
||||||
|
|
||||||
block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
|
output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
|
||||||
block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
|
output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
|
||||||
block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
|
output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
|
||||||
block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
|
output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#undef stride
|
#undef stride
|
||||||
|
@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
|
|||||||
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
|
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
|
||||||
{0x8000000080000000ULL, 0x8000000080000000ULL};
|
{0x8000000080000000ULL, 0x8000000080000000ULL};
|
||||||
|
|
||||||
|
DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
|
||||||
DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
|
DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
|
||||||
|
@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
|
|||||||
%endif
|
%endif
|
||||||
|
|
||||||
cextern pw_32
|
cextern pw_32
|
||||||
|
cextern pw_1
|
||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8
|
|||||||
add8_sse2_cycle 2, 0x21
|
add8_sse2_cycle 2, 0x21
|
||||||
add8_sse2_cycle 3, 0x29
|
add8_sse2_cycle 3, 0x29
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
|
||||||
|
|
||||||
|
%macro WALSH4_1D 5
|
||||||
|
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
|
||||||
|
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
|
||||||
|
SWAP %1, %4, %3
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro DEQUANT_MMX 3
|
||||||
|
mova m7, [pw_1]
|
||||||
|
mova m4, %1
|
||||||
|
punpcklwd %1, m7
|
||||||
|
punpckhwd m4, m7
|
||||||
|
mova m5, %2
|
||||||
|
punpcklwd %2, m7
|
||||||
|
punpckhwd m5, m7
|
||||||
|
movd m7, t3d
|
||||||
|
punpckldq m7, m7
|
||||||
|
pmaddwd %1, m7
|
||||||
|
pmaddwd %2, m7
|
||||||
|
pmaddwd m4, m7
|
||||||
|
pmaddwd m5, m7
|
||||||
|
psrad %1, %3
|
||||||
|
psrad %2, %3
|
||||||
|
psrad m4, %3
|
||||||
|
psrad m5, %3
|
||||||
|
packssdw %1, m4
|
||||||
|
packssdw %2, m5
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro STORE_WORDS_MMX 5
|
||||||
|
movd t0d, %1
|
||||||
|
psrlq %1, 32
|
||||||
|
movd t1d, %1
|
||||||
|
mov [t2+%2*32], t0w
|
||||||
|
mov [t2+%4*32], t1w
|
||||||
|
shr t0d, 16
|
||||||
|
shr t1d, 16
|
||||||
|
mov [t2+%3*32], t0w
|
||||||
|
mov [t2+%5*32], t1w
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro DEQUANT_STORE_MMX 1
|
||||||
|
DEQUANT_MMX m0, m1, %1
|
||||||
|
STORE_WORDS_MMX m0, 0, 1, 4, 5
|
||||||
|
STORE_WORDS_MMX m1, 2, 3, 6, 7
|
||||||
|
|
||||||
|
DEQUANT_MMX m2, m3, %1
|
||||||
|
STORE_WORDS_MMX m2, 8, 9, 12, 13
|
||||||
|
STORE_WORDS_MMX m3, 10, 11, 14, 15
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro STORE_WORDS_SSE 9
|
||||||
|
movd t0d, %1
|
||||||
|
psrldq %1, 4
|
||||||
|
movd t1d, %1
|
||||||
|
psrldq %1, 4
|
||||||
|
mov [t2+%2*32], t0w
|
||||||
|
mov [t2+%4*32], t1w
|
||||||
|
shr t0d, 16
|
||||||
|
shr t1d, 16
|
||||||
|
mov [t2+%3*32], t0w
|
||||||
|
mov [t2+%5*32], t1w
|
||||||
|
movd t0d, %1
|
||||||
|
psrldq %1, 4
|
||||||
|
movd t1d, %1
|
||||||
|
mov [t2+%6*32], t0w
|
||||||
|
mov [t2+%8*32], t1w
|
||||||
|
shr t0d, 16
|
||||||
|
shr t1d, 16
|
||||||
|
mov [t2+%7*32], t0w
|
||||||
|
mov [t2+%9*32], t1w
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro DEQUANT_STORE_SSE2 1
|
||||||
|
movd xmm4, t3d
|
||||||
|
movq xmm5, [pw_1]
|
||||||
|
pshufd xmm4, xmm4, 0
|
||||||
|
movq2dq xmm0, m0
|
||||||
|
movq2dq xmm1, m1
|
||||||
|
movq2dq xmm2, m2
|
||||||
|
movq2dq xmm3, m3
|
||||||
|
punpcklwd xmm0, xmm5
|
||||||
|
punpcklwd xmm1, xmm5
|
||||||
|
punpcklwd xmm2, xmm5
|
||||||
|
punpcklwd xmm3, xmm5
|
||||||
|
pmaddwd xmm0, xmm4
|
||||||
|
pmaddwd xmm1, xmm4
|
||||||
|
pmaddwd xmm2, xmm4
|
||||||
|
pmaddwd xmm3, xmm4
|
||||||
|
psrad xmm0, %1
|
||||||
|
psrad xmm1, %1
|
||||||
|
psrad xmm2, %1
|
||||||
|
psrad xmm3, %1
|
||||||
|
packssdw xmm0, xmm1
|
||||||
|
packssdw xmm2, xmm3
|
||||||
|
STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
|
||||||
|
STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro IDCT_DC_DEQUANT 2
|
||||||
|
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
|
||||||
|
movq m3, [r1+24]
|
||||||
|
movq m2, [r1+16]
|
||||||
|
movq m1, [r1+ 8]
|
||||||
|
movq m0, [r1+ 0]
|
||||||
|
WALSH4_1D 0,1,2,3,4
|
||||||
|
TRANSPOSE4x4W 0,1,2,3,4
|
||||||
|
WALSH4_1D 0,1,2,3,4
|
||||||
|
|
||||||
|
; shift, tmp, output, qmul
|
||||||
|
%ifdef WIN64
|
||||||
|
DECLARE_REG_TMP 0,3,1,2
|
||||||
|
; we can't avoid this, because r0 is the shift register (ecx) on win64
|
||||||
|
xchg r0, t2
|
||||||
|
%elifdef ARCH_X86_64
|
||||||
|
DECLARE_REG_TMP 3,1,0,2
|
||||||
|
%else
|
||||||
|
DECLARE_REG_TMP 1,3,0,2
|
||||||
|
%endif
|
||||||
|
|
||||||
|
cmp t3d, 32767
|
||||||
|
jg .big_qmul
|
||||||
|
add t3d, 128 << 16
|
||||||
|
%ifidn %1,mmx
|
||||||
|
DEQUANT_STORE_MMX 8
|
||||||
|
%else
|
||||||
|
DEQUANT_STORE_SSE2 8
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
.big_qmul:
|
||||||
|
bsr t0d, t3d
|
||||||
|
add t3d, 128 << 16
|
||||||
|
mov t1d, 7
|
||||||
|
cmp t0d, t1d
|
||||||
|
cmovg t0d, t1d
|
||||||
|
inc t1d
|
||||||
|
shr t3d, t0b
|
||||||
|
sub t1d, t0d
|
||||||
|
%ifidn %1,mmx
|
||||||
|
movd m6, t1d
|
||||||
|
DEQUANT_STORE_MMX m6
|
||||||
|
%else
|
||||||
|
movd xmm6, t1d
|
||||||
|
DEQUANT_STORE_SSE2 xmm6
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
IDCT_DC_DEQUANT mmx, 0
|
||||||
|
IDCT_DC_DEQUANT sse2, 7
|
||||||
|
@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM
|
|||||||
int stride, const uint8_t nnzc[6*8]);
|
int stride, const uint8_t nnzc[6*8]);
|
||||||
void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
|
void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
|
||||||
int stride, const uint8_t nnzc[6*8]);
|
int stride, const uint8_t nnzc[6*8]);
|
||||||
|
void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
|
||||||
|
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
|
||||||
|
|
||||||
/***********************************/
|
/***********************************/
|
||||||
/* deblocking */
|
/* deblocking */
|
||||||
@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
|
|||||||
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
|
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
|
||||||
c->h264_idct_add8 = ff_h264_idct_add8_mmx;
|
c->h264_idct_add8 = ff_h264_idct_add8_mmx;
|
||||||
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
|
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
|
||||||
|
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
|
||||||
|
|
||||||
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
||||||
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
|
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
|
||||||
@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
|
|||||||
if (mm_flags&AV_CPU_FLAG_SSE2) {
|
if (mm_flags&AV_CPU_FLAG_SSE2) {
|
||||||
c->h264_idct8_add = ff_h264_idct8_add_sse2;
|
c->h264_idct8_add = ff_h264_idct8_add_sse2;
|
||||||
c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
|
c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
|
||||||
|
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
|
||||||
|
|
||||||
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
|
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
|
||||||
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
|
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user