mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
h264: Don't store intra pcm samples in h->mb
Instead, keep them in the bitstream buffer until we read them verbatim, this saves a memcpy() and a subsequent clearing of the target buffer. decode_cabac+decode_mb for a sample file (CAPM3_Sony_D.jsv) goes from 6121.4 to 6095.5 cycles, i.e. 26 cycles faster. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
9918f57dcf
commit
7ebfb466ae
@ -416,6 +416,7 @@ typedef struct H264Context {
|
|||||||
GetBitContext *intra_gb_ptr;
|
GetBitContext *intra_gb_ptr;
|
||||||
GetBitContext *inter_gb_ptr;
|
GetBitContext *inter_gb_ptr;
|
||||||
|
|
||||||
|
const uint8_t *intra_pcm_ptr;
|
||||||
DECLARE_ALIGNED(16, int16_t, mb)[16 * 48 * 2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
|
DECLARE_ALIGNED(16, int16_t, mb)[16 * 48 * 2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
|
||||||
DECLARE_ALIGNED(16, int16_t, mb_luma_dc)[3][16 * 2];
|
DECLARE_ALIGNED(16, int16_t, mb_luma_dc)[3][16 * 2];
|
||||||
int16_t mb_padding[256 * 2]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
|
int16_t mb_padding[256 * 2]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
|
||||||
|
@ -2004,7 +2004,8 @@ decode_intra_mb:
|
|||||||
// The pixels are stored in the same order as levels in h->mb array.
|
// The pixels are stored in the same order as levels in h->mb array.
|
||||||
if ((int) (h->cabac.bytestream_end - ptr) < mb_size)
|
if ((int) (h->cabac.bytestream_end - ptr) < mb_size)
|
||||||
return -1;
|
return -1;
|
||||||
memcpy(h->mb, ptr, mb_size); ptr+=mb_size;
|
h->intra_pcm_ptr = ptr;
|
||||||
|
ptr += mb_size;
|
||||||
|
|
||||||
ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
|
ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
|
||||||
|
|
||||||
|
@ -761,17 +761,12 @@ decode_intra_mb:
|
|||||||
h->slice_table[ mb_xy ]= h->slice_num;
|
h->slice_table[ mb_xy ]= h->slice_num;
|
||||||
|
|
||||||
if(IS_INTRA_PCM(mb_type)){
|
if(IS_INTRA_PCM(mb_type)){
|
||||||
unsigned int x;
|
|
||||||
const int mb_size = ff_h264_mb_sizes[h->sps.chroma_format_idc] *
|
const int mb_size = ff_h264_mb_sizes[h->sps.chroma_format_idc] *
|
||||||
h->sps.bit_depth_luma >> 3;
|
h->sps.bit_depth_luma;
|
||||||
|
|
||||||
// We assume these blocks are very rare so we do not optimize it.
|
// We assume these blocks are very rare so we do not optimize it.
|
||||||
align_get_bits(&h->gb);
|
h->intra_pcm_ptr = align_get_bits(&h->gb);
|
||||||
|
skip_bits_long(&h->gb, mb_size);
|
||||||
// The pixels are stored in the same order as levels in h->mb array.
|
|
||||||
for(x=0; x < mb_size; x++){
|
|
||||||
((uint8_t*)h->mb)[x]= get_bits(&h->gb, 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
// In deblocking, the quantizer is 0
|
// In deblocking, the quantizer is 0
|
||||||
h->cur_pic.f.qscale_table[mb_xy] = 0;
|
h->cur_pic.f.qscale_table[mb_xy] = 0;
|
||||||
|
@ -102,7 +102,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
|
|||||||
const int bit_depth = h->sps.bit_depth_luma;
|
const int bit_depth = h->sps.bit_depth_luma;
|
||||||
int j;
|
int j;
|
||||||
GetBitContext gb;
|
GetBitContext gb;
|
||||||
init_get_bits(&gb, (uint8_t *)h->mb,
|
init_get_bits(&gb, (uint8_t *)h->intra_pcm_ptr,
|
||||||
ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
|
ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
|
||||||
|
|
||||||
for (i = 0; i < 16; i++) {
|
for (i = 0; i < 16; i++) {
|
||||||
@ -137,7 +137,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (i = 0; i < 16; i++)
|
for (i = 0; i < 16; i++)
|
||||||
memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
|
memcpy(dest_y + i * linesize, (uint8_t *)h->intra_pcm_ptr + i * 16, 16);
|
||||||
if (SIMPLE || !CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) {
|
if (SIMPLE || !CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) {
|
||||||
if (!h->sps.chroma_format_idc) {
|
if (!h->sps.chroma_format_idc) {
|
||||||
for (i = 0; i < block_h; i++) {
|
for (i = 0; i < block_h; i++) {
|
||||||
@ -145,8 +145,8 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
|
|||||||
memset(dest_cr + i * uvlinesize, 128, 8);
|
memset(dest_cr + i * uvlinesize, 128, 8);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
uint8_t *src_cb = (uint8_t *)h->mb + 256;
|
uint8_t *src_cb = (uint8_t *)h->intra_pcm_ptr + 256;
|
||||||
uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8;
|
uint8_t *src_cr = (uint8_t *)h->intra_pcm_ptr + 256 + block_h * 8;
|
||||||
for (i = 0; i < block_h; i++) {
|
for (i = 0; i < block_h; i++) {
|
||||||
memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
|
memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
|
||||||
memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
|
memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
|
||||||
@ -261,10 +261,10 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
if (h->cbp || IS_INTRA(mb_type)) {
|
||||||
if (h->cbp || IS_INTRA(mb_type)) {
|
h->dsp.clear_blocks(h->mb);
|
||||||
h->dsp.clear_blocks(h->mb);
|
h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
|
||||||
h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -328,7 +328,7 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
|
|||||||
if (PIXEL_SHIFT) {
|
if (PIXEL_SHIFT) {
|
||||||
const int bit_depth = h->sps.bit_depth_luma;
|
const int bit_depth = h->sps.bit_depth_luma;
|
||||||
GetBitContext gb;
|
GetBitContext gb;
|
||||||
init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth);
|
init_get_bits(&gb, (uint8_t *)h->intra_pcm_ptr, 768 * bit_depth);
|
||||||
|
|
||||||
for (p = 0; p < plane_count; p++)
|
for (p = 0; p < plane_count; p++)
|
||||||
for (i = 0; i < 16; i++) {
|
for (i = 0; i < 16; i++) {
|
||||||
@ -340,7 +340,7 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
|
|||||||
for (p = 0; p < plane_count; p++)
|
for (p = 0; p < plane_count; p++)
|
||||||
for (i = 0; i < 16; i++)
|
for (i = 0; i < 16; i++)
|
||||||
memcpy(dest[p] + i * linesize,
|
memcpy(dest[p] + i * linesize,
|
||||||
(uint8_t *)h->mb + p * 256 + i * 16, 16);
|
(uint8_t *)h->intra_pcm_ptr + p * 256 + i * 16, 16);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (IS_INTRA(mb_type)) {
|
if (IS_INTRA(mb_type)) {
|
||||||
@ -368,10 +368,11 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
|
|||||||
hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass,
|
hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass,
|
||||||
PIXEL_SHIFT, block_offset, linesize,
|
PIXEL_SHIFT, block_offset, linesize,
|
||||||
dest[p], p);
|
dest[p], p);
|
||||||
}
|
|
||||||
if (h->cbp || IS_INTRA(mb_type)) {
|
if (h->cbp || IS_INTRA(mb_type)) {
|
||||||
h->dsp.clear_blocks(h->mb);
|
h->dsp.clear_blocks(h->mb);
|
||||||
h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
|
h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user