1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-13 21:28:01 +02:00

diracdec: rewrite HQ slice decoding

Now coefficients are written to a buffer and are then dequantized by the
new SIMD dequantization functions. For the lower bands without enough
coefficients to fill a register (and hence they overwrite) the C version
of the dequantization function is used.

The buffer is per-thread and will be realloc'd if anything changes.
This prevents regressions and having to limit slice size.

Signed-off-by: Rostislav Pehlivanov <rpehlivanov@obe.tv>
This commit is contained in:
Rostislav Pehlivanov 2016-06-23 18:07:00 +01:00 committed by Rostislav Pehlivanov
parent 09d89d9406
commit c43485f707

View File

@ -161,6 +161,10 @@ typedef struct DiracContext {
unsigned num_x; /* number of horizontal slices */
unsigned num_y; /* number of vertical slices */
uint8_t *thread_buf; /* Per-thread buffer for coefficient storage */
int threads_num_buf; /* Current # of buffers allocated */
int thread_buf_size; /* Each thread has a buffer this size */
struct {
unsigned width;
unsigned height;
@ -370,6 +374,10 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx)
s->avctx = avctx;
s->frame_number = -1;
s->thread_buf = NULL;
s->threads_num_buf = -1;
s->thread_buf_size = -1;
ff_diracdsp_init(&s->diracdsp);
ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
ff_videodsp_init(&s->vdsp, 8);
@ -403,6 +411,8 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx)
for (i = 0; i < MAX_FRAMES; i++)
av_frame_free(&s->all_frames[i].avframe);
av_freep(&s->thread_buf);
return 0;
}
@ -760,46 +770,108 @@ static int decode_lowdelay_slice(AVCodecContext *avctx, void *arg)
return 0;
}
typedef struct SliceCoeffs {
int left;
int top;
int tot_h;
int tot_v;
int tot;
} SliceCoeffs;
static int subband_coeffs(DiracContext *s, int x, int y, int p,
SliceCoeffs c[MAX_DWT_LEVELS])
{
int level, coef = 0;
for (level = 0; level < s->wavelet_depth; level++) {
SliceCoeffs *o = &c[level];
SubBand *b = &s->plane[p].band[level][3]; /* orientation doens't matter */
o->top = b->height * y / s->num_y;
o->left = b->width * x / s->num_x;
o->tot_h = ((b->width * (x + 1)) / s->num_x) - o->left;
o->tot_v = ((b->height * (y + 1)) / s->num_y) - o->top;
o->tot = o->tot_h*o->tot_v;
coef += o->tot * (4 - !!level);
}
return coef;
}
/**
* VC-2 Specification ->
* 13.5.3 hq_slice(sx,sy)
*/
static int decode_hq_slice(AVCodecContext *avctx, void *arg)
static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
{
int i, quant, level, orientation, quant_idx;
uint8_t quants[MAX_DWT_LEVELS][4];
DiracContext *s = avctx->priv_data;
DiracSlice *slice = arg;
int i, level, orientation, quant_idx;
int qfactor[MAX_DWT_LEVELS][4], qoffset[MAX_DWT_LEVELS][4];
GetBitContext *gb = &slice->gb;
SliceCoeffs coeffs_num[MAX_DWT_LEVELS];
skip_bits_long(gb, 8*s->highquality.prefix_bytes);
quant_idx = get_bits(gb, 8);
if (quant_idx > DIRAC_MAX_QUANT_INDEX) {
av_log(s->avctx, AV_LOG_ERROR, "Invalid quantization index - %i\n", quant_idx);
return AVERROR_INVALIDDATA;
}
/* Slice quantization (slice_quantizers() in the specs) */
for (level = 0; level < s->wavelet_depth; level++) {
for (orientation = !!level; orientation < 4; orientation++) {
quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
quants[level][orientation] = quant;
const int quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
qfactor[level][orientation] = ff_dirac_qscale_tab[quant];
qoffset[level][orientation] = ff_dirac_qoffset_intra_tab[quant] + 2;
}
}
/* Luma + 2 Chroma planes */
for (i = 0; i < 3; i++) {
int c, coef_num, coef_par, off = 0;
int64_t length = s->highquality.size_scaler*get_bits(gb, 8);
int64_t bits_left = 8 * length;
int64_t bits_end = get_bits_count(gb) + bits_left;
int64_t start = get_bits_count(gb);
int64_t bits_end = start + 8*length;
if (bits_end >= INT_MAX) {
av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
return AVERROR_INVALIDDATA;
}
coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
if (s->pshift) {
int32_t *dst = (int32_t *)tmp_buf;
for (c = 0; c < coef_num; c++)
dst[c] = dirac_get_se_golomb(gb);
coef_par = c;
} else {
int16_t *dst = (int16_t *)tmp_buf;
for (c = 0; c < coef_num; c++)
dst[c] = dirac_get_se_golomb(gb);
coef_par = c;
}
if (coef_num > coef_par) {
const int start_b = coef_par * (4 >> s->pshift);
const int end_b = coef_num * (4 >> s->pshift);
memset(&tmp_buf[start_b], 0, end_b - start_b);
}
for (level = 0; level < s->wavelet_depth; level++) {
const SliceCoeffs *c = &coeffs_num[level];
for (orientation = !!level; orientation < 4; orientation++) {
decode_subband(s, gb, quants[level][orientation], slice->slice_x, slice->slice_y, bits_end,
&s->plane[i].band[level][orientation], NULL);
const SubBand *b1 = &s->plane[i].band[level][orientation];
uint8_t *buf = b1->ibuf + c->top * b1->stride + (c->left << (s->pshift + 1));
/* Change to c->tot_h <= 4 for AVX2 dequantization */
const int qfunc = s->pshift + 2*(c->tot_h <= 2);
s->diracdsp.dequant_subband[qfunc](&tmp_buf[off], buf, b1->stride,
qfactor[level][orientation],
qoffset[level][orientation],
c->tot_v, c->tot_h);
off += c->tot << (s->pshift + 1);
}
}
skip_bits_long(gb, bits_end - get_bits_count(gb));
}
@ -811,8 +883,9 @@ static int decode_hq_slice_row(AVCodecContext *avctx, void *arg, int jobnr, int
int i;
DiracContext *s = avctx->priv_data;
DiracSlice *slices = ((DiracSlice *)arg) + s->num_x*jobnr;
uint8_t *thread_buf = &s->thread_buf[s->thread_buf_size*threadnr];
for (i = 0; i < s->num_x; i++)
decode_hq_slice(avctx, &slices[i]);
decode_hq_slice(s, &slices[i], thread_buf);
return 0;
}
@ -824,15 +897,32 @@ static int decode_lowdelay(DiracContext *s)
{
AVCodecContext *avctx = s->avctx;
int slice_x, slice_y, bufsize;
int64_t bytes = 0;
int64_t coef_buf_size, bytes = 0;
const uint8_t *buf;
DiracSlice *slices;
SliceCoeffs tmp[MAX_DWT_LEVELS];
int slice_num = 0;
slices = av_mallocz_array(s->num_x, s->num_y * sizeof(DiracSlice));
if (!slices)
return AVERROR(ENOMEM);
/* 8 becacuse that's how much the golomb reader could overread junk data
* from another plane/slice at most, and 512 because SIMD */
coef_buf_size = subband_coeffs(s, s->num_x - 1, s->num_y - 1, 0, tmp) + 8;
coef_buf_size = (coef_buf_size << (1 + s->pshift)) + 512;
if (s->threads_num_buf != avctx->thread_count ||
s->thread_buf_size != coef_buf_size) {
s->threads_num_buf = avctx->thread_count;
s->thread_buf_size = coef_buf_size;
s->thread_buf = av_realloc_f(s->thread_buf, avctx->thread_count, s->thread_buf_size);
if (!s->thread_buf) {
av_log(s->avctx, AV_LOG_ERROR, "thread buffer allocation failure\n");
return AVERROR(ENOMEM);
}
}
align_get_bits(&s->gb);
/*[DIRAC_STD] 13.5.2 Slices. slice(sx,sy) */
buf = s->gb.buffer + get_bits_count(&s->gb)/8;
@ -848,7 +938,7 @@ static int decode_lowdelay(DiracContext *s)
if (bytes <= bufsize/8)
bytes += buf[bytes] * s->highquality.size_scaler + 1;
}
if (bytes >= INT_MAX) {
if (bytes >= INT_MAX || bytes*8 > bufsize) {
av_log(s->avctx, AV_LOG_ERROR, "too many bytes\n");
av_free(slices);
return AVERROR_INVALIDDATA;
@ -867,6 +957,12 @@ static int decode_lowdelay(DiracContext *s)
bufsize = 0;
}
}
if (s->num_x*s->num_y != slice_num) {
av_log(s->avctx, AV_LOG_ERROR, "too few slices\n");
return AVERROR_INVALIDDATA;
}
avctx->execute2(avctx, decode_hq_slice_row, slices, NULL, s->num_y);
} else {
for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {