mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-13 21:28:01 +02:00
diracdec: rewrite HQ slice decoding
Now coefficients are written to a buffer and are then dequantized by the new SIMD dequantization functions. For the lower bands without enough coefficients to fill a register (and hence they overwrite) the C version of the dequantization function is used. The buffer is per-thread and will be realloc'd if anything changes. This prevents regressions and having to limit slice size. Signed-off-by: Rostislav Pehlivanov <rpehlivanov@obe.tv>
This commit is contained in:
parent
09d89d9406
commit
c43485f707
@ -161,6 +161,10 @@ typedef struct DiracContext {
|
||||
unsigned num_x; /* number of horizontal slices */
|
||||
unsigned num_y; /* number of vertical slices */
|
||||
|
||||
uint8_t *thread_buf; /* Per-thread buffer for coefficient storage */
|
||||
int threads_num_buf; /* Current # of buffers allocated */
|
||||
int thread_buf_size; /* Each thread has a buffer this size */
|
||||
|
||||
struct {
|
||||
unsigned width;
|
||||
unsigned height;
|
||||
@ -370,6 +374,10 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx)
|
||||
s->avctx = avctx;
|
||||
s->frame_number = -1;
|
||||
|
||||
s->thread_buf = NULL;
|
||||
s->threads_num_buf = -1;
|
||||
s->thread_buf_size = -1;
|
||||
|
||||
ff_diracdsp_init(&s->diracdsp);
|
||||
ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
|
||||
ff_videodsp_init(&s->vdsp, 8);
|
||||
@ -403,6 +411,8 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx)
|
||||
for (i = 0; i < MAX_FRAMES; i++)
|
||||
av_frame_free(&s->all_frames[i].avframe);
|
||||
|
||||
av_freep(&s->thread_buf);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -760,46 +770,108 @@ static int decode_lowdelay_slice(AVCodecContext *avctx, void *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
typedef struct SliceCoeffs {
|
||||
int left;
|
||||
int top;
|
||||
int tot_h;
|
||||
int tot_v;
|
||||
int tot;
|
||||
} SliceCoeffs;
|
||||
|
||||
static int subband_coeffs(DiracContext *s, int x, int y, int p,
|
||||
SliceCoeffs c[MAX_DWT_LEVELS])
|
||||
{
|
||||
int level, coef = 0;
|
||||
for (level = 0; level < s->wavelet_depth; level++) {
|
||||
SliceCoeffs *o = &c[level];
|
||||
SubBand *b = &s->plane[p].band[level][3]; /* orientation doens't matter */
|
||||
o->top = b->height * y / s->num_y;
|
||||
o->left = b->width * x / s->num_x;
|
||||
o->tot_h = ((b->width * (x + 1)) / s->num_x) - o->left;
|
||||
o->tot_v = ((b->height * (y + 1)) / s->num_y) - o->top;
|
||||
o->tot = o->tot_h*o->tot_v;
|
||||
coef += o->tot * (4 - !!level);
|
||||
}
|
||||
return coef;
|
||||
}
|
||||
|
||||
/**
|
||||
* VC-2 Specification ->
|
||||
* 13.5.3 hq_slice(sx,sy)
|
||||
*/
|
||||
static int decode_hq_slice(AVCodecContext *avctx, void *arg)
|
||||
static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
|
||||
{
|
||||
int i, quant, level, orientation, quant_idx;
|
||||
uint8_t quants[MAX_DWT_LEVELS][4];
|
||||
DiracContext *s = avctx->priv_data;
|
||||
DiracSlice *slice = arg;
|
||||
int i, level, orientation, quant_idx;
|
||||
int qfactor[MAX_DWT_LEVELS][4], qoffset[MAX_DWT_LEVELS][4];
|
||||
GetBitContext *gb = &slice->gb;
|
||||
SliceCoeffs coeffs_num[MAX_DWT_LEVELS];
|
||||
|
||||
skip_bits_long(gb, 8*s->highquality.prefix_bytes);
|
||||
quant_idx = get_bits(gb, 8);
|
||||
|
||||
if (quant_idx > DIRAC_MAX_QUANT_INDEX) {
|
||||
av_log(s->avctx, AV_LOG_ERROR, "Invalid quantization index - %i\n", quant_idx);
|
||||
return AVERROR_INVALIDDATA;
|
||||
}
|
||||
|
||||
/* Slice quantization (slice_quantizers() in the specs) */
|
||||
for (level = 0; level < s->wavelet_depth; level++) {
|
||||
for (orientation = !!level; orientation < 4; orientation++) {
|
||||
quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
|
||||
quants[level][orientation] = quant;
|
||||
const int quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
|
||||
qfactor[level][orientation] = ff_dirac_qscale_tab[quant];
|
||||
qoffset[level][orientation] = ff_dirac_qoffset_intra_tab[quant] + 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Luma + 2 Chroma planes */
|
||||
for (i = 0; i < 3; i++) {
|
||||
int c, coef_num, coef_par, off = 0;
|
||||
int64_t length = s->highquality.size_scaler*get_bits(gb, 8);
|
||||
int64_t bits_left = 8 * length;
|
||||
int64_t bits_end = get_bits_count(gb) + bits_left;
|
||||
int64_t start = get_bits_count(gb);
|
||||
int64_t bits_end = start + 8*length;
|
||||
|
||||
if (bits_end >= INT_MAX) {
|
||||
av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
|
||||
return AVERROR_INVALIDDATA;
|
||||
}
|
||||
|
||||
coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
|
||||
|
||||
if (s->pshift) {
|
||||
int32_t *dst = (int32_t *)tmp_buf;
|
||||
for (c = 0; c < coef_num; c++)
|
||||
dst[c] = dirac_get_se_golomb(gb);
|
||||
coef_par = c;
|
||||
} else {
|
||||
int16_t *dst = (int16_t *)tmp_buf;
|
||||
for (c = 0; c < coef_num; c++)
|
||||
dst[c] = dirac_get_se_golomb(gb);
|
||||
coef_par = c;
|
||||
}
|
||||
|
||||
if (coef_num > coef_par) {
|
||||
const int start_b = coef_par * (4 >> s->pshift);
|
||||
const int end_b = coef_num * (4 >> s->pshift);
|
||||
memset(&tmp_buf[start_b], 0, end_b - start_b);
|
||||
}
|
||||
|
||||
for (level = 0; level < s->wavelet_depth; level++) {
|
||||
const SliceCoeffs *c = &coeffs_num[level];
|
||||
for (orientation = !!level; orientation < 4; orientation++) {
|
||||
decode_subband(s, gb, quants[level][orientation], slice->slice_x, slice->slice_y, bits_end,
|
||||
&s->plane[i].band[level][orientation], NULL);
|
||||
const SubBand *b1 = &s->plane[i].band[level][orientation];
|
||||
uint8_t *buf = b1->ibuf + c->top * b1->stride + (c->left << (s->pshift + 1));
|
||||
|
||||
/* Change to c->tot_h <= 4 for AVX2 dequantization */
|
||||
const int qfunc = s->pshift + 2*(c->tot_h <= 2);
|
||||
s->diracdsp.dequant_subband[qfunc](&tmp_buf[off], buf, b1->stride,
|
||||
qfactor[level][orientation],
|
||||
qoffset[level][orientation],
|
||||
c->tot_v, c->tot_h);
|
||||
|
||||
off += c->tot << (s->pshift + 1);
|
||||
}
|
||||
}
|
||||
|
||||
skip_bits_long(gb, bits_end - get_bits_count(gb));
|
||||
}
|
||||
|
||||
@ -811,8 +883,9 @@ static int decode_hq_slice_row(AVCodecContext *avctx, void *arg, int jobnr, int
|
||||
int i;
|
||||
DiracContext *s = avctx->priv_data;
|
||||
DiracSlice *slices = ((DiracSlice *)arg) + s->num_x*jobnr;
|
||||
uint8_t *thread_buf = &s->thread_buf[s->thread_buf_size*threadnr];
|
||||
for (i = 0; i < s->num_x; i++)
|
||||
decode_hq_slice(avctx, &slices[i]);
|
||||
decode_hq_slice(s, &slices[i], thread_buf);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -824,15 +897,32 @@ static int decode_lowdelay(DiracContext *s)
|
||||
{
|
||||
AVCodecContext *avctx = s->avctx;
|
||||
int slice_x, slice_y, bufsize;
|
||||
int64_t bytes = 0;
|
||||
int64_t coef_buf_size, bytes = 0;
|
||||
const uint8_t *buf;
|
||||
DiracSlice *slices;
|
||||
SliceCoeffs tmp[MAX_DWT_LEVELS];
|
||||
int slice_num = 0;
|
||||
|
||||
slices = av_mallocz_array(s->num_x, s->num_y * sizeof(DiracSlice));
|
||||
if (!slices)
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
/* 8 becacuse that's how much the golomb reader could overread junk data
|
||||
* from another plane/slice at most, and 512 because SIMD */
|
||||
coef_buf_size = subband_coeffs(s, s->num_x - 1, s->num_y - 1, 0, tmp) + 8;
|
||||
coef_buf_size = (coef_buf_size << (1 + s->pshift)) + 512;
|
||||
|
||||
if (s->threads_num_buf != avctx->thread_count ||
|
||||
s->thread_buf_size != coef_buf_size) {
|
||||
s->threads_num_buf = avctx->thread_count;
|
||||
s->thread_buf_size = coef_buf_size;
|
||||
s->thread_buf = av_realloc_f(s->thread_buf, avctx->thread_count, s->thread_buf_size);
|
||||
if (!s->thread_buf) {
|
||||
av_log(s->avctx, AV_LOG_ERROR, "thread buffer allocation failure\n");
|
||||
return AVERROR(ENOMEM);
|
||||
}
|
||||
}
|
||||
|
||||
align_get_bits(&s->gb);
|
||||
/*[DIRAC_STD] 13.5.2 Slices. slice(sx,sy) */
|
||||
buf = s->gb.buffer + get_bits_count(&s->gb)/8;
|
||||
@ -848,7 +938,7 @@ static int decode_lowdelay(DiracContext *s)
|
||||
if (bytes <= bufsize/8)
|
||||
bytes += buf[bytes] * s->highquality.size_scaler + 1;
|
||||
}
|
||||
if (bytes >= INT_MAX) {
|
||||
if (bytes >= INT_MAX || bytes*8 > bufsize) {
|
||||
av_log(s->avctx, AV_LOG_ERROR, "too many bytes\n");
|
||||
av_free(slices);
|
||||
return AVERROR_INVALIDDATA;
|
||||
@ -867,6 +957,12 @@ static int decode_lowdelay(DiracContext *s)
|
||||
bufsize = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (s->num_x*s->num_y != slice_num) {
|
||||
av_log(s->avctx, AV_LOG_ERROR, "too few slices\n");
|
||||
return AVERROR_INVALIDDATA;
|
||||
}
|
||||
|
||||
avctx->execute2(avctx, decode_hq_slice_row, slices, NULL, s->num_y);
|
||||
} else {
|
||||
for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {
|
||||
|
Loading…
Reference in New Issue
Block a user