mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
vp9: split decode_coeff_b loop inside txsz branch.
The advantage of this is that the is32x32 division branch in decode_coeffs_b is removed from the inner loop to outside the block coef decoding loop in decode_coeffs. Also, it allows us to merge the txsz branches from the block coef decoding loop, the context merge and the context split.
This commit is contained in:
parent
fdb093c4e4
commit
21a0451167
155
libavcodec/vp9.c
155
libavcodec/vp9.c
@ -2011,12 +2011,13 @@ static void decode_mode(AVCodecContext *ctx)
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME remove tx argument, and merge cnt/eob arguments?
|
||||
static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
|
||||
enum TxfmMode tx, unsigned (*cnt)[6][3],
|
||||
unsigned (*eob)[6][2], uint8_t (*p)[6][11],
|
||||
int nnz, const int16_t *scan, const int16_t (*nb)[2],
|
||||
const int16_t *band_counts, const int16_t *qmul)
|
||||
// FIXME merge cnt/eob arguments?
|
||||
static av_always_inline int
|
||||
decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
|
||||
int is_tx32x32, unsigned (*cnt)[6][3],
|
||||
unsigned (*eob)[6][2], uint8_t (*p)[6][11],
|
||||
int nnz, const int16_t *scan, const int16_t (*nb)[2],
|
||||
const int16_t *band_counts, const int16_t *qmul)
|
||||
{
|
||||
int i = 0, band = 0, band_left = band_counts[band];
|
||||
uint8_t *tp = p[0][nnz];
|
||||
@ -2108,7 +2109,7 @@ static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
|
||||
}
|
||||
if (!--band_left)
|
||||
band_left = band_counts[++band];
|
||||
if (tx == TX_32X32) // FIXME slow
|
||||
if (is_tx32x32)
|
||||
coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
|
||||
else
|
||||
coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
|
||||
@ -2119,6 +2120,26 @@ static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
|
||||
return i;
|
||||
}
|
||||
|
||||
static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
|
||||
unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
|
||||
uint8_t (*p)[6][11], int nnz, const int16_t *scan,
|
||||
const int16_t (*nb)[2], const int16_t *band_counts,
|
||||
const int16_t *qmul)
|
||||
{
|
||||
return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
|
||||
nnz, scan, nb, band_counts, qmul);
|
||||
}
|
||||
|
||||
static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
|
||||
unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
|
||||
uint8_t (*p)[6][11], int nnz, const int16_t *scan,
|
||||
const int16_t (*nb)[2], const int16_t *band_counts,
|
||||
const int16_t *qmul)
|
||||
{
|
||||
return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
|
||||
nnz, scan, nb, band_counts, qmul);
|
||||
}
|
||||
|
||||
static void decode_coeffs(AVCodecContext *ctx)
|
||||
{
|
||||
VP9Context *s = ctx->priv_data;
|
||||
@ -2130,8 +2151,7 @@ static void decode_coeffs(AVCodecContext *ctx)
|
||||
int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
|
||||
int end_x = FFMIN(2 * (s->cols - col), w4);
|
||||
int end_y = FFMIN(2 * (s->rows - row), h4);
|
||||
int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
|
||||
int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), res;
|
||||
int n, pl, x, y, res;
|
||||
int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
|
||||
int tx = 4 * s->lossless + b->tx;
|
||||
const int16_t * const *yscans = vp9_scans[tx];
|
||||
@ -2158,29 +2178,22 @@ static void decode_coeffs(AVCodecContext *ctx)
|
||||
MERGE(a, end_x, step, rd); \
|
||||
} while (0)
|
||||
|
||||
/* y tokens */
|
||||
switch (b->tx) {
|
||||
case TX_8X8: MERGE_CTX(2, AV_RN16A); break;
|
||||
case TX_16X16: MERGE_CTX(4, AV_RN32A); break;
|
||||
case TX_32X32: MERGE_CTX(8, AV_RN64A); break;
|
||||
}
|
||||
for (n = 0, y = 0; y < end_y; y += step1d) {
|
||||
for (x = 0; x < end_x; x += step1d, n += step) {
|
||||
enum TxfmType txtp = vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
|
||||
b->bs > BS_8x8 ?
|
||||
n : 0]];
|
||||
int nnz = a[x] + l[y];
|
||||
res = decode_coeffs_b(&s->c, s->block + 16 * n, 16 * step,
|
||||
b->tx, c, e, p, nnz, yscans[txtp],
|
||||
ynbs[txtp], y_band_counts, qmul[0]);
|
||||
a[x] = l[y] = !!res;
|
||||
if (b->tx > TX_8X8) {
|
||||
AV_WN16A(&s->eob[n], res);
|
||||
} else {
|
||||
s->eob[n] = res;
|
||||
}
|
||||
}
|
||||
#define DECODE_Y_COEF_LOOP(step, mode_index, v) \
|
||||
for (n = 0, y = 0; y < end_y; y += step) { \
|
||||
for (x = 0; x < end_x; x += step, n += step * step) { \
|
||||
enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
|
||||
res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
|
||||
c, e, p, a[x] + l[y], yscans[txtp], \
|
||||
ynbs[txtp], y_band_counts, qmul[0]); \
|
||||
a[x] = l[y] = !!res; \
|
||||
if (step >= 4) { \
|
||||
AV_WN16A(&s->eob[n], res); \
|
||||
} else { \
|
||||
s->eob[n] = res; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SPLAT(la, end, step, cond) \
|
||||
if (step == 2) { \
|
||||
for (n = 1; n < end; n += step) \
|
||||
@ -2215,10 +2228,38 @@ static void decode_coeffs(AVCodecContext *ctx)
|
||||
SPLAT(a, end_x, step, end_x == w4); \
|
||||
SPLAT(l, end_y, step, end_y == h4); \
|
||||
} while (0)
|
||||
|
||||
/* y tokens */
|
||||
switch (b->tx) {
|
||||
case TX_8X8: SPLAT_CTX(2); break;
|
||||
case TX_16X16: SPLAT_CTX(4); break;
|
||||
case TX_32X32: SPLAT_CTX(8); break;
|
||||
case TX_4X4:
|
||||
DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
|
||||
break;
|
||||
case TX_8X8:
|
||||
MERGE_CTX(2, AV_RN16A);
|
||||
DECODE_Y_COEF_LOOP(2, 0,);
|
||||
SPLAT_CTX(2);
|
||||
break;
|
||||
case TX_16X16:
|
||||
MERGE_CTX(4, AV_RN32A);
|
||||
DECODE_Y_COEF_LOOP(4, 0,);
|
||||
SPLAT_CTX(4);
|
||||
break;
|
||||
case TX_32X32:
|
||||
MERGE_CTX(8, AV_RN64A);
|
||||
DECODE_Y_COEF_LOOP(8, 0, 32);
|
||||
SPLAT_CTX(8);
|
||||
break;
|
||||
}
|
||||
|
||||
#define DECODE_UV_COEF_LOOP(step) \
|
||||
for (n = 0, y = 0; y < end_y; y += step) { \
|
||||
for (x = 0; x < end_x; x += step, n += step * step) { \
|
||||
res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
|
||||
16 * step * step, c, e, p, a[x] + l[y], \
|
||||
uvscan, uvnb, uv_band_counts, qmul[1]); \
|
||||
a[x] = l[y] = !!res; \
|
||||
s->uveob[pl][n] = res; \
|
||||
} \
|
||||
}
|
||||
|
||||
p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
|
||||
@ -2232,28 +2273,30 @@ static void decode_coeffs(AVCodecContext *ctx)
|
||||
a = &s->above_uv_nnz_ctx[pl][col];
|
||||
l = &s->left_uv_nnz_ctx[pl][row & 7];
|
||||
switch (b->uvtx) {
|
||||
case TX_8X8: MERGE_CTX(2, AV_RN16A); break;
|
||||
case TX_16X16: MERGE_CTX(4, AV_RN32A); break;
|
||||
case TX_32X32: MERGE_CTX(8, AV_RN64A); break;
|
||||
}
|
||||
for (n = 0, y = 0; y < end_y; y += uvstep1d) {
|
||||
for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
|
||||
int nnz = a[x] + l[y];
|
||||
res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n,
|
||||
16 * uvstep, b->uvtx, c, e, p, nnz,
|
||||
uvscan, uvnb, uv_band_counts, qmul[1]);
|
||||
a[x] = l[y] = !!res;
|
||||
if (b->uvtx > TX_8X8) {
|
||||
AV_WN16A(&s->uveob[pl][n], res);
|
||||
} else {
|
||||
s->uveob[pl][n] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
switch (b->uvtx) {
|
||||
case TX_8X8: SPLAT_CTX(2); break;
|
||||
case TX_16X16: SPLAT_CTX(4); break;
|
||||
case TX_32X32: SPLAT_CTX(8); break;
|
||||
case TX_4X4:
|
||||
DECODE_UV_COEF_LOOP(1);
|
||||
break;
|
||||
case TX_8X8:
|
||||
MERGE_CTX(2, AV_RN16A);
|
||||
DECODE_UV_COEF_LOOP(2);
|
||||
SPLAT_CTX(2);
|
||||
break;
|
||||
case TX_16X16:
|
||||
MERGE_CTX(4, AV_RN32A);
|
||||
DECODE_UV_COEF_LOOP(4);
|
||||
SPLAT_CTX(4);
|
||||
break;
|
||||
case TX_32X32:
|
||||
MERGE_CTX(8, AV_RN64A);
|
||||
// a 64x64 (max) uv block can ever only contain 1 tx32x32 block
|
||||
// so there is no need to loop
|
||||
res = decode_coeffs_b32(&s->c, s->uvblock[pl],
|
||||
1024, c, e, p, a[0] + l[0],
|
||||
uvscan, uvnb, uv_band_counts, qmul[1]);
|
||||
a[0] = l[0] = !!res;
|
||||
AV_WN16A(&s->uveob[pl][0], res);
|
||||
SPLAT_CTX(8);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user