mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
diracdec: implement a LUT-based Golomb code parser
Still much left to optimize, but it provides a significant performance improvement - 10% for 300Mbps (1080p30), 25% for 1.5Gbps (4k 60fps) in comparison with the default implementation. Signed-off-by: Rostislav Pehlivanov <rpehlivanov@obe.tv>
This commit is contained in:
parent
c43485f707
commit
0eb0f93109
@ -238,7 +238,8 @@ OBJS-$(CONFIG_DCA_DECODER) += dcadec.o dca.o dcadata.o dcahuff.o \
|
|||||||
OBJS-$(CONFIG_DCA_ENCODER) += dcaenc.o dca.o dcadata.o
|
OBJS-$(CONFIG_DCA_ENCODER) += dcaenc.o dca.o dcadata.o
|
||||||
OBJS-$(CONFIG_DDS_DECODER) += dds.o
|
OBJS-$(CONFIG_DDS_DECODER) += dds.o
|
||||||
OBJS-$(CONFIG_DIRAC_DECODER) += diracdec.o dirac.o diracdsp.o diractab.o \
|
OBJS-$(CONFIG_DIRAC_DECODER) += diracdec.o dirac.o diracdsp.o diractab.o \
|
||||||
dirac_arith.o mpeg12data.o dirac_dwt.o
|
dirac_arith.o mpeg12data.o dirac_dwt.o \
|
||||||
|
dirac_vlc.o
|
||||||
OBJS-$(CONFIG_DFA_DECODER) += dfa.o
|
OBJS-$(CONFIG_DFA_DECODER) += dfa.o
|
||||||
OBJS-$(CONFIG_DNXHD_DECODER) += dnxhddec.o dnxhddata.o
|
OBJS-$(CONFIG_DNXHD_DECODER) += dnxhddec.o dnxhddata.o
|
||||||
OBJS-$(CONFIG_DNXHD_ENCODER) += dnxhdenc.o dnxhddata.o
|
OBJS-$(CONFIG_DNXHD_ENCODER) += dnxhdenc.o dnxhddata.o
|
||||||
|
242
libavcodec/dirac_vlc.c
Normal file
242
libavcodec/dirac_vlc.c
Normal file
@ -0,0 +1,242 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2016 Open Broadcast Systems Ltd.
|
||||||
|
* Author 2016 Rostislav Pehlivanov <rpehlivanov@obe.tv>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "dirac_vlc.h"
|
||||||
|
|
||||||
|
#define LUT_SIZE (1 << LUT_BITS)
|
||||||
|
#define RSIZE_BITS (CHAR_BIT*sizeof(residual))
|
||||||
|
|
||||||
|
#define CONVERT_TO_RESIDUE(a, b) \
|
||||||
|
(((residual)(a)) << (RSIZE_BITS - (b)))
|
||||||
|
|
||||||
|
#define INIT_RESIDUE(N, I, B) \
|
||||||
|
residual N = B ? CONVERT_TO_RESIDUE(I, B) : 0; \
|
||||||
|
av_unused int32_t N ## _bits = B
|
||||||
|
|
||||||
|
int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
|
||||||
|
int bytes, uint8_t *_dst, int coeffs)
|
||||||
|
{
|
||||||
|
int i, b, c_idx = 0;
|
||||||
|
int32_t *dst = (int32_t *)_dst;
|
||||||
|
DiracGolombLUT *future[4], *l = &lut_ctx[2*LUT_SIZE + buf[0]];
|
||||||
|
INIT_RESIDUE(res, 0, 0);
|
||||||
|
|
||||||
|
#define APPEND_RESIDUE(N, M) \
|
||||||
|
N |= M >> (N ## _bits); \
|
||||||
|
N ## _bits += (M ## _bits)
|
||||||
|
|
||||||
|
for (b = 1; b <= bytes; b++) {
|
||||||
|
future[0] = &lut_ctx[buf[b]];
|
||||||
|
future[1] = future[0] + 1*LUT_SIZE;
|
||||||
|
future[2] = future[0] + 2*LUT_SIZE;
|
||||||
|
future[3] = future[0] + 3*LUT_SIZE;
|
||||||
|
|
||||||
|
if ((c_idx + 1) > coeffs)
|
||||||
|
return c_idx;
|
||||||
|
|
||||||
|
/* res_bits is a hint for better branch prediction */
|
||||||
|
if (res_bits && l->sign) {
|
||||||
|
int32_t coeff = 1;
|
||||||
|
APPEND_RESIDUE(res, l->preamble);
|
||||||
|
for (i = 0; i < (res_bits >> 1) - 1; i++) {
|
||||||
|
coeff <<= 1;
|
||||||
|
coeff |= (res >> (RSIZE_BITS - 2*i - 2)) & 1;
|
||||||
|
}
|
||||||
|
dst[c_idx++] = l->sign * (coeff - 1);
|
||||||
|
res_bits = res = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(&dst[c_idx], l->ready, LUT_BITS*sizeof(int32_t));
|
||||||
|
c_idx += l->ready_num;
|
||||||
|
|
||||||
|
APPEND_RESIDUE(res, l->leftover);
|
||||||
|
|
||||||
|
l = future[l->need_s ? 3 : !res_bits ? 2 : res_bits & 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return c_idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
|
||||||
|
int bytes, uint8_t *_dst, int coeffs)
|
||||||
|
{
|
||||||
|
int i, b, c_idx = 0;
|
||||||
|
int16_t *dst = (int16_t *)_dst;
|
||||||
|
DiracGolombLUT *future[4], *l = &lut_ctx[2*LUT_SIZE + buf[0]];
|
||||||
|
INIT_RESIDUE(res, 0, 0);
|
||||||
|
|
||||||
|
#define APPEND_RESIDUE(N, M) \
|
||||||
|
N |= M >> (N ## _bits); \
|
||||||
|
N ## _bits += (M ## _bits)
|
||||||
|
|
||||||
|
for (b = 1; b <= bytes; b++) {
|
||||||
|
future[0] = &lut_ctx[buf[b]];
|
||||||
|
future[1] = future[0] + 1*LUT_SIZE;
|
||||||
|
future[2] = future[0] + 2*LUT_SIZE;
|
||||||
|
future[3] = future[0] + 3*LUT_SIZE;
|
||||||
|
|
||||||
|
if ((c_idx + 1) > coeffs)
|
||||||
|
return c_idx;
|
||||||
|
|
||||||
|
if (res_bits && l->sign) {
|
||||||
|
int32_t coeff = 1;
|
||||||
|
APPEND_RESIDUE(res, l->preamble);
|
||||||
|
for (i = 0; i < (res_bits >> 1) - 1; i++) {
|
||||||
|
coeff <<= 1;
|
||||||
|
coeff |= (res >> (RSIZE_BITS - 2*i - 2)) & 1;
|
||||||
|
}
|
||||||
|
dst[c_idx++] = l->sign * (coeff - 1);
|
||||||
|
res_bits = res = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < LUT_BITS; i++)
|
||||||
|
dst[c_idx + i] = l->ready[i];
|
||||||
|
c_idx += l->ready_num;
|
||||||
|
|
||||||
|
APPEND_RESIDUE(res, l->leftover);
|
||||||
|
|
||||||
|
l = future[l->need_s ? 3 : !res_bits ? 2 : res_bits & 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return c_idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Searches for golomb codes in a residue */
|
||||||
|
static inline void search_for_golomb(DiracGolombLUT *l, residual r, int bits)
|
||||||
|
{
|
||||||
|
int r_count = RSIZE_BITS - 1;
|
||||||
|
int bits_start, bits_tot = bits, need_sign = 0;
|
||||||
|
|
||||||
|
#define READ_BIT(N) (((N) >> (N ## _count--)) & 1)
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
int32_t coef = 1;
|
||||||
|
bits_start = (RSIZE_BITS - 1) - r_count;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
if (!bits--)
|
||||||
|
goto leftover;
|
||||||
|
if (READ_BIT(r))
|
||||||
|
break;
|
||||||
|
|
||||||
|
coef <<= 1;
|
||||||
|
|
||||||
|
if (!bits--)
|
||||||
|
goto leftover;
|
||||||
|
coef |= READ_BIT(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
l->ready[l->ready_num] = coef - 1;
|
||||||
|
if (l->ready[l->ready_num]) {
|
||||||
|
if (!bits--) {
|
||||||
|
need_sign = 1;
|
||||||
|
goto leftover;
|
||||||
|
}
|
||||||
|
l->ready[l->ready_num] *= READ_BIT(r) ? -1 : +1;
|
||||||
|
}
|
||||||
|
l->ready_num++;
|
||||||
|
|
||||||
|
if (!bits)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
leftover:
|
||||||
|
l->leftover = r << bits_start;
|
||||||
|
l->leftover_bits = bits_tot - bits_start;
|
||||||
|
l->need_s = need_sign;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Parity LUTs - even and odd bit end positions */
|
||||||
|
static void generate_parity_lut(DiracGolombLUT *lut, int even)
|
||||||
|
{
|
||||||
|
for (int idx = 0; idx < LUT_SIZE; idx++) {
|
||||||
|
DiracGolombLUT *l = &lut[idx];
|
||||||
|
int symbol_end_loc = -1;
|
||||||
|
uint32_t code;
|
||||||
|
|
||||||
|
INIT_RESIDUE(res, idx, LUT_BITS);
|
||||||
|
|
||||||
|
for (int i = 0; i < LUT_BITS; i++) {
|
||||||
|
const int cond = even ? (i & 1) : !(i & 1);
|
||||||
|
if (((res >> (RSIZE_BITS - i - 1)) & 1) && cond) {
|
||||||
|
symbol_end_loc = i + 2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (symbol_end_loc < 0 || symbol_end_loc > LUT_BITS) {
|
||||||
|
l->preamble = 0;
|
||||||
|
l->preamble_bits = 0;
|
||||||
|
l->leftover_bits = LUT_BITS;
|
||||||
|
l->leftover = CONVERT_TO_RESIDUE(idx, l->leftover_bits);
|
||||||
|
if (even)
|
||||||
|
l->need_s = idx & 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Gets bits 0 through to (symbol_end_loc - 1) inclusive */
|
||||||
|
code = idx >> ((LUT_BITS - 1) - (symbol_end_loc - 1));
|
||||||
|
code &= ((1 << LUT_BITS) - 1) >> (LUT_BITS - symbol_end_loc);
|
||||||
|
l->preamble_bits = symbol_end_loc;
|
||||||
|
l->preamble = CONVERT_TO_RESIDUE(code, l->preamble_bits);
|
||||||
|
l->sign = ((l->preamble >> (RSIZE_BITS - l->preamble_bits)) & 1) ? -1 : +1;
|
||||||
|
|
||||||
|
search_for_golomb(l, res << symbol_end_loc, LUT_BITS - symbol_end_loc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Reset (off == 0) and needs-one-more-bit (off == 1) LUTs */
|
||||||
|
static void generate_offset_lut(DiracGolombLUT *lut, int off)
|
||||||
|
{
|
||||||
|
for (int idx = 0; idx < LUT_SIZE; idx++) {
|
||||||
|
DiracGolombLUT *l = &lut[idx];
|
||||||
|
|
||||||
|
INIT_RESIDUE(res, idx, LUT_BITS);
|
||||||
|
|
||||||
|
l->preamble = CONVERT_TO_RESIDUE(res >> (RSIZE_BITS - off), off);
|
||||||
|
l->preamble_bits = off;
|
||||||
|
l->sign = ((l->preamble >> (RSIZE_BITS - l->preamble_bits)) & 1) ? -1 : +1;
|
||||||
|
|
||||||
|
search_for_golomb(l, res << off, LUT_BITS - off);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx)
|
||||||
|
{
|
||||||
|
DiracGolombLUT *lut;
|
||||||
|
|
||||||
|
if (!(lut = av_calloc(4*LUT_SIZE, sizeof(DiracGolombLUT))))
|
||||||
|
return AVERROR(ENOMEM);
|
||||||
|
|
||||||
|
generate_parity_lut(&lut[0*LUT_SIZE], 0);
|
||||||
|
generate_parity_lut(&lut[1*LUT_SIZE], 1);
|
||||||
|
generate_offset_lut(&lut[2*LUT_SIZE], 0);
|
||||||
|
generate_offset_lut(&lut[3*LUT_SIZE], 1);
|
||||||
|
|
||||||
|
*lut_ctx = lut;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx)
|
||||||
|
{
|
||||||
|
av_freep(lut_ctx);
|
||||||
|
}
|
51
libavcodec/dirac_vlc.h
Normal file
51
libavcodec/dirac_vlc.h
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2016 Open Broadcast Systems Ltd.
|
||||||
|
* Author 2016 Rostislav Pehlivanov <rpehlivanov@obe.tv>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef AVCODEC_DIRAC_VLC_H
|
||||||
|
#define AVCODEC_DIRAC_VLC_H
|
||||||
|
|
||||||
|
#include <libavutil/avutil.h>
|
||||||
|
|
||||||
|
/* Can be 32 bits wide for some performance gain on some machines, but it will
|
||||||
|
* incorrectly decode very long coefficients (usually only 1 or 2 per frame) */
|
||||||
|
typedef uint64_t residual;
|
||||||
|
|
||||||
|
#define LUT_BITS 8
|
||||||
|
|
||||||
|
/* Exactly 64 bytes */
|
||||||
|
typedef struct DiracGolombLUT {
|
||||||
|
residual preamble, leftover;
|
||||||
|
int32_t ready[LUT_BITS];
|
||||||
|
int32_t preamble_bits, leftover_bits, ready_num;
|
||||||
|
int8_t need_s, sign;
|
||||||
|
} DiracGolombLUT;
|
||||||
|
|
||||||
|
av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx);
|
||||||
|
|
||||||
|
int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
|
||||||
|
int bytes, uint8_t *dst, int coeffs);
|
||||||
|
|
||||||
|
int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
|
||||||
|
int bytes, uint8_t *_dst, int coeffs);
|
||||||
|
|
||||||
|
av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx);
|
||||||
|
|
||||||
|
#endif /* AVCODEC_DIRAC_VLC_H */
|
@ -32,6 +32,7 @@
|
|||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
#include "golomb.h"
|
#include "golomb.h"
|
||||||
#include "dirac_arith.h"
|
#include "dirac_arith.h"
|
||||||
|
#include "dirac_vlc.h"
|
||||||
#include "mpeg12data.h"
|
#include "mpeg12data.h"
|
||||||
#include "libavcodec/mpegvideo.h"
|
#include "libavcodec/mpegvideo.h"
|
||||||
#include "mpegvideoencdsp.h"
|
#include "mpegvideoencdsp.h"
|
||||||
@ -125,6 +126,7 @@ typedef struct DiracContext {
|
|||||||
MpegvideoEncDSPContext mpvencdsp;
|
MpegvideoEncDSPContext mpvencdsp;
|
||||||
VideoDSPContext vdsp;
|
VideoDSPContext vdsp;
|
||||||
DiracDSPContext diracdsp;
|
DiracDSPContext diracdsp;
|
||||||
|
DiracGolombLUT *reader_ctx;
|
||||||
DiracVersionInfo version;
|
DiracVersionInfo version;
|
||||||
GetBitContext gb;
|
GetBitContext gb;
|
||||||
AVDiracSeqHeader seq;
|
AVDiracSeqHeader seq;
|
||||||
@ -378,6 +380,7 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx)
|
|||||||
s->threads_num_buf = -1;
|
s->threads_num_buf = -1;
|
||||||
s->thread_buf_size = -1;
|
s->thread_buf_size = -1;
|
||||||
|
|
||||||
|
ff_dirac_golomb_reader_init(&s->reader_ctx);
|
||||||
ff_diracdsp_init(&s->diracdsp);
|
ff_diracdsp_init(&s->diracdsp);
|
||||||
ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
|
ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
|
||||||
ff_videodsp_init(&s->vdsp, 8);
|
ff_videodsp_init(&s->vdsp, 8);
|
||||||
@ -407,6 +410,8 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx)
|
|||||||
DiracContext *s = avctx->priv_data;
|
DiracContext *s = avctx->priv_data;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ff_dirac_golomb_reader_end(&s->reader_ctx);
|
||||||
|
|
||||||
dirac_decode_flush(avctx);
|
dirac_decode_flush(avctx);
|
||||||
for (i = 0; i < MAX_FRAMES; i++)
|
for (i = 0; i < MAX_FRAMES; i++)
|
||||||
av_frame_free(&s->all_frames[i].avframe);
|
av_frame_free(&s->all_frames[i].avframe);
|
||||||
@ -825,10 +830,11 @@ static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
|
|||||||
|
|
||||||
/* Luma + 2 Chroma planes */
|
/* Luma + 2 Chroma planes */
|
||||||
for (i = 0; i < 3; i++) {
|
for (i = 0; i < 3; i++) {
|
||||||
int c, coef_num, coef_par, off = 0;
|
int coef_num, coef_par, off = 0;
|
||||||
int64_t length = s->highquality.size_scaler*get_bits(gb, 8);
|
int64_t length = s->highquality.size_scaler*get_bits(gb, 8);
|
||||||
int64_t start = get_bits_count(gb);
|
int64_t start = get_bits_count(gb);
|
||||||
int64_t bits_end = start + 8*length;
|
int64_t bits_end = start + 8*length;
|
||||||
|
const uint8_t *addr = align_get_bits(gb);
|
||||||
|
|
||||||
if (bits_end >= INT_MAX) {
|
if (bits_end >= INT_MAX) {
|
||||||
av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
|
av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
|
||||||
@ -837,17 +843,12 @@ static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
|
|||||||
|
|
||||||
coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
|
coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
|
||||||
|
|
||||||
if (s->pshift) {
|
if (s->pshift)
|
||||||
int32_t *dst = (int32_t *)tmp_buf;
|
coef_par = ff_dirac_golomb_read_32bit(s->reader_ctx, addr,
|
||||||
for (c = 0; c < coef_num; c++)
|
length, tmp_buf, coef_num);
|
||||||
dst[c] = dirac_get_se_golomb(gb);
|
else
|
||||||
coef_par = c;
|
coef_par = ff_dirac_golomb_read_16bit(s->reader_ctx, addr,
|
||||||
} else {
|
length, tmp_buf, coef_num);
|
||||||
int16_t *dst = (int16_t *)tmp_buf;
|
|
||||||
for (c = 0; c < coef_num; c++)
|
|
||||||
dst[c] = dirac_get_se_golomb(gb);
|
|
||||||
coef_par = c;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (coef_num > coef_par) {
|
if (coef_num > coef_par) {
|
||||||
const int start_b = coef_par * (4 >> s->pshift);
|
const int start_b = coef_par * (4 >> s->pshift);
|
||||||
|
Loading…
Reference in New Issue
Block a user