From 0eb0f93109aa2353c87dfaeaf899efec9215d1c1 Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov Date: Thu, 23 Jun 2016 18:07:01 +0100 Subject: [PATCH] diracdec: implement a LUT-based Golomb code parser Still much left to optimize, but it provides a significant performance improvement - 10% for 300Mbps (1080p30), 25% for 1.5Gbps (4k 60fps) in comparison with the default implementation. Signed-off-by: Rostislav Pehlivanov --- libavcodec/Makefile | 3 +- libavcodec/dirac_vlc.c | 242 +++++++++++++++++++++++++++++++++++++++++ libavcodec/dirac_vlc.h | 51 +++++++++ libavcodec/diracdec.c | 25 +++-- 4 files changed, 308 insertions(+), 13 deletions(-) create mode 100644 libavcodec/dirac_vlc.c create mode 100644 libavcodec/dirac_vlc.h diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 78cd36c27a..abef19e18b 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -238,7 +238,8 @@ OBJS-$(CONFIG_DCA_DECODER) += dcadec.o dca.o dcadata.o dcahuff.o \ OBJS-$(CONFIG_DCA_ENCODER) += dcaenc.o dca.o dcadata.o OBJS-$(CONFIG_DDS_DECODER) += dds.o OBJS-$(CONFIG_DIRAC_DECODER) += diracdec.o dirac.o diracdsp.o diractab.o \ - dirac_arith.o mpeg12data.o dirac_dwt.o + dirac_arith.o mpeg12data.o dirac_dwt.o \ + dirac_vlc.o OBJS-$(CONFIG_DFA_DECODER) += dfa.o OBJS-$(CONFIG_DNXHD_DECODER) += dnxhddec.o dnxhddata.o OBJS-$(CONFIG_DNXHD_ENCODER) += dnxhdenc.o dnxhddata.o diff --git a/libavcodec/dirac_vlc.c b/libavcodec/dirac_vlc.c new file mode 100644 index 0000000000..bd0469a082 --- /dev/null +++ b/libavcodec/dirac_vlc.c @@ -0,0 +1,242 @@ +/* + * Copyright (C) 2016 Open Broadcast Systems Ltd. + * Author 2016 Rostislav Pehlivanov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dirac_vlc.h" + +#define LUT_SIZE (1 << LUT_BITS) +#define RSIZE_BITS (CHAR_BIT*sizeof(residual)) + +#define CONVERT_TO_RESIDUE(a, b) \ + (((residual)(a)) << (RSIZE_BITS - (b))) + +#define INIT_RESIDUE(N, I, B) \ + residual N = B ? CONVERT_TO_RESIDUE(I, B) : 0; \ + av_unused int32_t N ## _bits = B + +int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf, + int bytes, uint8_t *_dst, int coeffs) +{ + int i, b, c_idx = 0; + int32_t *dst = (int32_t *)_dst; + DiracGolombLUT *future[4], *l = &lut_ctx[2*LUT_SIZE + buf[0]]; + INIT_RESIDUE(res, 0, 0); + +#define APPEND_RESIDUE(N, M) \ + N |= M >> (N ## _bits); \ + N ## _bits += (M ## _bits) + + for (b = 1; b <= bytes; b++) { + future[0] = &lut_ctx[buf[b]]; + future[1] = future[0] + 1*LUT_SIZE; + future[2] = future[0] + 2*LUT_SIZE; + future[3] = future[0] + 3*LUT_SIZE; + + if ((c_idx + 1) > coeffs) + return c_idx; + + /* res_bits is a hint for better branch prediction */ + if (res_bits && l->sign) { + int32_t coeff = 1; + APPEND_RESIDUE(res, l->preamble); + for (i = 0; i < (res_bits >> 1) - 1; i++) { + coeff <<= 1; + coeff |= (res >> (RSIZE_BITS - 2*i - 2)) & 1; + } + dst[c_idx++] = l->sign * (coeff - 1); + res_bits = res = 0; + } + + memcpy(&dst[c_idx], l->ready, LUT_BITS*sizeof(int32_t)); + c_idx += l->ready_num; + + APPEND_RESIDUE(res, l->leftover); + + l = future[l->need_s ? 3 : !res_bits ? 2 : res_bits & 1]; + } + + return c_idx; +} + +int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf, + int bytes, uint8_t *_dst, int coeffs) +{ + int i, b, c_idx = 0; + int16_t *dst = (int16_t *)_dst; + DiracGolombLUT *future[4], *l = &lut_ctx[2*LUT_SIZE + buf[0]]; + INIT_RESIDUE(res, 0, 0); + +#define APPEND_RESIDUE(N, M) \ + N |= M >> (N ## _bits); \ + N ## _bits += (M ## _bits) + + for (b = 1; b <= bytes; b++) { + future[0] = &lut_ctx[buf[b]]; + future[1] = future[0] + 1*LUT_SIZE; + future[2] = future[0] + 2*LUT_SIZE; + future[3] = future[0] + 3*LUT_SIZE; + + if ((c_idx + 1) > coeffs) + return c_idx; + + if (res_bits && l->sign) { + int32_t coeff = 1; + APPEND_RESIDUE(res, l->preamble); + for (i = 0; i < (res_bits >> 1) - 1; i++) { + coeff <<= 1; + coeff |= (res >> (RSIZE_BITS - 2*i - 2)) & 1; + } + dst[c_idx++] = l->sign * (coeff - 1); + res_bits = res = 0; + } + + for (i = 0; i < LUT_BITS; i++) + dst[c_idx + i] = l->ready[i]; + c_idx += l->ready_num; + + APPEND_RESIDUE(res, l->leftover); + + l = future[l->need_s ? 3 : !res_bits ? 2 : res_bits & 1]; + } + + return c_idx; +} + +/* Searches for golomb codes in a residue */ +static inline void search_for_golomb(DiracGolombLUT *l, residual r, int bits) +{ + int r_count = RSIZE_BITS - 1; + int bits_start, bits_tot = bits, need_sign = 0; + +#define READ_BIT(N) (((N) >> (N ## _count--)) & 1) + + while (1) { + int32_t coef = 1; + bits_start = (RSIZE_BITS - 1) - r_count; + + while (1) { + if (!bits--) + goto leftover; + if (READ_BIT(r)) + break; + + coef <<= 1; + + if (!bits--) + goto leftover; + coef |= READ_BIT(r); + } + + l->ready[l->ready_num] = coef - 1; + if (l->ready[l->ready_num]) { + if (!bits--) { + need_sign = 1; + goto leftover; + } + l->ready[l->ready_num] *= READ_BIT(r) ? -1 : +1; + } + l->ready_num++; + + if (!bits) + return; + } + + leftover: + l->leftover = r << bits_start; + l->leftover_bits = bits_tot - bits_start; + l->need_s = need_sign; +} + +/* Parity LUTs - even and odd bit end positions */ +static void generate_parity_lut(DiracGolombLUT *lut, int even) +{ + for (int idx = 0; idx < LUT_SIZE; idx++) { + DiracGolombLUT *l = &lut[idx]; + int symbol_end_loc = -1; + uint32_t code; + + INIT_RESIDUE(res, idx, LUT_BITS); + + for (int i = 0; i < LUT_BITS; i++) { + const int cond = even ? (i & 1) : !(i & 1); + if (((res >> (RSIZE_BITS - i - 1)) & 1) && cond) { + symbol_end_loc = i + 2; + break; + } + } + + if (symbol_end_loc < 0 || symbol_end_loc > LUT_BITS) { + l->preamble = 0; + l->preamble_bits = 0; + l->leftover_bits = LUT_BITS; + l->leftover = CONVERT_TO_RESIDUE(idx, l->leftover_bits); + if (even) + l->need_s = idx & 1; + continue; + } + + /* Gets bits 0 through to (symbol_end_loc - 1) inclusive */ + code = idx >> ((LUT_BITS - 1) - (symbol_end_loc - 1)); + code &= ((1 << LUT_BITS) - 1) >> (LUT_BITS - symbol_end_loc); + l->preamble_bits = symbol_end_loc; + l->preamble = CONVERT_TO_RESIDUE(code, l->preamble_bits); + l->sign = ((l->preamble >> (RSIZE_BITS - l->preamble_bits)) & 1) ? -1 : +1; + + search_for_golomb(l, res << symbol_end_loc, LUT_BITS - symbol_end_loc); + } +} + +/* Reset (off == 0) and needs-one-more-bit (off == 1) LUTs */ +static void generate_offset_lut(DiracGolombLUT *lut, int off) +{ + for (int idx = 0; idx < LUT_SIZE; idx++) { + DiracGolombLUT *l = &lut[idx]; + + INIT_RESIDUE(res, idx, LUT_BITS); + + l->preamble = CONVERT_TO_RESIDUE(res >> (RSIZE_BITS - off), off); + l->preamble_bits = off; + l->sign = ((l->preamble >> (RSIZE_BITS - l->preamble_bits)) & 1) ? -1 : +1; + + search_for_golomb(l, res << off, LUT_BITS - off); + } +} + +av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx) +{ + DiracGolombLUT *lut; + + if (!(lut = av_calloc(4*LUT_SIZE, sizeof(DiracGolombLUT)))) + return AVERROR(ENOMEM); + + generate_parity_lut(&lut[0*LUT_SIZE], 0); + generate_parity_lut(&lut[1*LUT_SIZE], 1); + generate_offset_lut(&lut[2*LUT_SIZE], 0); + generate_offset_lut(&lut[3*LUT_SIZE], 1); + + *lut_ctx = lut; + + return 0; +} + +av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx) +{ + av_freep(lut_ctx); +} diff --git a/libavcodec/dirac_vlc.h b/libavcodec/dirac_vlc.h new file mode 100644 index 0000000000..523e9ca813 --- /dev/null +++ b/libavcodec/dirac_vlc.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2016 Open Broadcast Systems Ltd. + * Author 2016 Rostislav Pehlivanov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_DIRAC_VLC_H +#define AVCODEC_DIRAC_VLC_H + +#include + +/* Can be 32 bits wide for some performance gain on some machines, but it will + * incorrectly decode very long coefficients (usually only 1 or 2 per frame) */ +typedef uint64_t residual; + +#define LUT_BITS 8 + +/* Exactly 64 bytes */ +typedef struct DiracGolombLUT { + residual preamble, leftover; + int32_t ready[LUT_BITS]; + int32_t preamble_bits, leftover_bits, ready_num; + int8_t need_s, sign; +} DiracGolombLUT; + +av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx); + +int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf, + int bytes, uint8_t *dst, int coeffs); + +int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf, + int bytes, uint8_t *_dst, int coeffs); + +av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx); + +#endif /* AVCODEC_DIRAC_VLC_H */ diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c index 7913656991..e95ce9e10e 100644 --- a/libavcodec/diracdec.c +++ b/libavcodec/diracdec.c @@ -32,6 +32,7 @@ #include "internal.h" #include "golomb.h" #include "dirac_arith.h" +#include "dirac_vlc.h" #include "mpeg12data.h" #include "libavcodec/mpegvideo.h" #include "mpegvideoencdsp.h" @@ -125,6 +126,7 @@ typedef struct DiracContext { MpegvideoEncDSPContext mpvencdsp; VideoDSPContext vdsp; DiracDSPContext diracdsp; + DiracGolombLUT *reader_ctx; DiracVersionInfo version; GetBitContext gb; AVDiracSeqHeader seq; @@ -378,6 +380,7 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx) s->threads_num_buf = -1; s->thread_buf_size = -1; + ff_dirac_golomb_reader_init(&s->reader_ctx); ff_diracdsp_init(&s->diracdsp); ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx); ff_videodsp_init(&s->vdsp, 8); @@ -407,6 +410,8 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx) DiracContext *s = avctx->priv_data; int i; + ff_dirac_golomb_reader_end(&s->reader_ctx); + dirac_decode_flush(avctx); for (i = 0; i < MAX_FRAMES; i++) av_frame_free(&s->all_frames[i].avframe); @@ -825,10 +830,11 @@ static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf) /* Luma + 2 Chroma planes */ for (i = 0; i < 3; i++) { - int c, coef_num, coef_par, off = 0; + int coef_num, coef_par, off = 0; int64_t length = s->highquality.size_scaler*get_bits(gb, 8); int64_t start = get_bits_count(gb); int64_t bits_end = start + 8*length; + const uint8_t *addr = align_get_bits(gb); if (bits_end >= INT_MAX) { av_log(s->avctx, AV_LOG_ERROR, "end too far away\n"); @@ -837,17 +843,12 @@ static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf) coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num); - if (s->pshift) { - int32_t *dst = (int32_t *)tmp_buf; - for (c = 0; c < coef_num; c++) - dst[c] = dirac_get_se_golomb(gb); - coef_par = c; - } else { - int16_t *dst = (int16_t *)tmp_buf; - for (c = 0; c < coef_num; c++) - dst[c] = dirac_get_se_golomb(gb); - coef_par = c; - } + if (s->pshift) + coef_par = ff_dirac_golomb_read_32bit(s->reader_ctx, addr, + length, tmp_buf, coef_num); + else + coef_par = ff_dirac_golomb_read_16bit(s->reader_ctx, addr, + length, tmp_buf, coef_num); if (coef_num > coef_par) { const int start_b = coef_par * (4 >> s->pshift);