From 0eb0f93109aa2353c87dfaeaf899efec9215d1c1 Mon Sep 17 00:00:00 2001
From: Rostislav Pehlivanov <rpehlivanov@ob-encoder.com>
Date: Thu, 23 Jun 2016 18:07:01 +0100
Subject: [PATCH] diracdec: implement a LUT-based Golomb code parser

Still much left to optimize, but it provides a significant performance
improvement - 10% for 300Mbps (1080p30), 25% for 1.5Gbps (4k 60fps) in
comparison with the default implementation.

Signed-off-by: Rostislav Pehlivanov <rpehlivanov@obe.tv>
---
 libavcodec/Makefile    |   3 +-
 libavcodec/dirac_vlc.c | 242 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/dirac_vlc.h |  51 +++++++++
 libavcodec/diracdec.c  |  25 +++--
 4 files changed, 308 insertions(+), 13 deletions(-)
 create mode 100644 libavcodec/dirac_vlc.c
 create mode 100644 libavcodec/dirac_vlc.h

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 78cd36c27a..abef19e18b 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -238,7 +238,8 @@ OBJS-$(CONFIG_DCA_DECODER)             += dcadec.o dca.o dcadata.o dcahuff.o \
 OBJS-$(CONFIG_DCA_ENCODER)             += dcaenc.o dca.o dcadata.o
 OBJS-$(CONFIG_DDS_DECODER)             += dds.o
 OBJS-$(CONFIG_DIRAC_DECODER)           += diracdec.o dirac.o diracdsp.o diractab.o \
-                                          dirac_arith.o mpeg12data.o dirac_dwt.o
+                                          dirac_arith.o mpeg12data.o dirac_dwt.o \
+                                          dirac_vlc.o
 OBJS-$(CONFIG_DFA_DECODER)             += dfa.o
 OBJS-$(CONFIG_DNXHD_DECODER)           += dnxhddec.o dnxhddata.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += dnxhdenc.o dnxhddata.o
diff --git a/libavcodec/dirac_vlc.c b/libavcodec/dirac_vlc.c
new file mode 100644
index 0000000000..bd0469a082
--- /dev/null
+++ b/libavcodec/dirac_vlc.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <rpehlivanov@obe.tv>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dirac_vlc.h"
+
+#define LUT_SIZE   (1 << LUT_BITS)
+#define RSIZE_BITS (CHAR_BIT*sizeof(residual))
+
+#define CONVERT_TO_RESIDUE(a, b)                                               \
+    (((residual)(a)) << (RSIZE_BITS - (b)))
+
+#define INIT_RESIDUE(N, I, B)                                                  \
+    residual N = B ? CONVERT_TO_RESIDUE(I, B) : 0;                             \
+    av_unused int32_t N ## _bits  = B
+
+int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
+                               int bytes, uint8_t *_dst, int coeffs)
+{
+    int i, b, c_idx = 0;
+    int32_t *dst = (int32_t *)_dst;
+    DiracGolombLUT *future[4], *l = &lut_ctx[2*LUT_SIZE + buf[0]];
+    INIT_RESIDUE(res, 0, 0);
+
+#define APPEND_RESIDUE(N, M)         \
+    N          |= M >> (N ## _bits); \
+    N ## _bits +=      (M ## _bits)
+
+    for (b = 1; b <= bytes; b++) {
+        future[0] = &lut_ctx[buf[b]];
+        future[1] = future[0] + 1*LUT_SIZE;
+        future[2] = future[0] + 2*LUT_SIZE;
+        future[3] = future[0] + 3*LUT_SIZE;
+
+        if ((c_idx + 1) > coeffs)
+            return c_idx;
+
+        /* res_bits is a hint for better branch prediction */
+        if (res_bits && l->sign) {
+            int32_t coeff = 1;
+            APPEND_RESIDUE(res, l->preamble);
+            for (i = 0; i < (res_bits >> 1) - 1; i++) {
+                coeff <<= 1;
+                coeff |= (res >> (RSIZE_BITS - 2*i - 2)) & 1;
+            }
+            dst[c_idx++] = l->sign * (coeff - 1);
+            res_bits = res = 0;
+        }
+
+        memcpy(&dst[c_idx], l->ready, LUT_BITS*sizeof(int32_t));
+        c_idx += l->ready_num;
+
+        APPEND_RESIDUE(res, l->leftover);
+
+        l = future[l->need_s ? 3 : !res_bits ? 2 : res_bits & 1];
+    }
+
+    return c_idx;
+}
+
+int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
+                               int bytes, uint8_t *_dst, int coeffs)
+{
+    int i, b, c_idx = 0;
+    int16_t *dst = (int16_t *)_dst;
+    DiracGolombLUT *future[4], *l = &lut_ctx[2*LUT_SIZE + buf[0]];
+    INIT_RESIDUE(res, 0, 0);
+
+#define APPEND_RESIDUE(N, M)         \
+    N          |= M >> (N ## _bits); \
+    N ## _bits +=      (M ## _bits)
+
+    for (b = 1; b <= bytes; b++) {
+        future[0] = &lut_ctx[buf[b]];
+        future[1] = future[0] + 1*LUT_SIZE;
+        future[2] = future[0] + 2*LUT_SIZE;
+        future[3] = future[0] + 3*LUT_SIZE;
+
+        if ((c_idx + 1) > coeffs)
+            return c_idx;
+
+        if (res_bits && l->sign) {
+            int32_t coeff = 1;
+            APPEND_RESIDUE(res, l->preamble);
+            for (i = 0; i < (res_bits >> 1) - 1; i++) {
+                coeff <<= 1;
+                coeff |= (res >> (RSIZE_BITS - 2*i - 2)) & 1;
+            }
+            dst[c_idx++] = l->sign * (coeff - 1);
+            res_bits = res = 0;
+        }
+
+        for (i = 0; i < LUT_BITS; i++)
+            dst[c_idx + i] = l->ready[i];
+        c_idx += l->ready_num;
+
+        APPEND_RESIDUE(res, l->leftover);
+
+        l = future[l->need_s ? 3 : !res_bits ? 2 : res_bits & 1];
+    }
+
+    return c_idx;
+}
+
+/* Searches for golomb codes in a residue */
+static inline void search_for_golomb(DiracGolombLUT *l, residual r, int bits)
+{
+    int r_count = RSIZE_BITS - 1;
+    int bits_start, bits_tot = bits, need_sign = 0;
+
+#define READ_BIT(N) (((N) >> (N ## _count--)) & 1)
+
+    while (1) {
+        int32_t coef = 1;
+        bits_start = (RSIZE_BITS - 1) - r_count;
+
+        while (1) {
+            if (!bits--)
+                goto leftover;
+            if (READ_BIT(r))
+                break;
+
+            coef <<= 1;
+
+            if (!bits--)
+                goto leftover;
+            coef |= READ_BIT(r);
+        }
+
+        l->ready[l->ready_num] = coef - 1;
+        if (l->ready[l->ready_num]) {
+            if (!bits--) {
+                need_sign = 1;
+                goto leftover;
+            }
+            l->ready[l->ready_num] *= READ_BIT(r) ? -1 : +1;
+        }
+        l->ready_num++;
+
+        if (!bits)
+            return;
+    }
+
+    leftover:
+        l->leftover      = r << bits_start;
+        l->leftover_bits = bits_tot - bits_start;
+        l->need_s        = need_sign;
+}
+
+/* Parity LUTs - even and odd bit end positions */
+static void generate_parity_lut(DiracGolombLUT *lut, int even)
+{
+    for (int idx = 0; idx < LUT_SIZE; idx++) {
+        DiracGolombLUT *l = &lut[idx];
+        int symbol_end_loc = -1;
+        uint32_t code;
+
+        INIT_RESIDUE(res, idx, LUT_BITS);
+
+        for (int i = 0; i < LUT_BITS; i++) {
+            const int cond = even ? (i & 1) : !(i & 1);
+            if (((res >> (RSIZE_BITS - i - 1)) & 1) && cond) {
+                symbol_end_loc = i + 2;
+                break;
+            }
+        }
+
+        if (symbol_end_loc < 0 || symbol_end_loc > LUT_BITS) {
+            l->preamble      = 0;
+            l->preamble_bits = 0;
+            l->leftover_bits = LUT_BITS;
+            l->leftover      = CONVERT_TO_RESIDUE(idx, l->leftover_bits);
+            if (even)
+                l->need_s    = idx & 1;
+            continue;
+        }
+
+        /* Gets bits 0 through to (symbol_end_loc - 1) inclusive */
+        code  = idx >> ((LUT_BITS - 1) - (symbol_end_loc - 1));
+        code &= ((1 << LUT_BITS) - 1) >> (LUT_BITS - symbol_end_loc);
+        l->preamble_bits = symbol_end_loc;
+        l->preamble      = CONVERT_TO_RESIDUE(code, l->preamble_bits);
+        l->sign = ((l->preamble >> (RSIZE_BITS - l->preamble_bits)) & 1) ? -1 : +1;
+
+        search_for_golomb(l, res << symbol_end_loc, LUT_BITS - symbol_end_loc);
+    }
+}
+
+/* Reset (off == 0) and needs-one-more-bit (off == 1) LUTs */
+static void generate_offset_lut(DiracGolombLUT *lut, int off)
+{
+    for (int idx = 0; idx < LUT_SIZE; idx++) {
+        DiracGolombLUT *l = &lut[idx];
+
+        INIT_RESIDUE(res, idx, LUT_BITS);
+
+        l->preamble      = CONVERT_TO_RESIDUE(res >> (RSIZE_BITS - off), off);
+        l->preamble_bits = off;
+        l->sign = ((l->preamble >> (RSIZE_BITS - l->preamble_bits)) & 1) ? -1 : +1;
+
+        search_for_golomb(l, res << off, LUT_BITS - off);
+    }
+}
+
+av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx)
+{
+    DiracGolombLUT *lut;
+
+    if (!(lut = av_calloc(4*LUT_SIZE, sizeof(DiracGolombLUT))))
+        return AVERROR(ENOMEM);
+
+    generate_parity_lut(&lut[0*LUT_SIZE], 0);
+    generate_parity_lut(&lut[1*LUT_SIZE], 1);
+    generate_offset_lut(&lut[2*LUT_SIZE], 0);
+    generate_offset_lut(&lut[3*LUT_SIZE], 1);
+
+    *lut_ctx = lut;
+
+    return 0;
+}
+
+av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx)
+{
+    av_freep(lut_ctx);
+}
diff --git a/libavcodec/dirac_vlc.h b/libavcodec/dirac_vlc.h
new file mode 100644
index 0000000000..523e9ca813
--- /dev/null
+++ b/libavcodec/dirac_vlc.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <rpehlivanov@obe.tv>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRAC_VLC_H
+#define AVCODEC_DIRAC_VLC_H
+
+#include <libavutil/avutil.h>
+
+/* Can be 32 bits wide for some performance gain on some machines, but it will
+ * incorrectly decode very long coefficients (usually only 1 or 2 per frame) */
+typedef uint64_t residual;
+
+#define LUT_BITS 8
+
+/* Exactly 64 bytes */
+typedef struct DiracGolombLUT {
+    residual preamble, leftover;
+    int32_t  ready[LUT_BITS];
+    int32_t  preamble_bits, leftover_bits, ready_num;
+    int8_t   need_s, sign;
+} DiracGolombLUT;
+
+av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx);
+
+int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
+                               int bytes, uint8_t *dst, int coeffs);
+
+int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
+                               int bytes, uint8_t *_dst, int coeffs);
+
+av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx);
+
+#endif /* AVCODEC_DIRAC_VLC_H */
diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c
index 7913656991..e95ce9e10e 100644
--- a/libavcodec/diracdec.c
+++ b/libavcodec/diracdec.c
@@ -32,6 +32,7 @@
 #include "internal.h"
 #include "golomb.h"
 #include "dirac_arith.h"
+#include "dirac_vlc.h"
 #include "mpeg12data.h"
 #include "libavcodec/mpegvideo.h"
 #include "mpegvideoencdsp.h"
@@ -125,6 +126,7 @@ typedef struct DiracContext {
     MpegvideoEncDSPContext mpvencdsp;
     VideoDSPContext vdsp;
     DiracDSPContext diracdsp;
+    DiracGolombLUT *reader_ctx;
     DiracVersionInfo version;
     GetBitContext gb;
     AVDiracSeqHeader seq;
@@ -378,6 +380,7 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx)
     s->threads_num_buf = -1;
     s->thread_buf_size = -1;
 
+    ff_dirac_golomb_reader_init(&s->reader_ctx);
     ff_diracdsp_init(&s->diracdsp);
     ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
     ff_videodsp_init(&s->vdsp, 8);
@@ -407,6 +410,8 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx)
     DiracContext *s = avctx->priv_data;
     int i;
 
+    ff_dirac_golomb_reader_end(&s->reader_ctx);
+
     dirac_decode_flush(avctx);
     for (i = 0; i < MAX_FRAMES; i++)
         av_frame_free(&s->all_frames[i].avframe);
@@ -825,10 +830,11 @@ static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
 
     /* Luma + 2 Chroma planes */
     for (i = 0; i < 3; i++) {
-        int c, coef_num, coef_par, off = 0;
+        int coef_num, coef_par, off = 0;
         int64_t length = s->highquality.size_scaler*get_bits(gb, 8);
         int64_t start = get_bits_count(gb);
         int64_t bits_end = start + 8*length;
+        const uint8_t *addr = align_get_bits(gb);
 
         if (bits_end >= INT_MAX) {
             av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
@@ -837,17 +843,12 @@ static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
 
         coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
 
-        if (s->pshift) {
-            int32_t *dst = (int32_t *)tmp_buf;
-            for (c = 0; c < coef_num; c++)
-                dst[c] = dirac_get_se_golomb(gb);
-            coef_par = c;
-        } else {
-            int16_t *dst = (int16_t *)tmp_buf;
-            for (c = 0; c < coef_num; c++)
-                dst[c] = dirac_get_se_golomb(gb);
-            coef_par = c;
-        }
+        if (s->pshift)
+            coef_par = ff_dirac_golomb_read_32bit(s->reader_ctx, addr,
+                                                  length, tmp_buf, coef_num);
+        else
+            coef_par = ff_dirac_golomb_read_16bit(s->reader_ctx, addr,
+                                                  length, tmp_buf, coef_num);
 
         if (coef_num > coef_par) {
             const int start_b = coef_par * (4 >> s->pshift);