From 6b8d53f72801dd17eb5804aa3ba1134884475309 Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Fri, 2 Jun 2023 21:51:54 +0200
Subject: [PATCH] avcodec/magicyuvenc: add slice encoding support

---
 libavcodec/magicyuvenc.c | 210 ++++++++++++++++++++++-----------------
 1 file changed, 118 insertions(+), 92 deletions(-)

diff --git a/libavcodec/magicyuvenc.c b/libavcodec/magicyuvenc.c
index 9e41c1b0fe..a12ef5a33d 100644
--- a/libavcodec/magicyuvenc.c
+++ b/libavcodec/magicyuvenc.c
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "libavutil/cpu.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/qsort.h"
@@ -63,8 +64,8 @@ typedef struct MagicYUVContext {
     int                  correlate;
     int                  hshift[4];
     int                  vshift[4];
-    uint8_t             *slices[4];
-    unsigned             slice_pos[4];
+    uint8_t            **slices;
+    unsigned            *slice_pos;
     unsigned             tables_size;
     uint8_t             *decorrelate_buf[2];
     HuffEntry            he[4][256];
@@ -150,7 +151,6 @@ static av_cold int magy_encode_init(AVCodecContext *avctx)
 {
     MagicYUVContext *s = avctx->priv_data;
     PutByteContext pb;
-    int i;
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_GBRP:
@@ -201,14 +201,23 @@ static av_cold int magy_encode_init(AVCodecContext *avctx)
 
     s->planes = av_pix_fmt_count_planes(avctx->pix_fmt);
 
-    s->nb_slices = 1;
+    s->nb_slices = (avctx->slices <= 0) ? av_cpu_count() : avctx->slices;
+    s->nb_slices = FFMIN(s->nb_slices, avctx->height >> s->vshift[1]);
+    s->nb_slices = FFMAX(1, s->nb_slices);
+    s->slice_height = FFALIGN((avctx->height + s->nb_slices - 1) / s->nb_slices, 1 << s->vshift[1]);
+    s->slice_pos = av_calloc(s->nb_slices * s->planes, sizeof(*s->slice_pos));
+    s->slices = av_calloc(s->nb_slices * s->planes, sizeof(*s->slices));
+    if (!s->slices || !s->slice_pos)
+        return AVERROR(ENOMEM);
 
-    for (i = 0; i < s->planes; i++) {
-        s->slices[i] = av_malloc(avctx->width * (avctx->height + 2) +
-                                 AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!s->slices[i]) {
-            av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n");
-            return AVERROR(ENOMEM);
+    for (int n = 0; n < s->nb_slices; n++) {
+        for (int i = 0; i < s->planes; i++) {
+            s->slices[n * s->planes + i] = av_malloc(avctx->width * (s->slice_height + 2) +
+                                        AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!s->slices[n * s->planes + i]) {
+                av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n");
+                return AVERROR(ENOMEM);
+            }
         }
     }
 
@@ -263,15 +272,12 @@ static void calculate_codes(HuffEntry *he, uint16_t codes_count[33])
     }
 }
 
-static void count_usage(uint8_t *src, int width,
+static void count_usage(const uint8_t *src, int width,
                         int height, PTable *counts)
 {
-    int i, j;
-
-    for (j = 0; j < height; j++) {
-        for (i = 0; i < width; i++) {
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; i++)
             counts[src[i]].prob++;
-        }
         src += width;
     }
 }
@@ -352,26 +358,30 @@ static void magy_huffman_compute_bits(PTable *prob_table, HuffEntry *distincts,
     }
 }
 
-static int encode_table(AVCodecContext *avctx, uint8_t *dst,
-                        int width, int height,
-                        PutBitContext *pb, HuffEntry *he)
+static int encode_table(AVCodecContext *avctx,
+                        PutBitContext *pb, HuffEntry *he, int plane)
 {
+    MagicYUVContext *s = avctx->priv_data;
     PTable counts[256] = { {0} };
     uint16_t codes_counts[33] = { 0 };
-    int i;
 
-    count_usage(dst, width, height, counts);
+    for (int n = 0; n < s->nb_slices; n++) {
+        const uint8_t *dst = s->slices[n * s->planes + plane];
 
-    for (i = 0; i < 256; i++) {
-        counts[i].prob++;
-        counts[i].value = i;
+        count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]),
+                    AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts);
+
+        for (int i = 0; i < 256; i++) {
+            counts[i].prob++;
+            counts[i].value = i;
+        }
     }
 
     magy_huffman_compute_bits(counts, he, codes_counts, 256, 12);
 
     calculate_codes(he, codes_counts);
 
-    for (i = 0; i < 256; i++) {
+    for (int i = 0; i < 256; i++) {
         put_bits(pb, 1, 0);
         put_bits(pb, 7, he[i].len);
     }
@@ -410,13 +420,66 @@ static int encode_slice(uint8_t *src, uint8_t *dst, int dst_size,
     return put_bytes_output(&pb);
 }
 
+static int predict_slice(AVCodecContext *avctx, void *tdata,
+                         int n, int threadnr)
+{
+    const int aligned_width = FFALIGN(avctx->width, 16);
+    MagicYUVContext *s = avctx->priv_data;
+    const int slice_height = s->slice_height;
+    const int last_height = FFMIN(slice_height, avctx->height - n * slice_height);
+    const int height = (n < (s->nb_slices - 1)) ? slice_height : last_height;
+    const int width = avctx->width;
+    AVFrame *frame = tdata;
+
+    if (s->correlate) {
+        uint8_t *decorrelated[2] = { s->decorrelate_buf[0] + n * slice_height * aligned_width,
+                                     s->decorrelate_buf[1] + n * slice_height * aligned_width };
+        const int decorrelate_linesize = aligned_width;
+        const uint8_t *const data[4] = { decorrelated[0], frame->data[0] + n * slice_height * frame->linesize[0],
+                                         decorrelated[1], frame->data[3] + n * slice_height * frame->linesize[3] };
+        const uint8_t *r, *g, *b;
+        const int linesize[4]  = { decorrelate_linesize, frame->linesize[0],
+                                   decorrelate_linesize, frame->linesize[3] };
+
+        g = frame->data[0] + n * slice_height * frame->linesize[0];
+        b = frame->data[1] + n * slice_height * frame->linesize[1];
+        r = frame->data[2] + n * slice_height * frame->linesize[2];
+
+        for (int i = 0; i < slice_height; i++) {
+            s->llvidencdsp.diff_bytes(decorrelated[0], b, g, width);
+            s->llvidencdsp.diff_bytes(decorrelated[1], r, g, width);
+            g += frame->linesize[0];
+            b += frame->linesize[1];
+            r += frame->linesize[2];
+            decorrelated[0] += decorrelate_linesize;
+            decorrelated[1] += decorrelate_linesize;
+        }
+
+        for (int i = 0; i < s->planes; i++) {
+            s->predict(s, data[i], s->slices[n * s->planes + i], linesize[i],
+                       frame->width, height);
+        }
+    } else {
+        for (int i = 0; i < s->planes; i++) {
+            s->predict(s, frame->data[i] + n * (slice_height >> s->vshift[i]) * frame->linesize[i],
+                       s->slices[n * s->planes + i],
+                       frame->linesize[i],
+                       AV_CEIL_RSHIFT(frame->width, s->hshift[i]),
+                       AV_CEIL_RSHIFT(height, s->vshift[i]));
+        }
+    }
+
+    return 0;
+}
+
 static int magy_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                              const AVFrame *frame, int *got_packet)
 {
     MagicYUVContext *s = avctx->priv_data;
     PutByteContext pb;
     const int width = avctx->width, height = avctx->height;
-    int pos, slice, i, j, ret = 0;
+    const int slice_height = s->slice_height;
+    int pos, ret = 0;
 
     ret = ff_alloc_packet(avctx, pkt, (256 + 4 * s->nb_slices + width * height) *
                           s->planes + 256);
@@ -439,92 +502,53 @@ static int magy_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     bytestream2_put_le32(&pb, avctx->width);
     bytestream2_put_le32(&pb, avctx->height);
     bytestream2_put_le32(&pb, avctx->width);
-    bytestream2_put_le32(&pb, avctx->height);
+    bytestream2_put_le32(&pb, slice_height);
     bytestream2_put_le32(&pb, 0);
 
-    for (i = 0; i < s->planes; i++) {
+    for (int i = 0; i < s->planes; i++) {
         bytestream2_put_le32(&pb, 0);
-        for (j = 1; j < s->nb_slices; j++) {
+        for (int j = 1; j < s->nb_slices; j++)
             bytestream2_put_le32(&pb, 0);
-        }
     }
 
     bytestream2_put_byte(&pb, s->planes);
 
-    for (i = 0; i < s->planes; i++) {
-        for (slice = 0; slice < s->nb_slices; slice++) {
-            bytestream2_put_byte(&pb, i);
-        }
+    for (int i = 0; i < s->planes; i++) {
+        for (int n = 0; n < s->nb_slices; n++)
+            bytestream2_put_byte(&pb, n * s->planes + i);
     }
 
-    if (s->correlate) {
-        uint8_t *decorrelated[2] = { s->decorrelate_buf[0],
-                                                 s->decorrelate_buf[1] };
-        const int decorrelate_linesize = FFALIGN(width, 16);
-        const uint8_t *const data[4] = { decorrelated[0], frame->data[0],
-                                         decorrelated[1], frame->data[3] };
-        const uint8_t *r, *g, *b;
-        const int linesize[4]  = { decorrelate_linesize, frame->linesize[0],
-                                   decorrelate_linesize, frame->linesize[3] };
-
-        g = frame->data[0];
-        b = frame->data[1];
-        r = frame->data[2];
-
-        for (i = 0; i < height; i++) {
-            s->llvidencdsp.diff_bytes(decorrelated[0], b, g, width);
-            s->llvidencdsp.diff_bytes(decorrelated[1], r, g, width);
-            g += frame->linesize[0];
-            b += frame->linesize[1];
-            r += frame->linesize[2];
-            decorrelated[0] += decorrelate_linesize;
-            decorrelated[1] += decorrelate_linesize;
-        }
-
-        for (i = 0; i < s->planes; i++) {
-            for (slice = 0; slice < s->nb_slices; slice++) {
-                s->predict(s, data[i], s->slices[i], linesize[i],
-                           frame->width, frame->height);
-            }
-        }
-    } else {
-        for (i = 0; i < s->planes; i++) {
-            for (slice = 0; slice < s->nb_slices; slice++) {
-                s->predict(s, frame->data[i], s->slices[i], frame->linesize[i],
-                           AV_CEIL_RSHIFT(frame->width, s->hshift[i]),
-                           AV_CEIL_RSHIFT(frame->height, s->vshift[i]));
-            }
-        }
-    }
+    avctx->execute2(avctx, predict_slice, (void *)frame, NULL, s->nb_slices);
 
     init_put_bits(&s->pb, pkt->data + bytestream2_tell_p(&pb), bytestream2_get_bytes_left_p(&pb));
 
-    for (i = 0; i < s->planes; i++) {
-        encode_table(avctx, s->slices[i],
-                     AV_CEIL_RSHIFT(frame->width,  s->hshift[i]),
-                     AV_CEIL_RSHIFT(frame->height, s->vshift[i]),
-                     &s->pb, s->he[i]);
-    }
+    for (int i = 0; i < s->planes; i++)
+        encode_table(avctx, &s->pb, s->he[i], i);
+
     s->tables_size = put_bytes_count(&s->pb, 1);
     bytestream2_skip_p(&pb, s->tables_size);
 
-    for (i = 0; i < s->planes; i++) {
-        unsigned slice_size;
+    for (int n = 0; n < s->nb_slices; n++) {
+        for (int i = 0; i < s->planes; i++) {
+            unsigned slice_size;
 
-        s->slice_pos[i] = bytestream2_tell_p(&pb);
-        slice_size = encode_slice(s->slices[i], pkt->data + bytestream2_tell_p(&pb),
-                                  bytestream2_get_bytes_left_p(&pb),
-                                  AV_CEIL_RSHIFT(frame->width,  s->hshift[i]),
-                                  AV_CEIL_RSHIFT(frame->height, s->vshift[i]),
-                                  s->he[i], s->frame_pred);
-        bytestream2_skip_p(&pb, slice_size);
+            s->slice_pos[n * s->planes + i] = bytestream2_tell_p(&pb);
+            slice_size = encode_slice(s->slices[n * s->planes + i],
+                                      pkt->data + bytestream2_tell_p(&pb),
+                                      bytestream2_get_bytes_left_p(&pb),
+                                      AV_CEIL_RSHIFT(frame->width, s->hshift[i]),
+                                      AV_CEIL_RSHIFT(slice_height, s->vshift[i]),
+                                      s->he[i], s->frame_pred);
+            bytestream2_skip_p(&pb, slice_size);
+        }
     }
 
     pos = bytestream2_tell_p(&pb);
     bytestream2_seek_p(&pb, 32, SEEK_SET);
     bytestream2_put_le32(&pb, s->slice_pos[0] - 32);
-    for (i = 0; i < s->planes; i++) {
-        bytestream2_put_le32(&pb, s->slice_pos[i] - 32);
+    for (int i = 0; i < s->planes; i++) {
+        for (int n = 0; n < s->nb_slices; n++)
+           bytestream2_put_le32(&pb, s->slice_pos[n * s->planes + i] - 32);
     }
     bytestream2_seek_p(&pb, pos, SEEK_SET);
 
@@ -538,10 +562,11 @@ static int magy_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 static av_cold int magy_encode_close(AVCodecContext *avctx)
 {
     MagicYUVContext *s = avctx->priv_data;
-    int i;
 
-    for (i = 0; i < s->planes; i++)
+    av_freep(&s->slice_pos);
+    for (int i = 0; i < s->planes && s->slices; i++)
         av_freep(&s->slices[i]);
+    av_freep(&s->slices);
     av_freep(&s->decorrelate_buf);
 
     return 0;
@@ -570,6 +595,7 @@ const FFCodec ff_magicyuv_encoder = {
     .p.type           = AVMEDIA_TYPE_VIDEO,
     .p.id             = AV_CODEC_ID_MAGICYUV,
     .p.capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
+                        AV_CODEC_CAP_SLICE_THREADS |
                         AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
     .priv_data_size   = sizeof(MagicYUVContext),
     .p.priv_class     = &magicyuv_class,