avcodec/magicyuvenc: put some slice work under parallel execution

Speeds up slice threaded encoding.
2025-10-30 23:18:11 +02:00 · 2023-06-03 10:49:08 +02:00
parent ba32f28498
commit 2342c05e43
1 changed files with 74 additions and 22 deletions
--- a/libavcodec/magicyuvenc.c
+++ b/libavcodec/magicyuvenc.c
@@ -65,8 +65,12 @@ typedef struct MagicYUVContext {
    int                  hshift[4];
    int                  vshift[4];
    uint8_t            **slices;
+    uint8_t            **bitslices;
+    unsigned             bitslice_size;
    unsigned            *slice_pos;
+    unsigned            *slice_size;
    unsigned             tables_size;
+    PTable              *counts;
    uint8_t             *decorrelate_buf[2];
    HuffEntry            he[4][256];
    LLVidEncDSPContext   llvidencdsp;
@@ -206,15 +210,20 @@ static av_cold int magy_encode_init(AVCodecContext *avctx)
    s->nb_slices = FFMAX(1, s->nb_slices);
    s->slice_height = FFALIGN((avctx->height + s->nb_slices - 1) / s->nb_slices, 1 << s->vshift[1]);
    s->slice_pos = av_calloc(s->nb_slices * s->planes, sizeof(*s->slice_pos));
+    s->slice_size = av_calloc(s->nb_slices * s->planes, sizeof(*s->slice_size));
    s->slices = av_calloc(s->nb_slices * s->planes, sizeof(*s->slices));
-    if (!s->slices || !s->slice_pos)
+    s->bitslices = av_calloc(s->nb_slices * s->planes, sizeof(*s->bitslices));
+    s->counts = av_calloc(s->nb_slices * s->planes * 256, sizeof(*s->counts));
+    if (!s->slices || !s->slice_pos || !s->counts || !s->slice_size)
        return AVERROR(ENOMEM);

+    s->bitslice_size = avctx->width * (s->slice_height + 2) + AV_INPUT_BUFFER_PADDING_SIZE;
    for (int n = 0; n < s->nb_slices; n++) {
        for (int i = 0; i < s->planes; i++) {
+            s->bitslices[n * s->planes + i] = av_malloc(s->bitslice_size);
            s->slices[n * s->planes + i] = av_malloc(avctx->width * (s->slice_height + 2) +
-                                        AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!s->slices[n * s->planes + i]) {
+                                                     AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!s->slices[n * s->planes + i] || !s->bitslices[n * s->planes + i]) {
                av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n");
                return AVERROR(ENOMEM);
            }
@@ -358,6 +367,20 @@ static void magy_huffman_compute_bits(PTable *prob_table, HuffEntry *distincts,
    }
 }

+static int count_plane_slice(AVCodecContext *avctx, int n, int plane)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    const uint8_t *dst = s->slices[n * s->planes + plane];
+    PTable *counts = s->counts + 256 * (n * s->planes + plane);
+
+    memset(counts, 0, sizeof(*counts) * 256);
+
+    count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]),
+                AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts);
+
+    return 0;
+}
+
 static int encode_table(AVCodecContext *avctx,
                        PutBitContext *pb, HuffEntry *he, int plane)
 {
@@ -366,15 +389,15 @@ static int encode_table(AVCodecContext *avctx,
    uint16_t codes_counts[33] = { 0 };

    for (int n = 0; n < s->nb_slices; n++) {
-        const uint8_t *dst = s->slices[n * s->planes + plane];
+        PTable *slice_counts = s->counts + 256 * (n * s->planes + plane);

-        count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]),
-                    AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts);
+        for (int i = 0; i < 256; i++)
+            counts[i].prob = slice_counts[i].prob;
+    }

-        for (int i = 0; i < 256; i++) {
-            counts[i].prob++;
-            counts[i].value = i;
-        }
+    for (int i = 0; i < 256; i++) {
+        counts[i].prob++;
+        counts[i].value = i;
    }

    magy_huffman_compute_bits(counts, he, codes_counts, 256, 12);
@@ -389,8 +412,8 @@ static int encode_table(AVCodecContext *avctx,
    return 0;
 }

-static int encode_slice(uint8_t *src, uint8_t *dst, int dst_size,
-                        int width, int height, HuffEntry *he, int prediction)
+static int encode_plane_slice(uint8_t *src, uint8_t *dst, int dst_size,
+                              int width, int height, HuffEntry *he, int prediction)
 {
    PutBitContext pb;
    int i, j;
@@ -420,6 +443,31 @@ static int encode_slice(uint8_t *src, uint8_t *dst, int dst_size,
    return put_bytes_output(&pb);
 }

+static int encode_slice(AVCodecContext *avctx, void *tdata,
+                        int n, int threadnr)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    const int slice_height = s->slice_height;
+    const int last_height = FFMIN(slice_height, avctx->height - n * slice_height);
+    const int height = (n < (s->nb_slices - 1)) ? slice_height : last_height;
+    PutByteContext pb;
+
+    for (int i = 0; i < s->planes; i++) {
+        bytestream2_init_writer(&pb, s->bitslices[n + s->planes + i],
+                                s->bitslice_size);
+
+        s->slice_size[n * s->planes + i] =
+            encode_plane_slice(s->slices[n * s->planes + i],
+                               s->bitslices[n * s->planes + i],
+                               bytestream2_get_bytes_left_p(&pb),
+                               AV_CEIL_RSHIFT(avctx->width, s->hshift[i]),
+                               AV_CEIL_RSHIFT(height, s->vshift[i]),
+                               s->he[i], s->frame_pred);
+    }
+
+    return 0;
+}
+
 static int predict_slice(AVCodecContext *avctx, void *tdata,
                         int n, int threadnr)
 {
@@ -469,6 +517,9 @@ static int predict_slice(AVCodecContext *avctx, void *tdata,
        }
    }

+    for (int p = 0; p < s->planes; p++)
+        count_plane_slice(avctx, n, p);
+
    return 0;
 }

@@ -528,18 +579,14 @@ static int magy_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
    s->tables_size = put_bytes_count(&s->pb, 1);
    bytestream2_skip_p(&pb, s->tables_size);

+    avctx->execute2(avctx, encode_slice, NULL, NULL, s->nb_slices);
+
    for (int n = 0; n < s->nb_slices; n++) {
        for (int i = 0; i < s->planes; i++) {
-            unsigned slice_size;
-
            s->slice_pos[n * s->planes + i] = bytestream2_tell_p(&pb);
-            slice_size = encode_slice(s->slices[n * s->planes + i],
-                                      pkt->data + bytestream2_tell_p(&pb),
-                                      bytestream2_get_bytes_left_p(&pb),
-                                      AV_CEIL_RSHIFT(frame->width, s->hshift[i]),
-                                      AV_CEIL_RSHIFT(slice_height, s->vshift[i]),
-                                      s->he[i], s->frame_pred);
-            bytestream2_skip_p(&pb, slice_size);
+
+            bytestream2_put_buffer(&pb, s->bitslices[n * s->planes + i],
+                                   s->slice_size[n * s->planes + i]);
        }
    }

@@ -564,9 +611,14 @@ static av_cold int magy_encode_close(AVCodecContext *avctx)
    MagicYUVContext *s = avctx->priv_data;

    av_freep(&s->slice_pos);
-    for (int i = 0; i < s->planes && s->slices; i++)
+    av_freep(&s->slice_size);
+    for (int i = 0; i < s->planes * s->nb_slices && s->slices; i++)
        av_freep(&s->slices[i]);
+    for (int i = 0; i < s->planes * s->nb_slices && s->bitslices; i++)
+        av_freep(&s->bitslices[i]);
+    av_freep(&s->counts);
    av_freep(&s->slices);
+    av_freep(&s->bitslices);
    av_freep(&s->decorrelate_buf);

    return 0;