ffv1: add a Vulkan-based decoder

This patch adds a fully-featured level 3 and 4 decoder for FFv1, supporting Golomb and all Range coding variants, all pixel formats, and all features, except for the newly added floating-point formats. On a 6000 Ada, for 3840x2160 bgr0 content at 50Mbps (standard desktop recording), it is able to do 400fps. An Alder Lake with 24 threads can barely do 100fps.
2025-08-04 22:03:09 +02:00 · 2025-03-10 03:04:39 +00:00
parent caff29dbb1
commit 6bad55eb17
15 changed files with 2165 additions and 8 deletions
--- a/2
+++ b/2
@ -3195,6 +3195,8 @@ av1_videotoolbox_hwaccel_deps="videotoolbox"
 av1_videotoolbox_hwaccel_select="av1_decoder"
 av1_vulkan_hwaccel_deps="vulkan"
 av1_vulkan_hwaccel_select="av1_decoder"
+ffv1_vulkan_hwaccel_deps="vulkan spirv_compiler"
+ffv1_vulkan_hwaccel_select="ffv1_decoder"
 h263_vaapi_hwaccel_deps="vaapi"
 h263_vaapi_hwaccel_select="h263_decoder"
 h263_videotoolbox_hwaccel_deps="videotoolbox"
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@ -1017,6 +1017,7 @@ OBJS-$(CONFIG_AV1_VAAPI_HWACCEL)          += vaapi_av1.o
 OBJS-$(CONFIG_AV1_VDPAU_HWACCEL)          += vdpau_av1.o
 OBJS-$(CONFIG_AV1_VIDEOTOOLBOX_HWACCEL)   += videotoolbox_av1.o
 OBJS-$(CONFIG_AV1_VULKAN_HWACCEL)         += vulkan_decode.o vulkan_av1.o
+OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)        += vulkan_decode.o ffv1_vulkan.o vulkan_ffv1.o
 OBJS-$(CONFIG_H263_VAAPI_HWACCEL)         += vaapi_mpeg4.o
 OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
 OBJS-$(CONFIG_H264_D3D11VA_HWACCEL)       += dxva2_h264.o
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@ -387,6 +387,9 @@ static int decode_slice(AVCodecContext *c, void *arg)
 static enum AVPixelFormat get_pixel_format(FFV1Context *f)
 {
    enum AVPixelFormat pix_fmts[] = {
+#if CONFIG_FFV1_VULKAN_HWACCEL
+        AV_PIX_FMT_VULKAN,
+#endif
        f->pix_fmt,
        AV_PIX_FMT_NONE,
    };
@ -889,6 +892,9 @@ const FFCodec ff_ffv1_decoder = {
                      FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM |
                      FF_CODEC_CAP_USES_PROGRESSFRAMES,
    .hw_configs     = (const AVCodecHWConfigInternal *const []) {
+#if CONFIG_FFV1_VULKAN_HWACCEL
+        HWACCEL_VULKAN(ffv1),
+#endif
        NULL
    },
 };
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@ -28,6 +28,7 @@ extern const struct FFHWAccel ff_av1_vaapi_hwaccel;
 extern const struct FFHWAccel ff_av1_vdpau_hwaccel;
 extern const struct FFHWAccel ff_av1_videotoolbox_hwaccel;
 extern const struct FFHWAccel ff_av1_vulkan_hwaccel;
+extern const struct FFHWAccel ff_ffv1_vulkan_hwaccel;
 extern const struct FFHWAccel ff_h263_vaapi_hwaccel;
 extern const struct FFHWAccel ff_h263_videotoolbox_hwaccel;
 extern const struct FFHWAccel ff_h264_d3d11va_hwaccel;
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@ -11,6 +11,12 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
 					vulkan/ffv1_enc_vlc.o vulkan/ffv1_enc_ac.o \
 					vulkan/ffv1_enc.o vulkan/ffv1_enc_rgb.o

+OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
+					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
+					vulkan/ffv1_common.o vulkan/ffv1_reset.o \
+					vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o \
+					vulkan/ffv1_dec_rct.o
+
 VULKAN = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavcodec/vulkan/*.comp))
 .SECONDARY: $(VULKAN:.comp=.c)
 libavcodec/vulkan/%.c: TAG = VULKAN
--- a/libavcodec/vulkan/common.comp
+++ b/libavcodec/vulkan/common.comp
@ -26,6 +26,10 @@ layout(buffer_reference, buffer_reference_align = 1) buffer u8vec2buf {
    u8vec2 v;
 };

+layout(buffer_reference, buffer_reference_align = 1) buffer u8vec4buf {
+    u8vec4 v;
+};
+
 layout(buffer_reference, buffer_reference_align = 2) buffer u16buf {
    uint16_t v;
 };
@ -182,3 +186,94 @@ uint32_t put_bytes_count(in PutBitContext pb)
    uint64_t num_bytes = (pb.buf - pb.buf_start) + ((BUF_BITS - pb.bit_left) >> 3);
    return uint32_t(num_bytes);
 }
+
+struct GetBitContext {
+    uint64_t buf_start;
+    uint64_t buf;
+    uint64_t buf_end;
+
+    uint64_t bits;
+    uint bits_valid;
+    uint size_in_bits;
+};
+
+#define LOAD64()                                       \
+    {                                                  \
+        u8vec4buf ptr = u8vec4buf(gb.buf);             \
+        uint32_t rf1 = pack32((ptr[0].v).wzyx);        \
+        uint32_t rf2 = pack32((ptr[1].v).wzyx);        \
+        gb.buf += 8;                                   \
+        gb.bits = uint64_t(rf1) << 32 | uint64_t(rf2); \
+        gb.bits_valid = 64;                            \
+    }
+
+#define RELOAD32()                                                \
+    {                                                             \
+        u8vec4buf ptr = u8vec4buf(gb.buf);                        \
+        uint32_t rf = pack32((ptr[0].v).wzyx);                    \
+        gb.buf += 4;                                              \
+        gb.bits = uint64_t(rf) << (32 - gb.bits_valid) | gb.bits; \
+        gb.bits_valid += 32;                                      \
+    }
+
+void init_get_bits(inout GetBitContext gb, u8buf data, uint64_t len)
+{
+    gb.buf = gb.buf_start = uint64_t(data);
+    gb.buf_end = uint64_t(data) + len;
+    gb.size_in_bits = uint(len) * 8;
+
+    /* Preload */
+    LOAD64()
+}
+
+bool get_bit(inout GetBitContext gb)
+{
+    if (gb.bits_valid == 0)
+        LOAD64()
+
+    bool val = bool(gb.bits >> (64 - 1));
+    gb.bits <<= 1;
+    gb.bits_valid--;
+    return val;
+}
+
+uint get_bits(inout GetBitContext gb, uint n)
+{
+    if (n == 0)
+        return 0;
+
+    if (n > gb.bits_valid)
+        RELOAD32()
+
+    uint val = uint(gb.bits >> (64 - n));
+    gb.bits <<= n;
+    gb.bits_valid -= n;
+    return val;
+}
+
+uint show_bits(inout GetBitContext gb, uint n)
+{
+    if (n > gb.bits_valid)
+        RELOAD32()
+
+    return uint(gb.bits >> (64 - n));
+}
+
+void skip_bits(inout GetBitContext gb, uint n)
+{
+    if (n > gb.bits_valid)
+        RELOAD32()
+
+    gb.bits <<= n;
+    gb.bits_valid -= n;
+}
+
+uint tell_bits(in GetBitContext gb)
+{
+    return uint(gb.buf - gb.buf_start) * 8 - gb.bits_valid;
+}
+
+uint left_bits(in GetBitContext gb)
+{
+    return gb.size_in_bits - uint(gb.buf - gb.buf_start) * 8 + gb.bits_valid;
+}
--- a/libavcodec/vulkan/ffv1_common.comp
+++ b/libavcodec/vulkan/ffv1_common.comp
@ -22,7 +22,12 @@

 struct SliceContext {
    RangeCoder c;
+
+#if !defined(DECODE)
    PutBitContext pb; /* 8*8 bytes */
+#else
+    GetBitContext gb;
+#endif

    ivec2 slice_dim;
    ivec2 slice_pos;
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@ -0,0 +1,276 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2024 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ivec2 get_pred(ivec2 pos, ivec2 off, int p, int sw, uint8_t quant_table_idx)
+{
+    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
+
+    VTYPE3 top  = VTYPE3(TYPE(0),
+                         TYPE(0),
+                         TYPE(0));
+    if (off.y > 0 && off != ivec2(0, 1))
+        top[0] = TYPE(imageLoad(dst[p], pos + ivec2(-1, -1) + yoff_border1)[0]);
+    if (off.y > 0) {
+        top[1] = TYPE(imageLoad(dst[p], pos + ivec2(0, -1))[0]);
+        top[2] = TYPE(imageLoad(dst[p], pos + ivec2(min(1, sw - off.x - 1), -1))[0]);
+    }
+
+    TYPE cur = TYPE(0);
+    if (off != ivec2(0, 0))
+        cur = TYPE(imageLoad(dst[p], pos + ivec2(-1,  0) + yoff_border1)[0]);
+
+    int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
+
+    if ((quant_table[quant_table_idx][3][127] != 0) ||
+        (quant_table[quant_table_idx][4][127] != 0)) {
+        if (off.x > 0 && off != ivec2(1, 0)) {
+            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
+            TYPE cur2 = TYPE(imageLoad(dst[p], pos + ivec2(-2,  0) + yoff_border2)[0]);
+            base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
+        }
+        if (off.y > 1) {
+            TYPE top2 = TYPE(imageLoad(dst[p], pos + ivec2(0, -2))[0]);
+            base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
+        }
+    }
+
+    /* context, prediction */
+    return ivec2(base, predict(cur, VTYPE2(top)));
+}
+
+#ifndef GOLOMB
+int get_isymbol(inout RangeCoder c, uint64_t state)
+{
+    if (get_rac(c, state))
+        return 0;
+
+    state += 1;
+
+    int e;
+    for (e = 0; e < 32; e++)
+        if (!get_rac(c, state + min(e, 9)))
+            break;
+    if (e > 31) {
+        corrupt = true;
+        return 0;
+    }
+
+    state += 21;
+
+    int a = 1 << e;
+    int i;
+    for (i = e - 1; i >= 9; i--)
+        a |= int(get_rac(c, state + 9)) << i;  // 22..31
+
+    for (; i >= 0; i--)
+        a |= int(get_rac(c, state + i)) << i;  // 22..31
+
+    return get_rac(c, state - 11 + min(e, 10)) ? -a : a;
+}
+
+void decode_line_pcm(inout SliceContext sc, int y, int p, int bits)
+{
+    ivec2 sp = sc.slice_pos;
+    int w = sc.slice_dim.x;
+
+#ifndef RGB
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+#endif
+
+    for (int x = 0; x < w; x++) {
+        uint v = 0;
+        for (int i = (bits - 1); i >= 0; i--)
+            v |= uint(get_rac_equi(sc.c)) << i;
+
+        imageStore(dst[p], sp + ivec2(x, y), uvec4(v));
+    }
+}
+
+void decode_line(inout SliceContext sc, uint64_t state,
+                 int y, int p, int bits, const int run_index)
+{
+    ivec2 sp = sc.slice_pos;
+    int w = sc.slice_dim.x;
+
+#ifndef RGB
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+#endif
+
+    for (int x = 0; x < w; x++) {
+        ivec2 pr = get_pred(sp + ivec2(x, y), ivec2(x, y), p, w,
+                            sc.quant_table_idx[p]);
+
+        int diff = get_isymbol(sc.c, state + CONTEXT_SIZE*abs(pr[0]));
+        if (pr[0] < 0)
+            diff = -diff;
+
+        uint v = zero_extend(pr[1] + diff, bits);
+        imageStore(dst[p], sp + ivec2(x, y), uvec4(v));
+    }
+}
+
+#else /* GOLOMB */
+
+void decode_line(inout SliceContext sc, uint64_t state,
+                 int y, int p, int bits, inout int run_index)
+{
+    ivec2 sp = sc.slice_pos;
+    int w = sc.slice_dim.x;
+
+#ifndef RGB
+    if (p > 0 && p < 3) {
+        w >>= chroma_shift.x;
+        sp >>= chroma_shift;
+    }
+#endif
+
+    int run_count = 0;
+    int run_mode  = 0;
+
+    for (int x = 0; x < w; x++) {
+        ivec2 pos = sp + ivec2(x, y);
+        int diff;
+        ivec2 pr = get_pred(sp + ivec2(x, y), ivec2(x, y), p, w,
+                            sc.quant_table_idx[p]);
+
+        VlcState sb = VlcState(state + VLC_STATE_SIZE*abs(pr[0]));
+
+        if (pr[0] == 0 && run_mode == 0)
+            run_mode = 1;
+
+        if (run_mode != 0) {
+            if (run_count == 0 && run_mode == 1) {
+                int tmp_idx = int(log2_run[run_index]);
+                if (get_bit(sc.gb)) {
+                    run_count = 1 << tmp_idx;
+                    if (x + run_count <= w)
+                        run_index++;
+                } else {
+                    if (tmp_idx != 0) {
+                        run_count = int(get_bits(sc.gb, tmp_idx));
+                    } else
+                        run_count = 0;
+
+                    if (run_index != 0)
+                        run_index--;
+                    run_mode = 2;
+                }
+            }
+
+            run_count--;
+            if (run_count < 0) {
+                run_mode  = 0;
+                run_count = 0;
+                diff = read_vlc_symbol(sc.gb, sb, bits);
+                if (diff >= 0)
+                    diff++;
+            } else {
+                diff = 0;
+            }
+        } else {
+            diff = read_vlc_symbol(sc.gb, sb, bits);
+        }
+
+        if (pr[0] < 0)
+            diff = -diff;
+
+        uint v = zero_extend(pr[1] + diff, bits);
+        imageStore(dst[p], sp + ivec2(x, y), uvec4(v));
+    }
+}
+#endif
+
+void decode_slice(inout SliceContext sc, const uint slice_idx)
+{
+    int run_index = 0;
+
+#ifndef RGB
+    int bits = bits_per_raw_sample;
+#else
+    int bits = 9;
+    if (bits != 8 || sc.slice_coding_mode != 0)
+        bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);
+#endif
+
+    /* PCM coding */
+#ifndef GOLOMB
+    if (sc.slice_coding_mode == 1) {
+#ifndef RGB
+        for (int p = 0; p < planes; p++) {
+            int h = sc.slice_dim.y;
+            if (p > 0 && p < 3)
+                h >>= chroma_shift.y;
+
+            for (int y = 0; y < h; y++)
+                decode_line_pcm(sc, y, p, bits);
+        }
+#else
+        for (int y = 0; y < sc.slice_dim.y; y++) {
+            for (int p = 0; p < color_planes; p++)
+                decode_line_pcm(sc, y, p, bits);
+        }
+#endif
+    } else
+
+    /* Arithmetic coding */
+#endif
+    {
+        uint64_t slice_state_off = uint64_t(slice_state) +
+                                   slice_idx*plane_state_size*codec_planes;
+
+#ifndef RGB
+        for (int p = 0; p < planes; p++) {
+            int h = sc.slice_dim.y;
+            if (p > 0 && p < 3)
+                h >>= chroma_shift.y;
+
+            for (int y = 0; y < h; y++)
+                decode_line(sc, slice_state_off, y, p, bits, run_index);
+
+            /* For the second chroma plane, reuse the first plane's state */
+            if (p != 1)
+                slice_state_off += plane_state_size;
+        }
+#else
+        for (int y = 0; y < sc.slice_dim.y; y++) {
+            for (int p = 0; p < color_planes; p++)
+                decode_line(sc,
+                            slice_state_off + plane_state_size*((p + 1) >> 1),
+                            y, p, bits, run_index);
+        }
+#endif
+    }
+}
+
+void main(void)
+{
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
+    decode_slice(slice_ctx[slice_idx], slice_idx);
+}
--- a/libavcodec/vulkan/ffv1_dec_rct.comp
+++ b/libavcodec/vulkan/ffv1_dec_rct.comp
@ -0,0 +1,88 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2025 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+void bypass_block(in SliceContext sc)
+{
+    ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos;
+    ivec2 end = sc.slice_pos + sc.slice_dim;
+
+    for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) {
+        for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) {
+            ivec2 pos = ivec2(x, y);
+            ivec4 pix;
+            for (int i = 0; i < color_planes; i++)
+                pix[i] = int(imageLoad(src[i], pos)[0]);
+
+            imageStore(dst[0], pos, pix);
+            if (planar_rgb != 0) {
+                for (int i = 1; i < color_planes; i++)
+                    imageStore(dst[i], pos, ivec4(pix[i]));
+            }
+        }
+    }
+}
+
+void transform_sample(ivec2 pos, ivec2 rct_coef)
+{
+    ivec4 pix;
+    pix.r = int(imageLoad(src[2], pos)[0]);
+    pix.g = int(imageLoad(src[0], pos)[0]);
+    pix.b = int(imageLoad(src[1], pos)[0]);
+    if (transparency != 0)
+        pix.a = int(imageLoad(src[3], pos)[0]);
+
+    pix.b -= offset;
+    pix.r -= offset;
+    pix.g -= (pix.b*rct_coef.y + pix.r*rct_coef.x) >> 2;
+    pix.b += pix.g;
+    pix.r += pix.g;
+
+    pix = ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]],
+                pix[fmt_lut[2]], pix[fmt_lut[3]]);
+
+    imageStore(dst[0], pos, pix);
+    if (planar_rgb != 0) {
+        for (int i = 1; i < color_planes; i++)
+            imageStore(dst[i], pos, ivec4(pix[i]));
+    }
+}
+
+void transform_block(in SliceContext sc)
+{
+    const ivec2 rct_coef = sc.slice_rct_coef;
+    const ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos;
+    const ivec2 end = sc.slice_pos + sc.slice_dim;
+
+    for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y)
+        for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x)
+            transform_sample(ivec2(x, y), rct_coef);
+}
+
+void main()
+{
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
+
+    if (slice_ctx[slice_idx].slice_coding_mode == 1)
+        bypass_block(slice_ctx[slice_idx]);
+    else
+        transform_block(slice_ctx[slice_idx]);
+}
--- a/libavcodec/vulkan/ffv1_dec_setup.comp
+++ b/libavcodec/vulkan/ffv1_dec_setup.comp
@ -0,0 +1,138 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2024 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+uint get_usymbol(inout RangeCoder c, uint64_t state)
+{
+    if (get_rac(c, state + 0))
+        return 0;
+
+    int e = 0;
+    while (get_rac(c, state + 1 + min(e, 9))) { // 1..10
+        e++;
+        if (e > 31) {
+            corrupt = true;
+            return 0;
+        }
+    }
+
+    uint a = 1;
+    for (int i = e - 1; i >= 0; i--)
+        a += a + uint(get_rac(c, state + 22 + min(i, 9)));  // 22..31
+
+    return a;
+}
+
+bool decode_slice_header(inout SliceContext sc, uint64_t state)
+{
+    u8buf sb = u8buf(state);
+
+    [[unroll]]
+    for (int i = 0; i < CONTEXT_SIZE; i++)
+        sb[i].v = uint8_t(128);
+
+    uint sx = get_usymbol(sc.c, state);
+    uint sy = get_usymbol(sc.c, state);
+    uint sw = get_usymbol(sc.c, state) + 1;
+    uint sh = get_usymbol(sc.c, state) + 1;
+
+    if (sx < 0 || sy < 0 || sw <= 0 || sh <= 0 ||
+        sx > (gl_NumWorkGroups.x - sw) || sy > (gl_NumWorkGroups.y - sh) ||
+        corrupt) {
+        return true;
+    }
+
+    /* Set coordinates */
+    uint sxs = slice_coord(img_size.x, sx     , gl_NumWorkGroups.x, chroma_shift.x);
+    uint sxe = slice_coord(img_size.x, sx + sw, gl_NumWorkGroups.x, chroma_shift.x);
+    uint sys = slice_coord(img_size.y, sy     , gl_NumWorkGroups.y, chroma_shift.y);
+    uint sye = slice_coord(img_size.y, sy + sh, gl_NumWorkGroups.y, chroma_shift.y);
+
+    sc.slice_pos = ivec2(sxs, sys);
+    sc.slice_dim = ivec2(sxe - sxs, sye - sys);
+    sc.slice_rct_coef = ivec2(1, 1);
+    sc.slice_coding_mode = int(0);
+
+    for (uint i = 0; i < codec_planes; i++) {
+        uint idx = get_usymbol(sc.c, state);
+        if (idx >= quant_table_count)
+            return true;
+        sc.quant_table_idx[i] = uint8_t(idx);
+        sc.context_count = context_count[idx];
+    }
+
+    get_usymbol(sc.c, state);
+    get_usymbol(sc.c, state);
+    get_usymbol(sc.c, state);
+
+    if (version >= 4) {
+        sc.slice_reset_contexts = get_rac(sc.c, state);
+        sc.slice_coding_mode = get_usymbol(sc.c, state);
+        if (sc.slice_coding_mode != 1 && colorspace == 1) {
+            sc.slice_rct_coef.x = int(get_usymbol(sc.c, state));
+            sc.slice_rct_coef.y = int(get_usymbol(sc.c, state));
+            if (sc.slice_rct_coef.x + sc.slice_rct_coef.y > 4)
+                return true;
+        }
+    }
+
+    return false;
+}
+
+void golomb_init(inout SliceContext sc, uint64_t state)
+{
+    if (version == 3 && micro_version > 1 || version > 3) {
+        u8buf(state).v = uint8_t(129);
+        get_rac(sc.c, state);
+    }
+
+    uint64_t ac_byte_count = sc.c.bytestream - sc.c.bytestream_start - 1;
+    init_get_bits(sc.gb, u8buf(sc.c.bytestream_start + ac_byte_count),
+                  sc.c.bytestream_end - sc.c.bytestream_start - ac_byte_count);
+}
+
+void main(void)
+{
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
+    uint64_t scratch_state = uint64_t(scratch_data) + slice_idx*CONTEXT_SIZE;
+
+    u8buf bs = u8buf(slice_data + slice_offsets[2*slice_idx + 0]);
+    uint32_t slice_size = slice_offsets[2*slice_idx + 1];
+
+    rac_init_dec(slice_ctx[slice_idx].c,
+                 bs, slice_size);
+
+    if (slice_idx == (gl_NumWorkGroups.x*gl_NumWorkGroups.y - 1))
+        get_rac_equi(slice_ctx[slice_idx].c);
+
+    decode_slice_header(slice_ctx[slice_idx], scratch_state);
+
+    if (golomb == 1)
+        golomb_init(slice_ctx[slice_idx], scratch_state);
+
+    if (ec != 0 && check_crc != 0) {
+        uint32_t crc = crcref;
+        for (int i = 0; i < slice_size; i++)
+            crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8);
+
+        slice_crc_mismatch[slice_idx] = crc;
+    }
+}
--- a/libavcodec/vulkan/ffv1_rct.comp
+++ b/libavcodec/vulkan/ffv1_rct.comp
@ -0,0 +1,90 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2024 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ivec4 load_components(ivec2 pos)
+{
+    ivec4 pix = ivec4(imageLoad(src[0], pos));
+    if (planar_rgb != 0) {
+        for (int i = 1; i < (3 + transparency); i++)
+            pix[i] = int(imageLoad(src[i], pos)[0]);
+    }
+
+    return ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]],
+                 pix[fmt_lut[2]], pix[fmt_lut[3]]);
+}
+
+void bypass_sample(ivec2 pos)
+{
+    imageStore(dst[0], pos, load_components(pos));
+}
+
+void bypass_block(in SliceContext sc)
+{
+    ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos;
+    ivec2 end = sc.slice_pos + sc.slice_dim;
+    for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y)
+        for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x)
+            bypass_sample(ivec2(x, y));
+}
+
+void transform_sample(ivec2 pos, ivec2 rct_coef)
+{
+    ivec4 pix = load_components(pos);
+    pix.b -= offset;
+    pix.r -= offset;
+    pix.g -= (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
+    pix.b += pix.g;
+    pix.r += pix.g;
+    imageStore(dst[0], pos, pix);
+}
+
+void transform_sample(ivec2 pos, ivec2 rct_coef)
+{
+    ivec4 pix = load_components(pos);
+    pix.b -= pix.g;
+    pix.r -= pix.g;
+    pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
+    pix.b += offset;
+    pix.r += offset;
+    imageStore(dst[0], pos, pix);
+}
+
+void transform_block(in SliceContext sc)
+{
+    const ivec2 rct_coef = sc.slice_rct_coef;
+    const ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos;
+    const ivec2 end = sc.slice_pos + sc.slice_dim;
+
+    for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y)
+        for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x)
+            transform_sample(ivec2(x, y), rct_coef);
+}
+
+void main()
+{
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
+
+    if (slice_ctx[slice_idx].slice_coding_mode == 1)
+        bypass_block(slice_ctx[slice_idx]);
+    else
+        transform_block(slice_ctx[slice_idx]);
+}
--- a/libavcodec/vulkan/ffv1_vlc.comp
+++ b/libavcodec/vulkan/ffv1_vlc.comp
@ -120,3 +120,40 @@ Symbol get_vlc_symbol(inout VlcState state, int v, int bits)

    return set_sr_golomb(code, k, 12, bits);
 }
+
+uint get_ur_golomb(inout GetBitContext gb, uint k, int limit, int esc_len)
+{
+    for (uint i = 0; i < 12; i++)
+        if (get_bit(gb))
+            return get_bits(gb, k) + (i << k);
+
+    return get_bits(gb, esc_len) + 11;
+}
+
+int get_sr_golomb(inout GetBitContext gb, uint k, int limit, int esc_len)
+{
+    int v = int(get_ur_golomb(gb, k, limit, esc_len));
+    return (v >> 1) ^ -(v & 1);
+}
+
+int read_vlc_symbol(inout GetBitContext gb, inout VlcState state, int bits)
+{
+    int k, i, v, ret;
+
+    i = state.count;
+    k = 0;
+    while (i < state.error_sum) { // FIXME: optimize
+        k++;
+        i += i;
+    }
+
+    v = get_sr_golomb(gb, k, 12, bits);
+
+    v ^= ((2 * state.drift + state.count) >> 31);
+
+    ret = fold(v + state.bias, bits);
+
+    update_vlc_state(state, v);
+
+    return ret;
+}
--- a/libavcodec/vulkan/rangecoder.comp
+++ b/libavcodec/vulkan/rangecoder.comp
@ -25,8 +25,8 @@ struct RangeCoder {
    uint64_t bytestream;
    uint64_t bytestream_end;

-    uint low;
-    uint16_t range;
+    int low;
+    int range;
    uint16_t outstanding_count;
    uint8_t outstanding_byte;
 };
@ -66,7 +66,7 @@ void renorm_encoder_full(inout RangeCoder c)
 void renorm_encoder(inout RangeCoder c)
 {
    uint16_t oc = c.outstanding_count + uint16_t(1);
-    uint low = c.low;
+    int low = c.low;

    c.range <<= 8;
    c.low = bitfieldInsert(0, low, 8, 8);
@ -95,7 +95,7 @@ void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit)
 {
    u8buf sb = u8buf(state);
    uint val = uint(sb.v);
-    uint16_t range1 = uint16_t((uint(c.range) * val) >> 8);
+    int range1 = uint16_t((c.range * val) >> 8);

 #ifdef DEBUG
    if (val == 0)
@ -106,7 +106,7 @@ void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit)
        debugPrintfEXT("Error: range1 <= 0");
 #endif

-    uint16_t diff = c.range - range1;
+    int diff = c.range - range1;
    if (bit) {
        c.low   += diff;
        c.range  = range1;
@ -125,7 +125,7 @@ void put_rac_norenorm(inout RangeCoder c, uint64_t state, bool bit)
 /* Equiprobable bit */
 void put_rac_equi(inout RangeCoder c, bool bit)
 {
-    uint16_t range1 = c.range >> 1;
+    int range1 = c.range >> 1;

 #ifdef DEBUG
    if (range1 >= c.range)
@ -147,7 +147,7 @@ void put_rac_equi(inout RangeCoder c, bool bit)

 void put_rac_terminate(inout RangeCoder c)
 {
-    uint16_t range1 = uint16_t((uint(c.range) * 129) >> 8);
+    int range1 = (c.range * 129) >> 8;

 #ifdef DEBUG
    if (range1 >= c.range)
@ -187,7 +187,79 @@ void rac_init(out RangeCoder r, u8buf data, uint buf_size)
    r.bytestream = uint64_t(data);
    r.bytestream_end = uint64_t(data) + buf_size;
    r.low = 0;
-    r.range = uint16_t(0xFF00);
+    r.range = 0xFF00;
    r.outstanding_count = uint16_t(0);
    r.outstanding_byte = uint8_t(0xFF);
 }
+
+/* Decoder */
+uint overread;
+bool corrupt;
+
+void rac_init_dec(out RangeCoder r, u8buf data, uint buf_size)
+{
+    overread = 0;
+    corrupt = false;
+
+    /* Skip priming bytes */
+    rac_init(r, OFFBUF(u8buf, data, 2), buf_size - 2);
+
+    u8vec2 prime = u8vec2buf(data).v;
+    /* Switch endianess of the priming bytes */
+    r.low = pack16(prime.yx);
+
+    if (r.low >= 0xFF00) {
+        r.low = 0xFF00;
+        r.bytestream_end = uint64_t(data) + 2;
+    }
+}
+
+void refill(inout RangeCoder c)
+{
+    c.range <<= 8;
+    c.low   <<= 8;
+    if (c.bytestream < c.bytestream_end) {
+        c.low += u8buf(c.bytestream).v;
+        c.bytestream++;
+    } else {
+        overread++;
+    }
+}
+
+bool get_rac(inout RangeCoder c, uint64_t state)
+{
+    u8buf sb = u8buf(state);
+    int val = int(sb.v);
+    int range1 = -int(c.range * val >> 8);
+    int ranged = c.range + range1;
+
+    bool bit = c.low >= ranged;
+    int bv = bit ? 0xFFFFFFFF : 0;
+    sb.v = zero_one_state[(bv & 0x100) + val];
+
+    c.low = c.low - (bv & ranged);
+    c.range = (ranged & ~bv) - (range1 & bv);
+
+    if (c.range < 0x100)
+        refill(c);
+
+    return bit;
+}
+
+bool get_rac_equi(inout RangeCoder c)
+{
+    int range1 = c.range >> 1;
+
+    c.range -= range1;
+
+    bool bit = c.low >= c.range;
+    if (bit) {
+        c.low -= c.range;
+        c.range = range1;
+    }
+
+    if (c.range < 0x100)
+        refill(c);
+
+    return bit;
+}
--- a/libavcodec/vulkan_decode.c
+++ b/libavcodec/vulkan_decode.c
@ -36,6 +36,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_hevc_desc;
 #if CONFIG_AV1_VULKAN_HWACCEL
 extern const FFVulkanDecodeDescriptor ff_vk_dec_av1_desc;
 #endif
+#if CONFIG_FFV1_VULKAN_HWACCEL
+extern const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc;
+#endif

 static const FFVulkanDecodeDescriptor *dec_descs[] = {
 #if CONFIG_H264_VULKAN_HWACCEL
@ -47,6 +50,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = {
 #if CONFIG_AV1_VULKAN_HWACCEL
    &ff_vk_dec_av1_desc,
 #endif
+#if CONFIG_FFV1_VULKAN_HWACCEL
+    &ff_vk_dec_ffv1_desc,
+#endif
 };

 static const FFVulkanDecodeDescriptor *get_codecdesc(enum AVCodecID codec_id)
@ -1035,6 +1041,23 @@ int ff_vk_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
        frames_ctx->free        = free_profile_data;

        hwfc->create_pnext = &prof->profile_list;
+    } else {
+        switch (frames_ctx->sw_format) {
+        case AV_PIX_FMT_GBRAP16:
+            /* This should be more efficient for downloading and using */
+            frames_ctx->sw_format = AV_PIX_FMT_RGBA64;
+            break;
+        case AV_PIX_FMT_GBRP10:
+            /* This saves memory bandwidth when downloading */
+            frames_ctx->sw_format = AV_PIX_FMT_X2BGR10;
+            break;
+        case AV_PIX_FMT_BGR0:
+            /* mpv has issues with bgr0 mapping, so just remap it */
+            frames_ctx->sw_format = AV_PIX_FMT_RGB0;
+            break;
+        default:
+            break;
+        }
    }

    frames_ctx->width  = avctx->coded_width;
--- a/libavcodec/vulkan_ffv1.c
+++ b/libavcodec/vulkan_ffv1.c