diff --git a/libswscale/Makefile b/libswscale/Makefile
index 08036634b7..c4e45d494e 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -6,6 +6,7 @@ HEADERS = swscale.h                                                     \
           version_major.h                                               \
 
 OBJS = alphablend.o                                     \
+       cms.o                                            \
        csputils.o                                       \
        hscale.o                                         \
        hscale_fast_bilinear.o                           \
diff --git a/libswscale/cms.c b/libswscale/cms.c
new file mode 100644
index 0000000000..7793a19b87
--- /dev/null
+++ b/libswscale/cms.c
@@ -0,0 +1,766 @@
+/*
+ * Copyright (C) 2024 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/csp.h"
+#include "libavutil/slicethread.h"
+
+#include "cms.h"
+#include "csputils.h"
+#include "libswscale/swscale.h"
+#include "utils.h"
+
+bool sws_color_map_noop(const SwsColorMap *map)
+{
+    /* If the encoding space is different, we must go through a conversion */
+    if (map->src.prim != map->dst.prim || map->src.trc != map->dst.trc)
+        return false;
+
+    /* If the black point changes, we have to perform black point compensation */
+    if (av_cmp_q(map->src.min_luma, map->dst.min_luma))
+        return false;
+
+    switch (map->intent) {
+    case SWS_INTENT_ABSOLUTE_COLORIMETRIC:
+    case SWS_INTENT_RELATIVE_COLORIMETRIC:
+        return ff_prim_superset(&map->dst.gamut, &map->src.gamut) &&
+               av_cmp_q(map->src.max_luma, map->dst.max_luma) <= 0;
+    case SWS_INTENT_PERCEPTUAL:
+    case SWS_INTENT_SATURATION:
+        return ff_prim_equal(&map->dst.gamut, &map->src.gamut) &&
+               !av_cmp_q(map->src.max_luma, map->dst.max_luma);
+    default:
+        av_assert0(!"Invalid gamut mapping intent?");
+        return true;
+    }
+}
+
+/* Approximation of gamut hull at a given intensity level */
+static const float hull(float I)
+{
+    return ((I - 6.0f) * I + 9.0f) * I;
+}
+
+/* For some minimal type safety, and code cleanliness */
+typedef struct RGB {
+    float R, G, B; /* nits */
+} RGB;
+
+typedef struct IPT {
+    float I, P, T;
+} IPT;
+
+typedef struct ICh {
+    float I, C, h;
+} ICh;
+
+static av_always_inline ICh ipt2ich(IPT c)
+{
+    return (ICh) {
+        .I = c.I,
+        .C = sqrtf(c.P * c.P + c.T * c.T),
+        .h = atan2f(c.T, c.P),
+    };
+}
+
+static av_always_inline IPT ich2ipt(ICh c)
+{
+    return (IPT) {
+        .I = c.I,
+        .P = c.C * cosf(c.h),
+        .T = c.C * sinf(c.h),
+    };
+}
+
+/* Helper struct containing pre-computed cached values describing a gamut */
+typedef struct Gamut {
+    SwsMatrix3x3 encoding2lms;
+    SwsMatrix3x3 lms2encoding;
+    SwsMatrix3x3 lms2content;
+    SwsMatrix3x3 content2lms;
+    av_csp_eotf_function eotf;
+    av_csp_eotf_function eotf_inv;
+    float Iavg_frame;
+    float Imax_frame;
+    float Imin, Imax;
+    float Lb, Lw;
+    AVCIExy wp;
+    ICh peak; /* updated as needed in loop body when hue changes */
+} Gamut;
+
+static Gamut gamut_from_colorspace(SwsColor fmt)
+{
+    const AVColorPrimariesDesc *encoding = av_csp_primaries_desc_from_id(fmt.prim);
+    const AVColorPrimariesDesc content = {
+        .prim = fmt.gamut,
+        .wp   = encoding->wp,
+    };
+
+    const float Lw = av_q2d(fmt.max_luma), Lb = av_q2d(fmt.min_luma);
+    const float Imax = pq_oetf(Lw);
+
+    return (Gamut) {
+        .encoding2lms = ff_sws_ipt_rgb2lms(encoding),
+        .lms2encoding = ff_sws_ipt_lms2rgb(encoding),
+        .lms2content  = ff_sws_ipt_lms2rgb(&content),
+        .content2lms  = ff_sws_ipt_rgb2lms(&content),
+        .eotf         = av_csp_itu_eotf(fmt.trc),
+        .eotf_inv     = av_csp_itu_eotf_inv(fmt.trc),
+        .wp           = encoding->wp,
+        .Imin         = pq_oetf(Lb),
+        .Imax         = Imax,
+        .Imax_frame   = fmt.frame_peak.den ? pq_oetf(av_q2d(fmt.frame_peak)) : Imax,
+        .Iavg_frame   = fmt.frame_avg.den  ? pq_oetf(av_q2d(fmt.frame_avg))  : 0.0f,
+        .Lb           = Lb,
+        .Lw           = Lw,
+    };
+}
+
+static av_always_inline IPT rgb2ipt(RGB c, const SwsMatrix3x3 rgb2lms)
+{
+    const float L = rgb2lms.m[0][0] * c.R +
+                    rgb2lms.m[0][1] * c.G +
+                    rgb2lms.m[0][2] * c.B;
+    const float M = rgb2lms.m[1][0] * c.R +
+                    rgb2lms.m[1][1] * c.G +
+                    rgb2lms.m[1][2] * c.B;
+    const float S = rgb2lms.m[2][0] * c.R +
+                    rgb2lms.m[2][1] * c.G +
+                    rgb2lms.m[2][2] * c.B;
+    const float Lp = pq_oetf(L);
+    const float Mp = pq_oetf(M);
+    const float Sp = pq_oetf(S);
+    return (IPT) {
+        .I = 0.4000f * Lp + 0.4000f * Mp + 0.2000f * Sp,
+        .P = 4.4550f * Lp - 4.8510f * Mp + 0.3960f * Sp,
+        .T = 0.8056f * Lp + 0.3572f * Mp - 1.1628f * Sp,
+    };
+}
+
+static av_always_inline RGB ipt2rgb(IPT c, const SwsMatrix3x3 lms2rgb)
+{
+    const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T;
+    const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T;
+    const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T;
+    const float L = pq_eotf(Lp);
+    const float M = pq_eotf(Mp);
+    const float S = pq_eotf(Sp);
+    return (RGB) {
+        .R = lms2rgb.m[0][0] * L +
+             lms2rgb.m[0][1] * M +
+             lms2rgb.m[0][2] * S,
+        .G = lms2rgb.m[1][0] * L +
+             lms2rgb.m[1][1] * M +
+             lms2rgb.m[1][2] * S,
+        .B = lms2rgb.m[2][0] * L +
+             lms2rgb.m[2][1] * M +
+             lms2rgb.m[2][2] * S,
+    };
+}
+
+static inline bool ingamut(IPT c, Gamut gamut)
+{
+    const float min_rgb = gamut.Lb - 1e-4f;
+    const float max_rgb = gamut.Lw + 1e-2f;
+    const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T;
+    const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T;
+    const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T;
+    if (Lp < gamut.Imin || Lp > gamut.Imax ||
+        Mp < gamut.Imin || Mp > gamut.Imax ||
+        Sp < gamut.Imin || Sp > gamut.Imax)
+    {
+        /* Values outside legal LMS range */
+        return false;
+    } else {
+        const float L = pq_eotf(Lp);
+        const float M = pq_eotf(Mp);
+        const float S = pq_eotf(Sp);
+        RGB rgb = {
+            .R = gamut.lms2content.m[0][0] * L +
+                 gamut.lms2content.m[0][1] * M +
+                 gamut.lms2content.m[0][2] * S,
+            .G = gamut.lms2content.m[1][0] * L +
+                 gamut.lms2content.m[1][1] * M +
+                 gamut.lms2content.m[1][2] * S,
+            .B = gamut.lms2content.m[2][0] * L +
+                 gamut.lms2content.m[2][1] * M +
+                 gamut.lms2content.m[2][2] * S,
+        };
+        return rgb.R >= min_rgb && rgb.R <= max_rgb &&
+               rgb.G >= min_rgb && rgb.G <= max_rgb &&
+               rgb.B >= min_rgb && rgb.B <= max_rgb;
+    }
+}
+
+static const float maxDelta = 5e-5f;
+
+// Find gamut intersection using specified bounds
+static inline ICh
+desat_bounded(float I, float h, float Cmin, float Cmax, Gamut gamut)
+{
+    if (I <= gamut.Imin)
+        return (ICh) { .I = gamut.Imin, .C = 0, .h = h };
+    else if (I >= gamut.Imax)
+        return (ICh) { .I = gamut.Imax, .C = 0, .h = h };
+    else {
+        const float maxDI = I * maxDelta;
+        ICh res = { .I = I, .C = (Cmin + Cmax) / 2, .h = h };
+        do {
+            if (ingamut(ich2ipt(res), gamut)) {
+                Cmin = res.C;
+            } else {
+                Cmax = res.C;
+            }
+            res.C = (Cmin + Cmax) / 2;
+        } while (Cmax - Cmin > maxDI);
+
+        return res;
+    }
+}
+
+// Finds maximally saturated in-gamut color (for given hue)
+static inline ICh saturate(float hue, Gamut gamut)
+{
+    static const float invphi = 0.6180339887498948f;
+    static const float invphi2 = 0.38196601125010515f;
+
+    ICh lo = { .I = gamut.Imin, .h = hue };
+    ICh hi = { .I = gamut.Imax, .h = hue };
+    float de = hi.I - lo.I;
+    ICh a = { .I = lo.I + invphi2 * de };
+    ICh b = { .I = lo.I + invphi  * de };
+    a = desat_bounded(a.I, hue, 0.0f, 0.5f, gamut);
+    b = desat_bounded(b.I, hue, 0.0f, 0.5f, gamut);
+
+    while (de > maxDelta) {
+        de *= invphi;
+        if (a.C > b.C) {
+            hi = b;
+            b = a;
+            a.I = lo.I + invphi2 * de;
+            a = desat_bounded(a.I, hue, lo.C - maxDelta, 0.5f, gamut);
+        } else {
+            lo = a;
+            a = b;
+            b.I = lo.I + invphi * de;
+            b = desat_bounded(b.I, hue, hi.C - maxDelta, 0.5f, gamut);
+        }
+    }
+
+    return a.C > b.C ? a : b;
+}
+
+static float softclip(float value, float source, float target)
+{
+    const float j = SOFTCLIP_KNEE;
+    float peak, x, a, b, scale;
+    if (!target)
+        return 0.0f;
+
+    peak = source / target;
+    x = fminf(value / target, peak);
+    if (x <= j || peak <= 1.0)
+        return value;
+
+    /* Apply simple mobius function */
+    a = -j*j * (peak - 1.0f) / (j*j - 2.0f * j + peak);
+    b = (j*j - 2.0f * j * peak + peak) / fmaxf(1e-6f, peak - 1.0f);
+    scale = (b*b + 2.0f * b*j + j*j) / (b - a);
+
+    return scale * (x + a) / (x + b) * target;
+}
+
+/**
+ * Something like fmixf(base, c, x) but follows an exponential curve, note
+ * that this can be used to extend 'c' outwards for x > 1
+ */
+static inline ICh mix_exp(ICh c, float x, float gamma, float base)
+{
+    return (ICh) {
+        .I = base + (c.I - base) * powf(x, gamma),
+        .C = c.C * x,
+        .h = c.h,
+    };
+}
+
+/**
+ * Drop gamma for colors approaching black and achromatic to avoid numerical
+ * instabilities, and excessive brightness boosting of grain, while also
+ * strongly boosting gamma for values exceeding the target peak
+ */
+static inline float scale_gamma(float gamma, ICh ich, Gamut gamut)
+{
+    const float Imin = gamut.Imin;
+    const float Irel = fmaxf((ich.I - Imin) / (gamut.peak.I - Imin), 0.0f);
+    return gamma * powf(Irel, 3) * fminf(ich.C / gamut.peak.C, 1.0f);
+}
+
+/* Clip a color along the exponential curve given by `gamma` */
+static inline IPT clip_gamma(IPT ipt, float gamma, Gamut gamut)
+{
+    float lo = 0.0f, hi = 1.0f, x = 0.5f;
+    const float maxDI = fmaxf(ipt.I * maxDelta, 1e-7f);
+    ICh ich;
+
+    if (ipt.I <= gamut.Imin)
+        return (IPT) { .I = gamut.Imin };
+    if (ingamut(ipt, gamut))
+        return ipt;
+
+    ich = ipt2ich(ipt);
+    if (!gamma)
+        return ich2ipt(desat_bounded(ich.I, ich.h, 0.0f, ich.C, gamut));
+
+    gamma = scale_gamma(gamma, ich, gamut);
+    do {
+        ICh test = mix_exp(ich, x, gamma, gamut.peak.I);
+        if (ingamut(ich2ipt(test), gamut)) {
+            lo = x;
+        } else {
+            hi = x;
+        }
+        x = (lo + hi) / 2.0f;
+    } while (hi - lo > maxDI);
+
+    return ich2ipt(mix_exp(ich, x, gamma, gamut.peak.I));
+}
+
+typedef struct CmsCtx CmsCtx;
+struct CmsCtx {
+    /* Tone mapping parameters */
+    float Qa, Qb, Qc, Pa, Pb, src_knee, dst_knee; /* perceptual */
+    float I_scale, I_offset; /* linear methods */
+
+    /* Colorspace parameters */
+    Gamut src;
+    Gamut tmp; /* after tone mapping */
+    Gamut dst;
+    SwsMatrix3x3 adaptation; /* for absolute intent */
+
+    /* Invocation parameters */
+    SwsColorMap map;
+    float (*tone_map)(const CmsCtx *ctx, float I);
+    IPT (*adapt_colors)(const CmsCtx *ctx, IPT ipt);
+    v3u16_t *input;
+    v3u16_t *output;
+
+    /* Threading parameters */
+    int slice_size;
+    int size_input;
+    int size_output_I;
+    int size_output_PT;
+};
+
+/**
+ * Helper function to pick a knee point based on the * HDR10+ brightness
+ * metadata and scene brightness average matching.
+ *
+ * Inspired by SMPTE ST2094-10, with some modifications
+ */
+static void st2094_pick_knee(float src_max, float src_min, float src_avg,
+                             float dst_max, float dst_min,
+                             float *out_src_knee, float *out_dst_knee)
+{
+    const float min_knee = PERCEPTUAL_KNEE_MIN;
+    const float max_knee = PERCEPTUAL_KNEE_MAX;
+    const float def_knee = PERCEPTUAL_KNEE_DEF;
+    const float src_knee_min = fmixf(src_min, src_max, min_knee);
+    const float src_knee_max = fmixf(src_min, src_max, max_knee);
+    const float dst_knee_min = fmixf(dst_min, dst_max, min_knee);
+    const float dst_knee_max = fmixf(dst_min, dst_max, max_knee);
+    float src_knee, target, adapted, tuning, adaptation, dst_knee;
+
+    /* Choose source knee based on dynamic source scene brightness */
+    src_knee = src_avg ? src_avg : fmixf(src_min, src_max, def_knee);
+    src_knee = av_clipf(src_knee, src_knee_min, src_knee_max);
+
+    /* Choose target adaptation point based on linearly re-scaling source knee */
+    target = (src_knee - src_min) / (src_max - src_min);
+    adapted = fmixf(dst_min, dst_max, target);
+
+    /**
+     * Choose the destnation knee by picking the perceptual adaptation point
+     * between the source knee and the desired target. This moves the knee
+     * point, on the vertical axis, closer to the 1:1 (neutral) line.
+     *
+     * Adjust the adaptation strength towards 1 based on how close the knee
+     * point is to its extreme values (min/max knee)
+     */
+    tuning = smoothstepf(max_knee, def_knee, target) *
+             smoothstepf(min_knee, def_knee, target);
+    adaptation = fmixf(1.0f, PERCEPTUAL_ADAPTATION, tuning);
+    dst_knee = fmixf(src_knee, adapted, adaptation);
+    dst_knee = av_clipf(dst_knee, dst_knee_min, dst_knee_max);
+
+    *out_src_knee = src_knee;
+    *out_dst_knee = dst_knee;
+}
+
+static void tone_map_setup(CmsCtx *ctx, bool dynamic)
+{
+    const float dst_min = ctx->dst.Imin;
+    const float dst_max = ctx->dst.Imax;
+    const float src_min = ctx->src.Imin;
+    const float src_max = dynamic ? ctx->src.Imax_frame : ctx->src.Imax;
+    const float src_avg = dynamic ? ctx->src.Iavg_frame : 0.0f;
+    float slope, ratio, in_min, in_max, out_min, out_max, t;
+
+    switch (ctx->map.intent) {
+    case SWS_INTENT_PERCEPTUAL:
+        st2094_pick_knee(src_max, src_min, src_avg, dst_max, dst_min,
+                         &ctx->src_knee, &ctx->dst_knee);
+
+        /* Solve for linear knee (Pa = 0) */
+        slope = (ctx->dst_knee - dst_min) / (ctx->src_knee - src_min);
+
+        /**
+         * Tune the slope at the knee point slightly: raise it to a user-provided
+         * gamma exponent, multiplied by an extra tuning coefficient designed to
+         * make the slope closer to 1.0 when the difference in peaks is low, and
+         * closer to linear when the difference between peaks is high.
+         */
+        ratio = src_max / dst_max - 1.0f;
+        ratio = av_clipf(SLOPE_TUNING * ratio, SLOPE_OFFSET, 1.0f + SLOPE_OFFSET);
+        slope = powf(slope, (1.0f - PERCEPTUAL_CONTRAST) * ratio);
+
+        /* Normalize everything the pivot to make the math easier */
+        in_min  = src_min - ctx->src_knee;
+        in_max  = src_max - ctx->src_knee;
+        out_min = dst_min - ctx->dst_knee;
+        out_max = dst_max - ctx->dst_knee;
+
+        /**
+         * Solve P of order 2 for:
+         *  P(in_min) = out_min
+         *  P'(0.0) = slope
+         *  P(0.0) = 0.0
+         */
+        ctx->Pa = (out_min - slope * in_min) / (in_min * in_min);
+        ctx->Pb = slope;
+
+        /**
+         * Solve Q of order 3 for:
+         *  Q(in_max) = out_max
+         *  Q''(in_max) = 0.0
+         *  Q(0.0) = 0.0
+         *  Q'(0.0) = slope
+         */
+        t = 2 * in_max * in_max;
+        ctx->Qa = (slope * in_max - out_max) / (in_max * t);
+        ctx->Qb = -3 * (slope * in_max - out_max) / t;
+        ctx->Qc = slope;
+        break;
+    case SWS_INTENT_SATURATION:
+        /* Linear stretch */
+        ctx->I_scale = (dst_max - dst_min) / (src_max - src_min);
+        ctx->I_offset = dst_min - src_min * ctx->I_scale;
+        break;
+    case SWS_INTENT_RELATIVE_COLORIMETRIC:
+        /* Pure black point adaptation */
+        ctx->I_scale = src_max / (src_max - src_min) /
+                      (dst_max / (dst_max - dst_min));
+        ctx->I_offset = dst_min - src_min * ctx->I_scale;
+        break;
+    case SWS_INTENT_ABSOLUTE_COLORIMETRIC:
+        /* Hard clip */
+        ctx->I_scale = 1.0f;
+        ctx->I_offset = 0.0f;
+        break;
+    }
+}
+
+static av_always_inline IPT tone_map_apply(const CmsCtx *ctx, IPT ipt)
+{
+    float I = ipt.I, desat;
+
+    if (ctx->map.intent == SWS_INTENT_PERCEPTUAL) {
+        const float Pa = ctx->Pa, Pb = ctx->Pb;
+        const float Qa = ctx->Qa, Qb = ctx->Qb, Qc = ctx->Qc;
+        I -= ctx->src_knee;
+        I = I > 0 ? ((Qa * I + Qb) * I + Qc) * I : (Pa * I + Pb) * I;
+        I += ctx->dst_knee;
+    } else {
+        I = ctx->I_scale * I + ctx->I_offset;
+    }
+
+    /**
+     * Avoids raising saturation excessively when raising brightness, and
+     * also desaturates when reducing brightness greatly to account for the
+     * reduction in gamut volume.
+     */
+    desat = fminf(ipt.I / I, hull(I) / hull(ipt.I));
+    return (IPT) {
+        .I = I,
+        .P = ipt.P * desat,
+        .T = ipt.T * desat,
+    };
+}
+
+static IPT perceptual(const CmsCtx *ctx, IPT ipt)
+{
+    ICh ich = ipt2ich(ipt);
+    IPT mapped = rgb2ipt(ipt2rgb(ipt, ctx->tmp.lms2content), ctx->dst.content2lms);
+    RGB rgb;
+    float maxRGB;
+
+    /* Protect in gamut region */
+    const float maxC = fmaxf(ctx->tmp.peak.C, ctx->dst.peak.C);
+    float k = smoothstepf(PERCEPTUAL_DEADZONE, 1.0f, ich.C / maxC);
+    k *= PERCEPTUAL_STRENGTH;
+    ipt.I = fmixf(ipt.I, mapped.I, k);
+    ipt.P = fmixf(ipt.P, mapped.P, k);
+    ipt.T = fmixf(ipt.T, mapped.T, k);
+
+    rgb = ipt2rgb(ipt, ctx->dst.lms2content);
+    maxRGB = fmaxf(rgb.R, fmaxf(rgb.G, rgb.B));
+    rgb.R = fmaxf(softclip(rgb.R, maxRGB, ctx->dst.Lw), ctx->dst.Lb);
+    rgb.G = fmaxf(softclip(rgb.G, maxRGB, ctx->dst.Lw), ctx->dst.Lb);
+    rgb.B = fmaxf(softclip(rgb.B, maxRGB, ctx->dst.Lw), ctx->dst.Lb);
+
+    return rgb2ipt(rgb, ctx->dst.content2lms);
+}
+
+static IPT relative(const CmsCtx *ctx, IPT ipt)
+{
+    return clip_gamma(ipt, COLORIMETRIC_GAMMA, ctx->dst);
+}
+
+static IPT absolute(const CmsCtx *ctx, IPT ipt)
+{
+    RGB rgb = ipt2rgb(ipt, ctx->dst.lms2encoding);
+    float c[3] = { rgb.R, rgb.G, rgb.B };
+    ff_sws_matrix3x3_apply(&ctx->adaptation, c);
+    ipt = rgb2ipt((RGB) { c[0], c[1], c[2] }, ctx->dst.encoding2lms);
+
+    return clip_gamma(ipt, COLORIMETRIC_GAMMA, ctx->dst);
+}
+
+static IPT saturation(const CmsCtx * ctx, IPT ipt)
+{
+    RGB rgb = ipt2rgb(ipt, ctx->tmp.lms2content);
+    return rgb2ipt(rgb, ctx->dst.content2lms);
+}
+
+static av_always_inline av_const uint16_t av_round16f(float x)
+{
+    return av_clip_uint16(x * (UINT16_MAX - 1) + 0.5f);
+}
+
+/* Call this whenever the hue changes inside the loop body */
+static av_always_inline void update_hue_peaks(CmsCtx *ctx, float P, float T)
+{
+    const float hue = atan2f(T, P);
+    switch (ctx->map.intent) {
+    case SWS_INTENT_PERCEPTUAL:
+        ctx->tmp.peak = saturate(hue, ctx->tmp);
+        /* fall through */
+    case SWS_INTENT_RELATIVE_COLORIMETRIC:
+    case SWS_INTENT_ABSOLUTE_COLORIMETRIC:
+        ctx->dst.peak = saturate(hue, ctx->dst);
+        return;
+    default:
+        return;
+    }
+}
+
+static void generate_slice(void *priv, int jobnr, int threadnr, int nb_jobs,
+                           int nb_threads)
+{
+    CmsCtx ctx = *(const CmsCtx *) priv;
+
+    const int slice_start  = jobnr * ctx.slice_size;
+    const int slice_stride = ctx.size_input * ctx.size_input;
+    const int slice_end    = FFMIN((jobnr + 1) * ctx.slice_size, ctx.size_input);
+    v3u16_t *input = &ctx.input[slice_start * slice_stride];
+
+    const int output_slice_h = (ctx.size_output_PT + nb_jobs - 1) / nb_jobs;
+    const int output_start   = jobnr * output_slice_h;
+    const int output_stride  = ctx.size_output_PT * ctx.size_output_I;
+    const int output_end     = FFMIN((jobnr + 1) * output_slice_h, ctx.size_output_PT);
+    v3u16_t *output = ctx.output ? &ctx.output[output_start * output_stride] : NULL;
+
+    const float I_scale   = 1.0f / (ctx.src.Imax - ctx.src.Imin);
+    const float I_offset  = -ctx.src.Imin * I_scale;
+    const float PT_offset = (float) (1 << 15) / (UINT16_MAX - 1);
+
+    const float input_scale     = 1.0f / (ctx.size_input - 1);
+    const float output_scale_PT = 1.0f / (ctx.size_output_PT - 1);
+    const float output_scale_I  = (ctx.tmp.Imax - ctx.tmp.Imin) /
+                                  (ctx.size_output_I - 1);
+
+    for (int Bx = slice_start; Bx < slice_end; Bx++) {
+        const float B = input_scale * Bx;
+        for (int Gx = 0; Gx < ctx.size_input; Gx++) {
+            const float G = input_scale * Gx;
+            for (int Rx = 0; Rx < ctx.size_input; Rx++) {
+                double c[3] = { input_scale * Rx, G, B };
+                RGB rgb;
+                IPT ipt;
+
+                ctx.src.eotf(ctx.src.Lw, ctx.src.Lb, c);
+                rgb = (RGB) { c[0], c[1], c[2] };
+                ipt = rgb2ipt(rgb, ctx.src.encoding2lms);
+
+                if (output) {
+                    /* Save intermediate value to 3DLUT */
+                    *input++ = (v3u16_t) {
+                        av_round16f(I_scale * ipt.I + I_offset),
+                        av_round16f(ipt.P + PT_offset),
+                        av_round16f(ipt.T + PT_offset),
+                    };
+                } else {
+                    update_hue_peaks(&ctx, ipt.P, ipt.T);
+
+                    ipt = tone_map_apply(&ctx, ipt);
+                    ipt = ctx.adapt_colors(&ctx, ipt);
+                    rgb = ipt2rgb(ipt, ctx.dst.lms2encoding);
+
+                    c[0] = rgb.R;
+                    c[1] = rgb.G;
+                    c[2] = rgb.B;
+                    ctx.dst.eotf_inv(ctx.dst.Lw, ctx.dst.Lb, c);
+                    *input++ = (v3u16_t) {
+                        av_round16f(c[0]),
+                        av_round16f(c[1]),
+                        av_round16f(c[2]),
+                    };
+                }
+            }
+        }
+    }
+
+    if (!output)
+        return;
+
+    /* Generate split gamut mapping LUT */
+    for (int Tx = output_start; Tx < output_end; Tx++) {
+        const float T = output_scale_PT * Tx - PT_offset;
+        for (int Px = 0; Px < ctx.size_output_PT; Px++) {
+            const float P = output_scale_PT * Px - PT_offset;
+            update_hue_peaks(&ctx, P, T);
+
+            for (int Ix = 0; Ix < ctx.size_output_I; Ix++) {
+                const float I = output_scale_I * Ix + ctx.tmp.Imin;
+                IPT ipt = ctx.adapt_colors(&ctx, (IPT) { I, P, T });
+                RGB rgb = ipt2rgb(ipt, ctx.dst.lms2encoding);
+                double c[3] = { rgb.R, rgb.G, rgb.B };
+                ctx.dst.eotf_inv(ctx.dst.Lw, ctx.dst.Lb, c);
+                *output++ = (v3u16_t) {
+                    av_round16f(c[0]),
+                    av_round16f(c[1]),
+                    av_round16f(c[2]),
+                };
+            }
+        }
+    }
+}
+
+int sws_color_map_generate_static(v3u16_t *lut, int size, const SwsColorMap *map)
+{
+    return sws_color_map_generate_dynamic(lut, NULL, size, 1, 1, map);
+}
+
+int sws_color_map_generate_dynamic(v3u16_t *input, v3u16_t *output,
+                                   int size_input, int size_I, int size_PT,
+                                   const SwsColorMap *map)
+{
+    AVSliceThread *slicethread;
+    int ret, num_slices;
+
+    CmsCtx ctx = {
+        .map            = *map,
+        .input          = input,
+        .output         = output,
+        .size_input     = size_input,
+        .size_output_I  = size_I,
+        .size_output_PT = size_PT,
+        .src            = gamut_from_colorspace(map->src),
+        .dst            = gamut_from_colorspace(map->dst),
+    };
+
+    switch (ctx.map.intent) {
+    case SWS_INTENT_PERCEPTUAL:            ctx.adapt_colors = perceptual; break;
+    case SWS_INTENT_RELATIVE_COLORIMETRIC: ctx.adapt_colors = relative;   break;
+    case SWS_INTENT_SATURATION:            ctx.adapt_colors = saturation; break;
+    case SWS_INTENT_ABSOLUTE_COLORIMETRIC: ctx.adapt_colors = absolute;   break;
+    default: return AVERROR(EINVAL);
+    }
+
+    if (!output) {
+        /* Tone mapping is handled in a separate step when using dynamic TM */
+        tone_map_setup(&ctx, false);
+    }
+
+    /* Intermediate color space after tone mapping */
+    ctx.tmp      = ctx.src;
+    ctx.tmp.Lb   = ctx.dst.Lb;
+    ctx.tmp.Lw   = ctx.dst.Lw;
+    ctx.tmp.Imin = ctx.dst.Imin;
+    ctx.tmp.Imax = ctx.dst.Imax;
+
+    if (ctx.map.intent == SWS_INTENT_ABSOLUTE_COLORIMETRIC) {
+        /**
+         * The IPT transform already implies an explicit white point adaptation
+         * from src to dst, so to get absolute colorimetric semantics we have
+         * to explicitly undo this adaptation with a * corresponding inverse.
+         */
+        ctx.adaptation = ff_sws_get_adaptation(&ctx.map.dst.gamut,
+                                               ctx.dst.wp, ctx.src.wp);
+    }
+
+    ret = avpriv_slicethread_create(&slicethread, &ctx, generate_slice, NULL, 0);
+    if (ret < 0)
+        return ret;
+
+    ctx.slice_size = (ctx.size_input + ret - 1) / ret;
+    num_slices = (ctx.size_input + ctx.slice_size - 1) / ctx.slice_size;
+    avpriv_slicethread_execute(slicethread, num_slices, 0);
+    avpriv_slicethread_free(&slicethread);
+    return 0;
+}
+
+void sws_tone_map_generate(v2u16_t *lut, int size, const SwsColorMap *map)
+{
+    CmsCtx ctx = {
+        .map = *map,
+        .src = gamut_from_colorspace(map->src),
+        .dst = gamut_from_colorspace(map->dst),
+    };
+
+    const float src_scale  = (ctx.src.Imax - ctx.src.Imin) / (size - 1);
+    const float src_offset = ctx.src.Imin;
+    const float dst_scale  = 1.0f / (ctx.dst.Imax - ctx.dst.Imin);
+    const float dst_offset = -ctx.dst.Imin * dst_scale;
+
+    tone_map_setup(&ctx, true);
+
+    for (int i = 0; i < size; i++) {
+        const float I = src_scale * i + src_offset;
+        IPT ipt = tone_map_apply(&ctx, (IPT) { I, 1.0f });
+        lut[i] = (v2u16_t) {
+            av_round16f(dst_scale * ipt.I + dst_offset),
+            av_clip_uint16(ipt.P * (1 << 15) + 0.5f),
+        };
+    }
+}
diff --git a/libswscale/cms.h b/libswscale/cms.h
new file mode 100644
index 0000000000..542b25e92e
--- /dev/null
+++ b/libswscale/cms.h
@@ -0,0 +1,105 @@
+ /*
+  * Copyright (C) 2024 Niklas Haas
+  *
+  * This file is part of FFmpeg.
+  *
+  * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
+  * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+
+#ifndef SWSCALE_CMS_H
+#define SWSCALE_CMS_H
+
+#include <stdbool.h>
+
+#include "libavutil/csp.h"
+
+#include "csputils.h"
+#include "swscale.h"
+#include "utils.h"
+
+/* Minimum, maximum, and default knee point for perceptual tone mapping [0,1] */
+#define PERCEPTUAL_KNEE_MIN 0.10f
+#define PERCEPTUAL_KNEE_MAX 0.80f
+#define PERCEPTUAL_KNEE_DEF 0.40f
+
+/* Ratio between source average and target average. */
+#define PERCEPTUAL_ADAPTATION 0.40f
+
+/* (Relative) chromaticity protection zone for perceptual mapping [0,1] */
+#define PERCEPTUAL_DEADZONE 0.30f
+
+/* Contrast setting for perceptual tone mapping. [0,1.5] */
+#define PERCEPTUAL_CONTRAST 0.50f
+
+/* Tuning constants for overriding the contrast near extremes */
+#define SLOPE_TUNING 1.50f /* [0,10] */
+#define SLOPE_OFFSET 0.20f /* [0,1] */
+
+/* Strength of the perceptual saturation mapping component [0,1] */
+#define PERCEPTUAL_STRENGTH 0.80f
+
+/* Knee point to use for perceptual soft clipping [0,1] */
+#define SOFTCLIP_KNEE 0.70f
+
+/* I vs C curve gamma to use for colorimetric clipping [0,10] */
+#define COLORIMETRIC_GAMMA 1.80f
+
+/* Struct describing a color mapping operation */
+typedef struct SwsColorMap {
+    SwsColor src;
+    SwsColor dst;
+    SwsIntent intent;
+} SwsColorMap;
+
+/**
+ * Returns true if the given color map is a semantic no-op - that is,
+ * the overall RGB end to end transform would an identity mapping.
+ */
+bool sws_color_map_noop(const SwsColorMap *map);
+
+/**
+ * Generates a single end-to-end color mapping 3DLUT embedding a static tone
+ * mapping curve.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int sws_color_map_generate_static(v3u16_t *lut, int size, const SwsColorMap *map);
+
+/**
+ * Generates a split pair of 3DLUTS, going to IPT and back, allowing an
+ * arbitrary dynamic EETF to be nestled in between these two operations.
+ *
+ * See sws_tone_map_generate().
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int sws_color_map_generate_dynamic(v3u16_t *input, v3u16_t *output,
+                                   int size_input, int size_I, int size_PT,
+                                   const SwsColorMap *map);
+
+/**
+ * Generate a 1D LUT of size `size` adapting intensity (I) levels from the
+ * source to the destination color space. The LUT is normalized to the
+ * relevant intensity range directly. The second channel of each entry returns
+ * the corresponding 15-bit scaling factor for the P/T channels. The scaling
+ * factor k may be applied as `(1 << 15) - k + (PT * k >> 15)`.
+ *
+ * This is designed to be used with sws_gamut_map_generate_dynamic().
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+void sws_tone_map_generate(v2u16_t *lut, int size, const SwsColorMap *map);
+
+#endif // SWSCALE_GAMUT_MAPPING_H