FFmpeg/libavcodec/vulkan/prores_idct.comp

/*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

uint get_px(uint tex_idx, ivec2 pos)
{
#ifndef INTERLACED
    return imageLoad(dst[tex_idx], pos).x;
#else
    return imageLoad(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field)).x;
#endif
}

void put_px(uint tex_idx, ivec2 pos, uint v)
{
#ifndef INTERLACED
    imageStore(dst[tex_idx], pos, uvec4(v));
#else
    imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v));
#endif
}

void main(void)
{
    uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID;
    uint comp = gid.z, block = (lid.y << 2) | (lid.x >> 3), idx = lid.x & 0x7;
    uint chroma_shift = comp != 0 ? log2_chroma_w : 0;
    bool act = gid.x < mb_width << (4 - chroma_shift);

    /**
     * Normalize coefficients to [-1, 1] for increased precision during the iDCT.
     * DCT coeffs have the range of a 12-bit signed integer (7.4 Inverse Transform).
     */
    const float norm = 1.0f / (1 << 11);

    /* Coalesced load of DCT coeffs in shared memory, inverse quantization */
    if (act) {
        /**
         * According to the VK spec indexing an array in push constant memory with
         * a non-dynamically uniform value is illegal ($15.9.1 in v1.4.326),
         * so copy the whole matrix locally.
         */
        uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma;

        /* Table 15 */
        uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> (4 - chroma_shift))];
        int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;

        [[unroll]] for (uint i = 0; i < 8; ++i) {
            uint cidx = (i << 3) + idx;
            int   c = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + i))), 16);
            float v = float(c * qscale * int(qmat[cidx])) * norm;
            blocks[block][i * 9 + idx] = v * idct_scale[cidx];
        }
    }

    /* Column-wise iDCT */
    idct8(block, idx, 9);
    barrier();

    /* Remap [-1, 1] to [0, 2] to remove a per-element addition in the output loop */
    blocks[block][idx * 9] += 1.0f;

    /* Row-wise iDCT */
    idct8(block, idx * 9, 1);
    barrier();

    float fact = 1 << (depth - 1);
    int maxv = (1 << depth) - 1;

    /* 7.5.1 Color Component Samples. Rescale, clamp and write back to global memory */
    if (act) {
        [[unroll]] for (uint i = 0; i < 8; ++i) {
            float v = round(blocks[block][i * 9 + idx] * fact);
            put_px(comp, ivec2(gid.x, (gid.y << 3) + i), clamp(int(v), 0, maxv));
        }
    }
}