You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
This commit adds a ProRes RAW hardware implementation written in Vulkan. Both version 0 and version 1 streams are supported. The implementation is highly parallelized, with 512 invocations dispatched per every tile, with generally 4k tiles on a 5.8k stream. Thanks to unlord for the 8-point iDCT. Benchmark for a generic 5.8k RAW HQ file: 6900XT: 63fps 7900XTX: 84fps 6000 Ada: 120fps Intel: 9fps
348 lines
9.9 KiB
Plaintext
348 lines
9.9 KiB
Plaintext
/*
|
|
* ProRes RAW decoder
|
|
*
|
|
* Copyright (c) 2025 Lynne <dev@lynne.ee>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#define I16(x) (int16_t(x))
|
|
|
|
#define COMP_ID (gl_LocalInvocationID.z)
|
|
#define BLOCK_ID (gl_LocalInvocationID.y)
|
|
#define ROW_ID (gl_LocalInvocationID.x)
|
|
|
|
GetBitContext gb;
|
|
shared float btemp[gl_WorkGroupSize.z][16][64] = { };
|
|
shared float block[gl_WorkGroupSize.z][16][64];
|
|
|
|
void idct8_horiz(const uint row_id)
|
|
{
|
|
float t0, t1, t2, t3, t4, t5, t6, t7, u8;
|
|
float u0, u1, u2, u3, u4, u5, u6, u7;
|
|
|
|
/* Input */
|
|
t0 = block[COMP_ID][BLOCK_ID][8*row_id + 0];
|
|
u4 = block[COMP_ID][BLOCK_ID][8*row_id + 1];
|
|
t2 = block[COMP_ID][BLOCK_ID][8*row_id + 2];
|
|
u6 = block[COMP_ID][BLOCK_ID][8*row_id + 3];
|
|
t1 = block[COMP_ID][BLOCK_ID][8*row_id + 4];
|
|
u5 = block[COMP_ID][BLOCK_ID][8*row_id + 5];
|
|
t3 = block[COMP_ID][BLOCK_ID][8*row_id + 6];
|
|
u7 = block[COMP_ID][BLOCK_ID][8*row_id + 7];
|
|
|
|
/* Embedded scaled inverse 4-point Type-II DCT */
|
|
u0 = t0 + t1;
|
|
u1 = t0 - t1;
|
|
u3 = t2 + t3;
|
|
u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
|
|
t0 = u0 + u3;
|
|
t3 = u0 - u3;
|
|
t1 = u1 + u2;
|
|
t2 = u1 - u2;
|
|
|
|
/* Embedded scaled inverse 4-point Type-IV DST */
|
|
t5 = u5 + u6;
|
|
t6 = u5 - u6;
|
|
t7 = u4 + u7;
|
|
t4 = u4 - u7;
|
|
u7 = t7 + t5;
|
|
u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
|
|
u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
|
|
u4 = u8 - t4*(1.0823922002923939687994464107328f);
|
|
u6 = u8 - t6*(2.6131259297527530557132863468544f);
|
|
t7 = u7;
|
|
t6 = t7 - u6;
|
|
t5 = t6 + u5;
|
|
t4 = t5 - u4;
|
|
|
|
/* Butterflies */
|
|
u0 = t0 + t7;
|
|
u7 = t0 - t7;
|
|
u6 = t1 + t6;
|
|
u1 = t1 - t6;
|
|
u2 = t2 + t5;
|
|
u5 = t2 - t5;
|
|
u4 = t3 + t4;
|
|
u3 = t3 - t4;
|
|
|
|
/* Output */
|
|
btemp[COMP_ID][BLOCK_ID][0*8 + row_id] = u0;
|
|
btemp[COMP_ID][BLOCK_ID][1*8 + row_id] = u1;
|
|
btemp[COMP_ID][BLOCK_ID][2*8 + row_id] = u2;
|
|
btemp[COMP_ID][BLOCK_ID][3*8 + row_id] = u3;
|
|
btemp[COMP_ID][BLOCK_ID][4*8 + row_id] = u4;
|
|
btemp[COMP_ID][BLOCK_ID][5*8 + row_id] = u5;
|
|
btemp[COMP_ID][BLOCK_ID][6*8 + row_id] = u6;
|
|
btemp[COMP_ID][BLOCK_ID][7*8 + row_id] = u7;
|
|
}
|
|
|
|
void idct8_vert(const uint row_id)
|
|
{
|
|
float t0, t1, t2, t3, t4, t5, t6, t7, u8;
|
|
float u0, u1, u2, u3, u4, u5, u6, u7;
|
|
|
|
/* Input */
|
|
t0 = btemp[COMP_ID][BLOCK_ID][8*row_id + 0] + 0.5f; // NOTE
|
|
u4 = btemp[COMP_ID][BLOCK_ID][8*row_id + 1];
|
|
t2 = btemp[COMP_ID][BLOCK_ID][8*row_id + 2];
|
|
u6 = btemp[COMP_ID][BLOCK_ID][8*row_id + 3];
|
|
t1 = btemp[COMP_ID][BLOCK_ID][8*row_id + 4];
|
|
u5 = btemp[COMP_ID][BLOCK_ID][8*row_id + 5];
|
|
t3 = btemp[COMP_ID][BLOCK_ID][8*row_id + 6];
|
|
u7 = btemp[COMP_ID][BLOCK_ID][8*row_id + 7];
|
|
|
|
/* Embedded scaled inverse 4-point Type-II DCT */
|
|
u0 = t0 + t1;
|
|
u1 = t0 - t1;
|
|
u3 = t2 + t3;
|
|
u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
|
|
t0 = u0 + u3;
|
|
t3 = u0 - u3;
|
|
t1 = u1 + u2;
|
|
t2 = u1 - u2;
|
|
|
|
/* Embedded scaled inverse 4-point Type-IV DST */
|
|
t5 = u5 + u6;
|
|
t6 = u5 - u6;
|
|
t7 = u4 + u7;
|
|
t4 = u4 - u7;
|
|
u7 = t7 + t5;
|
|
u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
|
|
u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
|
|
u4 = u8 - t4*(1.0823922002923939687994464107328f);
|
|
u6 = u8 - t6*(2.6131259297527530557132863468544f);
|
|
t7 = u7;
|
|
t6 = t7 - u6;
|
|
t5 = t6 + u5;
|
|
t4 = t5 - u4;
|
|
|
|
/* Butterflies */
|
|
u0 = t0 + t7;
|
|
u7 = t0 - t7;
|
|
u6 = t1 + t6;
|
|
u1 = t1 - t6;
|
|
u2 = t2 + t5;
|
|
u5 = t2 - t5;
|
|
u4 = t3 + t4;
|
|
u3 = t3 - t4;
|
|
|
|
/* Output */
|
|
block[COMP_ID][BLOCK_ID][0*8 + row_id] = u0;
|
|
block[COMP_ID][BLOCK_ID][1*8 + row_id] = u1;
|
|
block[COMP_ID][BLOCK_ID][2*8 + row_id] = u2;
|
|
block[COMP_ID][BLOCK_ID][3*8 + row_id] = u3;
|
|
block[COMP_ID][BLOCK_ID][4*8 + row_id] = u4;
|
|
block[COMP_ID][BLOCK_ID][5*8 + row_id] = u5;
|
|
block[COMP_ID][BLOCK_ID][6*8 + row_id] = u6;
|
|
block[COMP_ID][BLOCK_ID][7*8 + row_id] = u7;
|
|
}
|
|
|
|
int16_t get_value(int16_t codebook)
|
|
{
|
|
const int16_t switch_bits = codebook >> 8;
|
|
const int16_t rice_order = codebook & I16(0xf);
|
|
const int16_t exp_order = (codebook >> 4) & I16(0xf);
|
|
|
|
uint32_t b = show_bits(gb, 32);
|
|
if (expectEXT(b == 0, false))
|
|
return I16(0);
|
|
int16_t q = I16(31) - I16(findMSB(b));
|
|
|
|
if ((b & 0x80000000) != 0) {
|
|
skip_bits(gb, 1 + rice_order);
|
|
return I16((b & 0x7FFFFFFF) >> (31 - rice_order));
|
|
}
|
|
|
|
if (q <= switch_bits) {
|
|
skip_bits(gb, q + rice_order + 1);
|
|
return I16((q << rice_order) +
|
|
(((b << (q + 1)) >> 1) >> (31 - rice_order)));
|
|
}
|
|
|
|
int16_t bits = exp_order + (q << 1) - switch_bits;
|
|
skip_bits(gb, bits);
|
|
return I16((b >> (32 - bits)) +
|
|
((switch_bits + 1) << rice_order) -
|
|
(1 << exp_order));
|
|
}
|
|
|
|
#define TODCCODEBOOK(x) ((x + 1) >> 1)
|
|
|
|
void read_dc_vals(const uint nb_blocks)
|
|
{
|
|
int16_t dc, dc_add;
|
|
int16_t prev_dc = I16(0), sign = I16(0);
|
|
|
|
/* Special handling for first block */
|
|
dc = get_value(I16(700));
|
|
prev_dc = (dc >> 1) ^ -(dc & I16(1));
|
|
btemp[COMP_ID][0][0] = prev_dc;
|
|
|
|
for (uint n = 1; n < nb_blocks; n++) {
|
|
if (expectEXT(left_bits(gb) <= 0, false))
|
|
break;
|
|
|
|
uint8_t dc_codebook;
|
|
if ((n & 15) == 1)
|
|
dc_codebook = uint8_t(100);
|
|
else
|
|
dc_codebook = dc_cb[min(TODCCODEBOOK(dc), 13 - 1)];
|
|
|
|
dc = get_value(dc_codebook);
|
|
|
|
sign = sign ^ dc & int16_t(1);
|
|
dc_add = (-sign ^ I16(TODCCODEBOOK(dc))) + sign;
|
|
sign = I16(dc_add < 0);
|
|
prev_dc += dc_add;
|
|
|
|
btemp[COMP_ID][n][0] = prev_dc;
|
|
}
|
|
}
|
|
|
|
void read_ac_vals(const uint nb_blocks)
|
|
{
|
|
const uint nb_codes = nb_blocks << 6;
|
|
const uint log2_nb_blocks = findMSB(nb_blocks);
|
|
const uint block_mask = (1 << log2_nb_blocks) - 1;
|
|
|
|
int16_t ac, rn, ln;
|
|
int16_t ac_codebook = I16(49);
|
|
int16_t rn_codebook = I16( 0);
|
|
int16_t ln_codebook = I16(66);
|
|
int16_t sign;
|
|
int16_t val;
|
|
|
|
for (uint n = nb_blocks; n <= nb_codes;) {
|
|
if (expectEXT(left_bits(gb) <= 0, false))
|
|
break;
|
|
|
|
ln = get_value(ln_codebook);
|
|
for (uint i = 0; i < ln; i++) {
|
|
if (expectEXT(left_bits(gb) <= 0, false))
|
|
break;
|
|
|
|
if (expectEXT(n >= nb_codes, false))
|
|
break;
|
|
|
|
ac = get_value(ac_codebook);
|
|
ac_codebook = ac_cb[min(ac, 95 - 1)];
|
|
sign = -int16_t(get_bit(gb));
|
|
|
|
val = ((ac + I16(1)) ^ sign) - sign;
|
|
btemp[COMP_ID][n & block_mask][n >> log2_nb_blocks] = val;
|
|
|
|
n++;
|
|
}
|
|
|
|
if (expectEXT(n >= nb_codes, false))
|
|
break;
|
|
|
|
rn = get_value(rn_codebook);
|
|
rn_codebook = rn_cb[min(rn, 28 - 1)];
|
|
|
|
n += rn + 1;
|
|
if (expectEXT(n >= nb_codes, false))
|
|
break;
|
|
|
|
if (expectEXT(left_bits(gb) <= 0, false))
|
|
break;
|
|
|
|
ac = get_value(ac_codebook);
|
|
sign = -int16_t(get_bit(gb));
|
|
|
|
val = ((ac + I16(1)) ^ sign) - sign;
|
|
btemp[COMP_ID][n & block_mask][n >> log2_nb_blocks] = val;
|
|
|
|
ac_codebook = ac_cb[min(ac, 95 - 1)];
|
|
ln_codebook = ln_cb[min(ac, 15 - 1)];
|
|
|
|
n++;
|
|
}
|
|
}
|
|
|
|
void main(void)
|
|
{
|
|
const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
|
|
TileData td = tile_data[tile_idx];
|
|
|
|
if (expectEXT(td.pos.x >= frame_size.x, false))
|
|
return;
|
|
|
|
uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
|
|
u8vec2buf hdr_data = u8vec2buf(pkt_offset);
|
|
float qscale = float(pack16(hdr_data[0].v.yx)) / 2.0f;
|
|
|
|
ivec4 size = ivec4(td.size,
|
|
pack16(hdr_data[2].v.yx),
|
|
pack16(hdr_data[1].v.yx),
|
|
pack16(hdr_data[3].v.yx));
|
|
size[0] = size[0] - size[1] - size[2] - size[3] - 8;
|
|
if (expectEXT(size[0] < 0, false))
|
|
return;
|
|
|
|
const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
|
|
const uint w = min(tile_size.x, frame_size.x - td.pos.x) / 2;
|
|
const uint nb_blocks = w / 8;
|
|
|
|
const ivec4 comp_offset = ivec4(size[2] + size[1] + size[3],
|
|
size[2],
|
|
0,
|
|
size[2] + size[1]);
|
|
|
|
if (BLOCK_ID == 0 && ROW_ID == 0) {
|
|
init_get_bits(gb, u8buf(pkt_offset + 8 + comp_offset[COMP_ID]),
|
|
size[COMP_ID]);
|
|
read_dc_vals(nb_blocks);
|
|
read_ac_vals(nb_blocks);
|
|
}
|
|
|
|
barrier();
|
|
|
|
[[unroll]]
|
|
for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x)
|
|
block[COMP_ID][BLOCK_ID][i] = (btemp[COMP_ID][BLOCK_ID][scan[i]] / 16384.0) *
|
|
(float(qmat[i]) / 295.0) *
|
|
idct_8x8_scales[i] * qscale;
|
|
|
|
barrier();
|
|
|
|
#ifdef PARALLEL_ROWS
|
|
idct8_horiz(ROW_ID);
|
|
|
|
barrier();
|
|
|
|
idct8_vert(ROW_ID);
|
|
#else
|
|
for (uint j = 0; j < 8; j++)
|
|
idct8_horiz(j);
|
|
|
|
barrier();
|
|
|
|
for (uint j = 0; j < 8; j++)
|
|
idct8_vert(j);
|
|
#endif
|
|
|
|
barrier();
|
|
|
|
[[unroll]]
|
|
for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x)
|
|
imageStore(dst,
|
|
offs + 2*ivec2(BLOCK_ID*8 + (i & 7), i >> 3),
|
|
vec4(block[COMP_ID][BLOCK_ID][i]));
|
|
}
|