You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	lavc: add a ProRes Vulkan hwaccel
Add a shader-based Apple ProRes decoder. It supports all codec features for profiles up to the 4444 XQ profile, ie.: - 4:2:2 and 4:4:4 chroma subsampling - 10- and 12-bit component depth - Interlacing - Alpha The implementation consists in two shaders: the VLD kernel does entropy decoding for color/alpha, and the IDCT kernel performs the inverse transform on color components. Benchmarks for a 4k yuv422p10 sample: - AMD Radeon 6700XT: 178 fps - Intel i7 Tiger Lake: 37 fps - NVidia Orin Nano: 70 fps
This commit is contained in:
		
							
								
								
									
										2
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								configure
									
									
									
									
										vendored
									
									
								
							| @@ -3343,6 +3343,8 @@ prores_videotoolbox_hwaccel_deps="videotoolbox" | ||||
| prores_videotoolbox_hwaccel_select="prores_decoder" | ||||
| prores_raw_vulkan_hwaccel_deps="vulkan spirv_compiler" | ||||
| prores_raw_vulkan_hwaccel_select="prores_raw_decoder" | ||||
| prores_vulkan_hwaccel_deps="vulkan spirv_compiler" | ||||
| prores_vulkan_hwaccel_select="prores_decoder" | ||||
| vc1_d3d11va_hwaccel_deps="d3d11va" | ||||
| vc1_d3d11va_hwaccel_select="vc1_decoder" | ||||
| vc1_d3d11va2_hwaccel_deps="d3d11va" | ||||
|   | ||||
| @@ -1106,6 +1106,7 @@ OBJS-$(CONFIG_VP9_VULKAN_HWACCEL)         += vulkan_decode.o vulkan_vp9.o | ||||
| OBJS-$(CONFIG_VP8_QSV_HWACCEL)            += qsvdec.o | ||||
| OBJS-$(CONFIG_VVC_VAAPI_HWACCEL)          += vaapi_vvc.o | ||||
| OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL)  += vulkan_decode.o vulkan_prores_raw.o | ||||
| OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL)      += vulkan_decode.o vulkan_prores.o | ||||
|  | ||||
| # Objects duplicated from other libraries for shared builds | ||||
| SHLIBOBJS                              += log2_tab.o reverse.o | ||||
| @@ -1350,7 +1351,7 @@ SKIPHEADERS-$(CONFIG_QSVENC)           += qsvenc.h | ||||
| SKIPHEADERS-$(CONFIG_VAAPI)            += vaapi_decode.h vaapi_hevc.h vaapi_encode.h | ||||
| SKIPHEADERS-$(CONFIG_VDPAU)            += vdpau.h vdpau_internal.h | ||||
| SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += videotoolbox.h vt_internal.h | ||||
| SKIPHEADERS-$(CONFIG_VULKAN)           += ffv1_vulkan.h vulkan_video.h \ | ||||
| SKIPHEADERS-$(CONFIG_VULKAN)           += ffv1_vulkan.h prores_vulkan.h vulkan_video.h \ | ||||
|                                           vulkan_encode.h vulkan_decode.h | ||||
| SKIPHEADERS-$(CONFIG_V4L2_M2M)         += v4l2_buffers.h v4l2_context.h v4l2_m2m.h | ||||
| SKIPHEADERS-$(CONFIG_ZLIB)             += zlib_wrapper.h | ||||
|   | ||||
| @@ -68,6 +68,7 @@ extern const struct FFHWAccel ff_mpeg4_vdpau_hwaccel; | ||||
| extern const struct FFHWAccel ff_mpeg4_videotoolbox_hwaccel; | ||||
| extern const struct FFHWAccel ff_prores_videotoolbox_hwaccel; | ||||
| extern const struct FFHWAccel ff_prores_raw_vulkan_hwaccel; | ||||
| extern const struct FFHWAccel ff_prores_vulkan_hwaccel; | ||||
| extern const struct FFHWAccel ff_vc1_d3d11va_hwaccel; | ||||
| extern const struct FFHWAccel ff_vc1_d3d11va2_hwaccel; | ||||
| extern const struct FFHWAccel ff_vc1_d3d12va_hwaccel; | ||||
|   | ||||
| @@ -251,7 +251,7 @@ static int decode_frame_header(ProresContext *ctx, const uint8_t *buf, | ||||
|     } | ||||
|  | ||||
|     if (pix_fmt != ctx->pix_fmt) { | ||||
| #define HWACCEL_MAX (CONFIG_PRORES_VIDEOTOOLBOX_HWACCEL) | ||||
| #define HWACCEL_MAX (CONFIG_PRORES_VIDEOTOOLBOX_HWACCEL + CONFIG_PRORES_VULKAN_HWACCEL) | ||||
| #if HWACCEL_MAX | ||||
|         enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts; | ||||
|         int ret; | ||||
| @@ -260,6 +260,9 @@ static int decode_frame_header(ProresContext *ctx, const uint8_t *buf, | ||||
|  | ||||
| #if CONFIG_PRORES_VIDEOTOOLBOX_HWACCEL | ||||
|         *fmtp++ = AV_PIX_FMT_VIDEOTOOLBOX; | ||||
| #endif | ||||
| #if CONFIG_PRORES_VULKAN_HWACCEL | ||||
|         *fmtp++ = AV_PIX_FMT_VULKAN; | ||||
| #endif | ||||
|         *fmtp++ = ctx->pix_fmt; | ||||
|         *fmtp = AV_PIX_FMT_NONE; | ||||
| @@ -872,6 +875,9 @@ const FFCodec ff_prores_decoder = { | ||||
|     .hw_configs     = (const AVCodecHWConfigInternal *const []) { | ||||
| #if CONFIG_PRORES_VIDEOTOOLBOX_HWACCEL | ||||
|         HWACCEL_VIDEOTOOLBOX(prores), | ||||
| #endif | ||||
| #if CONFIG_PRORES_VULKAN_HWACCEL | ||||
|         HWACCEL_VULKAN(prores), | ||||
| #endif | ||||
|         NULL | ||||
|     }, | ||||
|   | ||||
| @@ -17,6 +17,11 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \ | ||||
| OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o \ | ||||
|                                             vulkan/prores_raw.o | ||||
|  | ||||
| OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o \ | ||||
|                                         vulkan/prores_reset.o \ | ||||
|                                         vulkan/prores_vld.o \ | ||||
|                                         vulkan/prores_idct.o | ||||
|  | ||||
| VULKAN = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavcodec/vulkan/*.comp)) | ||||
| .SECONDARY: $(VULKAN:.comp=.c) | ||||
| libavcodec/vulkan/%.c: TAG = VULKAN | ||||
|   | ||||
							
								
								
									
										123
									
								
								libavcodec/vulkan/prores_idct.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										123
									
								
								libavcodec/vulkan/prores_idct.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,123 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| /* Two macroblocks, padded to avoid bank conflicts */ | ||||
| shared float blocks[4*2][8*(8+1)]; | ||||
|  | ||||
| uint get_px(uint tex_idx, ivec2 pos) | ||||
| { | ||||
| #ifndef INTERLACED | ||||
|     return imageLoad(dst[tex_idx], pos).x; | ||||
| #else | ||||
|     return imageLoad(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field)).x; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| void put_px(uint tex_idx, ivec2 pos, uint v) | ||||
| { | ||||
| #ifndef INTERLACED | ||||
|     imageStore(dst[tex_idx], pos, uvec4(v)); | ||||
| #else | ||||
|     imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| /* 7.4 Inverse Transform */ | ||||
| void idct(uint block, uint offset, uint stride) | ||||
| { | ||||
|     float c0 = blocks[block][0*stride + offset]; | ||||
|     float c1 = blocks[block][1*stride + offset]; | ||||
|     float c2 = blocks[block][2*stride + offset]; | ||||
|     float c3 = blocks[block][3*stride + offset]; | ||||
|     float c4 = blocks[block][4*stride + offset]; | ||||
|     float c5 = blocks[block][5*stride + offset]; | ||||
|     float c6 = blocks[block][6*stride + offset]; | ||||
|     float c7 = blocks[block][7*stride + offset]; | ||||
|  | ||||
|     float tmp1 = c6 * 1.4142134189605712891 + (c2 - c6); | ||||
|     float tmp2 = c6 * 1.4142134189605712891 - (c2 - c6); | ||||
|  | ||||
|     float a1 = (c0 + c4) * 0.35355341434478759766 + tmp1 * 0.46193981170654296875; | ||||
|     float a4 = (c0 + c4) * 0.35355341434478759766 - tmp1 * 0.46193981170654296875; | ||||
|  | ||||
|     float a3 = (c0 - c4) * 0.35355341434478759766 + tmp2 * 0.19134169816970825195; | ||||
|     float a2 = (c0 - c4) * 0.35355341434478759766 - tmp2 * 0.19134169816970825195; | ||||
|  | ||||
|     float tmp3 = (c3 - c5) * 0.70710682868957519531 + c7; | ||||
|     float tmp4 = (c3 - c5) * 0.70710682868957519531 - c7; | ||||
|  | ||||
|     float tmp5 = (c5 - c7) *  1.4142134189605712891 + (c5 - c7) + (c1 - c3); | ||||
|     float tmp6 = (c5 - c7) * -1.4142134189605712891 + (c5 - c7) + (c1 - c3); | ||||
|  | ||||
|     float m1 = tmp3 *  2.6131260395050048828 + tmp5; | ||||
|     float m4 = tmp3 * -2.6131260395050048828 + tmp5; | ||||
|  | ||||
|     float m2 = tmp4 *  1.0823919773101806641 + tmp6; | ||||
|     float m3 = tmp4 * -1.0823919773101806641 + tmp6; | ||||
|  | ||||
|     blocks[block][0*stride + offset] = m1 *  0.49039259552955627441  + a1; | ||||
|     blocks[block][7*stride + offset] = m1 * -0.49039259552955627441  + a1; | ||||
|     blocks[block][1*stride + offset] = m2 *  0.41573479771614074707  + a2; | ||||
|     blocks[block][6*stride + offset] = m2 * -0.41573479771614074707  + a2; | ||||
|     blocks[block][2*stride + offset] = m3 *  0.27778509259223937988  + a3; | ||||
|     blocks[block][5*stride + offset] = m3 * -0.27778509259223937988  + a3; | ||||
|     blocks[block][3*stride + offset] = m4 *  0.097545139491558074951 + a4; | ||||
|     blocks[block][4*stride + offset] = m4 * -0.097545139491558074951 + a4; | ||||
| } | ||||
|  | ||||
| void main(void) | ||||
| { | ||||
|     uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID; | ||||
|     uint comp = gid.z, block = (lid.y << 2) | (lid.x >> 3), idx = lid.x & 0x7; | ||||
|     uint chroma_shift = comp != 0 ? log2_chroma_w : 0; | ||||
|     bool act = gid.x < mb_width << (4 - chroma_shift); | ||||
|  | ||||
|     /* Coalesced load of DCT coeffs in shared memory, second part of inverse quantization */ | ||||
|     if (act) { | ||||
|         /** | ||||
|          * According to spec indexing an array in push constant memory with | ||||
|          * a non-dynamically uniform value is illegal ($15.9.1 in v1.4.326), | ||||
|          * so copy the whole matrix locally. | ||||
|          */ | ||||
|         uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma; | ||||
|         [[unroll]] for (uint i = 0; i < 8; ++i) { | ||||
|             int v = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) | i))), 16); | ||||
|             blocks[block][i * 9 + idx] = float(v * int(qmat[(i << 3) + idx])); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /* Row-wise iDCT */ | ||||
|     barrier(); | ||||
|     idct(block, idx * 9, 1); | ||||
|  | ||||
|     /* Column-wise iDCT */ | ||||
|     barrier(); | ||||
|     idct(block, idx, 9); | ||||
|  | ||||
|     float fact = 1.0f / (1 << (12 - depth)), off = 1 << (depth - 1); | ||||
|     int maxv = (1 << depth) - 1; | ||||
|  | ||||
|     /* 7.5.1 Color Component Samples. Rescale, clamp and write back to global memory */ | ||||
|     barrier(); | ||||
|     if (act) { | ||||
|         [[unroll]] for (uint i = 0; i < 8; ++i) { | ||||
|             float v = blocks[block][i * 9 + idx] * fact + off; | ||||
|             put_px(comp, ivec2(gid.x, (gid.y << 3) | i), clamp(int(v), 0, maxv)); | ||||
|         } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										38
									
								
								libavcodec/vulkan/prores_reset.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								libavcodec/vulkan/prores_reset.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| void main(void) | ||||
| { | ||||
|     uvec3 gid = gl_GlobalInvocationID; | ||||
| #ifndef INTERLACED | ||||
|     ivec2 pos = ivec2(gid); | ||||
| #else | ||||
|     ivec2 pos = ivec2(gid.x, (gid.y << 1) + bottom_field); | ||||
| #endif | ||||
|  | ||||
|     /* Clear luma plane */ | ||||
|     imageStore(dst[0], pos, uvec4(0)); | ||||
|  | ||||
|     /* Clear chroma plane */ | ||||
|     if (gid.x < mb_width << (4 - log2_chroma_w)) { | ||||
|         imageStore(dst[1], pos, uvec4(0)); | ||||
|         imageStore(dst[2], pos, uvec4(0)); | ||||
|     } | ||||
|  | ||||
|     /* Alpha plane doesn't need a clear because it is not sparsely encoded */ | ||||
| } | ||||
							
								
								
									
										317
									
								
								libavcodec/vulkan/prores_vld.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										317
									
								
								libavcodec/vulkan/prores_vld.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,317 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #define U8(x)  (uint8_t (x)) | ||||
| #define U16(x) (uint16_t(x)) | ||||
|  | ||||
| void put_px(uint tex_idx, ivec2 pos, uint v) | ||||
| { | ||||
| #ifndef INTERLACED | ||||
|     imageStore(dst[tex_idx], pos, uvec4(v)); | ||||
| #else | ||||
|     imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| /* 7.5.3 Pixel Arrangement */ | ||||
| ivec2 pos_to_block(uint pos, uint luma) | ||||
| { | ||||
|     return ivec2((pos & -luma - 2) + luma >> 1, pos >> luma & 1) << 3; | ||||
| } | ||||
|  | ||||
| /* 7.1.1.2 Signed Golomb Combination Codes */ | ||||
| uint to_signed(uint x) | ||||
| { | ||||
|     return (x >> 1) ^ -(x & 1); | ||||
| } | ||||
|  | ||||
| /* 7.1.1.1 Golomb Combination Codes */ | ||||
| uint decode_codeword(inout GetBitContext gb, int codebook) | ||||
| { | ||||
|     int last_rice_q = bitfieldExtract(codebook, 0, 4), | ||||
|         krice       = bitfieldExtract(codebook, 4, 4), | ||||
|         kexp        = bitfieldExtract(codebook, 8, 4); | ||||
|  | ||||
|     int q = 31 - findMSB(show_bits(gb, 32)); | ||||
|     if (q <= last_rice_q) { | ||||
|         /* Golomb-Rice encoding */ | ||||
|         return (get_bits(gb, krice + q + 1) & ~(1 << krice)) + (q << krice); | ||||
|     } else { | ||||
|         /* exp-Golomb encoding */ | ||||
|         return get_bits(gb, (q << 1) + kexp - last_rice_q) - (1 << kexp) + ((last_rice_q + 1) << krice); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale) | ||||
| { | ||||
|     uvec3 gid = gl_GlobalInvocationID; | ||||
|     uint is_luma = uint(gid.z == 0); | ||||
|     uint chroma_shift = bool(is_luma) ? 0 : log2_chroma_w; | ||||
|  | ||||
|     uint num_blocks = mb_count << (2 - chroma_shift); | ||||
|     ivec2 base_pos = ivec2(mb_pos.x << (4 - chroma_shift), mb_pos.y << 4); | ||||
|  | ||||
|     /* 7.1.1.3 DC Coefficients */ | ||||
|     { | ||||
|         /* First coeff */ | ||||
|         uint c = to_signed(decode_codeword(gb, 0x650)); | ||||
|         put_px(gid.z, base_pos, c * qscale & 0xffff); | ||||
|  | ||||
|         /** | ||||
|          * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8) | ||||
|          * According to the SMPTE document, abs(prev_dc_diff) should be used | ||||
|          * to index the table, duplicating the entries removes the abs operation. | ||||
|          */ | ||||
|         const uint16_t dc_codebook[] = { U16(0x100), | ||||
|                                          U16(0x210), U16(0x210), | ||||
|                                          U16(0x321), U16(0x321), | ||||
|                                          U16(0x430), U16(0x430), }; | ||||
|  | ||||
|         uint cw = 5, prev_dc_diff = 0; | ||||
|         for (int i = 1; i < num_blocks; ++i) { | ||||
|             cw = decode_codeword(gb, dc_codebook[min(cw, 6)]); | ||||
|  | ||||
|             int s = int(prev_dc_diff) >> 31; | ||||
|             c += prev_dc_diff = (to_signed(cw) ^ s) - s; | ||||
|  | ||||
|             put_px(gid.z, base_pos + pos_to_block(i, is_luma), c * qscale & 0xffff); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /* 7.1.1.4 AC Coefficients */ | ||||
|     { | ||||
|         /* Table 10 */ | ||||
|         const uint16_t ac_run_codebook  [] = { U16(0x102), U16(0x102), U16(0x101), U16(0x101), | ||||
|                                                U16(0x100), U16(0x211), U16(0x211), U16(0x211), | ||||
|                                                U16(0x211), U16(0x210), U16(0x210), U16(0x210), | ||||
|                                                U16(0x210), U16(0x210), U16(0x210), U16(0x320), }; | ||||
|  | ||||
|         /* Table 11 */ | ||||
|         const uint16_t ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), U16(0x100), | ||||
|                                                U16(0x210), U16(0x210), U16(0x210), U16(0x210), | ||||
|                                                U16(0x320) }; | ||||
|  | ||||
| #ifndef INTERLACED | ||||
|         /* Figure 4, encoded as (x << 0) | (y << 4) */ | ||||
|         const uint8_t scan_tbl[] = { | ||||
|             U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), U8(0x13), | ||||
|             U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), U8(0x33), | ||||
|             U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), U8(0x16), | ||||
|             U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), U8(0x37), | ||||
|             U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), U8(0x52), | ||||
|             U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), U8(0x54), | ||||
|             U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), U8(0x56), | ||||
|             U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), U8(0x77), | ||||
|         }; | ||||
| #else | ||||
|         /* Figure 5 */ | ||||
|         const uint8_t scan_tbl[] = { | ||||
|             U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), U8(0x31), | ||||
|             U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), U8(0x33), | ||||
|             U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), U8(0x61), | ||||
|             U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), U8(0x73), | ||||
|             U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), U8(0x25), | ||||
|             U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), U8(0x45), | ||||
|             U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), U8(0x65), | ||||
|             U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), U8(0x77), | ||||
|         }; | ||||
| #endif | ||||
|  | ||||
|         uint block_mask  = num_blocks - 1; | ||||
|         uint block_shift = findLSB(num_blocks); | ||||
|  | ||||
|         uint pos = num_blocks - 1, run = 4, level = 1, s; | ||||
|         while (pos < num_blocks << 6) { | ||||
|             int left = left_bits(gb); | ||||
|             if (left <= 0 || (left < 32 && show_bits(gb, left) == 0)) | ||||
|                 break; | ||||
|  | ||||
|             run   = decode_codeword(gb, ac_run_codebook  [min(run,   15)]); | ||||
|             level = decode_codeword(gb, ac_level_codebook[min(level, 8 )]); | ||||
|             s     = get_bits(gb, 1); | ||||
|  | ||||
|             pos += run + 1; | ||||
|  | ||||
|             uint bidx  = pos & block_mask, scan = scan_tbl[pos >> block_shift]; | ||||
|             ivec2 spos = pos_to_block(bidx, is_luma); | ||||
|             ivec2 bpos = ivec2(scan & 0xf, scan >> 4); | ||||
|  | ||||
|             uint c = ((level + 1) ^ -s) + s; | ||||
|             put_px(gid.z, base_pos + spos + bpos, c * qscale & 0xffff); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /* 7.1.2 Scanned Alpha */ | ||||
| void decode_alpha(in GetBitContext gb, uvec2 mb_pos, uint mb_count) | ||||
| { | ||||
|     uvec3 gid = gl_GlobalInvocationID; | ||||
|  | ||||
|     ivec2 base_pos = ivec2(mb_pos) << 4; | ||||
|     uint block_shift = findMSB(mb_count) + 4, block_mask = (1 << block_shift) - 1; | ||||
|  | ||||
|     uint mask = (1 << (4 << alpha_info)) - 1; | ||||
|     uint num_values = (mb_count << 4) * min(height - (gid.y << 4), 16); | ||||
|  | ||||
|     int num_cw_bits  = alpha_info == 1 ? 5 : 8, | ||||
|         num_flc_bits = alpha_info == 1 ? 9 : 17; | ||||
|  | ||||
|     uint alpha_rescale_lshift = alpha_info == 1 ? depth - 8 : 16, | ||||
|          alpha_rescale_rshift = 16 - depth; | ||||
|  | ||||
|     uint alpha = -1; | ||||
|     for (uint pos = 0; pos < num_values;) { | ||||
|         uint diff, run; | ||||
|  | ||||
|         /* Decode run value */ | ||||
|         { | ||||
|             uint bits = show_bits(gb, num_cw_bits), q = num_cw_bits - 1 - findMSB(bits); | ||||
|  | ||||
|             /* Tables 13/14 */ | ||||
|             if (q != 0) { | ||||
|                 uint m = (bits >> 1) + 1, s = bits & 1; | ||||
|                 diff = (m ^ -s) + s; | ||||
|                 skip_bits(gb, num_cw_bits); | ||||
|             } else { | ||||
|                 diff = get_bits(gb, num_flc_bits); | ||||
|             } | ||||
|  | ||||
|             alpha = alpha + diff & mask; | ||||
|         } | ||||
|  | ||||
|         /* Decode run length */ | ||||
|         { | ||||
|             uint bits = show_bits(gb, 5), q = 4 - findMSB(bits); | ||||
|  | ||||
|             /* Table 12 */ | ||||
|             if (q == 0) { | ||||
|                 run = 1; | ||||
|                 skip_bits(gb, 1); | ||||
|             } else if (q <= 4) { | ||||
|                 run = bits + 1; | ||||
|                 skip_bits(gb, 5); | ||||
|             } else { | ||||
|                 run = get_bits(gb, 16) + 1; | ||||
|             } | ||||
|  | ||||
|             run = min(run, num_values - pos); | ||||
|         } | ||||
|  | ||||
|         /** | ||||
|          * FFmpeg doesn't support color and alpha with different precision, | ||||
|          * so we need to rescale to the color range. | ||||
|          */ | ||||
|         uint val = (alpha << alpha_rescale_lshift) | (alpha >> alpha_rescale_rshift); | ||||
|         for (uint end = pos + run; pos < end; ++pos) | ||||
|             put_px(3, base_pos + ivec2(pos & block_mask, pos >> block_shift), val & 0xffff); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void main(void) | ||||
| { | ||||
|     uvec3 gid = gl_GlobalInvocationID; | ||||
|     if (gid.x >= slice_width || gid.y >= slice_height) | ||||
|         return; | ||||
|  | ||||
|     uint slice_idx = gid.y * slice_width + gid.x; | ||||
|     uint slice_off  = slice_offsets[slice_idx], | ||||
|          slice_size = slice_offsets[slice_idx + 1] - slice_off; | ||||
|  | ||||
|     u8buf bs = u8buf(slice_data + slice_off); | ||||
|  | ||||
|     /* Decode slice header */ | ||||
|     uint hdr_size, y_size, u_size, v_size, a_size; | ||||
|     hdr_size = bs[0].v >> 3; | ||||
|  | ||||
|     /* Table 15 */ | ||||
|     uint qidx   = clamp(bs[1].v, 1, 224), | ||||
|          qscale = qidx > 128 ? (qidx - 96) << 2 : qidx; | ||||
|  | ||||
|     y_size = (uint(bs[2].v) << 8) | bs[3].v; | ||||
|     u_size = (uint(bs[4].v) << 8) | bs[5].v; | ||||
|  | ||||
|     /** | ||||
|      * The alpha_info field can be 0 even when an alpha plane is present, | ||||
|      * if skip_alpha is enabled, so use the header size instead. | ||||
|      */ | ||||
|     if (hdr_size > 6) | ||||
|         v_size = (uint(bs[6].v) << 8) | bs[7].v; | ||||
|     else | ||||
|         v_size = slice_size - hdr_size - y_size - u_size; | ||||
|  | ||||
|     a_size = slice_size - hdr_size - y_size - u_size - v_size; | ||||
|  | ||||
|     GetBitContext gb; | ||||
|     switch (gid.z) { | ||||
|         case 0: | ||||
|             init_get_bits(gb, u8buf(bs + hdr_size),                            int(y_size)); | ||||
|             break; | ||||
|         case 1: | ||||
|             init_get_bits(gb, u8buf(bs + hdr_size + y_size),                   int(u_size)); | ||||
|             break; | ||||
|         case 2: | ||||
|             init_get_bits(gb, u8buf(bs + hdr_size + y_size + u_size),          int(v_size)); | ||||
|             break; | ||||
|         case 3: | ||||
|             init_get_bits(gb, u8buf(bs + hdr_size + y_size + u_size + v_size), int(a_size)); | ||||
|             break; | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Support for the grayscale "extension" in the prores_aw encoder. | ||||
|      * According to the spec, entropy coded data should never be empty, | ||||
|      * and instead contain at least the DC coefficients. | ||||
|      * This avoids undefined behavior. | ||||
|      */ | ||||
|     if (left_bits(gb) == 0) | ||||
|         return; | ||||
|  | ||||
|     /** | ||||
|      * 4 ProRes Frame Structure | ||||
|      * ProRes tiles pictures into a grid of slices, whose size is determined | ||||
|      * by the log2_slice_width parameter (height is always 1 MB). | ||||
|      * Each slice has a width of (1 << log2_slice_width) MBs, until the picture | ||||
|      * cannot accommodate a full one. At this point, the remaining space | ||||
|      * is recursively completed using the first smaller power of two that fits | ||||
|      * (see Figure 1). | ||||
|      * The maximum number of extra slices is 3, when log2_slice_width is 3, | ||||
|      * with sizes 4, 2 and 1 MBs. | ||||
|      * The mb_width parameter therefore also represents the number of full slices, | ||||
|      * when interpreted as a fixed-point number with log2_slice_width fractional bits. | ||||
|      */ | ||||
|     uint frac      = bitfieldExtract(uint(mb_width), 0, log2_slice_width), | ||||
|          num_extra = bitCount(frac); | ||||
|  | ||||
|     uint diff = slice_width - gid.x - 1, | ||||
|          off  = max(int(diff - num_extra + 1) << 2, 0); | ||||
|  | ||||
|     uint log2_width = min(findLSB(frac - diff >> diff) + diff + off, log2_slice_width); | ||||
|  | ||||
|     uint mb_x = (min(gid.x, slice_width - num_extra) << log2_slice_width) + | ||||
|                 (frac & (0xf << log2_width + 1)), | ||||
|          mb_y = gid.y; | ||||
|     uint mb_count = 1 << log2_width; | ||||
|  | ||||
|     if (gid.z < 3) { | ||||
|         /* Color entropy decoding, inverse scanning, first part of inverse quantization */ | ||||
|         decode_comp(gb, uvec2(mb_x, mb_y), mb_count, qscale); | ||||
|     } else { | ||||
|         /* Alpha entropy decoding */ | ||||
|         decode_alpha(gb, uvec2(mb_x, mb_y), mb_count); | ||||
|     } | ||||
| } | ||||
| @@ -26,7 +26,8 @@ | ||||
|  | ||||
| #define DECODER_IS_SDR(codec_id) \ | ||||
|     (((codec_id) == AV_CODEC_ID_FFV1) || \ | ||||
|      ((codec_id) == AV_CODEC_ID_PRORES_RAW)) | ||||
|      ((codec_id) == AV_CODEC_ID_PRORES_RAW) || \ | ||||
|      ((codec_id) == AV_CODEC_ID_PRORES)) | ||||
|  | ||||
| #if CONFIG_H264_VULKAN_HWACCEL | ||||
| extern const FFVulkanDecodeDescriptor ff_vk_dec_h264_desc; | ||||
| @@ -46,6 +47,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc; | ||||
| #if CONFIG_PRORES_RAW_VULKAN_HWACCEL | ||||
| extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_raw_desc; | ||||
| #endif | ||||
| #if CONFIG_PRORES_VULKAN_HWACCEL | ||||
| extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc; | ||||
| #endif | ||||
|  | ||||
| static const FFVulkanDecodeDescriptor *dec_descs[] = { | ||||
| #if CONFIG_H264_VULKAN_HWACCEL | ||||
| @@ -66,6 +70,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = { | ||||
| #if CONFIG_PRORES_RAW_VULKAN_HWACCEL | ||||
|     &ff_vk_dec_prores_raw_desc, | ||||
| #endif | ||||
| #if CONFIG_PRORES_VULKAN_HWACCEL | ||||
|     &ff_vk_dec_prores_desc, | ||||
| #endif | ||||
| }; | ||||
|  | ||||
| typedef struct FFVulkanDecodeProfileData { | ||||
|   | ||||
							
								
								
									
										541
									
								
								libavcodec/vulkan_prores.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										541
									
								
								libavcodec/vulkan_prores.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,541 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include "proresdec.h" | ||||
| #include "vulkan_decode.h" | ||||
| #include "hwaccel_internal.h" | ||||
| #include "libavutil/mem.h" | ||||
| #include "libavutil/vulkan.h" | ||||
| #include "libavutil/vulkan_loader.h" | ||||
| #include "libavutil/vulkan_spirv.h" | ||||
|  | ||||
| extern const char *ff_source_common_comp; | ||||
| extern const char *ff_source_prores_reset_comp; | ||||
| extern const char *ff_source_prores_vld_comp; | ||||
| extern const char *ff_source_prores_idct_comp; | ||||
|  | ||||
| const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc = { | ||||
|     .codec_id    = AV_CODEC_ID_PRORES, | ||||
|     .queue_flags = VK_QUEUE_COMPUTE_BIT, | ||||
| }; | ||||
|  | ||||
| typedef struct ProresVulkanDecodePicture { | ||||
|     FFVulkanDecodePicture vp; | ||||
|  | ||||
|     AVBufferRef *slice_offset_buf; | ||||
|     uint32_t slice_num; | ||||
|  | ||||
|     uint32_t bitstream_start; | ||||
|     uint32_t bitstream_size; | ||||
| } ProresVulkanDecodePicture; | ||||
|  | ||||
| typedef struct ProresVulkanDecodeContext { | ||||
|     struct ProresVulkanShaderVariants { | ||||
|         FFVulkanShader reset; | ||||
|         FFVulkanShader vld; | ||||
|         FFVulkanShader idct; | ||||
|     } shaders[2]; /* Progressive/interlaced */ | ||||
|  | ||||
|     AVBufferPool *slice_offset_pool; | ||||
| } ProresVulkanDecodeContext; | ||||
|  | ||||
| typedef struct ProresVkParameters { | ||||
|     VkDeviceAddress slice_data; | ||||
|     uint32_t bitstream_size; | ||||
|  | ||||
|     uint16_t width; | ||||
|     uint16_t height; | ||||
|     uint16_t mb_width; | ||||
|     uint16_t mb_height; | ||||
|     uint16_t slice_width; | ||||
|     uint16_t slice_height; | ||||
|     uint8_t  log2_slice_width; | ||||
|     uint8_t  log2_chroma_w; | ||||
|     uint8_t  depth; | ||||
|     uint8_t  alpha_info; | ||||
|     uint8_t  bottom_field; | ||||
|  | ||||
|     uint8_t  qmat_luma  [64]; | ||||
|     uint8_t  qmat_chroma[64]; | ||||
| } ProresVkParameters; | ||||
|  | ||||
| static int vk_prores_start_frame(AVCodecContext          *avctx, | ||||
|                                  const AVBufferRef       *buffer_ref, | ||||
|                                  av_unused const uint8_t *buffer, | ||||
|                                  av_unused uint32_t       size) | ||||
| { | ||||
|     ProresContext             *pr = avctx->priv_data; | ||||
|     FFVulkanDecodeContext    *dec = avctx->internal->hwaccel_priv_data; | ||||
|     FFVulkanDecodeShared     *ctx = dec->shared_ctx; | ||||
|     ProresVulkanDecodeContext *pv = ctx->sd_ctx; | ||||
|     ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private; | ||||
|     FFVulkanDecodePicture     *vp = &pp->vp; | ||||
|  | ||||
|     int err; | ||||
|  | ||||
|     /* Host map the input slices data if supported */ | ||||
|     if (!vp->slices_buf && ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) | ||||
|         RET(ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data, | ||||
|                                   buffer_ref, | ||||
|                                   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | | ||||
|                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT)); | ||||
|  | ||||
|     /* Allocate slice offsets buffer */ | ||||
|     RET(ff_vk_get_pooled_buffer(&ctx->s, &pv->slice_offset_pool, | ||||
|                                 &pp->slice_offset_buf, | ||||
|                                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | | ||||
|                                 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, | ||||
|                                 NULL, (pr->slice_count + 1) * sizeof(uint32_t), | ||||
|                                 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | | ||||
|                                 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); | ||||
|  | ||||
|     /* Prepare frame to be used */ | ||||
|     RET(ff_vk_decode_prepare_frame_sdr(dec, pr->frame, vp, 1, | ||||
|                                        FF_VK_REP_NATIVE, 0)); | ||||
|  | ||||
|     pp->slice_num = 0; | ||||
|     pp->bitstream_start = pp->bitstream_size = 0; | ||||
|  | ||||
| fail: | ||||
|     return err; | ||||
| } | ||||
|  | ||||
| static int vk_prores_decode_slice(AVCodecContext *avctx, | ||||
|                                   const uint8_t  *data, | ||||
|                                   uint32_t        size) | ||||
| { | ||||
|     ProresContext             *pr = avctx->priv_data; | ||||
|     ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private; | ||||
|     FFVulkanDecodePicture     *vp = &pp->vp; | ||||
|  | ||||
|     FFVkBuffer *slice_offset = (FFVkBuffer *)pp->slice_offset_buf->data; | ||||
|     FFVkBuffer *slices_buf   = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL; | ||||
|  | ||||
|     /* Skip picture header */ | ||||
|     if (slices_buf && slices_buf->host_ref && !pp->slice_num) | ||||
|         pp->bitstream_size = data - slices_buf->mapped_mem; | ||||
|  | ||||
|     AV_WN32(slice_offset->mapped_mem + (pp->slice_num + 0) * sizeof(uint32_t), | ||||
|             pp->bitstream_size); | ||||
|     AV_WN32(slice_offset->mapped_mem + (pp->slice_num + 1) * sizeof(uint32_t), | ||||
|             pp->bitstream_size += size); | ||||
|  | ||||
|     if (!slices_buf || !slices_buf->host_ref) { | ||||
|         int err = ff_vk_decode_add_slice(avctx, vp, data, size, 0, | ||||
|                                          &pp->slice_num, NULL); | ||||
|         if (err < 0) | ||||
|             return err; | ||||
|     } else { | ||||
|         pp->slice_num++; | ||||
|     } | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| static int vk_prores_end_frame(AVCodecContext *avctx) | ||||
| { | ||||
|     ProresContext             *pr = avctx->priv_data; | ||||
|     FFVulkanDecodeContext    *dec = avctx->internal->hwaccel_priv_data; | ||||
|     FFVulkanDecodeShared     *ctx = dec->shared_ctx; | ||||
|     FFVulkanFunctions         *vk = &ctx->s.vkfn; | ||||
|     ProresVulkanDecodeContext *pv = ctx->sd_ctx; | ||||
|     ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private; | ||||
|     FFVulkanDecodePicture     *vp = &pp->vp; | ||||
|  | ||||
|     ProresVkParameters pd; | ||||
|     FFVkBuffer *slice_data, *slice_offsets; | ||||
|     struct ProresVulkanShaderVariants *shaders; | ||||
|     VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS]; | ||||
|     VkBufferMemoryBarrier2 buf_bar[2]; | ||||
|     int nb_img_bar = 0, nb_buf_bar = 0, err; | ||||
|     const AVPixFmtDescriptor *pix_desc; | ||||
|  | ||||
|     if (!pp->slice_num) | ||||
|         return 0; | ||||
|  | ||||
|     pix_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt); | ||||
|     if (!pix_desc) | ||||
|         return AVERROR(EINVAL); | ||||
|  | ||||
|     slice_data    = (FFVkBuffer *)vp->slices_buf->data; | ||||
|     slice_offsets = (FFVkBuffer *)pp->slice_offset_buf->data; | ||||
|  | ||||
|     shaders = &pv->shaders[pr->frame_type != 0]; | ||||
|  | ||||
|     pd = (ProresVkParameters) { | ||||
|         .slice_data       = slice_data->address, | ||||
|         .bitstream_size   = pp->bitstream_size, | ||||
|  | ||||
|         .width            = avctx->width, | ||||
|         .height           = avctx->height, | ||||
|         .mb_width         = pr->mb_width, | ||||
|         .mb_height        = pr->mb_height, | ||||
|         .slice_width      = pr->slice_count / pr->mb_height, | ||||
|         .slice_height     = pr->mb_height, | ||||
|         .log2_slice_width = av_log2(pr->slice_mb_width), | ||||
|         .log2_chroma_w    = pix_desc->log2_chroma_w, | ||||
|         .depth            = avctx->bits_per_raw_sample, | ||||
|         .alpha_info       = pr->alpha_info, | ||||
|         .bottom_field     = pr->first_field ^ (pr->frame_type == 1), | ||||
|     }; | ||||
|  | ||||
|     memcpy(pd.qmat_luma,   pr->qmat_luma,   sizeof(pd.qmat_luma  )); | ||||
|     memcpy(pd.qmat_chroma, pr->qmat_chroma, sizeof(pd.qmat_chroma)); | ||||
|  | ||||
|     FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool); | ||||
|     RET(ff_vk_exec_start(&ctx->s, exec)); | ||||
|  | ||||
|     /* Prepare deps */ | ||||
|     RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, pr->frame, | ||||
|                                  VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, | ||||
|                                  VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); | ||||
|  | ||||
|     RET(ff_vk_exec_mirror_sem_value(&ctx->s, exec, &vp->sem, &vp->sem_value, | ||||
|                                     pr->frame)); | ||||
|  | ||||
|     RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, | ||||
|                                (AVBufferRef *[]){ vp->slices_buf, pp->slice_offset_buf }, | ||||
|                                2, 0)); | ||||
|  | ||||
|     /* Transfer ownership to the exec context */ | ||||
|     vp->slices_buf = pp->slice_offset_buf = NULL; | ||||
|  | ||||
|     /* Input frame barrier */ | ||||
|     ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar, | ||||
|                         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, | ||||
|                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, | ||||
|                         VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||||
|                         VK_IMAGE_LAYOUT_GENERAL, | ||||
|                         VK_QUEUE_FAMILY_IGNORED); | ||||
|  | ||||
|     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { | ||||
|         .sType                    = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, | ||||
|         .pBufferMemoryBarriers    = buf_bar, | ||||
|         .bufferMemoryBarrierCount = nb_buf_bar, | ||||
|         .pImageMemoryBarriers     = img_bar, | ||||
|         .imageMemoryBarrierCount  = nb_img_bar, | ||||
|     }); | ||||
|     nb_img_bar = nb_buf_bar = 0; | ||||
|  | ||||
|     /* Reset */ | ||||
|     ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->reset, | ||||
|                                   pr->frame, vp->view.out, | ||||
|                                   0, 0, | ||||
|                                   VK_IMAGE_LAYOUT_GENERAL, | ||||
|                                   VK_NULL_HANDLE); | ||||
|  | ||||
|     ff_vk_shader_update_push_const(&ctx->s, exec, &shaders->reset, | ||||
|                                    VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|                                    0, sizeof(pd), &pd); | ||||
|  | ||||
|     ff_vk_exec_bind_shader(&ctx->s, exec, &shaders->reset); | ||||
|  | ||||
|     vk->CmdDispatch(exec->buf, pr->mb_width << 1, pr->mb_height << 1, 1); | ||||
|  | ||||
|     /* Input frame barrier after reset */ | ||||
|     ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar, | ||||
|                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, | ||||
|                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, | ||||
|                         VK_ACCESS_SHADER_WRITE_BIT, | ||||
|                         VK_IMAGE_LAYOUT_GENERAL, | ||||
|                         VK_QUEUE_FAMILY_IGNORED); | ||||
|  | ||||
|     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { | ||||
|         .sType                    = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, | ||||
|         .pBufferMemoryBarriers    = buf_bar, | ||||
|         .bufferMemoryBarrierCount = nb_buf_bar, | ||||
|         .pImageMemoryBarriers     = img_bar, | ||||
|         .imageMemoryBarrierCount  = nb_img_bar, | ||||
|     }); | ||||
|     nb_img_bar = nb_buf_bar = 0; | ||||
|  | ||||
|     /* Entropy decode */ | ||||
|     ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->vld, | ||||
|                                     0, 0, 0, | ||||
|                                     slice_offsets, | ||||
|                                     0, (pp->slice_num + 1) * sizeof(uint32_t), | ||||
|                                     VK_FORMAT_UNDEFINED); | ||||
|     ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->vld, | ||||
|                                   pr->frame, vp->view.out, | ||||
|                                   0, 1, | ||||
|                                   VK_IMAGE_LAYOUT_GENERAL, | ||||
|                                   VK_NULL_HANDLE); | ||||
|  | ||||
|     ff_vk_shader_update_push_const(&ctx->s, exec, &shaders->vld, | ||||
|                                    VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|                                    0, sizeof(pd), &pd); | ||||
|  | ||||
|     ff_vk_exec_bind_shader(&ctx->s, exec, &shaders->vld); | ||||
|  | ||||
|     vk->CmdDispatch(exec->buf, AV_CEIL_RSHIFT(pr->slice_count / pr->mb_height, 3), AV_CEIL_RSHIFT(pr->mb_height, 3), | ||||
|                     3 + !!pr->alpha_info); | ||||
|  | ||||
|     /* Synchronize vld and idct shaders */ | ||||
|     nb_img_bar = 0; | ||||
|     ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar, | ||||
|                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, | ||||
|                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, | ||||
|                         VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||||
|                         VK_IMAGE_LAYOUT_GENERAL, | ||||
|                         VK_QUEUE_FAMILY_IGNORED); | ||||
|  | ||||
|     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { | ||||
|         .sType                    = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, | ||||
|         .pBufferMemoryBarriers    = buf_bar, | ||||
|         .bufferMemoryBarrierCount = nb_buf_bar, | ||||
|         .pImageMemoryBarriers     = img_bar, | ||||
|         .imageMemoryBarrierCount  = nb_img_bar, | ||||
|     }); | ||||
|     nb_img_bar = nb_buf_bar = 0; | ||||
|  | ||||
|     /* Inverse transform */ | ||||
|     ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->idct, | ||||
|                                   pr->frame, vp->view.out, | ||||
|                                   0, 0, | ||||
|                                   VK_IMAGE_LAYOUT_GENERAL, | ||||
|                                   VK_NULL_HANDLE); | ||||
|  | ||||
|     ff_vk_exec_bind_shader(&ctx->s, exec, &shaders->idct); | ||||
|  | ||||
|     ff_vk_shader_update_push_const(&ctx->s, exec, &shaders->idct, | ||||
|                                    VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|                                    0, sizeof(pd), &pd); | ||||
|  | ||||
|     vk->CmdDispatch(exec->buf, AV_CEIL_RSHIFT(pr->mb_width, 1), pr->mb_height, 3); | ||||
|  | ||||
|     RET(ff_vk_exec_submit(&ctx->s, exec)); | ||||
|  | ||||
| fail: | ||||
|     return err; | ||||
| } | ||||
|  | ||||
| static int add_push_data(FFVulkanShader *shd) | ||||
| { | ||||
|     GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); | ||||
|     GLSLC(1,    u8buf    slice_data;                               ); | ||||
|     GLSLC(1,    uint     bitstream_size;                           ); | ||||
|     GLSLC(0,                                                       ); | ||||
|     GLSLC(1,    uint16_t width;                                    ); | ||||
|     GLSLC(1,    uint16_t height;                                   ); | ||||
|     GLSLC(1,    uint16_t mb_width;                                 ); | ||||
|     GLSLC(1,    uint16_t mb_height;                                ); | ||||
|     GLSLC(1,    uint16_t slice_width;                              ); | ||||
|     GLSLC(1,    uint16_t slice_height;                             ); | ||||
|     GLSLC(1,    uint8_t  log2_slice_width;                         ); | ||||
|     GLSLC(1,    uint8_t  log2_chroma_w;                            ); | ||||
|     GLSLC(1,    uint8_t  depth;                                    ); | ||||
|     GLSLC(1,    uint8_t  alpha_info;                               ); | ||||
|     GLSLC(1,    uint8_t  bottom_field;                             ); | ||||
|     GLSLC(0,                                                       ); | ||||
|     GLSLC(1,    uint8_t  qmat_luma  [8*8];                         ); | ||||
|     GLSLC(1,    uint8_t  qmat_chroma[8*8];                         ); | ||||
|     GLSLC(0, };                                                    ); | ||||
|  | ||||
|     return ff_vk_shader_add_push_const(shd, 0, sizeof(ProresVkParameters), | ||||
|                                        VK_SHADER_STAGE_COMPUTE_BIT); | ||||
| } | ||||
|  | ||||
| static int init_shader(AVCodecContext *avctx, FFVulkanContext *s, | ||||
|                        FFVkExecPool *pool, FFVkSPIRVCompiler *spv, | ||||
|                        FFVulkanShader *shd, const char *name, const char *entrypoint, | ||||
|                        FFVulkanDescriptorSetBinding *descs, int num_descs, | ||||
|                        const char *source, int local_size, int interlaced) | ||||
| { | ||||
|     uint8_t *spv_data; | ||||
|     size_t spv_len; | ||||
|     void *spv_opaque = NULL; | ||||
|     int err; | ||||
|  | ||||
|     RET(ff_vk_shader_init(s, shd, name, | ||||
|                           VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|                           (const char *[]) { "GL_EXT_buffer_reference", | ||||
|                                              "GL_EXT_buffer_reference2" }, 2, | ||||
|                           local_size >> 16 & 0xff, local_size >> 8 & 0xff, local_size >> 0 & 0xff, | ||||
|                           0)); | ||||
|  | ||||
|     /* Common code */ | ||||
|     GLSLD(ff_source_common_comp); | ||||
|  | ||||
|     /* Push constants layout */ | ||||
|     RET(add_push_data(shd)); | ||||
|  | ||||
|     RET(ff_vk_shader_add_descriptor_set(s, shd, descs, num_descs, 0, 0)); | ||||
|  | ||||
|     if (interlaced) | ||||
|         av_bprintf(&shd->src, "#define INTERLACED\n"); | ||||
|  | ||||
|     /* Main code */ | ||||
|     GLSLD(source); | ||||
|  | ||||
|     RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, entrypoint, | ||||
|                             &spv_opaque)); | ||||
|     RET(ff_vk_shader_link(s, shd, spv_data, spv_len, entrypoint)); | ||||
|  | ||||
|     RET(ff_vk_shader_register_exec(s, pool, shd)); | ||||
|  | ||||
| fail: | ||||
|     if (spv_opaque) | ||||
|         spv->free_shader(spv, &spv_opaque); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| static void vk_decode_prores_uninit(FFVulkanDecodeShared *ctx) | ||||
| { | ||||
|     ProresVulkanDecodeContext *pv = ctx->sd_ctx; | ||||
|     int i; | ||||
|  | ||||
|     for (i = 0; i < FF_ARRAY_ELEMS(pv->shaders); ++i) { | ||||
|         ff_vk_shader_free(&ctx->s, &pv->shaders[i].reset); | ||||
|         ff_vk_shader_free(&ctx->s, &pv->shaders[i].vld); | ||||
|         ff_vk_shader_free(&ctx->s, &pv->shaders[i].idct); | ||||
|     } | ||||
|  | ||||
|     av_buffer_pool_uninit(&pv->slice_offset_pool); | ||||
|  | ||||
|     av_freep(&pv); | ||||
| } | ||||
|  | ||||
| static int vk_decode_prores_init(AVCodecContext *avctx) | ||||
| { | ||||
|     FFVulkanDecodeContext        *dec = avctx->internal->hwaccel_priv_data; | ||||
|     FFVulkanDecodeShared         *ctx = NULL; | ||||
|  | ||||
|     AVHWFramesContext *out_frames_ctx; | ||||
|     ProresVulkanDecodeContext *pv; | ||||
|     FFVkSPIRVCompiler *spv; | ||||
|     FFVulkanDescriptorSetBinding *desc_set; | ||||
|     int max_num_slices, i, err; | ||||
|  | ||||
|     max_num_slices = (avctx->coded_width >> 4) * (avctx->coded_height >> 4); | ||||
|  | ||||
|     spv = ff_vk_spirv_init(); | ||||
|     if (!spv) { | ||||
|         av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); | ||||
|         return AVERROR_EXTERNAL; | ||||
|     } | ||||
|  | ||||
|     err = ff_vk_decode_init(avctx); | ||||
|     if (err < 0) | ||||
|         return err; | ||||
|     ctx = dec->shared_ctx; | ||||
|  | ||||
|     pv = ctx->sd_ctx = av_mallocz(sizeof(*pv)); | ||||
|     if (!pv) { | ||||
|         err = AVERROR(ENOMEM); | ||||
|         goto fail; | ||||
|     } | ||||
|  | ||||
|     out_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data; | ||||
|  | ||||
|     ctx->sd_ctx_free = vk_decode_prores_uninit; | ||||
|  | ||||
|     for (i = 0; i < FF_ARRAY_ELEMS(pv->shaders); ++i) { /* Progressive/interlaced */ | ||||
|         struct ProresVulkanShaderVariants *shaders = &pv->shaders[i]; | ||||
|  | ||||
|         desc_set = (FFVulkanDescriptorSetBinding []) { | ||||
|             { | ||||
|                 .name       = "dst", | ||||
|                 .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, | ||||
|                 .dimensions = 2, | ||||
|                 .mem_layout = ff_vk_shader_rep_fmt(out_frames_ctx->sw_format, | ||||
|                                                    FF_VK_REP_NATIVE), | ||||
|                 .mem_quali  = "writeonly", | ||||
|                 .elems      = av_pix_fmt_count_planes(out_frames_ctx->sw_format), | ||||
|                 .stages     = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|             }, | ||||
|         }; | ||||
|         RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->reset, | ||||
|                         "prores_dec_reset", "main", desc_set, 1, | ||||
|                         ff_source_prores_reset_comp, 0x080801, i)); | ||||
|  | ||||
|         desc_set = (FFVulkanDescriptorSetBinding []) { | ||||
|             { | ||||
|                 .name        = "slice_offsets_buf", | ||||
|                 .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||
|                 .stages      = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|                 .mem_quali   = "readonly", | ||||
|                 .buf_content = "uint32_t slice_offsets", | ||||
|                 .buf_elems   = max_num_slices + 1, | ||||
|             }, | ||||
|             { | ||||
|                 .name       = "dst", | ||||
|                 .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, | ||||
|                 .dimensions = 2, | ||||
|                 .mem_layout = ff_vk_shader_rep_fmt(out_frames_ctx->sw_format, | ||||
|                                                    FF_VK_REP_NATIVE), | ||||
|                 .mem_quali  = "writeonly", | ||||
|                 .elems      = av_pix_fmt_count_planes(out_frames_ctx->sw_format), | ||||
|                 .stages     = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|             }, | ||||
|         }; | ||||
|         RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->vld, | ||||
|                         "prores_dec_vld", "main", desc_set, 2, | ||||
|                         ff_source_prores_vld_comp, 0x080801, i)); | ||||
|  | ||||
|         desc_set = (FFVulkanDescriptorSetBinding []) { | ||||
|             { | ||||
|                 .name       = "dst", | ||||
|                 .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, | ||||
|                 .dimensions = 2, | ||||
|                 .mem_layout = ff_vk_shader_rep_fmt(out_frames_ctx->sw_format, | ||||
|                                                    FF_VK_REP_NATIVE), | ||||
|                 .elems      = av_pix_fmt_count_planes(out_frames_ctx->sw_format), | ||||
|                 .stages     = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|             }, | ||||
|         }; | ||||
|         RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->idct, | ||||
|                         "prores_dec_idct", "main", desc_set, 1, | ||||
|                         ff_source_prores_idct_comp, 0x200201, i)); | ||||
|     } | ||||
|  | ||||
|     err = 0; | ||||
|  | ||||
| fail: | ||||
|     spv->uninit(&spv); | ||||
|  | ||||
|     return err; | ||||
| } | ||||
|  | ||||
| static void vk_prores_free_frame_priv(AVRefStructOpaque _hwctx, void *data) | ||||
| { | ||||
|     AVHWDeviceContext    *dev_ctx = _hwctx.nc; | ||||
|     ProresVulkanDecodePicture *pp = data; | ||||
|  | ||||
|     ff_vk_decode_free_frame(dev_ctx, &pp->vp); | ||||
| } | ||||
|  | ||||
| const FFHWAccel ff_prores_vulkan_hwaccel = { | ||||
|     .p.name                = "prores_vulkan", | ||||
|     .p.type                = AVMEDIA_TYPE_VIDEO, | ||||
|     .p.id                  = AV_CODEC_ID_PRORES, | ||||
|     .p.pix_fmt             = AV_PIX_FMT_VULKAN, | ||||
|     .start_frame           = &vk_prores_start_frame, | ||||
|     .decode_slice          = &vk_prores_decode_slice, | ||||
|     .end_frame             = &vk_prores_end_frame, | ||||
|     .free_frame_priv       = &vk_prores_free_frame_priv, | ||||
|     .frame_priv_data_size  = sizeof(ProresVulkanDecodePicture), | ||||
|     .init                  = &vk_decode_prores_init, | ||||
|     .update_thread_context = &ff_vk_update_thread_context, | ||||
|     .decode_params         = &ff_vk_params_invalidate, | ||||
|     .flush                 = &ff_vk_decode_flush, | ||||
|     .uninit                = &ff_vk_decode_uninit, | ||||
|     .frame_params          = &ff_vk_frame_params, | ||||
|     .priv_data_size        = sizeof(FFVulkanDecodeContext), | ||||
|     .caps_internal         = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE, | ||||
| }; | ||||
		Reference in New Issue
	
	Block a user