You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
vulkan: add support for expect/assume
This commit adds support for compiler hints. While on AMD these are not used/needed, Nvidia benefits from them, and gives a sizeable 10% speedup on 4k.
This commit is contained in:
@ -31,7 +31,7 @@
|
|||||||
#ifdef RGB
|
#ifdef RGB
|
||||||
ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
|
ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
|
||||||
{
|
{
|
||||||
const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
|
const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? ivec2(1, -1) : ivec2(0, 0);
|
||||||
|
|
||||||
/* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
|
/* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
|
||||||
VTYPE3 top = VTYPE3(TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[0]),
|
VTYPE3 top = VTYPE3(TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[0]),
|
||||||
@ -47,10 +47,10 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
|
|||||||
quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
|
quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
|
||||||
quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
|
quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];
|
||||||
|
|
||||||
if (extend_lookup[quant_table_idx] > 0) {
|
if (expectEXT(extend_lookup[quant_table_idx] > 0, false)) {
|
||||||
TYPE cur2 = TYPE(0);
|
TYPE cur2 = TYPE(0);
|
||||||
if (off.x > 0) {
|
if (expectEXT(off.x > 0, true)) {
|
||||||
const ivec2 yoff_border2 = off.x == 1 ? ivec2(-1, -1) : ivec2(-2, 0);
|
const ivec2 yoff_border2 = expectEXT(off.x == 1, false) ? ivec2(-1, -1) : ivec2(-2, 0);
|
||||||
cur2 = TYPE(imageLoad(dec[p], sp + LADDR(off + yoff_border2))[0]);
|
cur2 = TYPE(imageLoad(dec[p], sp + LADDR(off + yoff_border2))[0]);
|
||||||
}
|
}
|
||||||
base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
|
base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
|
||||||
@ -120,9 +120,9 @@ int get_isymbol(inout RangeCoder c, uint64_t state)
|
|||||||
if (!get_rac(c, state + min(e, 9)))
|
if (!get_rac(c, state + min(e, 9)))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (e == 0) {
|
if (expectEXT(e == 0, false)) {
|
||||||
return get_rac(c, state + 10) ? -1 : 1;
|
return get_rac(c, state + 10) ? -1 : 1;
|
||||||
} else if (e > 31) {
|
} else if (expectEXT(e > 31, false)) {
|
||||||
corrupt = true;
|
corrupt = true;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -274,7 +274,7 @@ void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
|
|||||||
if (transparency != 0)
|
if (transparency != 0)
|
||||||
pix.a = int(imageLoad(dec[3], lpos)[0]);
|
pix.a = int(imageLoad(dec[3], lpos)[0]);
|
||||||
|
|
||||||
if (apply_rct)
|
if (expectEXT(apply_rct, true))
|
||||||
pix = transform_sample(pix, sc.slice_rct_coef);
|
pix = transform_sample(pix, sc.slice_rct_coef);
|
||||||
|
|
||||||
imageStore(dst[0], pos, pix);
|
imageStore(dst[0], pos, pix);
|
||||||
|
@ -141,7 +141,7 @@ void put_rac_equi(inout RangeCoder c, bool bit)
|
|||||||
c.range -= range1;
|
c.range -= range1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c.range < 0x100)
|
if (expectEXT(c.range < 0x100, false))
|
||||||
renorm_encoder(c);
|
renorm_encoder(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -157,7 +157,7 @@ void put_rac_terminate(inout RangeCoder c)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
c.range -= range1;
|
c.range -= range1;
|
||||||
if (c.range < 0x100)
|
if (expectEXT(c.range < 0x100, false))
|
||||||
renorm_encoder(c);
|
renorm_encoder(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,7 +218,7 @@ void refill(inout RangeCoder c)
|
|||||||
{
|
{
|
||||||
c.range <<= 8;
|
c.range <<= 8;
|
||||||
c.low <<= 8;
|
c.low <<= 8;
|
||||||
if (c.bytestream < c.bytestream_end) {
|
if (expectEXT(c.bytestream < c.bytestream_end, false)) {
|
||||||
c.low |= u8buf(c.bytestream).v;
|
c.low |= u8buf(c.bytestream).v;
|
||||||
c.bytestream++;
|
c.bytestream++;
|
||||||
} else {
|
} else {
|
||||||
@ -235,7 +235,7 @@ bool get_rac_direct(inout RangeCoder c, inout uint8_t state)
|
|||||||
c.low -= bit ? ranged : 0;
|
c.low -= bit ? ranged : 0;
|
||||||
c.range = (bit ? 0 : ranged) + (bit ? range1 : 0);
|
c.range = (bit ? 0 : ranged) + (bit ? range1 : 0);
|
||||||
|
|
||||||
if (c.range < 0x100)
|
if (expectEXT(c.range < 0x100, false))
|
||||||
refill(c);
|
refill(c);
|
||||||
|
|
||||||
state = zero_one_state[state + (bit ? 256 : 0)];
|
state = zero_one_state[state + (bit ? 256 : 0)];
|
||||||
@ -263,7 +263,7 @@ bool get_rac_equi(inout RangeCoder c)
|
|||||||
c.range = range1;
|
c.range = range1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c.range < 0x100)
|
if (expectEXT(c.range < 0x100, false))
|
||||||
refill(c);
|
refill(c);
|
||||||
|
|
||||||
return bit;
|
return bit;
|
||||||
|
@ -79,6 +79,7 @@ typedef struct VulkanDeviceFeatures {
|
|||||||
VkPhysicalDeviceVulkan12Features vulkan_1_2;
|
VkPhysicalDeviceVulkan12Features vulkan_1_2;
|
||||||
VkPhysicalDeviceVulkan13Features vulkan_1_3;
|
VkPhysicalDeviceVulkan13Features vulkan_1_3;
|
||||||
VkPhysicalDeviceTimelineSemaphoreFeatures timeline_semaphore;
|
VkPhysicalDeviceTimelineSemaphoreFeatures timeline_semaphore;
|
||||||
|
VkPhysicalDeviceShaderExpectAssumeFeatures expect_assume;
|
||||||
|
|
||||||
VkPhysicalDeviceVideoMaintenance1FeaturesKHR video_maintenance_1;
|
VkPhysicalDeviceVideoMaintenance1FeaturesKHR video_maintenance_1;
|
||||||
#ifdef VK_KHR_video_maintenance2
|
#ifdef VK_KHR_video_maintenance2
|
||||||
@ -210,6 +211,9 @@ static void device_features_init(AVHWDeviceContext *ctx, VulkanDeviceFeatures *f
|
|||||||
OPT_CHAIN(&feats->timeline_semaphore, FF_VK_EXT_PORTABILITY_SUBSET,
|
OPT_CHAIN(&feats->timeline_semaphore, FF_VK_EXT_PORTABILITY_SUBSET,
|
||||||
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES);
|
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES);
|
||||||
|
|
||||||
|
OPT_CHAIN(&feats->expect_assume, FF_VK_EXT_EXPECT_ASSUME,
|
||||||
|
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_EXPECT_ASSUME_FEATURES_KHR);
|
||||||
|
|
||||||
OPT_CHAIN(&feats->video_maintenance_1, FF_VK_EXT_VIDEO_MAINTENANCE_1,
|
OPT_CHAIN(&feats->video_maintenance_1, FF_VK_EXT_VIDEO_MAINTENANCE_1,
|
||||||
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VIDEO_MAINTENANCE_1_FEATURES_KHR);
|
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VIDEO_MAINTENANCE_1_FEATURES_KHR);
|
||||||
#ifdef VK_KHR_video_maintenance2
|
#ifdef VK_KHR_video_maintenance2
|
||||||
@ -302,6 +306,8 @@ static void device_features_copy_needed(VulkanDeviceFeatures *dst, VulkanDeviceF
|
|||||||
COPY_VAL(relaxed_extended_instruction.shaderRelaxedExtendedInstruction);
|
COPY_VAL(relaxed_extended_instruction.shaderRelaxedExtendedInstruction);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
COPY_VAL(expect_assume.shaderExpectAssume);
|
||||||
|
|
||||||
COPY_VAL(optical_flow.opticalFlow);
|
COPY_VAL(optical_flow.opticalFlow);
|
||||||
#undef COPY_VAL
|
#undef COPY_VAL
|
||||||
}
|
}
|
||||||
@ -616,6 +622,7 @@ static const VulkanOptExtension optional_device_exts[] = {
|
|||||||
{ VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME, FF_VK_EXT_COOP_MATRIX },
|
{ VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME, FF_VK_EXT_COOP_MATRIX },
|
||||||
{ VK_NV_OPTICAL_FLOW_EXTENSION_NAME, FF_VK_EXT_OPTICAL_FLOW },
|
{ VK_NV_OPTICAL_FLOW_EXTENSION_NAME, FF_VK_EXT_OPTICAL_FLOW },
|
||||||
{ VK_EXT_SHADER_OBJECT_EXTENSION_NAME, FF_VK_EXT_SHADER_OBJECT },
|
{ VK_EXT_SHADER_OBJECT_EXTENSION_NAME, FF_VK_EXT_SHADER_OBJECT },
|
||||||
|
{ VK_KHR_SHADER_EXPECT_ASSUME_EXTENSION_NAME, FF_VK_EXT_EXPECT_ASSUME },
|
||||||
{ VK_KHR_VIDEO_MAINTENANCE_1_EXTENSION_NAME, FF_VK_EXT_VIDEO_MAINTENANCE_1 },
|
{ VK_KHR_VIDEO_MAINTENANCE_1_EXTENSION_NAME, FF_VK_EXT_VIDEO_MAINTENANCE_1 },
|
||||||
#ifdef VK_KHR_video_maintenance2
|
#ifdef VK_KHR_video_maintenance2
|
||||||
{ VK_KHR_VIDEO_MAINTENANCE_2_EXTENSION_NAME, FF_VK_EXT_VIDEO_MAINTENANCE_2 },
|
{ VK_KHR_VIDEO_MAINTENANCE_2_EXTENSION_NAME, FF_VK_EXT_VIDEO_MAINTENANCE_2 },
|
||||||
|
@ -2046,6 +2046,12 @@ int ff_vk_shader_init(FFVulkanContext *s, FFVulkanShader *shd, const char *name,
|
|||||||
GLSLC(0, #extension GL_EXT_scalar_block_layout : require );
|
GLSLC(0, #extension GL_EXT_scalar_block_layout : require );
|
||||||
GLSLC(0, #extension GL_EXT_shader_explicit_arithmetic_types : require );
|
GLSLC(0, #extension GL_EXT_shader_explicit_arithmetic_types : require );
|
||||||
GLSLC(0, #extension GL_EXT_control_flow_attributes : require );
|
GLSLC(0, #extension GL_EXT_control_flow_attributes : require );
|
||||||
|
if (s->extensions & FF_VK_EXT_EXPECT_ASSUME) {
|
||||||
|
GLSLC(0, #extension GL_EXT_expect_assume : require );
|
||||||
|
} else {
|
||||||
|
GLSLC(0, #define assumeEXT(x) (x) );
|
||||||
|
GLSLC(0, #define expectEXT(x, c) (x) );
|
||||||
|
}
|
||||||
if ((s->extensions & FF_VK_EXT_DEBUG_UTILS) &&
|
if ((s->extensions & FF_VK_EXT_DEBUG_UTILS) &&
|
||||||
(s->extensions & FF_VK_EXT_RELAXED_EXTENDED_INSTR)) {
|
(s->extensions & FF_VK_EXT_RELAXED_EXTENDED_INSTR)) {
|
||||||
GLSLC(0, #extension GL_EXT_debug_printf : require );
|
GLSLC(0, #extension GL_EXT_debug_printf : require );
|
||||||
|
@ -47,6 +47,7 @@ typedef uint64_t FFVulkanExtensions;
|
|||||||
#define FF_VK_EXT_SHADER_OBJECT (1ULL << 13) /* VK_EXT_shader_object */
|
#define FF_VK_EXT_SHADER_OBJECT (1ULL << 13) /* VK_EXT_shader_object */
|
||||||
#define FF_VK_EXT_PUSH_DESCRIPTOR (1ULL << 14) /* VK_KHR_push_descriptor */
|
#define FF_VK_EXT_PUSH_DESCRIPTOR (1ULL << 14) /* VK_KHR_push_descriptor */
|
||||||
#define FF_VK_EXT_RELAXED_EXTENDED_INSTR (1ULL << 15) /* VK_KHR_shader_relaxed_extended_instruction */
|
#define FF_VK_EXT_RELAXED_EXTENDED_INSTR (1ULL << 15) /* VK_KHR_shader_relaxed_extended_instruction */
|
||||||
|
#define FF_VK_EXT_EXPECT_ASSUME (1ULL << 16) /* VK_KHR_shader_expect_assume */
|
||||||
|
|
||||||
/* Video extensions */
|
/* Video extensions */
|
||||||
#define FF_VK_EXT_VIDEO_QUEUE (1ULL << 36) /* VK_KHR_video_queue */
|
#define FF_VK_EXT_VIDEO_QUEUE (1ULL << 36) /* VK_KHR_video_queue */
|
||||||
|
@ -76,6 +76,7 @@ static inline uint64_t ff_vk_extensions_to_mask(const char * const *extensions,
|
|||||||
{ VK_KHR_VIDEO_DECODE_H265_EXTENSION_NAME, FF_VK_EXT_VIDEO_DECODE_H265 },
|
{ VK_KHR_VIDEO_DECODE_H265_EXTENSION_NAME, FF_VK_EXT_VIDEO_DECODE_H265 },
|
||||||
{ VK_KHR_VIDEO_DECODE_AV1_EXTENSION_NAME, FF_VK_EXT_VIDEO_DECODE_AV1 },
|
{ VK_KHR_VIDEO_DECODE_AV1_EXTENSION_NAME, FF_VK_EXT_VIDEO_DECODE_AV1 },
|
||||||
{ VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, FF_VK_EXT_PUSH_DESCRIPTOR },
|
{ VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, FF_VK_EXT_PUSH_DESCRIPTOR },
|
||||||
|
{ VK_KHR_SHADER_EXPECT_ASSUME_EXTENSION_NAME, FF_VK_EXT_EXPECT_ASSUME },
|
||||||
};
|
};
|
||||||
|
|
||||||
FFVulkanExtensions mask = 0x0;
|
FFVulkanExtensions mask = 0x0;
|
||||||
|
Reference in New Issue
Block a user