You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	nlmeans_vulkan: parallelize workgroup invocations
This commit is contained in:
		| @@ -395,8 +395,7 @@ OBJS-$(CONFIG_MULTIPLY_FILTER)               += vf_multiply.o | ||||
| OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_negate.o | ||||
| OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o | ||||
| OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o | ||||
| OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER)         += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o \ | ||||
|                                                 vulkan/prefix_sum.o | ||||
| OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER)         += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o | ||||
| OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o | ||||
| OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o | ||||
| OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o | ||||
|   | ||||
| @@ -38,9 +38,10 @@ typedef struct NLMeansVulkanContext { | ||||
|     VkSampler sampler; | ||||
|  | ||||
|     AVBufferPool *integral_buf_pool; | ||||
|     AVBufferPool *state_buf_pool; | ||||
|     AVBufferPool *ws_buf_pool; | ||||
|  | ||||
|     FFVkBuffer xyoffsets_buf; | ||||
|  | ||||
|     int pl_weights_rows; | ||||
|     FFVulkanPipeline pl_weights; | ||||
|     FFVkSPIRVShader shd_weights; | ||||
| @@ -66,107 +67,97 @@ typedef struct NLMeansVulkanContext { | ||||
|  | ||||
| extern const char *ff_source_prefix_sum_comp; | ||||
|  | ||||
| static void insert_first(FFVkSPIRVShader *shd, int r, int horiz, int plane, int comp) | ||||
| static void insert_first(FFVkSPIRVShader *shd, int r, const char *off, int horiz, int plane, int comp) | ||||
| { | ||||
|     GLSLF(2,     s1    = texture(input_img[%i], ivec2(x + %i, y + %i))[%i]; | ||||
|           ,plane, horiz ? r : 0, !horiz ? r : 0, comp); | ||||
|     GLSLF(4, s1    = texture(input_img[%i], pos + ivec2(%i + %s, %i + %s))[%i]; | ||||
|           ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); | ||||
|  | ||||
|     if (TYPE_ELEMS == 4) { | ||||
|         GLSLF(2, s2[0] = texture(input_img[%i], ivec2(x + %i + xoffs[0], y + %i + yoffs[0]))[%i]; | ||||
|               ,plane, horiz ? r : 0, !horiz ? r : 0, comp); | ||||
|         GLSLF(2, s2[1] = texture(input_img[%i], ivec2(x + %i + xoffs[1], y + %i + yoffs[1]))[%i]; | ||||
|               ,plane, horiz ? r : 0, !horiz ? r : 0, comp); | ||||
|         GLSLF(2, s2[2] = texture(input_img[%i], ivec2(x + %i + xoffs[2], y + %i + yoffs[2]))[%i]; | ||||
|               ,plane, horiz ? r : 0, !horiz ? r : 0, comp); | ||||
|         GLSLF(2, s2[3] = texture(input_img[%i], ivec2(x + %i + xoffs[3], y + %i + yoffs[3]))[%i]; | ||||
|               ,plane, horiz ? r : 0, !horiz ? r : 0, comp); | ||||
|     } else { | ||||
|         for (int i = 0; i < 16; i++) { | ||||
|             GLSLF(2, s2[%i][%i] = texture(input_img[%i], ivec2(x + %i + xoffs[%i], y + %i + yoffs[%i]))[%i]; | ||||
|                   ,i / 4, i % 4, plane, horiz ? r : 0, i, !horiz ? r : 0, i, comp); | ||||
|         } | ||||
|     } | ||||
|     GLSLF(4, s2[0] = texture(input_img[%i], pos + offs[0] + ivec2(%i + %s, %i + %s))[%i]; | ||||
|           ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); | ||||
|     GLSLF(4, s2[1] = texture(input_img[%i], pos + offs[1] + ivec2(%i + %s, %i + %s))[%i]; | ||||
|           ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); | ||||
|     GLSLF(4, s2[2] = texture(input_img[%i], pos + offs[2] + ivec2(%i + %s, %i + %s))[%i]; | ||||
|           ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); | ||||
|     GLSLF(4, s2[3] = texture(input_img[%i], pos + offs[3] + ivec2(%i + %s, %i + %s))[%i]; | ||||
|           ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); | ||||
|  | ||||
|     GLSLC(2, s2 = (s1 - s2) * (s1 - s2);                                       ); | ||||
|     GLSLC(4, s2 = (s1 - s2) * (s1 - s2);                                                    ); | ||||
| } | ||||
|  | ||||
| static void insert_horizontal_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp) | ||||
| { | ||||
|     GLSLF(1, x = int(gl_GlobalInvocationID.x) * %i;                   ,nb_rows); | ||||
|     if (!first) { | ||||
|         GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, | ||||
|                                 gl_StorageSemanticsBuffer, | ||||
|                                 gl_SemanticsAcquireRelease | | ||||
|                                 gl_SemanticsMakeAvailable | | ||||
|                                 gl_SemanticsMakeVisible);                     ); | ||||
|     } | ||||
|     GLSLF(1, for (y = 0; y < height[%i]; y++) {                               ,plane); | ||||
|     GLSLC(2,     offset = uint64_t(int_stride)*y*T_ALIGN;                     ); | ||||
|     GLSLC(2,     dst = DataBuffer(uint64_t(integral_data) + offset);          ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     if (first) { | ||||
|         for (int r = 0; r < nb_rows; r++) { | ||||
|             insert_first(shd, r, 1, plane, comp); | ||||
|             GLSLF(2, dst.v[x + %i] = s2;                                    ,r); | ||||
|             GLSLC(0,                                                          ); | ||||
|         } | ||||
|     } | ||||
|     GLSLC(2,     barrier();                                                   ); | ||||
|     GLSLC(2,     prefix_sum(dst, 1, dst, 1);                                  ); | ||||
|     GLSLC(1, }                                                                ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLF(1, pos.y = int(gl_GlobalInvocationID.x) * %i;                           ,nb_rows); | ||||
|     if (!first) | ||||
|         GLSLC(1, barrier();                                                       ); | ||||
|     GLSLC(0,                                                                      ); | ||||
|     GLSLF(1, if (pos.y < height[%i]) {                                            ,plane); | ||||
|     GLSLC(2,     #pragma unroll(1)                                                ); | ||||
|     GLSLF(2,     for (r = 0; r < %i; r++) {                                       ,nb_rows); | ||||
|     GLSLC(3,         prefix_sum = DTYPE(0);                                       ); | ||||
|     GLSLC(3,         offset = uint64_t(int_stride)*(pos.y + r)*T_ALIGN;           ); | ||||
|     GLSLC(3,         dst = DataBuffer(uint64_t(integral_data) + offset);          ); | ||||
|     GLSLC(0,                                                                      ); | ||||
|     GLSLF(3,         for (pos.x = 0; pos.x < width[%i]; pos.x++) {                ,plane); | ||||
|     if (first) | ||||
|         insert_first(shd, 0, "r", 0, plane, comp); | ||||
|     else | ||||
|         GLSLC(4,         s2 = dst.v[pos.x];                                       ); | ||||
|     GLSLC(4,             dst.v[pos.x] = s2 + prefix_sum;                          ); | ||||
|     GLSLC(4,             prefix_sum += s2;                                        ); | ||||
|     GLSLC(3,         }                                                            ); | ||||
|     GLSLC(2,     }                                                                ); | ||||
|     GLSLC(1, }                                                                    ); | ||||
|     GLSLC(0,                                                                      ); | ||||
| } | ||||
|  | ||||
| static void insert_vertical_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp) | ||||
| { | ||||
|     GLSLF(1, y = int(gl_GlobalInvocationID.x) * %i;                   ,nb_rows); | ||||
|     if (!first) { | ||||
|         GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, | ||||
|                                 gl_StorageSemanticsBuffer, | ||||
|                                 gl_SemanticsAcquireRelease | | ||||
|                                 gl_SemanticsMakeAvailable | | ||||
|                                 gl_SemanticsMakeVisible);                     ); | ||||
|     } | ||||
|     GLSLF(1, for (x = 0; x < width[%i]; x++) {                                ,plane); | ||||
|     GLSLC(2,     dst = DataBuffer(uint64_t(integral_data) + x*T_ALIGN);       ); | ||||
|  | ||||
|     for (int r = 0; r < nb_rows; r++) { | ||||
|         if (first) { | ||||
|             insert_first(shd, r, 0, plane, comp); | ||||
|             GLSLF(2, integral_data.v[(y + %i)*int_stride + x] = s2;         ,r); | ||||
|             GLSLC(0,                                                          ); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     GLSLC(2,     barrier();                                                   ); | ||||
|     GLSLC(2,     prefix_sum(dst, int_stride, dst, int_stride);                ); | ||||
|     GLSLC(1, }                                                                ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLF(1, pos.x = int(gl_GlobalInvocationID.x) * %i;                           ,nb_rows); | ||||
|     GLSLC(1, #pragma unroll(1)                                                    ); | ||||
|     GLSLF(1, for (r = 0; r < %i; r++)                                             ,nb_rows); | ||||
|     GLSLC(2,     psum[r] = DTYPE(0);                                              ); | ||||
|     GLSLC(0,                                                                      ); | ||||
|     if (!first) | ||||
|         GLSLC(1, barrier();                                                       ); | ||||
|     GLSLC(0,                                                                      ); | ||||
|     GLSLF(1, if (pos.x < width[%i]) {                                             ,plane); | ||||
|     GLSLF(2,     for (pos.y = 0; pos.y < height[%i]; pos.y++) {                   ,plane); | ||||
|     GLSLC(3,         offset = uint64_t(int_stride)*pos.y*T_ALIGN;                 ); | ||||
|     GLSLC(3,         dst = DataBuffer(uint64_t(integral_data) + offset);          ); | ||||
|     GLSLC(0,                                                                      ); | ||||
|     GLSLC(3,         #pragma unroll(1)                                            ); | ||||
|     GLSLF(3,         for (r = 0; r < %i; r++) {                                   ,nb_rows); | ||||
|     if (first) | ||||
|         insert_first(shd, 0, "r", 1, plane, comp); | ||||
|     else | ||||
|         GLSLC(4,         s2 = dst.v[pos.x + r];                                   ); | ||||
|     GLSLC(4,             dst.v[pos.x + r] = s2 + psum[r];                         ); | ||||
|     GLSLC(4,             psum[r] += s2;                                           ); | ||||
|     GLSLC(3,         }                                                            ); | ||||
|     GLSLC(2,     }                                                                ); | ||||
|     GLSLC(1, }                                                                    ); | ||||
|     GLSLC(0,                                                                      ); | ||||
| } | ||||
|  | ||||
| static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert, | ||||
|                                 int t, int dst_comp, int plane, int comp) | ||||
| { | ||||
|     GLSLF(1, p = patch_size[%i];                                     ,dst_comp); | ||||
|     GLSLF(1, p = patch_size[%i];                                              ,dst_comp); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, | ||||
|                             gl_StorageSemanticsBuffer, | ||||
|                             gl_SemanticsAcquireRelease | | ||||
|                             gl_SemanticsMakeAvailable | | ||||
|                             gl_SemanticsMakeVisible);                         ); | ||||
|     GLSLC(1, barrier();                                                       ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     if (!vert) { | ||||
|         GLSLF(1, for (y = 0; y < height[%i]; y++) {                           ,plane); | ||||
|         GLSLF(1, for (pos.y = 0; pos.y < height[%i]; pos.y++) {               ,plane); | ||||
|         GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= width[%i])             ,nb_rows, plane); | ||||
|         GLSLC(3,         break;                                               ); | ||||
|         GLSLF(2,     for (r = 0; r < %i; r++) {                       ,nb_rows); | ||||
|         GLSLF(3,         x = int(gl_GlobalInvocationID.x) * %i + r;   ,nb_rows); | ||||
|         GLSLF(2,     for (r = 0; r < %i; r++) {                               ,nb_rows); | ||||
|         GLSLF(3,         pos.x = int(gl_GlobalInvocationID.x) * %i + r;       ,nb_rows); | ||||
|     } else { | ||||
|         GLSLF(1, for (x = 0; x < width[%i]; x++) {                            ,plane); | ||||
|         GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) {                ,plane); | ||||
|         GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= height[%i])            ,nb_rows, plane); | ||||
|         GLSLC(3,         break;                                               ); | ||||
|         GLSLF(2,     for (r = 0; r < %i; r++) {                       ,nb_rows); | ||||
|         GLSLF(3,         y = int(gl_GlobalInvocationID.x) * %i + r;   ,nb_rows); | ||||
|         GLSLF(2,     for (r = 0; r < %i; r++) {                               ,nb_rows); | ||||
|         GLSLF(3,         pos.y = int(gl_GlobalInvocationID.x) * %i + r;       ,nb_rows); | ||||
|     } | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(3,         a = DTYPE(0);                                            ); | ||||
| @@ -174,25 +165,25 @@ static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert, | ||||
|     GLSLC(3,         c = DTYPE(0);                                            ); | ||||
|     GLSLC(3,         d = DTYPE(0);                                            ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(3,         lt = ((x - p) < 0) || ((y - p) < 0);                     ); | ||||
|     GLSLC(3,         lt = ((pos.x - p) < 0) || ((pos.y - p) < 0);             ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     if (TYPE_ELEMS == 4) { | ||||
|         GLSLF(3,         src[0] = texture(input_img[%i], ivec2(x + xoffs[0], y + yoffs[0]))[%i];   ,plane, comp); | ||||
|         GLSLF(3,         src[1] = texture(input_img[%i], ivec2(x + xoffs[1], y + yoffs[1]))[%i];   ,plane, comp); | ||||
|         GLSLF(3,         src[2] = texture(input_img[%i], ivec2(x + xoffs[2], y + yoffs[2]))[%i];   ,plane, comp); | ||||
|         GLSLF(3,         src[3] = texture(input_img[%i], ivec2(x + xoffs[3], y + yoffs[3]))[%i];   ,plane, comp); | ||||
|         GLSLF(3,         src[0] = texture(input_img[%i], pos + offs[0])[%i];   ,plane, comp); | ||||
|         GLSLF(3,         src[1] = texture(input_img[%i], pos + offs[1])[%i];   ,plane, comp); | ||||
|         GLSLF(3,         src[2] = texture(input_img[%i], pos + offs[2])[%i];   ,plane, comp); | ||||
|         GLSLF(3,         src[3] = texture(input_img[%i], pos + offs[3])[%i];   ,plane, comp); | ||||
|     } else { | ||||
|         for (int i = 0; i < 16; i++) | ||||
|             GLSLF(3, src[%i][%i] = texture(input_img[%i], ivec2(x + xoffs[%i], y + yoffs[%i]))[%i]; | ||||
|                   ,i / 4, i % 4, plane, i, i, comp); | ||||
|             GLSLF(3, src[%i][%i] = texture(input_img[%i], pos + offs[%i])[%i]; | ||||
|                   ,i / 4, i % 4, plane, i, comp); | ||||
|  | ||||
|     } | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(3,         if (lt == false) {                                       ); | ||||
|     GLSLC(4,             a = integral_data.v[(y - p)*int_stride + x - p];     ); | ||||
|     GLSLC(4,             c = integral_data.v[(y - p)*int_stride + x + p];     ); | ||||
|     GLSLC(4,             b = integral_data.v[(y + p)*int_stride + x - p];     ); | ||||
|     GLSLC(4,             d = integral_data.v[(y + p)*int_stride + x + p];     ); | ||||
|     GLSLC(4,             a = integral_data.v[(pos.y - p)*int_stride + pos.x - p];     ); | ||||
|     GLSLC(4,             c = integral_data.v[(pos.y - p)*int_stride + pos.x + p];     ); | ||||
|     GLSLC(4,             b = integral_data.v[(pos.y + p)*int_stride + pos.x - p];     ); | ||||
|     GLSLC(4,             d = integral_data.v[(pos.y + p)*int_stride + pos.x + p];     ); | ||||
|     GLSLC(3,         }                                                        ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(3,         patch_diff = d + a - b - c;                              ); | ||||
| @@ -212,27 +203,26 @@ static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert, | ||||
|     } | ||||
|     GLSLC(0,                                                                  ); | ||||
|     if (t > 1) { | ||||
|         GLSLF(3,         atomicAdd(weights_%i[y*ws_stride[%i] + x], w_sum);   ,dst_comp, dst_comp); | ||||
|         GLSLF(3,         atomicAdd(sums_%i[y*ws_stride[%i] + x], sum);        ,dst_comp, dst_comp); | ||||
|         GLSLF(3,         atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum);   ,dst_comp, dst_comp); | ||||
|         GLSLF(3,         atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum);        ,dst_comp, dst_comp); | ||||
|     } else { | ||||
|         GLSLF(3,         weights_%i[y*ws_stride[%i] + x] += w_sum;            ,dst_comp, dst_comp); | ||||
|         GLSLF(3,         sums_%i[y*ws_stride[%i] + x] += sum;                 ,dst_comp, dst_comp); | ||||
|         GLSLF(3,         weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum;            ,dst_comp, dst_comp); | ||||
|         GLSLF(3,         sums_%i[pos.y*ws_stride[%i] + pos.x] += sum;                 ,dst_comp, dst_comp); | ||||
|     } | ||||
|     GLSLC(2,     }                                                            ); | ||||
|     GLSLC(1, }                                                                ); | ||||
| } | ||||
|  | ||||
| typedef struct HorizontalPushData { | ||||
|     VkDeviceAddress integral_data; | ||||
|     VkDeviceAddress state_data; | ||||
|     int32_t  xoffs[TYPE_ELEMS]; | ||||
|     int32_t  yoffs[TYPE_ELEMS]; | ||||
|     uint32_t width[4]; | ||||
|     uint32_t height[4]; | ||||
|     uint32_t ws_stride[4]; | ||||
|     int32_t  patch_size[4]; | ||||
|     float    strength[4]; | ||||
|     VkDeviceAddress integral_base; | ||||
|     uint32_t integral_size; | ||||
|     uint32_t int_stride; | ||||
|     uint32_t xyoffs_start; | ||||
| } HorizontalPushData; | ||||
|  | ||||
| static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec, | ||||
| @@ -249,26 +239,18 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e | ||||
|     FFVulkanDescriptorSetBinding *desc_set; | ||||
|     int max_dim = FFMAX(width, height); | ||||
|     uint32_t max_wg = vkctx->props.properties.limits.maxComputeWorkGroupSize[0]; | ||||
|     int max_shm = vkctx->props.properties.limits.maxComputeSharedMemorySize; | ||||
|     int wg_size, wg_rows; | ||||
|  | ||||
|     /* Round the max workgroup size to the previous power of two */ | ||||
|     max_wg = 1 << (31 - ff_clz(max_wg)); | ||||
|     wg_size = max_wg; | ||||
|     wg_rows = 1; | ||||
|  | ||||
|     if (max_wg > max_dim) { | ||||
|         wg_size = max_wg / (max_wg / max_dim); | ||||
|         wg_size = max_dim; | ||||
|     } else if (max_wg < max_dim) { | ||||
|         /* First, make it fit */ | ||||
|         /* Make it fit */ | ||||
|         while (wg_size*wg_rows < max_dim) | ||||
|             wg_rows++; | ||||
|  | ||||
|         /* Second, make sure there's enough shared memory */ | ||||
|         while ((wg_size * TYPE_SIZE + TYPE_SIZE + 2*4) > max_shm) { | ||||
|             wg_size >>= 1; | ||||
|             wg_rows++; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     RET(ff_vk_shader_init(pl, shd, "nlmeans_weights", VK_SHADER_STAGE_COMPUTE_BIT, 0)); | ||||
| @@ -278,33 +260,24 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e | ||||
|     if (t > 1) | ||||
|         GLSLC(0, #extension GL_EXT_shader_atomic_float : require              ); | ||||
|     GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     ); | ||||
|     GLSLC(0, #pragma use_vulkan_memory_model                                  ); | ||||
|     GLSLC(0, #extension GL_KHR_memory_scope_semantics : enable                ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLF(0, #define N_ROWS %i                                       ,*nb_rows); | ||||
|     GLSLC(0, #define WG_SIZE (gl_WorkGroupSize.x)                             ); | ||||
|     GLSLF(0, #define LG_WG_SIZE %i                ,ff_log2(shd->local_size[0])); | ||||
|     GLSLC(0, #define PARTITION_SIZE (N_ROWS*WG_SIZE)                          ); | ||||
|     GLSLF(0, #define DTYPE %s                                       ,TYPE_NAME); | ||||
|     GLSLF(0, #define T_ALIGN %i                                     ,TYPE_SIZE); | ||||
|     GLSLF(0, #define DTYPE %s                                                 ,TYPE_NAME); | ||||
|     GLSLF(0, #define T_ALIGN %i                                               ,TYPE_SIZE); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) coherent buffer DataBuffer {  ); | ||||
|     GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer {  ); | ||||
|     GLSLC(1,     DTYPE v[];                                                   ); | ||||
|     GLSLC(0, };                                                               ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(0, layout(buffer_reference) buffer StateData;                       ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(0, layout(push_constant, std430) uniform pushConstants {            ); | ||||
|     GLSLC(1,     coherent DataBuffer integral_data;                           ); | ||||
|     GLSLC(1,     StateData  state;                                            ); | ||||
|     GLSLF(1,     uint xoffs[%i];                                   ,TYPE_ELEMS); | ||||
|     GLSLF(1,     uint yoffs[%i];                                   ,TYPE_ELEMS); | ||||
|     GLSLC(1,     uvec4 width;                                                 ); | ||||
|     GLSLC(1,     uvec4 height;                                                ); | ||||
|     GLSLC(1,     uvec4 ws_stride;                                             ); | ||||
|     GLSLC(1,     ivec4 patch_size;                                            ); | ||||
|     GLSLC(1,     vec4 strength;                                               ); | ||||
|     GLSLC(1,     DataBuffer integral_base;                                    ); | ||||
|     GLSLC(1,     uint integral_size;                                          ); | ||||
|     GLSLC(1,     uint int_stride;                                             ); | ||||
|     GLSLC(1,     uint xyoffs_start;                                           ); | ||||
|     GLSLC(0, };                                                               ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|  | ||||
| @@ -370,42 +343,65 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e | ||||
|     }; | ||||
|     RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1 + 2*desc->nb_components, 0, 0)); | ||||
|  | ||||
|     GLSLD(   ff_source_prefix_sum_comp                                        ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(0, void main()                                                      ); | ||||
|     GLSLC(0, {                                                                ); | ||||
|     GLSLC(1,     uint64_t offset;                                             ); | ||||
|     GLSLC(1,     DataBuffer dst;                                              ); | ||||
|     GLSLC(1,     float s1;                                                    ); | ||||
|     GLSLC(1,     DTYPE s2;                                                    ); | ||||
|     GLSLC(1,     int r;                                                       ); | ||||
|     GLSLC(1,     int x;                                                       ); | ||||
|     GLSLC(1,     int y;                                                       ); | ||||
|     GLSLC(1,     int p;                                                       ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(1,     DTYPE a;                                                     ); | ||||
|     GLSLC(1,     DTYPE b;                                                     ); | ||||
|     GLSLC(1,     DTYPE c;                                                     ); | ||||
|     GLSLC(1,     DTYPE d;                                                     ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(1,     DTYPE patch_diff;                                            ); | ||||
|     desc_set = (FFVulkanDescriptorSetBinding []) { | ||||
|         { | ||||
|             .name        = "xyoffsets_buffer", | ||||
|             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||
|             .mem_quali   = "readonly", | ||||
|             .stages      = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|             .buf_content = "ivec2 xyoffsets[];", | ||||
|         }, | ||||
|     }; | ||||
|     RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1, 1, 0)); | ||||
|  | ||||
|     GLSLC(0,                                                                     ); | ||||
|     GLSLC(0, void main()                                                         ); | ||||
|     GLSLC(0, {                                                                   ); | ||||
|     GLSLC(1,     uint64_t offset;                                                ); | ||||
|     GLSLC(1,     DataBuffer dst;                                                 ); | ||||
|     GLSLC(1,     float s1;                                                       ); | ||||
|     GLSLC(1,     DTYPE s2;                                                       ); | ||||
|     GLSLC(1,     DTYPE prefix_sum;                                               ); | ||||
|     GLSLF(1,     DTYPE psum[%i];                                                 ,*nb_rows); | ||||
|     GLSLC(1,     int r;                                                          ); | ||||
|     GLSLC(1,     ivec2 pos;                                                      ); | ||||
|     GLSLC(1,     int p;                                                          ); | ||||
|     GLSLC(0,                                                                     ); | ||||
|     GLSLC(1,     DataBuffer integral_data;                                       ); | ||||
|     GLSLF(1,     ivec2 offs[%i];                                                 ,TYPE_ELEMS); | ||||
|     GLSLC(0,                                                                     ); | ||||
|     GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                          ); | ||||
|  | ||||
|     GLSLC(1,     offset = uint64_t(integral_size)*invoc_idx;                     ); | ||||
|     GLSLC(1,     dst = DataBuffer(uint64_t(integral_data) + offset);             ); | ||||
|  | ||||
|     GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset);   ); | ||||
|     for (int i = 0; i < TYPE_ELEMS*2; i += 2) | ||||
|         GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + 2*%i*invoc_idx + %i];       ,i/2,TYPE_ELEMS,i); | ||||
|     GLSLC(0,                                                                     ); | ||||
|     GLSLC(1,     DTYPE a;                                                        ); | ||||
|     GLSLC(1,     DTYPE b;                                                        ); | ||||
|     GLSLC(1,     DTYPE c;                                                        ); | ||||
|     GLSLC(1,     DTYPE d;                                                        ); | ||||
|     GLSLC(0,                                                                     ); | ||||
|     GLSLC(1,     DTYPE patch_diff;                                               ); | ||||
|     if (TYPE_ELEMS == 4) { | ||||
|         GLSLC(1, vec4 src;                                                    ); | ||||
|         GLSLC(1, vec4 w;                                                      ); | ||||
|         GLSLC(1, vec4 src;                                                       ); | ||||
|         GLSLC(1, vec4 w;                                                         ); | ||||
|     } else { | ||||
|         GLSLC(1, vec4 src[4];                                                 ); | ||||
|         GLSLC(1, vec4 w[4];                                                   ); | ||||
|         GLSLC(1, vec4 src[4];                                                    ); | ||||
|         GLSLC(1, vec4 w[4];                                                      ); | ||||
|     } | ||||
|     GLSLC(1,     float w_sum;                                                 ); | ||||
|     GLSLC(1,     float sum;                                                   ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(1,     bool lt;                                                     ); | ||||
|     GLSLC(1,     bool gt;                                                     ); | ||||
|     GLSLC(0,                                                                  ); | ||||
|     GLSLC(1,     float w_sum;                                                    ); | ||||
|     GLSLC(1,     float sum;                                                      ); | ||||
|     GLSLC(0,                                                                     ); | ||||
|     GLSLC(1,     bool lt;                                                        ); | ||||
|     GLSLC(1,     bool gt;                                                        ); | ||||
|     GLSLC(0,                                                                     ); | ||||
|  | ||||
|     for (int i = 0; i < desc->nb_components; i++) { | ||||
|         int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8); | ||||
|         if (width > height) { | ||||
|         if (width >= height) { | ||||
|             insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off); | ||||
|             insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off); | ||||
|             insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, off); | ||||
| @@ -416,7 +412,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     GLSLC(0, }                                                                ); | ||||
|     GLSLC(0, }                                                                   ); | ||||
|  | ||||
|     RET(spv->compile_shader(spv, vkctx, shd, &spv_data, &spv_len, "main", &spv_opaque)); | ||||
|     RET(ff_vk_shader_create(vkctx, shd, spv_data, spv_len, "main")); | ||||
| @@ -584,6 +580,8 @@ static av_cold int init_filter(AVFilterContext *ctx) | ||||
|     FFVulkanContext *vkctx = &s->vkctx; | ||||
|     const int planes = av_pix_fmt_count_planes(s->vkctx.output_format); | ||||
|     FFVkSPIRVCompiler *spv; | ||||
|     int *offsets_buf; | ||||
|     int offsets_dispatched = 0, nb_dispatches = 0; | ||||
|  | ||||
|     const AVPixFmtDescriptor *desc; | ||||
|     desc = av_pix_fmt_desc_get(vkctx->output_format); | ||||
| @@ -634,6 +632,20 @@ static av_cold int init_filter(AVFilterContext *ctx) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     RET(ff_vk_create_buf(&s->vkctx, &s->xyoffsets_buf, 2*s->nb_offsets*sizeof(int32_t), NULL, NULL, | ||||
|                          VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | | ||||
|                          VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, | ||||
|                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | | ||||
|                          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); | ||||
|     RET(ff_vk_map_buffer(&s->vkctx, &s->xyoffsets_buf, (uint8_t **)&offsets_buf, 0)); | ||||
|  | ||||
|     for (int i = 0; i < 2*s->nb_offsets; i += 2) { | ||||
|         offsets_buf[i + 0] = s->xoffsets[i >> 1]; | ||||
|         offsets_buf[i + 1] = s->yoffsets[i >> 1]; | ||||
|     } | ||||
|  | ||||
|     RET(ff_vk_unmap_buffer(&s->vkctx, &s->xyoffsets_buf, 1)); | ||||
|  | ||||
|     s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS)); | ||||
|     if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) { | ||||
|         av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, " | ||||
| @@ -641,11 +653,6 @@ static av_cold int init_filter(AVFilterContext *ctx) | ||||
|         s->opts.t = 1; | ||||
|     } | ||||
|  | ||||
|     if (!vkctx->feats_12.vulkanMemoryModel) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "Device doesn't support the Vulkan memory model!"); | ||||
|         return AVERROR(EINVAL);; | ||||
|     } | ||||
|  | ||||
|     spv = ff_vk_spirv_init(); | ||||
|     if (!spv) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); | ||||
| @@ -663,8 +670,19 @@ static av_cold int init_filter(AVFilterContext *ctx) | ||||
|     RET(init_denoise_pipeline(vkctx, &s->e, &s->pl_denoise, &s->shd_denoise, s->sampler, | ||||
|                               spv, desc, planes)); | ||||
|  | ||||
|     av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches, %i parallel\n", | ||||
|            s->nb_offsets, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS) + 1, s->opts.t); | ||||
|     RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, NULL, 1, 0, 0, | ||||
|                                     s->xyoffsets_buf.address, s->xyoffsets_buf.size, | ||||
|                                     VK_FORMAT_UNDEFINED)); | ||||
|  | ||||
|     do { | ||||
|         int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t); | ||||
|         wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]); | ||||
|         offsets_dispatched += wg_invoc * TYPE_ELEMS; | ||||
|         nb_dispatches++; | ||||
|     } while (offsets_dispatched < s->nb_offsets); | ||||
|  | ||||
|     av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches\n", | ||||
|            s->nb_offsets, nb_dispatches); | ||||
|  | ||||
|     s->initialized = 1; | ||||
|  | ||||
| @@ -736,18 +754,16 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|     int plane_widths[4]; | ||||
|     int plane_heights[4]; | ||||
|  | ||||
|     int offsets_dispatched = 0; | ||||
|  | ||||
|     /* Integral */ | ||||
|     AVBufferRef *state_buf; | ||||
|     FFVkBuffer *state_vk; | ||||
|     AVBufferRef *integral_buf; | ||||
|     AVBufferRef *integral_buf = NULL; | ||||
|     FFVkBuffer *integral_vk; | ||||
|     uint32_t int_stride; | ||||
|     size_t int_size; | ||||
|     size_t state_size; | ||||
|     int t_offset = 0; | ||||
|  | ||||
|     /* Weights/sums */ | ||||
|     AVBufferRef *ws_buf; | ||||
|     AVBufferRef *ws_buf = NULL; | ||||
|     FFVkBuffer *ws_vk; | ||||
|     VkDeviceAddress weights_addr[4]; | ||||
|     VkDeviceAddress sums_addr[4]; | ||||
| @@ -773,7 +789,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|     /* Integral image */ | ||||
|     int_stride = s->pl_weights.wg_size[0]*s->pl_weights_rows; | ||||
|     int_size = int_stride * int_stride * TYPE_SIZE; | ||||
|     state_size = int_stride * 3 *TYPE_SIZE; | ||||
|  | ||||
|     /* Plane dimensions */ | ||||
|     for (int i = 0; i < desc->nb_components; i++) { | ||||
| @@ -798,16 +813,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|         return err; | ||||
|     integral_vk = (FFVkBuffer *)integral_buf->data; | ||||
|  | ||||
|     err = ff_vk_get_pooled_buffer(&s->vkctx, &s->state_buf_pool, &state_buf, | ||||
|                                   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | | ||||
|                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, | ||||
|                                   NULL, | ||||
|                                   s->opts.t * state_size, | ||||
|                                   VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); | ||||
|     if (err < 0) | ||||
|         return err; | ||||
|     state_vk = (FFVkBuffer *)state_buf->data; | ||||
|  | ||||
|     err = ff_vk_get_pooled_buffer(&s->vkctx, &s->ws_buf_pool, &ws_buf, | ||||
|                                   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | | ||||
|                                   VK_BUFFER_USAGE_TRANSFER_DST_BIT | | ||||
| @@ -844,9 +849,12 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|     RET(ff_vk_exec_add_dep_frame(vkctx, exec, out, | ||||
|                                  VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, | ||||
|                                  VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); | ||||
|  | ||||
|     RET(ff_vk_exec_add_dep_buf(vkctx, exec, &integral_buf, 1, 0)); | ||||
|     RET(ff_vk_exec_add_dep_buf(vkctx, exec, &state_buf,    1, 0)); | ||||
|     integral_buf = NULL; | ||||
|  | ||||
|     RET(ff_vk_exec_add_dep_buf(vkctx, exec, &ws_buf,       1, 0)); | ||||
|     ws_buf = NULL; | ||||
|  | ||||
|     /* Input frame prep */ | ||||
|     RET(ff_vk_create_imageviews(vkctx, exec, in_views, in)); | ||||
| @@ -869,6 +877,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|                         VK_IMAGE_LAYOUT_GENERAL, | ||||
|                         VK_QUEUE_FAMILY_IGNORED); | ||||
|  | ||||
|     nb_buf_bar = 0; | ||||
|     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { | ||||
|         .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, | ||||
|         .srcStageMask = ws_vk->stage, | ||||
| @@ -881,6 +890,19 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|         .size = ws_vk->size, | ||||
|         .offset = 0, | ||||
|     }; | ||||
|     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { | ||||
|         .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, | ||||
|         .srcStageMask = integral_vk->stage, | ||||
|         .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, | ||||
|         .srcAccessMask = integral_vk->access, | ||||
|         .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | | ||||
|                          VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, | ||||
|         .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|         .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|         .buffer = integral_vk->buf, | ||||
|         .size = integral_vk->size, | ||||
|         .offset = 0, | ||||
|     }; | ||||
|  | ||||
|     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { | ||||
|             .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, | ||||
| @@ -891,10 +913,13 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|         }); | ||||
|     ws_vk->stage = buf_bar[0].dstStageMask; | ||||
|     ws_vk->access = buf_bar[0].dstAccessMask; | ||||
|     integral_vk->stage = buf_bar[1].dstStageMask; | ||||
|     integral_vk->access = buf_bar[1].dstAccessMask; | ||||
|  | ||||
|     /* Weights/sums buffer zeroing */ | ||||
|     /* Buffer zeroing */ | ||||
|     vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0); | ||||
|  | ||||
|     nb_buf_bar = 0; | ||||
|     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { | ||||
|         .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, | ||||
|         .srcStageMask = ws_vk->stage, | ||||
| @@ -948,29 +973,22 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|     /* Weights pipeline */ | ||||
|     ff_vk_exec_bind_pipeline(vkctx, exec, &s->pl_weights); | ||||
|  | ||||
|     for (int i = 0; i < s->nb_offsets; i += TYPE_ELEMS) { | ||||
|         int *xoffs = s->xoffsets + i; | ||||
|         int *yoffs = s->yoffsets + i; | ||||
|     do { | ||||
|         int wg_invoc; | ||||
|         HorizontalPushData pd = { | ||||
|             integral_vk->address + t_offset*int_size, | ||||
|             state_vk->address + t_offset*state_size, | ||||
|             { 0 }, | ||||
|             { 0 }, | ||||
|             { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] }, | ||||
|             { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] }, | ||||
|             { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] }, | ||||
|             { s->patch[0], s->patch[1], s->patch[2], s->patch[3] }, | ||||
|             { s->strength[0], s->strength[1], s->strength[2], s->strength[2], }, | ||||
|             integral_vk->address, | ||||
|             int_size, | ||||
|             int_stride, | ||||
|             offsets_dispatched * 2, | ||||
|         }; | ||||
|  | ||||
|         memcpy(pd.xoffs, xoffs, sizeof(pd.xoffs)); | ||||
|         memcpy(pd.yoffs, yoffs, sizeof(pd.yoffs)); | ||||
|  | ||||
|         /* Put a barrier once we run out of parallelism buffers */ | ||||
|         if (!t_offset) { | ||||
|         if (offsets_dispatched) { | ||||
|             nb_buf_bar = 0; | ||||
|             /* Buffer prep/sync */ | ||||
|             buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { | ||||
|                 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, | ||||
|                 .srcStageMask = integral_vk->stage, | ||||
| @@ -984,39 +1002,28 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|                 .size = integral_vk->size, | ||||
|                 .offset = 0, | ||||
|             }; | ||||
|             buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { | ||||
|                 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, | ||||
|                 .srcStageMask = state_vk->stage, | ||||
|                 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, | ||||
|                 .srcAccessMask = state_vk->access, | ||||
|                 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | | ||||
|                                  VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, | ||||
|                 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                 .buffer = state_vk->buf, | ||||
|                 .size = state_vk->size, | ||||
|                 .offset = 0, | ||||
|             }; | ||||
|  | ||||
|             vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { | ||||
|                     .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, | ||||
|                     .pBufferMemoryBarriers = buf_bar, | ||||
|                     .bufferMemoryBarrierCount = nb_buf_bar, | ||||
|                 }); | ||||
|             integral_vk->stage = buf_bar[0].dstStageMask; | ||||
|             integral_vk->access = buf_bar[0].dstAccessMask; | ||||
|             state_vk->stage = buf_bar[1].dstStageMask; | ||||
|             state_vk->access = buf_bar[1].dstAccessMask; | ||||
|             integral_vk->stage = buf_bar[1].dstStageMask; | ||||
|             integral_vk->access = buf_bar[1].dstAccessMask; | ||||
|         } | ||||
|         t_offset = (t_offset + 1) % s->opts.t; | ||||
|  | ||||
|         /* Push data */ | ||||
|         ff_vk_update_push_exec(vkctx, exec, &s->pl_weights, VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|                                0, sizeof(pd), &pd); | ||||
|  | ||||
|         wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t); | ||||
|         wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]); | ||||
|  | ||||
|         /* End of horizontal pass */ | ||||
|         vk->CmdDispatch(exec->buf, 1, 1, 1); | ||||
|     } | ||||
|         vk->CmdDispatch(exec->buf, 1, 1, wg_invoc); | ||||
|  | ||||
|         offsets_dispatched += wg_invoc * TYPE_ELEMS; | ||||
|     } while (offsets_dispatched < s->nb_offsets); | ||||
|  | ||||
|     RET(denoise_pass(s, exec, ws_vk, ws_stride)); | ||||
|  | ||||
| @@ -1033,6 +1040,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) | ||||
|     return ff_filter_frame(outlink, out); | ||||
|  | ||||
| fail: | ||||
|     av_buffer_unref(&integral_buf); | ||||
|     av_buffer_unref(&ws_buf); | ||||
|     av_frame_free(&in); | ||||
|     av_frame_free(&out); | ||||
|     return err; | ||||
| @@ -1051,7 +1060,6 @@ static void nlmeans_vulkan_uninit(AVFilterContext *avctx) | ||||
|     ff_vk_shader_free(vkctx, &s->shd_denoise); | ||||
|  | ||||
|     av_buffer_pool_uninit(&s->integral_buf_pool); | ||||
|     av_buffer_pool_uninit(&s->state_buf_pool); | ||||
|     av_buffer_pool_uninit(&s->ws_buf_pool); | ||||
|  | ||||
|     if (s->sampler) | ||||
|   | ||||
| @@ -1,151 +0,0 @@ | ||||
| #extension GL_EXT_buffer_reference : require | ||||
| #extension GL_EXT_buffer_reference2 : require | ||||
|  | ||||
| #define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire | ||||
| #define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease | ||||
|  | ||||
| // These correspond to X, A, P respectively in the prefix sum paper. | ||||
| #define FLAG_NOT_READY       0u | ||||
| #define FLAG_AGGREGATE_READY 1u | ||||
| #define FLAG_PREFIX_READY    2u | ||||
|  | ||||
| layout(buffer_reference, buffer_reference_align = T_ALIGN) nonprivate buffer StateData { | ||||
|     DTYPE aggregate; | ||||
|     DTYPE prefix; | ||||
|     uint flag; | ||||
| }; | ||||
|  | ||||
| shared DTYPE sh_scratch[WG_SIZE]; | ||||
| shared DTYPE sh_prefix; | ||||
| shared uint  sh_part_ix; | ||||
| shared uint  sh_flag; | ||||
|  | ||||
| void prefix_sum(DataBuffer dst, uint dst_stride, DataBuffer src, uint src_stride) | ||||
| { | ||||
|     DTYPE local[N_ROWS]; | ||||
|     // Determine partition to process by atomic counter (described in Section 4.4 of prefix sum paper). | ||||
|     if (gl_GlobalInvocationID.x == 0) | ||||
|           sh_part_ix = gl_WorkGroupID.x; | ||||
| //        sh_part_ix = atomicAdd(part_counter, 1); | ||||
|  | ||||
|     barrier(); | ||||
|     uint part_ix = sh_part_ix; | ||||
|  | ||||
|     uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; | ||||
|  | ||||
|     // TODO: gate buffer read? (evaluate whether shader check or CPU-side padding is better) | ||||
|     local[0] = src.v[ix*src_stride]; | ||||
|     for (uint i = 1; i < N_ROWS; i++) | ||||
|         local[i] = local[i - 1] + src.v[(ix + i)*src_stride]; | ||||
|  | ||||
|     DTYPE agg = local[N_ROWS - 1]; | ||||
|     sh_scratch[gl_LocalInvocationID.x] = agg; | ||||
|     for (uint i = 0; i < LG_WG_SIZE; i++) { | ||||
|         barrier(); | ||||
|         if (gl_LocalInvocationID.x >= (1u << i)) | ||||
|             agg += sh_scratch[gl_LocalInvocationID.x - (1u << i)]; | ||||
|         barrier(); | ||||
|  | ||||
|         sh_scratch[gl_LocalInvocationID.x] = agg; | ||||
|     } | ||||
|  | ||||
|     // Publish aggregate for this partition | ||||
|     if (gl_LocalInvocationID.x == WG_SIZE - 1) { | ||||
|         state[part_ix].aggregate = agg; | ||||
|         if (part_ix == 0) | ||||
|             state[0].prefix = agg; | ||||
|     } | ||||
|  | ||||
|     // Write flag with release semantics | ||||
|     if (gl_LocalInvocationID.x == WG_SIZE - 1) { | ||||
|         uint flag = part_ix == 0 ? FLAG_PREFIX_READY : FLAG_AGGREGATE_READY; | ||||
|         atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE); | ||||
|     } | ||||
|  | ||||
|     DTYPE exclusive = DTYPE(0); | ||||
|     if (part_ix != 0) { | ||||
|         // step 4 of paper: decoupled lookback | ||||
|         uint look_back_ix = part_ix - 1; | ||||
|  | ||||
|         DTYPE their_agg; | ||||
|         uint their_ix = 0; | ||||
|         while (true) { | ||||
|             // Read flag with acquire semantics. | ||||
|             if (gl_LocalInvocationID.x == WG_SIZE - 1) | ||||
|                 sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE); | ||||
|  | ||||
|             // The flag load is done only in the last thread. However, because the | ||||
|             // translation of memoryBarrierBuffer to Metal requires uniform control | ||||
|             // flow, we broadcast it to all threads. | ||||
|             barrier(); | ||||
|  | ||||
|             uint flag = sh_flag; | ||||
|             barrier(); | ||||
|  | ||||
|             if (flag == FLAG_PREFIX_READY) { | ||||
|                 if (gl_LocalInvocationID.x == WG_SIZE - 1) { | ||||
|                     DTYPE their_prefix = state[look_back_ix].prefix; | ||||
|                     exclusive = their_prefix + exclusive; | ||||
|                 } | ||||
|                 break; | ||||
|             } else if (flag == FLAG_AGGREGATE_READY) { | ||||
|                 if (gl_LocalInvocationID.x == WG_SIZE - 1) { | ||||
|                     their_agg = state[look_back_ix].aggregate; | ||||
|                     exclusive = their_agg + exclusive; | ||||
|                 } | ||||
|                 look_back_ix--; | ||||
|                 their_ix = 0; | ||||
|                 continue; | ||||
|             } // else spins | ||||
|  | ||||
|             if (gl_LocalInvocationID.x == WG_SIZE - 1) { | ||||
|                 // Unfortunately there's no guarantee of forward progress of other | ||||
|                 // workgroups, so compute a bit of the aggregate before trying again. | ||||
|                 // In the worst case, spinning stops when the aggregate is complete. | ||||
|                 DTYPE m = src.v[(look_back_ix * PARTITION_SIZE + their_ix)*src_stride]; | ||||
|                 if (their_ix == 0) | ||||
|                     their_agg = m; | ||||
|                 else | ||||
|                     their_agg += m; | ||||
|  | ||||
|                 their_ix++; | ||||
|                 if (their_ix == PARTITION_SIZE) { | ||||
|                     exclusive = their_agg + exclusive; | ||||
|                     if (look_back_ix == 0) { | ||||
|                         sh_flag = FLAG_PREFIX_READY; | ||||
|                     } else { | ||||
|                         look_back_ix--; | ||||
|                         their_ix = 0; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             barrier(); | ||||
|             flag = sh_flag; | ||||
|             barrier(); | ||||
|             if (flag == FLAG_PREFIX_READY) | ||||
|                 break; | ||||
|         } | ||||
|  | ||||
|         // step 5 of paper: compute inclusive prefix | ||||
|         if (gl_LocalInvocationID.x == WG_SIZE - 1) { | ||||
|             DTYPE inclusive_prefix = exclusive + agg; | ||||
|             sh_prefix = exclusive; | ||||
|             state[part_ix].prefix = inclusive_prefix; | ||||
|         } | ||||
|  | ||||
|         if (gl_LocalInvocationID.x == WG_SIZE - 1) | ||||
|             atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE); | ||||
|     } | ||||
|  | ||||
|     barrier(); | ||||
|     if (part_ix != 0) | ||||
|         exclusive = sh_prefix; | ||||
|  | ||||
|     DTYPE row = exclusive; | ||||
|     if (gl_LocalInvocationID.x > 0) | ||||
|         row += sh_scratch[gl_LocalInvocationID.x - 1]; | ||||
|  | ||||
|     // note - may overwrite | ||||
|     for (uint i = 0; i < N_ROWS; i++) | ||||
|         dst.v[(ix + i)*dst_stride] = row + local[i]; | ||||
| } | ||||
		Reference in New Issue
	
	Block a user