You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	avfilter/vf_bwdif_cuda: CUDA accelerated bwdif deinterlacer
I've been sitting on this for 3 1/2 years now(!), and I finally got around to fixing the loose ends and convincing myself that it was correct. It follows the same basic structure as yadif_cuda, including leaving out the edge handling, to avoid expensive branching.
This commit is contained in:
		| @@ -19,6 +19,7 @@ version <next>: | ||||
| - vMix video decoder | ||||
| - Essential Video Coding parser, muxer and demuxer | ||||
| - Essential Video Coding frame merge bsf | ||||
| - bwdif_cuda filter | ||||
|  | ||||
| version 6.0: | ||||
| - Radiance HDR image support | ||||
|   | ||||
							
								
								
									
										2
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								configure
									
									
									
									
										vendored
									
									
								
							| @@ -3697,6 +3697,8 @@ blend_vulkan_filter_deps="vulkan spirv_compiler" | ||||
| boxblur_filter_deps="gpl" | ||||
| boxblur_opencl_filter_deps="opencl gpl" | ||||
| bs2b_filter_deps="libbs2b" | ||||
| bwdif_cuda_filter_deps="ffnvcodec" | ||||
| bwdif_cuda_filter_deps_any="cuda_nvcc cuda_llvm" | ||||
| bwdif_vulkan_filter_deps="vulkan spirv_compiler" | ||||
| chromaber_vulkan_filter_deps="vulkan spirv_compiler" | ||||
| color_vulkan_filter_deps="vulkan spirv_compiler" | ||||
|   | ||||
| @@ -9145,6 +9145,58 @@ Only deinterlace frames marked as interlaced. | ||||
| The default value is @code{all}. | ||||
| @end table | ||||
|  | ||||
| @section bwdif_cuda | ||||
|  | ||||
| Deinterlace the input video using the @ref{bwdif} algorithm, but implemented | ||||
| in CUDA so that it can work as part of a GPU accelerated pipeline with nvdec | ||||
| and/or nvenc. | ||||
|  | ||||
| It accepts the following parameters: | ||||
|  | ||||
| @table @option | ||||
| @item mode | ||||
| The interlacing mode to adopt. It accepts one of the following values: | ||||
|  | ||||
| @table @option | ||||
| @item 0, send_frame | ||||
| Output one frame for each frame. | ||||
| @item 1, send_field | ||||
| Output one frame for each field. | ||||
| @end table | ||||
|  | ||||
| The default value is @code{send_field}. | ||||
|  | ||||
| @item parity | ||||
| The picture field parity assumed for the input interlaced video. It accepts one | ||||
| of the following values: | ||||
|  | ||||
| @table @option | ||||
| @item 0, tff | ||||
| Assume the top field is first. | ||||
| @item 1, bff | ||||
| Assume the bottom field is first. | ||||
| @item -1, auto | ||||
| Enable automatic detection of field parity. | ||||
| @end table | ||||
|  | ||||
| The default value is @code{auto}. | ||||
| If the interlacing is unknown or the decoder does not export this information, | ||||
| top field first will be assumed. | ||||
|  | ||||
| @item deint | ||||
| Specify which frames to deinterlace. Accepts one of the following | ||||
| values: | ||||
|  | ||||
| @table @option | ||||
| @item 0, all | ||||
| Deinterlace all frames. | ||||
| @item 1, interlaced | ||||
| Only deinterlace frames marked as interlaced. | ||||
| @end table | ||||
|  | ||||
| The default value is @code{all}. | ||||
| @end table | ||||
|  | ||||
| @section ccrepack | ||||
|  | ||||
| Repack CEA-708 closed captioning side data | ||||
|   | ||||
| @@ -29,7 +29,7 @@ | ||||
|  | ||||
| #include "version_major.h" | ||||
|  | ||||
| #define LIBAVCODEC_VERSION_MINOR  19 | ||||
| #define LIBAVCODEC_VERSION_MINOR  20 | ||||
| #define LIBAVCODEC_VERSION_MICRO 100 | ||||
|  | ||||
| #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ | ||||
|   | ||||
| @@ -213,6 +213,8 @@ OBJS-$(CONFIG_BOXBLUR_FILTER)                += vf_boxblur.o boxblur.o | ||||
| OBJS-$(CONFIG_BOXBLUR_OPENCL_FILTER)         += vf_avgblur_opencl.o opencl.o \ | ||||
|                                                 opencl/avgblur.o boxblur.o | ||||
| OBJS-$(CONFIG_BWDIF_FILTER)                  += vf_bwdif.o yadif_common.o | ||||
| OBJS-$(CONFIG_BWDIF_CUDA_FILTER)             += vf_bwdif_cuda.o vf_bwdif_cuda.ptx.o \ | ||||
|                                                 yadif_common.o | ||||
| OBJS-$(CONFIG_BWDIF_VULKAN_FILTER)           += vf_bwdif_vulkan.o yadif_common.o vulkan.o vulkan_filter.o | ||||
| OBJS-$(CONFIG_CAS_FILTER)                    += vf_cas.o | ||||
| OBJS-$(CONFIG_CCREPACK_FILTER)               += vf_ccrepack.o | ||||
|   | ||||
| @@ -197,6 +197,7 @@ extern const AVFilter ff_vf_bm3d; | ||||
| extern const AVFilter ff_vf_boxblur; | ||||
| extern const AVFilter ff_vf_boxblur_opencl; | ||||
| extern const AVFilter ff_vf_bwdif; | ||||
| extern const AVFilter ff_vf_bwdif_cuda; | ||||
| extern const AVFilter ff_vf_bwdif_vulkan; | ||||
| extern const AVFilter ff_vf_cas; | ||||
| extern const AVFilter ff_vf_ccrepack; | ||||
|   | ||||
							
								
								
									
										375
									
								
								libavfilter/vf_bwdif_cuda.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										375
									
								
								libavfilter/vf_bwdif_cuda.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,375 @@ | ||||
| /* | ||||
|  * Copyright (C) 2019 Philip Langdale <philipl@overt.org> | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include "libavutil/avassert.h" | ||||
| #include "libavutil/hwcontext.h" | ||||
| #include "libavutil/hwcontext_cuda_internal.h" | ||||
| #include "libavutil/cuda_check.h" | ||||
| #include "internal.h" | ||||
| #include "yadif.h" | ||||
|  | ||||
| #include "cuda/load_helper.h" | ||||
|  | ||||
| extern const unsigned char ff_vf_bwdif_cuda_ptx_data[]; | ||||
| extern const unsigned int ff_vf_bwdif_cuda_ptx_len; | ||||
|  | ||||
| typedef struct DeintCUDAContext { | ||||
|     YADIFContext yadif; | ||||
|  | ||||
|     AVCUDADeviceContext *hwctx; | ||||
|     AVBufferRef         *device_ref; | ||||
|     AVBufferRef         *input_frames_ref; | ||||
|     AVHWFramesContext   *input_frames; | ||||
|  | ||||
|     CUmodule    cu_module; | ||||
|     CUfunction  cu_func_uchar; | ||||
|     CUfunction  cu_func_uchar2; | ||||
|     CUfunction  cu_func_ushort; | ||||
|     CUfunction  cu_func_ushort2; | ||||
| } DeintCUDAContext; | ||||
|  | ||||
| #define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) | ||||
| #define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1)) | ||||
| #define BLOCKX 32 | ||||
| #define BLOCKY 16 | ||||
|  | ||||
| #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) | ||||
|  | ||||
| static CUresult call_kernel(AVFilterContext *ctx, CUfunction func, | ||||
|                             CUdeviceptr prev, CUdeviceptr cur, CUdeviceptr next, | ||||
|                             CUarray_format format, int channels, | ||||
|                             int src_width,  // Width is pixels per channel | ||||
|                             int src_height, // Height is pixels per channel | ||||
|                             int src_pitch,  // Pitch is bytes | ||||
|                             CUdeviceptr dst, | ||||
|                             int dst_width,  // Width is pixels per channel | ||||
|                             int dst_height, // Height is pixels per channel | ||||
|                             int dst_pitch,  // Pitch is pixels per channel | ||||
|                             int parity, int tff, int clip_max) | ||||
| { | ||||
|     DeintCUDAContext *s = ctx->priv; | ||||
|     YADIFContext *y = &s->yadif; | ||||
|     CudaFunctions *cu = s->hwctx->internal->cuda_dl; | ||||
|     CUtexObject tex_prev = 0, tex_cur = 0, tex_next = 0; | ||||
|     int is_field_end = y->current_field == YADIF_FIELD_END; | ||||
|     int ret; | ||||
|  | ||||
|     void *args[] = { &dst, &tex_prev, &tex_cur, &tex_next, | ||||
|                      &dst_width, &dst_height, &dst_pitch, | ||||
|                      &src_width, &src_height, &parity, &tff, | ||||
|                      &is_field_end, &clip_max }; | ||||
|  | ||||
|     CUDA_TEXTURE_DESC tex_desc = { | ||||
|         .filterMode = CU_TR_FILTER_MODE_POINT, | ||||
|         .flags = CU_TRSF_READ_AS_INTEGER, | ||||
|     }; | ||||
|  | ||||
|     CUDA_RESOURCE_DESC res_desc = { | ||||
|         .resType = CU_RESOURCE_TYPE_PITCH2D, | ||||
|         .res.pitch2D.format = format, | ||||
|         .res.pitch2D.numChannels = channels, | ||||
|         .res.pitch2D.width = src_width, | ||||
|         .res.pitch2D.height = src_height, | ||||
|         .res.pitch2D.pitchInBytes = src_pitch, | ||||
|     }; | ||||
|  | ||||
|     res_desc.res.pitch2D.devPtr = (CUdeviceptr)prev; | ||||
|     ret = CHECK_CU(cu->cuTexObjectCreate(&tex_prev, &res_desc, &tex_desc, NULL)); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     res_desc.res.pitch2D.devPtr = (CUdeviceptr)cur; | ||||
|     ret = CHECK_CU(cu->cuTexObjectCreate(&tex_cur, &res_desc, &tex_desc, NULL)); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     res_desc.res.pitch2D.devPtr = (CUdeviceptr)next; | ||||
|     ret = CHECK_CU(cu->cuTexObjectCreate(&tex_next, &res_desc, &tex_desc, NULL)); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     ret = CHECK_CU(cu->cuLaunchKernel(func, | ||||
|                                       DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, | ||||
|                                       BLOCKX, BLOCKY, 1, | ||||
|                                       0, s->hwctx->stream, args, NULL)); | ||||
|  | ||||
| exit: | ||||
|     if (tex_prev) | ||||
|         CHECK_CU(cu->cuTexObjectDestroy(tex_prev)); | ||||
|     if (tex_cur) | ||||
|         CHECK_CU(cu->cuTexObjectDestroy(tex_cur)); | ||||
|     if (tex_next) | ||||
|         CHECK_CU(cu->cuTexObjectDestroy(tex_next)); | ||||
|  | ||||
|     return ret; | ||||
| } | ||||
|  | ||||
| static void filter(AVFilterContext *ctx, AVFrame *dst, | ||||
|                    int parity, int tff) | ||||
| { | ||||
|     DeintCUDAContext *s = ctx->priv; | ||||
|     YADIFContext *y = &s->yadif; | ||||
|     CudaFunctions *cu = s->hwctx->internal->cuda_dl; | ||||
|     CUcontext dummy; | ||||
|     int i, ret; | ||||
|  | ||||
|     ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); | ||||
|     if (ret < 0) | ||||
|         return; | ||||
|  | ||||
|     for (i = 0; i < y->csp->nb_components; i++) { | ||||
|         CUfunction func; | ||||
|         CUarray_format format; | ||||
|         int pixel_size, channels, clip_max; | ||||
|         const AVComponentDescriptor *comp = &y->csp->comp[i]; | ||||
|  | ||||
|         if (comp->plane < i) { | ||||
|             // We process planes as a whole, so don't reprocess | ||||
|             // them for additional components | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
|         pixel_size = (comp->depth + comp->shift) / 8; | ||||
|         channels = comp->step / pixel_size; | ||||
|         if (pixel_size > 2 || channels > 2) { | ||||
|             av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n", y->csp->name); | ||||
|             goto exit; | ||||
|         } | ||||
|         switch (pixel_size) { | ||||
|         case 1: | ||||
|             func = channels == 1 ? s->cu_func_uchar : s->cu_func_uchar2; | ||||
|             format = CU_AD_FORMAT_UNSIGNED_INT8; | ||||
|             break; | ||||
|         case 2: | ||||
|             func = channels == 1 ? s->cu_func_ushort : s->cu_func_ushort2; | ||||
|             format = CU_AD_FORMAT_UNSIGNED_INT16; | ||||
|             break; | ||||
|         default: | ||||
|             av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n", y->csp->name); | ||||
|             goto exit; | ||||
|         } | ||||
|  | ||||
|         clip_max = (1 << (comp->depth + comp->shift)) - 1; | ||||
|  | ||||
|         av_log(ctx, AV_LOG_TRACE, | ||||
|                "Deinterlacing plane %d: pixel_size: %d channels: %d\n", | ||||
|                comp->plane, pixel_size, channels); | ||||
|         call_kernel(ctx, func, | ||||
|                     (CUdeviceptr)y->prev->data[i], | ||||
|                     (CUdeviceptr)y->cur->data[i], | ||||
|                     (CUdeviceptr)y->next->data[i], | ||||
|                     format, channels, | ||||
|                     AV_CEIL_RSHIFT(y->cur->width, i ? y->csp->log2_chroma_w : 0), | ||||
|                     AV_CEIL_RSHIFT(y->cur->height, i ? y->csp->log2_chroma_h : 0), | ||||
|                     y->cur->linesize[i], | ||||
|                     (CUdeviceptr)dst->data[i], | ||||
|                     AV_CEIL_RSHIFT(dst->width, i ? y->csp->log2_chroma_w : 0), | ||||
|                     AV_CEIL_RSHIFT(dst->height, i ? y->csp->log2_chroma_h : 0), | ||||
|                     dst->linesize[i] / comp->step, | ||||
|                     parity, tff, clip_max); | ||||
|     } | ||||
|  | ||||
|     if (y->current_field == YADIF_FIELD_END) { | ||||
|         y->current_field = YADIF_FIELD_NORMAL; | ||||
|     } | ||||
|  | ||||
| exit: | ||||
|     CHECK_CU(cu->cuCtxPopCurrent(&dummy)); | ||||
|     return; | ||||
| } | ||||
|  | ||||
| static av_cold void deint_cuda_uninit(AVFilterContext *ctx) | ||||
| { | ||||
|     CUcontext dummy; | ||||
|     DeintCUDAContext *s = ctx->priv; | ||||
|     YADIFContext *y = &s->yadif; | ||||
|  | ||||
|     if (s->hwctx && s->cu_module) { | ||||
|         CudaFunctions *cu = s->hwctx->internal->cuda_dl; | ||||
|         CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); | ||||
|         CHECK_CU(cu->cuModuleUnload(s->cu_module)); | ||||
|         CHECK_CU(cu->cuCtxPopCurrent(&dummy)); | ||||
|     } | ||||
|  | ||||
|     av_frame_free(&y->prev); | ||||
|     av_frame_free(&y->cur); | ||||
|     av_frame_free(&y->next); | ||||
|  | ||||
|     av_buffer_unref(&s->device_ref); | ||||
|     s->hwctx = NULL; | ||||
|     av_buffer_unref(&s->input_frames_ref); | ||||
|     s->input_frames = NULL; | ||||
| } | ||||
|  | ||||
| static int config_input(AVFilterLink *inlink) | ||||
| { | ||||
|     AVFilterContext *ctx = inlink->dst; | ||||
|     DeintCUDAContext *s  = ctx->priv; | ||||
|  | ||||
|     if (!inlink->hw_frames_ctx) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "A hardware frames reference is " | ||||
|                "required to associate the processing device.\n"); | ||||
|         return AVERROR(EINVAL); | ||||
|     } | ||||
|  | ||||
|     s->input_frames_ref = av_buffer_ref(inlink->hw_frames_ctx); | ||||
|     if (!s->input_frames_ref) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "A input frames reference create " | ||||
|                "failed.\n"); | ||||
|         return AVERROR(ENOMEM); | ||||
|     } | ||||
|     s->input_frames = (AVHWFramesContext*)s->input_frames_ref->data; | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| static int config_output(AVFilterLink *link) | ||||
| { | ||||
|     AVHWFramesContext *output_frames; | ||||
|     AVFilterContext *ctx = link->src; | ||||
|     DeintCUDAContext *s = ctx->priv; | ||||
|     YADIFContext *y = &s->yadif; | ||||
|     CudaFunctions *cu; | ||||
|     int ret = 0; | ||||
|     CUcontext dummy; | ||||
|  | ||||
|     av_assert0(s->input_frames); | ||||
|     s->device_ref = av_buffer_ref(s->input_frames->device_ref); | ||||
|     if (!s->device_ref) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "A device reference create " | ||||
|                "failed.\n"); | ||||
|         return AVERROR(ENOMEM); | ||||
|     } | ||||
|     s->hwctx = ((AVHWDeviceContext*)s->device_ref->data)->hwctx; | ||||
|     cu = s->hwctx->internal->cuda_dl; | ||||
|  | ||||
|     link->hw_frames_ctx = av_hwframe_ctx_alloc(s->device_ref); | ||||
|     if (!link->hw_frames_ctx) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "Failed to create HW frame context " | ||||
|                "for output.\n"); | ||||
|         ret = AVERROR(ENOMEM); | ||||
|         goto exit; | ||||
|     } | ||||
|  | ||||
|     output_frames = (AVHWFramesContext*)link->hw_frames_ctx->data; | ||||
|  | ||||
|     output_frames->format    = AV_PIX_FMT_CUDA; | ||||
|     output_frames->sw_format = s->input_frames->sw_format; | ||||
|     output_frames->width     = ctx->inputs[0]->w; | ||||
|     output_frames->height    = ctx->inputs[0]->h; | ||||
|  | ||||
|     output_frames->initial_pool_size = 4; | ||||
|  | ||||
|     ret = ff_filter_init_hw_frames(ctx, link, 10); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     ret = av_hwframe_ctx_init(link->hw_frames_ctx); | ||||
|     if (ret < 0) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "Failed to initialise CUDA frame " | ||||
|                "context for output: %d\n", ret); | ||||
|         goto exit; | ||||
|     } | ||||
|  | ||||
|     link->time_base = av_mul_q(ctx->inputs[0]->time_base, (AVRational){1, 2}); | ||||
|     link->w         = ctx->inputs[0]->w; | ||||
|     link->h         = ctx->inputs[0]->h; | ||||
|  | ||||
|     if(y->mode & 1) | ||||
|         link->frame_rate = av_mul_q(ctx->inputs[0]->frame_rate, | ||||
|                                     (AVRational){2, 1}); | ||||
|  | ||||
|     if (link->w < 3 || link->h < 3) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "Video of less than 3 columns or lines is not supported\n"); | ||||
|         ret = AVERROR(EINVAL); | ||||
|         goto exit; | ||||
|     } | ||||
|  | ||||
|     y->csp = av_pix_fmt_desc_get(output_frames->sw_format); | ||||
|     y->filter = filter; | ||||
|  | ||||
|     ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, ff_vf_bwdif_cuda_ptx_data, ff_vf_bwdif_cuda_ptx_len); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "bwdif_uchar")); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "bwdif_uchar2")); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "bwdif_ushort")); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
|     ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "bwdif_ushort2")); | ||||
|     if (ret < 0) | ||||
|         goto exit; | ||||
|  | ||||
| exit: | ||||
|     CHECK_CU(cu->cuCtxPopCurrent(&dummy)); | ||||
|  | ||||
|     return ret; | ||||
| } | ||||
|  | ||||
| static const AVClass bwdif_cuda_class = { | ||||
|     .class_name = "bwdif_cuda", | ||||
|     .item_name  = av_default_item_name, | ||||
|     .option     = ff_yadif_options, | ||||
|     .version    = LIBAVUTIL_VERSION_INT, | ||||
|     .category   = AV_CLASS_CATEGORY_FILTER, | ||||
| }; | ||||
|  | ||||
| static const AVFilterPad deint_cuda_inputs[] = { | ||||
|     { | ||||
|         .name          = "default", | ||||
|         .type          = AVMEDIA_TYPE_VIDEO, | ||||
|         .filter_frame  = ff_yadif_filter_frame, | ||||
|         .config_props  = config_input, | ||||
|     }, | ||||
| }; | ||||
|  | ||||
| static const AVFilterPad deint_cuda_outputs[] = { | ||||
|     { | ||||
|         .name          = "default", | ||||
|         .type          = AVMEDIA_TYPE_VIDEO, | ||||
|         .request_frame = ff_yadif_request_frame, | ||||
|         .config_props  = config_output, | ||||
|     }, | ||||
| }; | ||||
|  | ||||
| const AVFilter ff_vf_bwdif_cuda = { | ||||
|     .name           = "bwdif_cuda", | ||||
|     .description    = NULL_IF_CONFIG_SMALL("Deinterlace CUDA frames"), | ||||
|     .priv_size      = sizeof(DeintCUDAContext), | ||||
|     .priv_class     = &bwdif_cuda_class, | ||||
|     .uninit         = deint_cuda_uninit, | ||||
|     FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), | ||||
|     FILTER_INPUTS(deint_cuda_inputs), | ||||
|     FILTER_OUTPUTS(deint_cuda_outputs), | ||||
|     .flags          = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL, | ||||
|     .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, | ||||
| }; | ||||
							
								
								
									
										309
									
								
								libavfilter/vf_bwdif_cuda.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										309
									
								
								libavfilter/vf_bwdif_cuda.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,309 @@ | ||||
| /* | ||||
|  * Copyright (C) 2019 Philip Langdale <philipl@overt.org> | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| __device__ static const int coef_lf[2] = { 4309, 213 }; | ||||
| __device__ static const int coef_hf[3] = { 5570, 3801, 1016 }; | ||||
| __device__ static const int coef_sp[2] = { 5077, 981 }; | ||||
|  | ||||
| template<typename T> | ||||
| __inline__ __device__ T max3(T a, T b, T c) | ||||
| { | ||||
|     T x = max(a, b); | ||||
|     return max(x, c); | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| __inline__ __device__ T min3(T a, T b, T c) | ||||
| { | ||||
|     T x = min(a, b); | ||||
|     return min(x, c); | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| __inline__ __device__ T clip(T a, T min, T max) | ||||
| { | ||||
|     if (a < min) { | ||||
|         return min; | ||||
|     } else if (a > max) { | ||||
|         return max; | ||||
|     } else { | ||||
|         return a; | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| __inline__ __device__ T filter_intra(T cur_prefs3, T cur_prefs, | ||||
|                                      T cur_mrefs, T cur_mrefs3, | ||||
|                                      int clip_max) | ||||
| { | ||||
|     int final = (coef_sp[0] * (cur_mrefs + cur_prefs) - | ||||
|                  coef_sp[1] * (cur_mrefs3 + cur_prefs3)) >> 13; | ||||
|     return clip(final, 0, clip_max); | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| __inline__ __device__ T filter(T cur_prefs3, T cur_prefs, T cur_mrefs, T cur_mrefs3, | ||||
|                                T prev2_prefs4, T prev2_prefs2, T prev2_0, T prev2_mrefs2, T prev2_mrefs4, | ||||
|                                T prev_prefs, T prev_mrefs, T next_prefs, T next_mrefs, | ||||
|                                T next2_prefs4, T next2_prefs2, T next2_0, T next2_mrefs2, T next2_mrefs4, | ||||
|                                int clip_max) | ||||
| { | ||||
|     T final; | ||||
|  | ||||
|     int c = cur_mrefs; | ||||
|     int d = (prev2_0 + next2_0) >> 1; | ||||
|     int e = cur_prefs; | ||||
|  | ||||
|     int temporal_diff0 = abs(prev2_0 - next2_0); | ||||
|     int temporal_diff1 = (abs(prev_mrefs - c) + abs(prev_prefs - e)) >> 1; | ||||
|     int temporal_diff2 = (abs(next_mrefs - c) + abs(next_prefs - e)) >> 1; | ||||
|     int diff = max3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); | ||||
|  | ||||
|     if (!diff) { | ||||
|         final = d; | ||||
|     } else { | ||||
|         int b = ((prev2_mrefs2 + next2_mrefs2) >> 1) - c; | ||||
|         int f = ((prev2_prefs2 + next2_prefs2) >> 1) - e; | ||||
|         int dc = d - c; | ||||
|         int de = d - e; | ||||
|         int mmax = max3(de, dc, min(b, f)); | ||||
|         int mmin = min3(de, dc, max(b, f)); | ||||
|         diff = max3(diff, mmin, -mmax); | ||||
|  | ||||
|         int interpol; | ||||
|         if (abs(c - e) > temporal_diff0) { | ||||
|             interpol = (((coef_hf[0] * (prev2_0 + next2_0) | ||||
|                 - coef_hf[1] * (prev2_mrefs2 + next2_mrefs2 + prev2_prefs2 + next2_prefs2) | ||||
|                 + coef_hf[2] * (prev2_mrefs4 + next2_mrefs4 + prev2_prefs4 + next2_mrefs4)) >> 2) | ||||
|                 + coef_lf[0] * (c + e) - coef_lf[1] * (cur_mrefs3 + cur_prefs3)) >> 13; | ||||
|         } else { | ||||
|             interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur_mrefs3 + cur_prefs3)) >> 13; | ||||
|         } | ||||
|  | ||||
|         if (interpol > d + diff) { | ||||
|             interpol = d + diff; | ||||
|         } else if (interpol < d - diff) { | ||||
|             interpol = d - diff; | ||||
|         } | ||||
|         final = clip(interpol, 0, clip_max); | ||||
|     } | ||||
|  | ||||
|     return final; | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| __inline__ __device__ void bwdif_single(T *dst, | ||||
|                                         cudaTextureObject_t prev, | ||||
|                                         cudaTextureObject_t cur, | ||||
|                                         cudaTextureObject_t next, | ||||
|                                         int dst_width, int dst_height, int dst_pitch, | ||||
|                                         int src_width, int src_height, | ||||
|                                         int parity, int tff, | ||||
|                                         int is_field_end, int clip_max) | ||||
| { | ||||
|     // Identify location | ||||
|     int xo = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|     int yo = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|     if (xo >= dst_width || yo >= dst_height) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // Don't modify the primary field | ||||
|     if (yo % 2 == parity) { | ||||
|       dst[yo*dst_pitch+xo] = tex2D<T>(cur, xo, yo); | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     T cur_prefs3 = tex2D<T>(cur, xo, yo + 3); | ||||
|     T cur_prefs = tex2D<T>(cur, xo, yo + 1); | ||||
|     T cur_mrefs = tex2D<T>(cur, xo, yo - 1); | ||||
|     T cur_mrefs3 = tex2D<T>(cur, xo, yo - 3); | ||||
|  | ||||
|     if (is_field_end) { | ||||
|         dst[yo*dst_pitch+xo] = | ||||
|             filter_intra(cur_prefs3, cur_prefs, cur_mrefs, cur_mrefs3, clip_max); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // Calculate temporal prediction | ||||
|     int is_second_field = !(parity ^ tff); | ||||
|  | ||||
|     cudaTextureObject_t prev2 = prev; | ||||
|     cudaTextureObject_t prev1 = is_second_field ? cur : prev; | ||||
|     cudaTextureObject_t next1 = is_second_field ? next : cur; | ||||
|     cudaTextureObject_t next2 = next; | ||||
|  | ||||
|     T prev2_prefs4 = tex2D<T>(prev2, xo,  yo + 4); | ||||
|     T prev2_prefs2 = tex2D<T>(prev2, xo,  yo + 2); | ||||
|     T prev2_0 = tex2D<T>(prev2, xo,  yo + 0); | ||||
|     T prev2_mrefs2 = tex2D<T>(prev2, xo,  yo - 2); | ||||
|     T prev2_mrefs4 = tex2D<T>(prev2, xo,  yo - 4); | ||||
|     T prev_prefs = tex2D<T>(prev1, xo,  yo + 1); | ||||
|     T prev_mrefs = tex2D<T>(prev1, xo,  yo - 1); | ||||
|     T next_prefs = tex2D<T>(next1, xo,  yo + 1); | ||||
|     T next_mrefs = tex2D<T>(next1, xo,  yo - 1); | ||||
|     T next2_prefs4 = tex2D<T>(next2, xo,  yo + 4); | ||||
|     T next2_prefs2 = tex2D<T>(next2, xo,  yo + 2); | ||||
|     T next2_0 = tex2D<T>(next2, xo,  yo + 0); | ||||
|     T next2_mrefs2 = tex2D<T>(next2, xo,  yo - 2); | ||||
|     T next2_mrefs4 = tex2D<T>(next2, xo,  yo - 4); | ||||
|  | ||||
|     dst[yo*dst_pitch+xo] = filter(cur_prefs3, cur_prefs, cur_mrefs, cur_mrefs3, | ||||
|                                   prev2_prefs4, prev2_prefs2, prev2_0, prev2_mrefs2, prev2_mrefs4, | ||||
|                                   prev_prefs, prev_mrefs, next_prefs, next_mrefs, | ||||
|                                   next2_prefs4, next2_prefs2, next2_0, next2_mrefs2, next2_mrefs4, | ||||
|                                   clip_max); | ||||
| } | ||||
|  | ||||
| template <typename T> | ||||
| __inline__ __device__ void bwdif_double(T *dst, | ||||
|                                         cudaTextureObject_t prev, | ||||
|                                         cudaTextureObject_t cur, | ||||
|                                         cudaTextureObject_t next, | ||||
|                                         int dst_width, int dst_height, int dst_pitch, | ||||
|                                         int src_width, int src_height, | ||||
|                                         int parity, int tff, | ||||
|                                         int is_field_end, int clip_max) | ||||
| { | ||||
|     int xo = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|     int yo = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|     if (xo >= dst_width || yo >= dst_height) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     if (yo % 2 == parity) { | ||||
|       // Don't modify the primary field | ||||
|       dst[yo*dst_pitch+xo] = tex2D<T>(cur, xo, yo); | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     T cur_prefs3 = tex2D<T>(cur, xo, yo + 3); | ||||
|     T cur_prefs = tex2D<T>(cur, xo, yo + 1); | ||||
|     T cur_mrefs = tex2D<T>(cur, xo, yo - 1); | ||||
|     T cur_mrefs3 = tex2D<T>(cur, xo, yo - 3); | ||||
|  | ||||
|     if (is_field_end) { | ||||
|         T final; | ||||
|         final.x = filter_intra(cur_prefs3.x, cur_prefs.x, cur_mrefs.x, cur_mrefs3.x, | ||||
|                                clip_max); | ||||
|         final.y = filter_intra(cur_prefs3.y, cur_prefs.y, cur_mrefs.y, cur_mrefs3.y, | ||||
|                                clip_max); | ||||
|         dst[yo*dst_pitch+xo] = final; | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     int is_second_field = !(parity ^ tff); | ||||
|  | ||||
|     cudaTextureObject_t prev2 = prev; | ||||
|     cudaTextureObject_t prev1 = is_second_field ? cur : prev; | ||||
|     cudaTextureObject_t next1 = is_second_field ? next : cur; | ||||
|     cudaTextureObject_t next2 = next; | ||||
|  | ||||
|     T prev2_prefs4 = tex2D<T>(prev2, xo,  yo + 4); | ||||
|     T prev2_prefs2 = tex2D<T>(prev2, xo,  yo + 2); | ||||
|     T prev2_0 = tex2D<T>(prev2, xo,  yo + 0); | ||||
|     T prev2_mrefs2 = tex2D<T>(prev2, xo,  yo - 2); | ||||
|     T prev2_mrefs4 = tex2D<T>(prev2, xo,  yo - 4); | ||||
|     T prev_prefs = tex2D<T>(prev1, xo,  yo + 1); | ||||
|     T prev_mrefs = tex2D<T>(prev1, xo,  yo - 1); | ||||
|     T next_prefs = tex2D<T>(next1, xo,  yo + 1); | ||||
|     T next_mrefs = tex2D<T>(next1, xo,  yo - 1); | ||||
|     T next2_prefs4 = tex2D<T>(next2, xo,  yo + 4); | ||||
|     T next2_prefs2 = tex2D<T>(next2, xo,  yo + 2); | ||||
|     T next2_0 = tex2D<T>(next2, xo,  yo + 0); | ||||
|     T next2_mrefs2 = tex2D<T>(next2, xo,  yo - 2); | ||||
|     T next2_mrefs4 = tex2D<T>(next2, xo,  yo - 4); | ||||
|  | ||||
|     T final; | ||||
|     final.x = filter(cur_prefs3.x, cur_prefs.x, cur_mrefs.x, cur_mrefs3.x, | ||||
|                      prev2_prefs4.x, prev2_prefs2.x, prev2_0.x, prev2_mrefs2.x, prev2_mrefs4.x, | ||||
|                      prev_prefs.x, prev_mrefs.x, next_prefs.x, next_mrefs.x, | ||||
|                      next2_prefs4.x, next2_prefs2.x, next2_0.x, next2_mrefs2.x, next2_mrefs4.x, | ||||
|                      clip_max); | ||||
|     final.y = filter(cur_prefs3.y, cur_prefs.y, cur_mrefs.y, cur_mrefs3.y, | ||||
|                      prev2_prefs4.y, prev2_prefs2.y, prev2_0.y, prev2_mrefs2.y, prev2_mrefs4.y, | ||||
|                      prev_prefs.y, prev_mrefs.y, next_prefs.y, next_mrefs.y, | ||||
|                      next2_prefs4.y, next2_prefs2.y, next2_0.y, next2_mrefs2.y, next2_mrefs4.y, | ||||
|                      clip_max); | ||||
|  | ||||
|     dst[yo*dst_pitch+xo] = final; | ||||
| } | ||||
|  | ||||
| extern "C" { | ||||
|  | ||||
| __global__ void bwdif_uchar(unsigned char *dst, | ||||
|                             cudaTextureObject_t prev, | ||||
|                             cudaTextureObject_t cur, | ||||
|                             cudaTextureObject_t next, | ||||
|                             int dst_width, int dst_height, int dst_pitch, | ||||
|                             int src_width, int src_height, | ||||
|                             int parity, int tff, int is_field_end, int clip_max) | ||||
| { | ||||
|     bwdif_single(dst, prev, cur, next, | ||||
|                  dst_width, dst_height, dst_pitch, | ||||
|                  src_width, src_height, | ||||
|                  parity, tff, is_field_end, clip_max); | ||||
| } | ||||
|  | ||||
| __global__ void bwdif_ushort(unsigned short *dst, | ||||
|                             cudaTextureObject_t prev, | ||||
|                             cudaTextureObject_t cur, | ||||
|                             cudaTextureObject_t next, | ||||
|                             int dst_width, int dst_height, int dst_pitch, | ||||
|                             int src_width, int src_height, | ||||
|                             int parity, int tff, int is_field_end, int clip_max) | ||||
| { | ||||
|     bwdif_single(dst, prev, cur, next, | ||||
|                  dst_width, dst_height, dst_pitch, | ||||
|                  src_width, src_height, | ||||
|                  parity, tff, is_field_end, clip_max); | ||||
| } | ||||
|  | ||||
| __global__ void bwdif_uchar2(uchar2 *dst, | ||||
|                             cudaTextureObject_t prev, | ||||
|                             cudaTextureObject_t cur, | ||||
|                             cudaTextureObject_t next, | ||||
|                             int dst_width, int dst_height, int dst_pitch, | ||||
|                             int src_width, int src_height, | ||||
|                             int parity, int tff, int is_field_end, int clip_max) | ||||
| { | ||||
|     bwdif_double(dst, prev, cur, next, | ||||
|                  dst_width, dst_height, dst_pitch, | ||||
|                  src_width, src_height, | ||||
|                  parity, tff, is_field_end, clip_max); | ||||
| } | ||||
|  | ||||
| __global__ void bwdif_ushort2(ushort2 *dst, | ||||
|                             cudaTextureObject_t prev, | ||||
|                             cudaTextureObject_t cur, | ||||
|                             cudaTextureObject_t next, | ||||
|                             int dst_width, int dst_height, int dst_pitch, | ||||
|                             int src_width, int src_height, | ||||
|                             int parity, int tff, int is_field_end, int clip_max) | ||||
| { | ||||
|     bwdif_double(dst, prev, cur, next, | ||||
|                  dst_width, dst_height, dst_pitch, | ||||
|                  src_width, src_height, | ||||
|                  parity, tff, is_field_end, clip_max); | ||||
| } | ||||
|  | ||||
| } /* extern "C" */ | ||||
		Reference in New Issue
	
	Block a user