You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	avfilter: add ocr filter
Signed-off-by: Paul B Mahol <onemda@gmail.com>
This commit is contained in:
		| @@ -4,6 +4,7 @@ releases are sorted from youngest to oldest. | ||||
| version <next>: | ||||
| - DXV decoding | ||||
| - extrastereo filter | ||||
| - ocr filter | ||||
|  | ||||
|  | ||||
| version 2.8: | ||||
|   | ||||
							
								
								
									
										4
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								configure
									
									
									
									
										vendored
									
									
								
							| @@ -246,6 +246,7 @@ External library support: | ||||
|   --enable-libspeex        enable Speex de/encoding via libspeex [no] | ||||
|   --enable-libssh          enable SFTP protocol via libssh [no] | ||||
|   --enable-libstagefright-h264  enable H.264 decoding via libstagefright [no] | ||||
|   --enable-libtesseract    enable Tesseract, needed for ocr filter [no] | ||||
|   --enable-libtheora       enable Theora encoding via libtheora [no] | ||||
|   --enable-libtwolame      enable MP2 encoding via libtwolame [no] | ||||
|   --enable-libutvideo      enable Ut Video encoding and decoding via libutvideo [no] | ||||
| @@ -1409,6 +1410,7 @@ EXTERNAL_LIBRARY_LIST=" | ||||
|     libspeex | ||||
|     libssh | ||||
|     libstagefright_h264 | ||||
|     libtesseract | ||||
|     libtheora | ||||
|     libtwolame | ||||
|     libutvideo | ||||
| @@ -2774,6 +2776,7 @@ mptestsrc_filter_deps="gpl" | ||||
| negate_filter_deps="lut_filter" | ||||
| perspective_filter_deps="gpl" | ||||
| pp7_filter_deps="gpl" | ||||
| ocr_filter_deps="libtesseract" | ||||
| ocv_filter_deps="libopencv" | ||||
| owdenoise_filter_deps="gpl" | ||||
| pan_filter_deps="swresample" | ||||
| @@ -5275,6 +5278,7 @@ enabled libspeex          && require_pkg_config speex speex/speex.h speex_decode | ||||
| enabled libstagefright_h264 && require_cpp libstagefright_h264 "binder/ProcessState.h media/stagefright/MetaData.h | ||||
|     media/stagefright/MediaBufferGroup.h media/stagefright/MediaDebug.h media/stagefright/MediaDefs.h | ||||
|     media/stagefright/OMXClient.h media/stagefright/OMXCodec.h" android::OMXClient -lstagefright -lmedia -lutils -lbinder -lgnustl_static | ||||
| enabled libtesseract      && require_pkg_config tesseract tesseract/capi.h TessBaseAPICreate | ||||
| enabled libtheora         && require libtheora theora/theoraenc.h th_info_init -ltheoraenc -ltheoradec -logg | ||||
| enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame && | ||||
|                              { check_lib twolame.h twolame_encode_buffer_float32_interleaved -ltwolame || | ||||
|   | ||||
| @@ -7485,6 +7485,30 @@ noise=alls=20:allf=t+u | ||||
|  | ||||
| Pass the video source unchanged to the output. | ||||
|  | ||||
| @section ocr | ||||
| Optical Character Recognition | ||||
|  | ||||
| This filter uses Tesseract for optical character recognition. | ||||
|  | ||||
| It accepts the following options: | ||||
|  | ||||
| @table @option | ||||
| @item datapath | ||||
| Set datapath to tesseract data. Default is to use whatever was | ||||
| set at installation. | ||||
|  | ||||
| @item language | ||||
| Set language, default is "eng". | ||||
|  | ||||
| @item whitelist | ||||
| Set character whitelist. | ||||
|  | ||||
| @item blacklist | ||||
| Set character blacklist. | ||||
| @end table | ||||
|  | ||||
| The filter exports recognized text as the frame metadata @code{lavfi.ocr.text}. | ||||
|  | ||||
| @section ocv | ||||
|  | ||||
| Apply a video transform using libopencv. | ||||
|   | ||||
| @@ -169,6 +169,7 @@ OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o | ||||
| OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o | ||||
| OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o | ||||
| OBJS-$(CONFIG_NULL_FILTER)                   += vf_null.o | ||||
| OBJS-$(CONFIG_OCR_FILTER)                    += vf_ocr.o | ||||
| OBJS-$(CONFIG_OCV_FILTER)                    += vf_libopencv.o | ||||
| OBJS-$(CONFIG_OPENCL)                        += deshake_opencl.o unsharp_opencl.o | ||||
| OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o dualinput.o framesync.o | ||||
|   | ||||
| @@ -191,6 +191,7 @@ void avfilter_register_all(void) | ||||
|     REGISTER_FILTER(NOFORMAT,       noformat,       vf); | ||||
|     REGISTER_FILTER(NOISE,          noise,          vf); | ||||
|     REGISTER_FILTER(NULL,           null,           vf); | ||||
|     REGISTER_FILTER(OCR,            ocr,            vf); | ||||
|     REGISTER_FILTER(OCV,            ocv,            vf); | ||||
|     REGISTER_FILTER(OVERLAY,        overlay,        vf); | ||||
|     REGISTER_FILTER(OWDENOISE,      owdenoise,      vf); | ||||
|   | ||||
| @@ -30,7 +30,7 @@ | ||||
| #include "libavutil/version.h" | ||||
|  | ||||
| #define LIBAVFILTER_VERSION_MAJOR   6 | ||||
| #define LIBAVFILTER_VERSION_MINOR   1 | ||||
| #define LIBAVFILTER_VERSION_MINOR   2 | ||||
| #define LIBAVFILTER_VERSION_MICRO 100 | ||||
|  | ||||
| #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \ | ||||
|   | ||||
							
								
								
									
										151
									
								
								libavfilter/vf_ocr.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								libavfilter/vf_ocr.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,151 @@ | ||||
| /* | ||||
|  * Copyright (c) 2015 Paul B Mahol | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include <tesseract/capi.h> | ||||
|  | ||||
| #include "libavutil/opt.h" | ||||
| #include "avfilter.h" | ||||
| #include "formats.h" | ||||
| #include "internal.h" | ||||
| #include "video.h" | ||||
|  | ||||
| typedef struct OCRContext { | ||||
|     const AVClass *class; | ||||
|  | ||||
|     char *datapath; | ||||
|     char *language; | ||||
|     char *whitelist; | ||||
|     char *blacklist; | ||||
|  | ||||
|     TessBaseAPI *tess; | ||||
| } OCRContext; | ||||
|  | ||||
| #define OFFSET(x) offsetof(OCRContext, x) | ||||
| #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM | ||||
|  | ||||
| static const AVOption ocr_options[] = { | ||||
|     { "datapath",  "set datapath",            OFFSET(datapath),  AV_OPT_TYPE_STRING, {.str=NULL},  0, 0, FLAGS }, | ||||
|     { "language",  "set language",            OFFSET(language),  AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS }, | ||||
|     { "whitelist", "set character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING, {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~"}, 0, 0, FLAGS }, | ||||
|     { "blacklist", "set character blacklist", OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""},    0, 0, FLAGS }, | ||||
|     { NULL } | ||||
| }; | ||||
|  | ||||
| static av_cold int init(AVFilterContext *ctx) | ||||
| { | ||||
|     OCRContext *s = ctx->priv; | ||||
|  | ||||
|     s->tess = TessBaseAPICreate(); | ||||
|     if (TessBaseAPIInit3(s->tess, s->datapath, s->language) == -1) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "failed to init tesseract\n"); | ||||
|         return AVERROR(EINVAL); | ||||
|     } | ||||
|  | ||||
|     if (!TessBaseAPISetVariable(s->tess, "tessedit_char_whitelist", s->whitelist)) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "failed to set whitelist\n"); | ||||
|         return AVERROR(EINVAL); | ||||
|     } | ||||
|  | ||||
|     if (!TessBaseAPISetVariable(s->tess, "tessedit_char_blacklist", s->blacklist)) { | ||||
|         av_log(ctx, AV_LOG_ERROR, "failed to set blacklist\n"); | ||||
|         return AVERROR(EINVAL); | ||||
|     } | ||||
|  | ||||
|     av_log(ctx, AV_LOG_DEBUG, "Tesseract version: %s\n", TessVersion()); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| static int query_formats(AVFilterContext *ctx) | ||||
| { | ||||
|     static const enum AVPixelFormat pix_fmts[] = { | ||||
|         AV_PIX_FMT_GRAY8, | ||||
|         AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, | ||||
|         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, | ||||
|         AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P, | ||||
|         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, | ||||
|         AV_PIX_FMT_YUVJ440P, AV_PIX_FMT_YUVJ444P, | ||||
|         AV_PIX_FMT_YUVJ411P, | ||||
|         AV_PIX_FMT_YUVA444P, AV_PIX_FMT_YUVA422P, AV_PIX_FMT_YUVA420P, | ||||
|         AV_PIX_FMT_NONE | ||||
|     }; | ||||
|  | ||||
|     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts); | ||||
|     if (!fmts_list) | ||||
|         return AVERROR(ENOMEM); | ||||
|     ff_set_common_formats(ctx, fmts_list); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| static int filter_frame(AVFilterLink *inlink, AVFrame *in) | ||||
| { | ||||
|     AVDictionary **metadata = avpriv_frame_get_metadatap(in); | ||||
|     AVFilterContext *ctx = inlink->dst; | ||||
|     AVFilterLink *outlink = ctx->outputs[0]; | ||||
|     OCRContext *s = ctx->priv; | ||||
|     char *result; | ||||
|  | ||||
|     result = TessBaseAPIRect(s->tess, in->data[0], 1, | ||||
|                              in->linesize[0], 0, 0, in->width, in->height); | ||||
|     av_dict_set(metadata, "lavfi.ocr.text", result, 0); | ||||
|     TessDeleteText(result); | ||||
|  | ||||
|     return ff_filter_frame(outlink, in); | ||||
| } | ||||
|  | ||||
| static av_cold void uninit(AVFilterContext *ctx) | ||||
| { | ||||
|     OCRContext *s = ctx->priv; | ||||
|  | ||||
|     TessBaseAPIEnd(s->tess); | ||||
|     TessBaseAPIDelete(s->tess); | ||||
| } | ||||
|  | ||||
| AVFILTER_DEFINE_CLASS(ocr); | ||||
|  | ||||
| static const AVFilterPad ocr_inputs[] = { | ||||
|     { | ||||
|         .name         = "default", | ||||
|         .type         = AVMEDIA_TYPE_VIDEO, | ||||
|         .filter_frame = filter_frame, | ||||
|     }, | ||||
|     { NULL } | ||||
| }; | ||||
|  | ||||
| static const AVFilterPad ocr_outputs[] = { | ||||
|     { | ||||
|         .name         = "default", | ||||
|         .type         = AVMEDIA_TYPE_VIDEO, | ||||
|     }, | ||||
|     { NULL } | ||||
| }; | ||||
|  | ||||
| AVFilter ff_vf_ocr = { | ||||
|     .name          = "ocr", | ||||
|     .description   = NULL_IF_CONFIG_SMALL("Optical Character Recognition."), | ||||
|     .priv_size     = sizeof(OCRContext), | ||||
|     .priv_class    = &ocr_class, | ||||
|     .query_formats = query_formats, | ||||
|     .init          = init, | ||||
|     .uninit        = uninit, | ||||
|     .inputs        = ocr_inputs, | ||||
|     .outputs       = ocr_outputs, | ||||
| }; | ||||
		Reference in New Issue
	
	Block a user