Message ID | MN2PR04MB59816F420228FD9D9A6657A4BAA19@MN2PR04MB5981.namprd04.prod.outlook.com |
---|---|
State | Superseded, archived |
Headers | show |
Series | [FFmpeg-devel,v8,01/13] global: Prepare AVFrame for subtitle handling | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
Soft Works: > Signed-off-by: softworkz <softworkz@hotmail.com> > --- > configure | 1 + > doc/filters.texi | 55 ++++++ > libavfilter/Makefile | 1 + > libavfilter/allfilters.c | 1 + > libavfilter/sf_graphicsub2text.c | 326 +++++++++++++++++++++++++++++++ > 5 files changed, 384 insertions(+) > create mode 100644 libavfilter/sf_graphicsub2text.c > > diff --git a/configure b/configure > index 37fc4c20e7..2682e51435 100755 > --- a/configure > +++ b/configure > @@ -3601,6 +3601,7 @@ frei0r_deps_any="libdl LoadLibrary" > frei0r_filter_deps="frei0r" > frei0r_src_filter_deps="frei0r" > fspp_filter_deps="gpl" > +graphicsub2text_filter_deps="libtesseract" > histeq_filter_deps="gpl" > hqdn3d_filter_deps="gpl" > interlace_filter_deps="gpl" > diff --git a/doc/filters.texi b/doc/filters.texi > index da463e2cc1..2b6dfbe1d4 100644 > --- a/doc/filters.texi > +++ b/doc/filters.texi > @@ -25248,6 +25248,61 @@ ffmpeg -i "https://streams.videolan.org/ffmpeg/mkv_subtitles.mkv" -filter_comple > @end example > @end itemize > > +@section graphicsub2text > + > +Converts graphic subtitles to text subtitles by performing OCR. > + > +For this filter to be available, ffmpeg needs to be compiled with libtesseract (see https://github.com/tesseract-ocr/tesseract). > +Language models need to be downloaded from https://github.com/tesseract-ocr/tessdata and put into as subfolder named 'tessdata' or into a folder specified via the environment variable 'TESSDATA_PREFIX'. > +The path can also be specified via filter option (see below). > + > +Note: These models are including the data for both OCR modes. > + > +Inputs: > +- 0: Subtitles [bitmap] > + > +Outputs: > +- 0: Subtitles [text] > + > +It accepts the following parameters: > + > +@table @option > +@item ocr_mode > +The character recognition mode to use. > + > +Supported OCR modes are: > + > +@table @var > +@item 0, tesseract > +This is the classic libtesseract operation mode. It is fast but less accurate than LSTM. > +@item 1, lstm > +Newer OCR implementation based on ML models. Provides usually better results, requires more processing resources. > +@item 2, both > +Use a combination of both modes. > +@end table > + > +@item tessdata_path > +The path to a folder containing the language models to be used. > + > +@item language > +The recognition language. It needs to match the first three characters of a language model file in the tessdata path. > + > +@end table > + > + > +@subsection Examples > + > +@itemize > +@item > +Convert DVB graphic subtitles to ASS (text) subtitles > + > +Note: For this to work, you need to have the data file 'eng.traineddata' in a 'tessdata' subfolder (see above). > +@example > +ffmpeg ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv > +@end example > +@end itemize > + > + > @section graphicsub2video > > Renders graphic subtitles as video frames. > diff --git a/libavfilter/Makefile b/libavfilter/Makefile > index 39abf6d2a6..312b67982c 100644 > --- a/libavfilter/Makefile > +++ b/libavfilter/Makefile > @@ -290,6 +290,7 @@ OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o qp_table.o > OBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o > OBJS-$(CONFIG_GEQ_FILTER) += vf_geq.o > OBJS-$(CONFIG_GRADFUN_FILTER) += vf_gradfun.o > +OBJS-$(CONFIG_GRAPHICSUB2TEXT_FILTER) += sf_graphicsub2text.o > OBJS-$(CONFIG_GRAPHICSUB2VIDEO_FILTER) += vf_overlay_graphicsubs.o framesync.o > OBJS-$(CONFIG_GRAPHMONITOR_FILTER) += f_graphmonitor.o > OBJS-$(CONFIG_GRAYWORLD_FILTER) += vf_grayworld.o > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c > index 77c6379302..ee5638dc3d 100644 > --- a/libavfilter/allfilters.c > +++ b/libavfilter/allfilters.c > @@ -527,6 +527,7 @@ extern const AVFilter ff_avf_showwaves; > extern const AVFilter ff_avf_showwavespic; > extern const AVFilter ff_vaf_spectrumsynth; > extern const AVFilter ff_sf_censor; > +extern const AVFilter ff_sf_graphicsub2text; > extern const AVFilter ff_sf_show_speaker; > extern const AVFilter ff_sf_split_cc; > extern const AVFilter ff_sf_stripstyles; > diff --git a/libavfilter/sf_graphicsub2text.c b/libavfilter/sf_graphicsub2text.c > new file mode 100644 > index 0000000000..157b76408e > --- /dev/null > +++ b/libavfilter/sf_graphicsub2text.c > @@ -0,0 +1,326 @@ > +/* > + * Copyright (c) 2021 softworkz > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +/** > + * @file > + * subtitle filter to convert graphical subs to text subs via OCR > + */ > + > +#include <tesseract/capi.h> > +#include <libavcodec/ass.h> > + > +#include "libavutil/avassert.h" > +#include "libavutil/opt.h" > +#include "avfilter.h" > +#include "internal.h" > +#include "subtitles.h" > +#include "libavcodec/avcodec.h" > +#include "libavutil/file.h" > + > +typedef struct SubOcrContext { > + const AVClass *class; > + int w, h; > + > + TessBaseAPI *tapi; > + TessOcrEngineMode ocr_mode; > + char *tessdata_path; > + char *language; > + > + int readorder_counter; > + > + AVFrame *pending_frame; > +} SubOcrContext; > + > + > +static int init(AVFilterContext *ctx) > +{ > + SubOcrContext *s = ctx->priv; > + const char* tver = TessVersion(); > + int ret; > + > + s->tapi = TessBaseAPICreate(); > + > + if (!s->tapi || !tver || !strlen(tver)) { > + av_log(ctx, AV_LOG_ERROR, "Failed to access libtesseract\n"); > + return AVERROR(ENOSYS); > + } > + > + av_log(ctx, AV_LOG_VERBOSE, "Initializing libtesseract, version: %s\n", tver); > + > + ret = TessBaseAPIInit4(s->tapi, s->tessdata_path, s->language, s->ocr_mode, NULL, 0, NULL, NULL, 0, 1); > + if (ret < 0 ) { > + av_log(ctx, AV_LOG_ERROR, "Failed to initialize libtesseract. Error: %d\n", ret); > + return AVERROR(ENOSYS); > + } > + > + ret = TessBaseAPISetVariable(s->tapi, "tessedit_char_blacklist", "|"); > + if (ret < 0 ) { > + av_log(ctx, AV_LOG_ERROR, "Failed to set 'tessedit_char_blacklist'. Error: %d\n", ret); > + return AVERROR(EINVAL); > + } > + > + return 0; > +} > + > +static void uninit(AVFilterContext *ctx) > +{ > + SubOcrContext *s = ctx->priv; > + > + TessBaseAPIEnd(s->tapi); > + TessBaseAPIDelete(s->tapi); Beware: uninit is also called on init failure, so it might be that s->tapi is NULL or that TessBaseAPIInit4 did not succeed. > +} > + > +static int query_formats(AVFilterContext *ctx) > +{ > + AVFilterFormats *formats, *formats2; > + AVFilterLink *inlink = ctx->inputs[0]; > + AVFilterLink *outlink = ctx->outputs[0]; > + static const enum AVSubtitleType in_fmts[] = { AV_SUBTITLE_FMT_BITMAP, AV_SUBTITLE_FMT_NONE }; > + static const enum AVSubtitleType out_fmts[] = { AV_SUBTITLE_FMT_ASS, AV_SUBTITLE_FMT_NONE }; > + int ret; > + > + /* set input format */ > + formats = ff_make_format_list(in_fmts); > + if ((ret = ff_formats_ref(formats, &inlink->outcfg.formats)) < 0) > + return ret; > + > + /* set output format */ > + formats2 = ff_make_format_list(out_fmts); > + if ((ret = ff_formats_ref(formats2, &outlink->incfg.formats)) < 0) > + return ret; > + > + return 0; > +} > + > +static int config_input(AVFilterLink *inlink) > +{ > + AVFilterContext *ctx = inlink->dst; > + SubOcrContext *s = ctx->priv; > + > + if (s->w <= 0 || s->h <= 0) { > + s->w = inlink->w; > + s->h = inlink->h; > + } > + return 0; > +} > + > +static int config_output(AVFilterLink *outlink) > +{ > + const AVFilterContext *ctx = outlink->src; > + SubOcrContext *s = ctx->priv; > + > + outlink->format = AV_SUBTITLE_FMT_ASS; > + outlink->w = s->w; > + outlink->h = s->h; > + > + return 0; > +} > + > +static uint8_t* create_grayscale_image(AVFilterContext *ctx, AVSubtitleArea *area) > +{ > + uint8_t gray_pal[256]; > + const size_t img_size = area->buf[0]->size; > + const uint8_t* img = area->buf[0]->data; > + uint8_t* gs_img = av_malloc(img_size); > + > + if (!gs_img) > + return NULL; > + > + for (unsigned i = 0; i < 256; i++) { > + const uint8_t *col = (uint8_t*)&area->pal[i]; > + const int val = (int)col[3] * FFMAX3(col[0], col[1], col[2]); > + gray_pal[i] = (uint8_t)(val >> 8); > + } > + > + for (unsigned i = 0; i < img_size; i++) > + gs_img[i] = 255 - gray_pal[img[i]]; > + > + return gs_img; > +} > + > +static int convert_area(AVFilterContext *ctx, AVSubtitleArea *area) > +{ > + SubOcrContext *s = ctx->priv; > + char *ocr_text = NULL; > + int ret; > + uint8_t *gs_img = create_grayscale_image(ctx, area); > + > + if (!gs_img) > + return AVERROR(ENOMEM); > + > + TessBaseAPISetImage(s->tapi, gs_img, area->w, area->h, 1, area->linesize[0]); > + TessBaseAPISetSourceResolution(s->tapi, 70); > + > + ret = TessBaseAPIRecognize(s->tapi, NULL); > + if (ret == 0) > + ocr_text = TessBaseAPIGetUTF8Text(s->tapi); > + > + if (!ocr_text) { > + av_log(ctx, AV_LOG_WARNING, "OCR didn't return a text. ret=%d\n", ret); > + area->ass = av_strdup(""); > + } > + else { > + size_t len = strlen(ocr_text); > + > + if (len > 0 && ocr_text[len - 1] == '\n') > + ocr_text[len - 1] = 0; > + > + av_log(ctx, AV_LOG_VERBOSE, "OCR Result: %s\n", ocr_text); > + > + area->ass = av_strdup(ocr_text); > + > + TessDeleteText(ocr_text); > + } > + > + av_freep(&gs_img); > + av_buffer_unref(&area->buf[0]); > + area->type = AV_SUBTITLE_FMT_ASS; > + > + return 0; > +} > + > +static int filter_frame(AVFilterLink *inlink, AVFrame *frame) > +{ > + AVFilterContext *ctx = inlink->dst; > + SubOcrContext *s = ctx->priv; > + AVFilterLink *outlink = inlink->dst->outputs[0]; > + int ret, frame_sent = 0; > + > + if (s->pending_frame) { > + const uint64_t pts_diff = frame->subtitle_pts - s->pending_frame->subtitle_pts; > + s->pending_frame->subtitle_end_time = (uint32_t)(pts_diff / 1000); > + > + ret = ff_filter_frame(outlink, s->pending_frame); > + s->pending_frame = NULL; > + if (ret < 0) > + return ret; > + > + frame_sent = 1; > + > + if (frame->num_subtitle_areas == 0) { > + // No need to forward this empty frame > + av_frame_unref(frame); Leak. > + return 0; > + } > + } > + > + av_frame_make_writable(frame); > + > + if (!frame) > + return AVERROR(ENOMEM); Wrong check; and leak. > + > + frame->format = AV_SUBTITLE_FMT_ASS; > + > + av_log(ctx, AV_LOG_DEBUG, "filter_frame sub_pts: %"PRIu64", start_time: %d, end_time: %d, num_areas: %d\n", > + frame->subtitle_pts, frame->subtitle_start_time, frame->subtitle_end_time, frame->num_subtitle_areas); > + > + if (frame->num_subtitle_areas > 1 && > + frame->subtitle_areas[0]->y > frame->subtitle_areas[frame->num_subtitle_areas - 1]->y) { > + > + for (unsigned i = 0; i < frame->num_subtitle_areas / 2; i++) > + FFSWAP(AVSubtitleArea*, frame->subtitle_areas[i], frame->subtitle_areas[frame->num_subtitle_areas - i - 1]); > + } > + > + for (unsigned i = 0; i < frame->num_subtitle_areas; i++) { > + char *tmp; > + AVSubtitleArea *area = frame->subtitle_areas[i]; > + > + ret = convert_area(ctx, area); > + if (ret < 0) > + return ret; > + > + if (strlen(area->ass)) { > + tmp = area->ass; > + > + if (i == 0) > + area->ass = ff_ass_get_dialog(s->readorder_counter++, 0, "Default", NULL, tmp); > + else > + area->ass = av_asprintf("\\N%s", tmp); > + > + av_free(tmp); > + } > + } > + > + if (frame->num_subtitle_areas > 0 && frame->subtitle_end_time >= 30000) { Where does this number come from? > + // Can't send it without end time, wait for the next frame to determine the end_display time > + s->pending_frame = frame; > + > + if (frame_sent) > + return 0; > + > + // To keep all going, send an empty frame instead > + frame = ff_get_subtitles_buffer(outlink, AV_SUBTITLE_FMT_ASS); > + if (!frame) > + return AVERROR(ENOMEM); > + > + av_frame_copy_props(frame, s->pending_frame); > + frame->subtitle_end_time = 1; > + } > + > + return ff_filter_frame(outlink, frame); > +} > + > +#define OFFSET(x) offsetof(SubOcrContext, x) > +#define FLAGS (AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_FILTERING_PARAM) > + > +static const AVOption graphicsub2text_options[] = { > + { "ocr_mode", "set ocr mode", OFFSET(ocr_mode), AV_OPT_TYPE_INT, {.i64=OEM_TESSERACT_ONLY}, OEM_TESSERACT_ONLY, 2, FLAGS, "ocr_mode" }, > + { "tesseract", "classic tesseract ocr", 0, AV_OPT_TYPE_CONST, {.i64=OEM_TESSERACT_ONLY}, 0, 0, FLAGS, "ocr_mode" }, > + { "lstm", "lstm (ML based)", 0, AV_OPT_TYPE_CONST, {.i64=OEM_LSTM_ONLY}, 0, 0, FLAGS, "ocr_mode" }, > + { "both", "use both models combined", 0, AV_OPT_TYPE_CONST, {.i64=OEM_TESSERACT_LSTM_COMBINED}, 0, 0, FLAGS, "ocr_mode" }, > + { "tessdata_path", "path to tesseract data", OFFSET(tessdata_path), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS, NULL }, > + { "language", "ocr language", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "eng"}, 0, 0, FLAGS, NULL }, > + { NULL }, > +}; > + > +AVFILTER_DEFINE_CLASS(graphicsub2text); > + > +static const AVFilterPad inputs[] = { > + { > + .name = "default", > + .type = AVMEDIA_TYPE_SUBTITLE, > + .filter_frame = filter_frame, > + .config_props = config_input, > + }, > +}; > + > +static const AVFilterPad outputs[] = { > + { > + .name = "default", > + .type = AVMEDIA_TYPE_SUBTITLE, > + .config_props = config_output, > + }, > +}; > + > +/* > + * Example: > + * ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv > + */ > +const AVFilter ff_sf_graphicsub2text = { > + .name = "graphicsub2text", > + .description = NULL_IF_CONFIG_SMALL("Convert graphical subtitles to text subtitles via OCR"), > + .init = init, > + .uninit = uninit, > + .query_formats = query_formats, > + .priv_size = sizeof(SubOcrContext), > + .priv_class = &graphicsub2text_class, > + FILTER_INPUTS(inputs), > + FILTER_OUTPUTS(outputs), > +}; >
> -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Andreas > Rheinhardt > Sent: Wednesday, 22 September 2021 04:05 > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add > new graphicsub2text filter (OCR) > > > + > > +static void uninit(AVFilterContext *ctx) > > +{ > > + SubOcrContext *s = ctx->priv; > > + > > + TessBaseAPIEnd(s->tapi); > > + TessBaseAPIDelete(s->tapi); > > Beware: uninit is also called on init failure, so it might be that > s->tapi is NULL or that TessBaseAPIInit4 did not succeed. vf_ocr does it in the same way, so I assumed it to be safe. Unfortunately there's no proper API documentation for tesseract. [..] Will apply the stripped notes - thank you! > > + if (frame->num_subtitle_areas > 0 && frame->subtitle_end_time >= > 30000) { > > Where does this number come from? dvbsubdec uses this as value when it can't determine the end display time. Others are setting it to UINT64_MAX. This is the indication to know that the subtitles do not have a display duration and we need to wait for the next one instead. softworkz
Soft Works: > > >> -----Original Message----- >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Andreas >> Rheinhardt >> Sent: Wednesday, 22 September 2021 04:05 >> To: ffmpeg-devel@ffmpeg.org >> Subject: Re: [FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add >> new graphicsub2text filter (OCR) >> >>> + >>> +static void uninit(AVFilterContext *ctx) >>> +{ >>> + SubOcrContext *s = ctx->priv; >>> + >>> + TessBaseAPIEnd(s->tapi); >>> + TessBaseAPIDelete(s->tapi); >> >> Beware: uninit is also called on init failure, so it might be that >> s->tapi is NULL or that TessBaseAPIInit4 did not succeed. > > vf_ocr does it in the same way, so I assumed it to be safe. > I wish it were that easy. Our own APIs are (usually) safe in the sense that an object that has been memset to zero (as all those contexts are initially) can be passed to the free function even when it has never been initialized. But this is need not be true for external APIs. And unfortunately error paths are often untested. For a bad example, just take a look at vf_libvmaf.c: It's uninit function locks a mutex, regardless of whether said mutex has been initialized at all. That's UB. > Unfortunately there's no proper API documentation for tesseract. >
> -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Andreas > Rheinhardt > Sent: Wednesday, 22 September 2021 04:26 > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add > new graphicsub2text filter (OCR) > > Soft Works: > > > > > >> -----Original Message----- > >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Andreas > >> Rheinhardt > >> Sent: Wednesday, 22 September 2021 04:05 > >> To: ffmpeg-devel@ffmpeg.org > >> Subject: Re: [FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add > >> new graphicsub2text filter (OCR) > >> > >>> + > >>> +static void uninit(AVFilterContext *ctx) > >>> +{ > >>> + SubOcrContext *s = ctx->priv; > >>> + > >>> + TessBaseAPIEnd(s->tapi); > >>> + TessBaseAPIDelete(s->tapi); > >> > >> Beware: uninit is also called on init failure, so it might be that > >> s->tapi is NULL or that TessBaseAPIInit4 did not succeed. > > > > vf_ocr does it in the same way, so I assumed it to be safe. > > > > I wish it were that easy. Our own APIs are (usually) safe in the sense > that an object that has been memset to zero (as all those contexts are > initially) can be passed to the free function even when it has never > been initialized. But this is need not be true for external APIs. And > unfortunately error paths are often untested. That's true. Though, I looked at more than a handful of other code using tesseract and I've never seen any additional checks in the uninit code. But now that you said it - I'll try it out to be sure :-) Thanks, softworkz
diff --git a/configure b/configure index 37fc4c20e7..2682e51435 100755 --- a/configure +++ b/configure @@ -3601,6 +3601,7 @@ frei0r_deps_any="libdl LoadLibrary" frei0r_filter_deps="frei0r" frei0r_src_filter_deps="frei0r" fspp_filter_deps="gpl" +graphicsub2text_filter_deps="libtesseract" histeq_filter_deps="gpl" hqdn3d_filter_deps="gpl" interlace_filter_deps="gpl" diff --git a/doc/filters.texi b/doc/filters.texi index da463e2cc1..2b6dfbe1d4 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -25248,6 +25248,61 @@ ffmpeg -i "https://streams.videolan.org/ffmpeg/mkv_subtitles.mkv" -filter_comple @end example @end itemize +@section graphicsub2text + +Converts graphic subtitles to text subtitles by performing OCR. + +For this filter to be available, ffmpeg needs to be compiled with libtesseract (see https://github.com/tesseract-ocr/tesseract). +Language models need to be downloaded from https://github.com/tesseract-ocr/tessdata and put into as subfolder named 'tessdata' or into a folder specified via the environment variable 'TESSDATA_PREFIX'. +The path can also be specified via filter option (see below). + +Note: These models are including the data for both OCR modes. + +Inputs: +- 0: Subtitles [bitmap] + +Outputs: +- 0: Subtitles [text] + +It accepts the following parameters: + +@table @option +@item ocr_mode +The character recognition mode to use. + +Supported OCR modes are: + +@table @var +@item 0, tesseract +This is the classic libtesseract operation mode. It is fast but less accurate than LSTM. +@item 1, lstm +Newer OCR implementation based on ML models. Provides usually better results, requires more processing resources. +@item 2, both +Use a combination of both modes. +@end table + +@item tessdata_path +The path to a folder containing the language models to be used. + +@item language +The recognition language. It needs to match the first three characters of a language model file in the tessdata path. + +@end table + + +@subsection Examples + +@itemize +@item +Convert DVB graphic subtitles to ASS (text) subtitles + +Note: For this to work, you need to have the data file 'eng.traineddata' in a 'tessdata' subfolder (see above). +@example +ffmpeg ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv +@end example +@end itemize + + @section graphicsub2video Renders graphic subtitles as video frames. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 39abf6d2a6..312b67982c 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -290,6 +290,7 @@ OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o qp_table.o OBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o OBJS-$(CONFIG_GEQ_FILTER) += vf_geq.o OBJS-$(CONFIG_GRADFUN_FILTER) += vf_gradfun.o +OBJS-$(CONFIG_GRAPHICSUB2TEXT_FILTER) += sf_graphicsub2text.o OBJS-$(CONFIG_GRAPHICSUB2VIDEO_FILTER) += vf_overlay_graphicsubs.o framesync.o OBJS-$(CONFIG_GRAPHMONITOR_FILTER) += f_graphmonitor.o OBJS-$(CONFIG_GRAYWORLD_FILTER) += vf_grayworld.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 77c6379302..ee5638dc3d 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -527,6 +527,7 @@ extern const AVFilter ff_avf_showwaves; extern const AVFilter ff_avf_showwavespic; extern const AVFilter ff_vaf_spectrumsynth; extern const AVFilter ff_sf_censor; +extern const AVFilter ff_sf_graphicsub2text; extern const AVFilter ff_sf_show_speaker; extern const AVFilter ff_sf_split_cc; extern const AVFilter ff_sf_stripstyles; diff --git a/libavfilter/sf_graphicsub2text.c b/libavfilter/sf_graphicsub2text.c new file mode 100644 index 0000000000..157b76408e --- /dev/null +++ b/libavfilter/sf_graphicsub2text.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2021 softworkz + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * subtitle filter to convert graphical subs to text subs via OCR + */ + +#include <tesseract/capi.h> +#include <libavcodec/ass.h> + +#include "libavutil/avassert.h" +#include "libavutil/opt.h" +#include "avfilter.h" +#include "internal.h" +#include "subtitles.h" +#include "libavcodec/avcodec.h" +#include "libavutil/file.h" + +typedef struct SubOcrContext { + const AVClass *class; + int w, h; + + TessBaseAPI *tapi; + TessOcrEngineMode ocr_mode; + char *tessdata_path; + char *language; + + int readorder_counter; + + AVFrame *pending_frame; +} SubOcrContext; + + +static int init(AVFilterContext *ctx) +{ + SubOcrContext *s = ctx->priv; + const char* tver = TessVersion(); + int ret; + + s->tapi = TessBaseAPICreate(); + + if (!s->tapi || !tver || !strlen(tver)) { + av_log(ctx, AV_LOG_ERROR, "Failed to access libtesseract\n"); + return AVERROR(ENOSYS); + } + + av_log(ctx, AV_LOG_VERBOSE, "Initializing libtesseract, version: %s\n", tver); + + ret = TessBaseAPIInit4(s->tapi, s->tessdata_path, s->language, s->ocr_mode, NULL, 0, NULL, NULL, 0, 1); + if (ret < 0 ) { + av_log(ctx, AV_LOG_ERROR, "Failed to initialize libtesseract. Error: %d\n", ret); + return AVERROR(ENOSYS); + } + + ret = TessBaseAPISetVariable(s->tapi, "tessedit_char_blacklist", "|"); + if (ret < 0 ) { + av_log(ctx, AV_LOG_ERROR, "Failed to set 'tessedit_char_blacklist'. Error: %d\n", ret); + return AVERROR(EINVAL); + } + + return 0; +} + +static void uninit(AVFilterContext *ctx) +{ + SubOcrContext *s = ctx->priv; + + TessBaseAPIEnd(s->tapi); + TessBaseAPIDelete(s->tapi); +} + +static int query_formats(AVFilterContext *ctx) +{ + AVFilterFormats *formats, *formats2; + AVFilterLink *inlink = ctx->inputs[0]; + AVFilterLink *outlink = ctx->outputs[0]; + static const enum AVSubtitleType in_fmts[] = { AV_SUBTITLE_FMT_BITMAP, AV_SUBTITLE_FMT_NONE }; + static const enum AVSubtitleType out_fmts[] = { AV_SUBTITLE_FMT_ASS, AV_SUBTITLE_FMT_NONE }; + int ret; + + /* set input format */ + formats = ff_make_format_list(in_fmts); + if ((ret = ff_formats_ref(formats, &inlink->outcfg.formats)) < 0) + return ret; + + /* set output format */ + formats2 = ff_make_format_list(out_fmts); + if ((ret = ff_formats_ref(formats2, &outlink->incfg.formats)) < 0) + return ret; + + return 0; +} + +static int config_input(AVFilterLink *inlink) +{ + AVFilterContext *ctx = inlink->dst; + SubOcrContext *s = ctx->priv; + + if (s->w <= 0 || s->h <= 0) { + s->w = inlink->w; + s->h = inlink->h; + } + return 0; +} + +static int config_output(AVFilterLink *outlink) +{ + const AVFilterContext *ctx = outlink->src; + SubOcrContext *s = ctx->priv; + + outlink->format = AV_SUBTITLE_FMT_ASS; + outlink->w = s->w; + outlink->h = s->h; + + return 0; +} + +static uint8_t* create_grayscale_image(AVFilterContext *ctx, AVSubtitleArea *area) +{ + uint8_t gray_pal[256]; + const size_t img_size = area->buf[0]->size; + const uint8_t* img = area->buf[0]->data; + uint8_t* gs_img = av_malloc(img_size); + + if (!gs_img) + return NULL; + + for (unsigned i = 0; i < 256; i++) { + const uint8_t *col = (uint8_t*)&area->pal[i]; + const int val = (int)col[3] * FFMAX3(col[0], col[1], col[2]); + gray_pal[i] = (uint8_t)(val >> 8); + } + + for (unsigned i = 0; i < img_size; i++) + gs_img[i] = 255 - gray_pal[img[i]]; + + return gs_img; +} + +static int convert_area(AVFilterContext *ctx, AVSubtitleArea *area) +{ + SubOcrContext *s = ctx->priv; + char *ocr_text = NULL; + int ret; + uint8_t *gs_img = create_grayscale_image(ctx, area); + + if (!gs_img) + return AVERROR(ENOMEM); + + TessBaseAPISetImage(s->tapi, gs_img, area->w, area->h, 1, area->linesize[0]); + TessBaseAPISetSourceResolution(s->tapi, 70); + + ret = TessBaseAPIRecognize(s->tapi, NULL); + if (ret == 0) + ocr_text = TessBaseAPIGetUTF8Text(s->tapi); + + if (!ocr_text) { + av_log(ctx, AV_LOG_WARNING, "OCR didn't return a text. ret=%d\n", ret); + area->ass = av_strdup(""); + } + else { + size_t len = strlen(ocr_text); + + if (len > 0 && ocr_text[len - 1] == '\n') + ocr_text[len - 1] = 0; + + av_log(ctx, AV_LOG_VERBOSE, "OCR Result: %s\n", ocr_text); + + area->ass = av_strdup(ocr_text); + + TessDeleteText(ocr_text); + } + + av_freep(&gs_img); + av_buffer_unref(&area->buf[0]); + area->type = AV_SUBTITLE_FMT_ASS; + + return 0; +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *frame) +{ + AVFilterContext *ctx = inlink->dst; + SubOcrContext *s = ctx->priv; + AVFilterLink *outlink = inlink->dst->outputs[0]; + int ret, frame_sent = 0; + + if (s->pending_frame) { + const uint64_t pts_diff = frame->subtitle_pts - s->pending_frame->subtitle_pts; + s->pending_frame->subtitle_end_time = (uint32_t)(pts_diff / 1000); + + ret = ff_filter_frame(outlink, s->pending_frame); + s->pending_frame = NULL; + if (ret < 0) + return ret; + + frame_sent = 1; + + if (frame->num_subtitle_areas == 0) { + // No need to forward this empty frame + av_frame_unref(frame); + return 0; + } + } + + av_frame_make_writable(frame); + + if (!frame) + return AVERROR(ENOMEM); + + frame->format = AV_SUBTITLE_FMT_ASS; + + av_log(ctx, AV_LOG_DEBUG, "filter_frame sub_pts: %"PRIu64", start_time: %d, end_time: %d, num_areas: %d\n", + frame->subtitle_pts, frame->subtitle_start_time, frame->subtitle_end_time, frame->num_subtitle_areas); + + if (frame->num_subtitle_areas > 1 && + frame->subtitle_areas[0]->y > frame->subtitle_areas[frame->num_subtitle_areas - 1]->y) { + + for (unsigned i = 0; i < frame->num_subtitle_areas / 2; i++) + FFSWAP(AVSubtitleArea*, frame->subtitle_areas[i], frame->subtitle_areas[frame->num_subtitle_areas - i - 1]); + } + + for (unsigned i = 0; i < frame->num_subtitle_areas; i++) { + char *tmp; + AVSubtitleArea *area = frame->subtitle_areas[i]; + + ret = convert_area(ctx, area); + if (ret < 0) + return ret; + + if (strlen(area->ass)) { + tmp = area->ass; + + if (i == 0) + area->ass = ff_ass_get_dialog(s->readorder_counter++, 0, "Default", NULL, tmp); + else + area->ass = av_asprintf("\\N%s", tmp); + + av_free(tmp); + } + } + + if (frame->num_subtitle_areas > 0 && frame->subtitle_end_time >= 30000) { + // Can't send it without end time, wait for the next frame to determine the end_display time + s->pending_frame = frame; + + if (frame_sent) + return 0; + + // To keep all going, send an empty frame instead + frame = ff_get_subtitles_buffer(outlink, AV_SUBTITLE_FMT_ASS); + if (!frame) + return AVERROR(ENOMEM); + + av_frame_copy_props(frame, s->pending_frame); + frame->subtitle_end_time = 1; + } + + return ff_filter_frame(outlink, frame); +} + +#define OFFSET(x) offsetof(SubOcrContext, x) +#define FLAGS (AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_FILTERING_PARAM) + +static const AVOption graphicsub2text_options[] = { + { "ocr_mode", "set ocr mode", OFFSET(ocr_mode), AV_OPT_TYPE_INT, {.i64=OEM_TESSERACT_ONLY}, OEM_TESSERACT_ONLY, 2, FLAGS, "ocr_mode" }, + { "tesseract", "classic tesseract ocr", 0, AV_OPT_TYPE_CONST, {.i64=OEM_TESSERACT_ONLY}, 0, 0, FLAGS, "ocr_mode" }, + { "lstm", "lstm (ML based)", 0, AV_OPT_TYPE_CONST, {.i64=OEM_LSTM_ONLY}, 0, 0, FLAGS, "ocr_mode" }, + { "both", "use both models combined", 0, AV_OPT_TYPE_CONST, {.i64=OEM_TESSERACT_LSTM_COMBINED}, 0, 0, FLAGS, "ocr_mode" }, + { "tessdata_path", "path to tesseract data", OFFSET(tessdata_path), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS, NULL }, + { "language", "ocr language", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "eng"}, 0, 0, FLAGS, NULL }, + { NULL }, +}; + +AVFILTER_DEFINE_CLASS(graphicsub2text); + +static const AVFilterPad inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_SUBTITLE, + .filter_frame = filter_frame, + .config_props = config_input, + }, +}; + +static const AVFilterPad outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_SUBTITLE, + .config_props = config_output, + }, +}; + +/* + * Example: + * ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv + */ +const AVFilter ff_sf_graphicsub2text = { + .name = "graphicsub2text", + .description = NULL_IF_CONFIG_SMALL("Convert graphical subtitles to text subtitles via OCR"), + .init = init, + .uninit = uninit, + .query_formats = query_formats, + .priv_size = sizeof(SubOcrContext), + .priv_class = &graphicsub2text_class, + FILTER_INPUTS(inputs), + FILTER_OUTPUTS(outputs), +};
Signed-off-by: softworkz <softworkz@hotmail.com> --- configure | 1 + doc/filters.texi | 55 ++++++ libavfilter/Makefile | 1 + libavfilter/allfilters.c | 1 + libavfilter/sf_graphicsub2text.c | 326 +++++++++++++++++++++++++++++++ 5 files changed, 384 insertions(+) create mode 100644 libavfilter/sf_graphicsub2text.c