diff mbox series

[FFmpeg-devel,v8,13/13] avfilter/graphicsub2text: Add new graphicsub2text filter (OCR)

Message ID MN2PR04MB59816F420228FD9D9A6657A4BAA19@MN2PR04MB5981.namprd04.prod.outlook.com
State Superseded, archived
Headers show
Series [FFmpeg-devel,v8,01/13] global: Prepare AVFrame for subtitle handling | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Soft Works Sept. 21, 2021, 11:55 p.m. UTC
Signed-off-by: softworkz <softworkz@hotmail.com>
---
 configure                        |   1 +
 doc/filters.texi                 |  55 ++++++
 libavfilter/Makefile             |   1 +
 libavfilter/allfilters.c         |   1 +
 libavfilter/sf_graphicsub2text.c | 326 +++++++++++++++++++++++++++++++
 5 files changed, 384 insertions(+)
 create mode 100644 libavfilter/sf_graphicsub2text.c

Comments

Andreas Rheinhardt Sept. 22, 2021, 2:04 a.m. UTC | #1
Soft Works:
> Signed-off-by: softworkz <softworkz@hotmail.com>
> ---
>  configure                        |   1 +
>  doc/filters.texi                 |  55 ++++++
>  libavfilter/Makefile             |   1 +
>  libavfilter/allfilters.c         |   1 +
>  libavfilter/sf_graphicsub2text.c | 326 +++++++++++++++++++++++++++++++
>  5 files changed, 384 insertions(+)
>  create mode 100644 libavfilter/sf_graphicsub2text.c
> 
> diff --git a/configure b/configure
> index 37fc4c20e7..2682e51435 100755
> --- a/configure
> +++ b/configure
> @@ -3601,6 +3601,7 @@ frei0r_deps_any="libdl LoadLibrary"
>  frei0r_filter_deps="frei0r"
>  frei0r_src_filter_deps="frei0r"
>  fspp_filter_deps="gpl"
> +graphicsub2text_filter_deps="libtesseract"
>  histeq_filter_deps="gpl"
>  hqdn3d_filter_deps="gpl"
>  interlace_filter_deps="gpl"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index da463e2cc1..2b6dfbe1d4 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -25248,6 +25248,61 @@ ffmpeg -i "https://streams.videolan.org/ffmpeg/mkv_subtitles.mkv" -filter_comple
>  @end example
>  @end itemize
>  
> +@section graphicsub2text
> +
> +Converts graphic subtitles to text subtitles by performing OCR.
> +
> +For this filter to be available, ffmpeg needs to be compiled with libtesseract (see https://github.com/tesseract-ocr/tesseract).
> +Language models need to be downloaded from https://github.com/tesseract-ocr/tessdata and put into as subfolder named 'tessdata' or into a folder specified via the environment variable 'TESSDATA_PREFIX'. 
> +The path can also be specified via filter option (see below).
> +
> +Note: These models are including the data for both OCR modes.
> +
> +Inputs:
> +- 0: Subtitles [bitmap]
> +
> +Outputs:
> +- 0: Subtitles [text]
> +
> +It accepts the following parameters:
> +
> +@table @option
> +@item ocr_mode
> +The character recognition mode to use.
> +
> +Supported OCR modes are:
> +
> +@table @var
> +@item 0, tesseract
> +This is the classic libtesseract operation mode. It is fast but less accurate than LSTM.
> +@item 1, lstm
> +Newer OCR implementation based on ML models. Provides usually better results, requires more processing resources.
> +@item 2, both
> +Use a combination of both modes.
> +@end table
> +
> +@item tessdata_path
> +The path to a folder containing the language models to be used.
> +
> +@item language
> +The recognition language. It needs to match the first three characters of a  language model file in the tessdata path.
> +
> +@end table
> +
> +
> +@subsection Examples
> +
> +@itemize
> +@item
> +Convert DVB graphic subtitles to ASS (text) subtitles
> +
> +Note: For this to work, you need to have the data file 'eng.traineddata' in a 'tessdata' subfolder (see above).
> +@example
> +ffmpeg ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv
> +@end example
> +@end itemize
> +
> +
>  @section graphicsub2video
>  
>  Renders graphic subtitles as video frames. 
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 39abf6d2a6..312b67982c 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -290,6 +290,7 @@ OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o qp_table.o
>  OBJS-$(CONFIG_GBLUR_FILTER)                  += vf_gblur.o
>  OBJS-$(CONFIG_GEQ_FILTER)                    += vf_geq.o
>  OBJS-$(CONFIG_GRADFUN_FILTER)                += vf_gradfun.o
> +OBJS-$(CONFIG_GRAPHICSUB2TEXT_FILTER)        += sf_graphicsub2text.o
>  OBJS-$(CONFIG_GRAPHICSUB2VIDEO_FILTER)       += vf_overlay_graphicsubs.o framesync.o
>  OBJS-$(CONFIG_GRAPHMONITOR_FILTER)           += f_graphmonitor.o
>  OBJS-$(CONFIG_GRAYWORLD_FILTER)              += vf_grayworld.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 77c6379302..ee5638dc3d 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -527,6 +527,7 @@ extern const AVFilter ff_avf_showwaves;
>  extern const AVFilter ff_avf_showwavespic;
>  extern const AVFilter ff_vaf_spectrumsynth;
>  extern const AVFilter ff_sf_censor;
> +extern const AVFilter ff_sf_graphicsub2text;
>  extern const AVFilter ff_sf_show_speaker;
>  extern const AVFilter ff_sf_split_cc;
>  extern const AVFilter ff_sf_stripstyles;
> diff --git a/libavfilter/sf_graphicsub2text.c b/libavfilter/sf_graphicsub2text.c
> new file mode 100644
> index 0000000000..157b76408e
> --- /dev/null
> +++ b/libavfilter/sf_graphicsub2text.c
> @@ -0,0 +1,326 @@
> +/*
> + * Copyright (c) 2021 softworkz
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * subtitle filter to convert graphical subs to text subs via OCR
> + */
> +
> +#include <tesseract/capi.h>
> +#include <libavcodec/ass.h>
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/opt.h"
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "subtitles.h"
> +#include "libavcodec/avcodec.h"
> +#include "libavutil/file.h"
> +
> +typedef struct SubOcrContext {
> +    const AVClass *class;
> +    int w, h;
> +
> +    TessBaseAPI *tapi;
> +    TessOcrEngineMode ocr_mode;
> +    char *tessdata_path;
> +    char *language;
> +
> +    int readorder_counter;
> +
> +    AVFrame *pending_frame;
> +} SubOcrContext;
> +
> +
> +static int init(AVFilterContext *ctx)
> +{
> +    SubOcrContext *s = ctx->priv;
> +    const char* tver = TessVersion();
> +    int ret;
> +
> +    s->tapi = TessBaseAPICreate();
> +
> +    if (!s->tapi || !tver || !strlen(tver)) {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to access libtesseract\n");
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    av_log(ctx, AV_LOG_VERBOSE, "Initializing libtesseract, version: %s\n", tver);
> +
> +    ret = TessBaseAPIInit4(s->tapi, s->tessdata_path, s->language, s->ocr_mode, NULL, 0, NULL, NULL, 0, 1);
> +    if (ret < 0 ) {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to initialize libtesseract. Error: %d\n", ret);
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    ret = TessBaseAPISetVariable(s->tapi, "tessedit_char_blacklist", "|");
> +    if (ret < 0 ) {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to set 'tessedit_char_blacklist'. Error: %d\n", ret);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    return 0;
> +}
> +
> +static void uninit(AVFilterContext *ctx)
> +{
> +    SubOcrContext *s = ctx->priv;
> +
> +    TessBaseAPIEnd(s->tapi);
> +    TessBaseAPIDelete(s->tapi);

Beware: uninit is also called on init failure, so it might be that
s->tapi is NULL or that TessBaseAPIInit4 did not succeed.

> +}
> +
> +static int query_formats(AVFilterContext *ctx)
> +{
> +    AVFilterFormats *formats, *formats2;
> +    AVFilterLink *inlink = ctx->inputs[0];
> +    AVFilterLink *outlink = ctx->outputs[0];
> +    static const enum AVSubtitleType in_fmts[] = { AV_SUBTITLE_FMT_BITMAP, AV_SUBTITLE_FMT_NONE };
> +    static const enum AVSubtitleType out_fmts[] = { AV_SUBTITLE_FMT_ASS, AV_SUBTITLE_FMT_NONE };
> +    int ret;
> +
> +    /* set input format */
> +    formats = ff_make_format_list(in_fmts);
> +    if ((ret = ff_formats_ref(formats, &inlink->outcfg.formats)) < 0)
> +        return ret;
> +
> +    /* set output format */
> +    formats2 = ff_make_format_list(out_fmts);
> +    if ((ret = ff_formats_ref(formats2, &outlink->incfg.formats)) < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int config_input(AVFilterLink *inlink)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    SubOcrContext *s = ctx->priv;
> +
> +    if (s->w <= 0 || s->h <= 0) {
> +        s->w = inlink->w;
> +        s->h = inlink->h;
> +    }
> +    return 0;
> +}
> +
> +static int config_output(AVFilterLink *outlink)
> +{
> +    const AVFilterContext *ctx  = outlink->src;
> +    SubOcrContext *s = ctx->priv;
> +
> +    outlink->format = AV_SUBTITLE_FMT_ASS;
> +    outlink->w = s->w;
> +    outlink->h = s->h;
> +
> +    return 0;
> +}
> +
> +static uint8_t* create_grayscale_image(AVFilterContext *ctx, AVSubtitleArea *area)
> +{
> +    uint8_t gray_pal[256];
> +    const size_t img_size = area->buf[0]->size;
> +    const uint8_t* img    = area->buf[0]->data;
> +    uint8_t* gs_img       = av_malloc(img_size);
> +
> +    if (!gs_img)
> +        return NULL;
> +
> +    for (unsigned i = 0; i < 256; i++) {
> +        const uint8_t *col = (uint8_t*)&area->pal[i];
> +        const int val      = (int)col[3] * FFMAX3(col[0], col[1], col[2]);
> +        gray_pal[i]        = (uint8_t)(val >> 8);
> +    }
> +
> +    for (unsigned i = 0; i < img_size; i++)
> +        gs_img[i] = 255 - gray_pal[img[i]];
> +
> +    return gs_img;
> +}
> +
> +static int convert_area(AVFilterContext *ctx, AVSubtitleArea *area)
> +{
> +    SubOcrContext *s = ctx->priv;
> +    char *ocr_text = NULL;
> +    int ret;
> +    uint8_t *gs_img = create_grayscale_image(ctx, area);
> +
> +    if (!gs_img)
> +        return AVERROR(ENOMEM);
> +
> +    TessBaseAPISetImage(s->tapi, gs_img, area->w, area->h, 1, area->linesize[0]);
> +    TessBaseAPISetSourceResolution(s->tapi, 70);
> +
> +    ret = TessBaseAPIRecognize(s->tapi, NULL);
> +    if (ret == 0)
> +        ocr_text = TessBaseAPIGetUTF8Text(s->tapi);
> +
> +    if (!ocr_text) {
> +        av_log(ctx, AV_LOG_WARNING, "OCR didn't return a text. ret=%d\n", ret);
> +        area->ass = av_strdup("");
> +    }
> +    else {
> +        size_t len = strlen(ocr_text);
> +
> +        if (len > 0 && ocr_text[len - 1] == '\n')
> +            ocr_text[len - 1] = 0;
> +
> +        av_log(ctx, AV_LOG_VERBOSE, "OCR Result: %s\n", ocr_text);
> +
> +        area->ass = av_strdup(ocr_text);
> +
> +        TessDeleteText(ocr_text);
> +    }
> +
> +    av_freep(&gs_img);
> +    av_buffer_unref(&area->buf[0]);
> +    area->type = AV_SUBTITLE_FMT_ASS;
> +
> +    return 0;
> +}
> +
> +static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    SubOcrContext *s = ctx->priv;
> +    AVFilterLink *outlink = inlink->dst->outputs[0];
> +    int ret, frame_sent = 0;
> +
> +    if (s->pending_frame) {
> +        const uint64_t pts_diff = frame->subtitle_pts - s->pending_frame->subtitle_pts;
> +        s->pending_frame->subtitle_end_time = (uint32_t)(pts_diff / 1000);
> +
> +        ret = ff_filter_frame(outlink, s->pending_frame);
> +        s->pending_frame = NULL;
> +        if (ret < 0)
> +            return  ret;
> +
> +        frame_sent = 1;
> +
> +        if (frame->num_subtitle_areas == 0) {
> +            // No need to forward this empty frame
> +            av_frame_unref(frame);

Leak.

> +            return 0;
> +        }
> +    }
> +
> +    av_frame_make_writable(frame);
> +
> +    if (!frame)
> +        return AVERROR(ENOMEM);

Wrong check; and leak.

> +
> +    frame->format = AV_SUBTITLE_FMT_ASS;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "filter_frame sub_pts: %"PRIu64", start_time: %d, end_time: %d, num_areas: %d\n", 
> +        frame->subtitle_pts, frame->subtitle_start_time, frame->subtitle_end_time, frame->num_subtitle_areas);
> +
> +    if (frame->num_subtitle_areas > 1 &&
> +        frame->subtitle_areas[0]->y > frame->subtitle_areas[frame->num_subtitle_areas - 1]->y) {
> +
> +        for (unsigned i = 0; i < frame->num_subtitle_areas / 2; i++)
> +            FFSWAP(AVSubtitleArea*, frame->subtitle_areas[i], frame->subtitle_areas[frame->num_subtitle_areas - i - 1]);
> +    }
> +    
> +    for (unsigned i = 0; i < frame->num_subtitle_areas; i++) {
> +        char *tmp;
> +        AVSubtitleArea *area = frame->subtitle_areas[i];
> +
> +        ret = convert_area(ctx, area);
> +        if (ret < 0)
> +            return ret;
> +
> +        if (strlen(area->ass)) {
> +            tmp = area->ass;
> +
> +            if (i == 0)
> +                area->ass = ff_ass_get_dialog(s->readorder_counter++, 0, "Default", NULL, tmp);
> +            else
> +                area->ass = av_asprintf("\\N%s", tmp);
> +
> +            av_free(tmp);
> +        }
> +    }
> +
> +    if (frame->num_subtitle_areas > 0 && frame->subtitle_end_time >= 30000) {

Where does this number come from?

> +        // Can't send it without end time, wait for the next frame to determine the end_display time
> +        s->pending_frame = frame;
> +
> +        if (frame_sent)
> +            return 0;
> +
> +        // To keep all going, send an empty frame instead
> +        frame = ff_get_subtitles_buffer(outlink, AV_SUBTITLE_FMT_ASS);
> +        if (!frame)
> +            return AVERROR(ENOMEM);
> +
> +        av_frame_copy_props(frame, s->pending_frame);
> +        frame->subtitle_end_time = 1;
> +    }
> +
> +    return ff_filter_frame(outlink, frame);
> +}
> +
> +#define OFFSET(x) offsetof(SubOcrContext, x)
> +#define FLAGS (AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
> +
> +static const AVOption graphicsub2text_options[] = {
> +    { "ocr_mode",       "set ocr mode",                  OFFSET(ocr_mode),      AV_OPT_TYPE_INT,    {.i64=OEM_TESSERACT_ONLY},          OEM_TESSERACT_ONLY, 2, FLAGS, "ocr_mode" },
> +    {   "tesseract",    "classic tesseract ocr",         0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_TESSERACT_ONLY},          0,                  0, FLAGS, "ocr_mode" },
> +    {   "lstm",         "lstm (ML based)",               0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_LSTM_ONLY},               0,                  0, FLAGS, "ocr_mode" },
> +    {   "both",         "use both models combined",      0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_TESSERACT_LSTM_COMBINED}, 0,                  0, FLAGS, "ocr_mode" },
> +    { "tessdata_path",  "path to tesseract data",        OFFSET(tessdata_path), AV_OPT_TYPE_STRING, {.str = NULL},                      0,                  0, FLAGS, NULL   },
> +    { "language",       "ocr language",                  OFFSET(language),      AV_OPT_TYPE_STRING, {.str = "eng"},                     0,                  0, FLAGS, NULL   },
> +    { NULL },
> +};
> +
> +AVFILTER_DEFINE_CLASS(graphicsub2text);
> +
> +static const AVFilterPad inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_SUBTITLE,
> +        .filter_frame = filter_frame,
> +        .config_props = config_input,
> +    },
> +};
> +
> +static const AVFilterPad outputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_SUBTITLE,
> +        .config_props  = config_output,
> +    },
> +};
> +
> +/*
> + * Example:
> + * ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv
> + */
> +const AVFilter ff_sf_graphicsub2text = {
> +    .name          = "graphicsub2text",
> +    .description   = NULL_IF_CONFIG_SMALL("Convert graphical subtitles to text subtitles via OCR"),
> +    .init          = init,
> +    .uninit        = uninit,
> +    .query_formats = query_formats,
> +    .priv_size     = sizeof(SubOcrContext),
> +    .priv_class    = &graphicsub2text_class,
> +    FILTER_INPUTS(inputs),
> +    FILTER_OUTPUTS(outputs),
> +};
>
Soft Works Sept. 22, 2021, 2:17 a.m. UTC | #2
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Andreas
> Rheinhardt
> Sent: Wednesday, 22 September 2021 04:05
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add
> new graphicsub2text filter (OCR)
> 
> > +
> > +static void uninit(AVFilterContext *ctx)
> > +{
> > +    SubOcrContext *s = ctx->priv;
> > +
> > +    TessBaseAPIEnd(s->tapi);
> > +    TessBaseAPIDelete(s->tapi);
> 
> Beware: uninit is also called on init failure, so it might be that
> s->tapi is NULL or that TessBaseAPIInit4 did not succeed.

vf_ocr does it in the same way, so I assumed it to be safe.

Unfortunately there's no proper API documentation for tesseract.

[..] Will apply the stripped notes - thank you!

> > +    if (frame->num_subtitle_areas > 0 && frame->subtitle_end_time >=
> 30000) {
> 
> Where does this number come from?

dvbsubdec uses this as value when it can't determine the end display 
time. Others are setting it to UINT64_MAX.

This is the indication to know that the subtitles do not have a 
display duration and we need to wait for the next one instead.


softworkz
Andreas Rheinhardt Sept. 22, 2021, 2:25 a.m. UTC | #3
Soft Works:
> 
> 
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Andreas
>> Rheinhardt
>> Sent: Wednesday, 22 September 2021 04:05
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add
>> new graphicsub2text filter (OCR)
>>
>>> +
>>> +static void uninit(AVFilterContext *ctx)
>>> +{
>>> +    SubOcrContext *s = ctx->priv;
>>> +
>>> +    TessBaseAPIEnd(s->tapi);
>>> +    TessBaseAPIDelete(s->tapi);
>>
>> Beware: uninit is also called on init failure, so it might be that
>> s->tapi is NULL or that TessBaseAPIInit4 did not succeed.
> 
> vf_ocr does it in the same way, so I assumed it to be safe.
> 

I wish it were that easy. Our own APIs are (usually) safe in the sense
that an object that has been memset to zero (as all those contexts are
initially) can be passed to the free function even when it has never
been initialized. But this is need not be true for external APIs. And
unfortunately error paths are often untested.
For a bad example, just take a look at vf_libvmaf.c: It's uninit
function locks a mutex, regardless of whether said mutex has been
initialized at all. That's UB.

> Unfortunately there's no proper API documentation for tesseract.
>
Soft Works Sept. 22, 2021, 2:35 a.m. UTC | #4
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Andreas
> Rheinhardt
> Sent: Wednesday, 22 September 2021 04:26
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add
> new graphicsub2text filter (OCR)
> 
> Soft Works:
> >
> >
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Andreas
> >> Rheinhardt
> >> Sent: Wednesday, 22 September 2021 04:05
> >> To: ffmpeg-devel@ffmpeg.org
> >> Subject: Re: [FFmpeg-devel] [PATCH v8 13/13] avfilter/graphicsub2text: Add
> >> new graphicsub2text filter (OCR)
> >>
> >>> +
> >>> +static void uninit(AVFilterContext *ctx)
> >>> +{
> >>> +    SubOcrContext *s = ctx->priv;
> >>> +
> >>> +    TessBaseAPIEnd(s->tapi);
> >>> +    TessBaseAPIDelete(s->tapi);
> >>
> >> Beware: uninit is also called on init failure, so it might be that
> >> s->tapi is NULL or that TessBaseAPIInit4 did not succeed.
> >
> > vf_ocr does it in the same way, so I assumed it to be safe.
> >
> 
> I wish it were that easy. Our own APIs are (usually) safe in the sense
> that an object that has been memset to zero (as all those contexts are
> initially) can be passed to the free function even when it has never
> been initialized. But this is need not be true for external APIs. And
> unfortunately error paths are often untested.

That's true. Though, I looked at more than a handful of other code 
using tesseract and I've never seen any additional checks in the 
uninit code. But now that you said it - I'll try it out to be sure :-)

Thanks,
softworkz
diff mbox series

Patch

diff --git a/configure b/configure
index 37fc4c20e7..2682e51435 100755
--- a/configure
+++ b/configure
@@ -3601,6 +3601,7 @@  frei0r_deps_any="libdl LoadLibrary"
 frei0r_filter_deps="frei0r"
 frei0r_src_filter_deps="frei0r"
 fspp_filter_deps="gpl"
+graphicsub2text_filter_deps="libtesseract"
 histeq_filter_deps="gpl"
 hqdn3d_filter_deps="gpl"
 interlace_filter_deps="gpl"
diff --git a/doc/filters.texi b/doc/filters.texi
index da463e2cc1..2b6dfbe1d4 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -25248,6 +25248,61 @@  ffmpeg -i "https://streams.videolan.org/ffmpeg/mkv_subtitles.mkv" -filter_comple
 @end example
 @end itemize
 
+@section graphicsub2text
+
+Converts graphic subtitles to text subtitles by performing OCR.
+
+For this filter to be available, ffmpeg needs to be compiled with libtesseract (see https://github.com/tesseract-ocr/tesseract).
+Language models need to be downloaded from https://github.com/tesseract-ocr/tessdata and put into as subfolder named 'tessdata' or into a folder specified via the environment variable 'TESSDATA_PREFIX'. 
+The path can also be specified via filter option (see below).
+
+Note: These models are including the data for both OCR modes.
+
+Inputs:
+- 0: Subtitles [bitmap]
+
+Outputs:
+- 0: Subtitles [text]
+
+It accepts the following parameters:
+
+@table @option
+@item ocr_mode
+The character recognition mode to use.
+
+Supported OCR modes are:
+
+@table @var
+@item 0, tesseract
+This is the classic libtesseract operation mode. It is fast but less accurate than LSTM.
+@item 1, lstm
+Newer OCR implementation based on ML models. Provides usually better results, requires more processing resources.
+@item 2, both
+Use a combination of both modes.
+@end table
+
+@item tessdata_path
+The path to a folder containing the language models to be used.
+
+@item language
+The recognition language. It needs to match the first three characters of a  language model file in the tessdata path.
+
+@end table
+
+
+@subsection Examples
+
+@itemize
+@item
+Convert DVB graphic subtitles to ASS (text) subtitles
+
+Note: For this to work, you need to have the data file 'eng.traineddata' in a 'tessdata' subfolder (see above).
+@example
+ffmpeg ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv
+@end example
+@end itemize
+
+
 @section graphicsub2video
 
 Renders graphic subtitles as video frames. 
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 39abf6d2a6..312b67982c 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -290,6 +290,7 @@  OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o qp_table.o
 OBJS-$(CONFIG_GBLUR_FILTER)                  += vf_gblur.o
 OBJS-$(CONFIG_GEQ_FILTER)                    += vf_geq.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += vf_gradfun.o
+OBJS-$(CONFIG_GRAPHICSUB2TEXT_FILTER)        += sf_graphicsub2text.o
 OBJS-$(CONFIG_GRAPHICSUB2VIDEO_FILTER)       += vf_overlay_graphicsubs.o framesync.o
 OBJS-$(CONFIG_GRAPHMONITOR_FILTER)           += f_graphmonitor.o
 OBJS-$(CONFIG_GRAYWORLD_FILTER)              += vf_grayworld.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 77c6379302..ee5638dc3d 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -527,6 +527,7 @@  extern const AVFilter ff_avf_showwaves;
 extern const AVFilter ff_avf_showwavespic;
 extern const AVFilter ff_vaf_spectrumsynth;
 extern const AVFilter ff_sf_censor;
+extern const AVFilter ff_sf_graphicsub2text;
 extern const AVFilter ff_sf_show_speaker;
 extern const AVFilter ff_sf_split_cc;
 extern const AVFilter ff_sf_stripstyles;
diff --git a/libavfilter/sf_graphicsub2text.c b/libavfilter/sf_graphicsub2text.c
new file mode 100644
index 0000000000..157b76408e
--- /dev/null
+++ b/libavfilter/sf_graphicsub2text.c
@@ -0,0 +1,326 @@ 
+/*
+ * Copyright (c) 2021 softworkz
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * subtitle filter to convert graphical subs to text subs via OCR
+ */
+
+#include <tesseract/capi.h>
+#include <libavcodec/ass.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
+#include "avfilter.h"
+#include "internal.h"
+#include "subtitles.h"
+#include "libavcodec/avcodec.h"
+#include "libavutil/file.h"
+
+typedef struct SubOcrContext {
+    const AVClass *class;
+    int w, h;
+
+    TessBaseAPI *tapi;
+    TessOcrEngineMode ocr_mode;
+    char *tessdata_path;
+    char *language;
+
+    int readorder_counter;
+
+    AVFrame *pending_frame;
+} SubOcrContext;
+
+
+static int init(AVFilterContext *ctx)
+{
+    SubOcrContext *s = ctx->priv;
+    const char* tver = TessVersion();
+    int ret;
+
+    s->tapi = TessBaseAPICreate();
+
+    if (!s->tapi || !tver || !strlen(tver)) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to access libtesseract\n");
+        return AVERROR(ENOSYS);
+    }
+
+    av_log(ctx, AV_LOG_VERBOSE, "Initializing libtesseract, version: %s\n", tver);
+
+    ret = TessBaseAPIInit4(s->tapi, s->tessdata_path, s->language, s->ocr_mode, NULL, 0, NULL, NULL, 0, 1);
+    if (ret < 0 ) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize libtesseract. Error: %d\n", ret);
+        return AVERROR(ENOSYS);
+    }
+
+    ret = TessBaseAPISetVariable(s->tapi, "tessedit_char_blacklist", "|");
+    if (ret < 0 ) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to set 'tessedit_char_blacklist'. Error: %d\n", ret);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static void uninit(AVFilterContext *ctx)
+{
+    SubOcrContext *s = ctx->priv;
+
+    TessBaseAPIEnd(s->tapi);
+    TessBaseAPIDelete(s->tapi);
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    AVFilterFormats *formats, *formats2;
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    static const enum AVSubtitleType in_fmts[] = { AV_SUBTITLE_FMT_BITMAP, AV_SUBTITLE_FMT_NONE };
+    static const enum AVSubtitleType out_fmts[] = { AV_SUBTITLE_FMT_ASS, AV_SUBTITLE_FMT_NONE };
+    int ret;
+
+    /* set input format */
+    formats = ff_make_format_list(in_fmts);
+    if ((ret = ff_formats_ref(formats, &inlink->outcfg.formats)) < 0)
+        return ret;
+
+    /* set output format */
+    formats2 = ff_make_format_list(out_fmts);
+    if ((ret = ff_formats_ref(formats2, &outlink->incfg.formats)) < 0)
+        return ret;
+
+    return 0;
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    SubOcrContext *s = ctx->priv;
+
+    if (s->w <= 0 || s->h <= 0) {
+        s->w = inlink->w;
+        s->h = inlink->h;
+    }
+    return 0;
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    const AVFilterContext *ctx  = outlink->src;
+    SubOcrContext *s = ctx->priv;
+
+    outlink->format = AV_SUBTITLE_FMT_ASS;
+    outlink->w = s->w;
+    outlink->h = s->h;
+
+    return 0;
+}
+
+static uint8_t* create_grayscale_image(AVFilterContext *ctx, AVSubtitleArea *area)
+{
+    uint8_t gray_pal[256];
+    const size_t img_size = area->buf[0]->size;
+    const uint8_t* img    = area->buf[0]->data;
+    uint8_t* gs_img       = av_malloc(img_size);
+
+    if (!gs_img)
+        return NULL;
+
+    for (unsigned i = 0; i < 256; i++) {
+        const uint8_t *col = (uint8_t*)&area->pal[i];
+        const int val      = (int)col[3] * FFMAX3(col[0], col[1], col[2]);
+        gray_pal[i]        = (uint8_t)(val >> 8);
+    }
+
+    for (unsigned i = 0; i < img_size; i++)
+        gs_img[i] = 255 - gray_pal[img[i]];
+
+    return gs_img;
+}
+
+static int convert_area(AVFilterContext *ctx, AVSubtitleArea *area)
+{
+    SubOcrContext *s = ctx->priv;
+    char *ocr_text = NULL;
+    int ret;
+    uint8_t *gs_img = create_grayscale_image(ctx, area);
+
+    if (!gs_img)
+        return AVERROR(ENOMEM);
+
+    TessBaseAPISetImage(s->tapi, gs_img, area->w, area->h, 1, area->linesize[0]);
+    TessBaseAPISetSourceResolution(s->tapi, 70);
+
+    ret = TessBaseAPIRecognize(s->tapi, NULL);
+    if (ret == 0)
+        ocr_text = TessBaseAPIGetUTF8Text(s->tapi);
+
+    if (!ocr_text) {
+        av_log(ctx, AV_LOG_WARNING, "OCR didn't return a text. ret=%d\n", ret);
+        area->ass = av_strdup("");
+    }
+    else {
+        size_t len = strlen(ocr_text);
+
+        if (len > 0 && ocr_text[len - 1] == '\n')
+            ocr_text[len - 1] = 0;
+
+        av_log(ctx, AV_LOG_VERBOSE, "OCR Result: %s\n", ocr_text);
+
+        area->ass = av_strdup(ocr_text);
+
+        TessDeleteText(ocr_text);
+    }
+
+    av_freep(&gs_img);
+    av_buffer_unref(&area->buf[0]);
+    area->type = AV_SUBTITLE_FMT_ASS;
+
+    return 0;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
+{
+    AVFilterContext *ctx = inlink->dst;
+    SubOcrContext *s = ctx->priv;
+    AVFilterLink *outlink = inlink->dst->outputs[0];
+    int ret, frame_sent = 0;
+
+    if (s->pending_frame) {
+        const uint64_t pts_diff = frame->subtitle_pts - s->pending_frame->subtitle_pts;
+        s->pending_frame->subtitle_end_time = (uint32_t)(pts_diff / 1000);
+
+        ret = ff_filter_frame(outlink, s->pending_frame);
+        s->pending_frame = NULL;
+        if (ret < 0)
+            return  ret;
+
+        frame_sent = 1;
+
+        if (frame->num_subtitle_areas == 0) {
+            // No need to forward this empty frame
+            av_frame_unref(frame);
+            return 0;
+        }
+    }
+
+    av_frame_make_writable(frame);
+
+    if (!frame)
+        return AVERROR(ENOMEM);
+
+    frame->format = AV_SUBTITLE_FMT_ASS;
+
+    av_log(ctx, AV_LOG_DEBUG, "filter_frame sub_pts: %"PRIu64", start_time: %d, end_time: %d, num_areas: %d\n", 
+        frame->subtitle_pts, frame->subtitle_start_time, frame->subtitle_end_time, frame->num_subtitle_areas);
+
+    if (frame->num_subtitle_areas > 1 &&
+        frame->subtitle_areas[0]->y > frame->subtitle_areas[frame->num_subtitle_areas - 1]->y) {
+
+        for (unsigned i = 0; i < frame->num_subtitle_areas / 2; i++)
+            FFSWAP(AVSubtitleArea*, frame->subtitle_areas[i], frame->subtitle_areas[frame->num_subtitle_areas - i - 1]);
+    }
+    
+    for (unsigned i = 0; i < frame->num_subtitle_areas; i++) {
+        char *tmp;
+        AVSubtitleArea *area = frame->subtitle_areas[i];
+
+        ret = convert_area(ctx, area);
+        if (ret < 0)
+            return ret;
+
+        if (strlen(area->ass)) {
+            tmp = area->ass;
+
+            if (i == 0)
+                area->ass = ff_ass_get_dialog(s->readorder_counter++, 0, "Default", NULL, tmp);
+            else
+                area->ass = av_asprintf("\\N%s", tmp);
+
+            av_free(tmp);
+        }
+    }
+
+    if (frame->num_subtitle_areas > 0 && frame->subtitle_end_time >= 30000) {
+        // Can't send it without end time, wait for the next frame to determine the end_display time
+        s->pending_frame = frame;
+
+        if (frame_sent)
+            return 0;
+
+        // To keep all going, send an empty frame instead
+        frame = ff_get_subtitles_buffer(outlink, AV_SUBTITLE_FMT_ASS);
+        if (!frame)
+            return AVERROR(ENOMEM);
+
+        av_frame_copy_props(frame, s->pending_frame);
+        frame->subtitle_end_time = 1;
+    }
+
+    return ff_filter_frame(outlink, frame);
+}
+
+#define OFFSET(x) offsetof(SubOcrContext, x)
+#define FLAGS (AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
+
+static const AVOption graphicsub2text_options[] = {
+    { "ocr_mode",       "set ocr mode",                  OFFSET(ocr_mode),      AV_OPT_TYPE_INT,    {.i64=OEM_TESSERACT_ONLY},          OEM_TESSERACT_ONLY, 2, FLAGS, "ocr_mode" },
+    {   "tesseract",    "classic tesseract ocr",         0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_TESSERACT_ONLY},          0,                  0, FLAGS, "ocr_mode" },
+    {   "lstm",         "lstm (ML based)",               0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_LSTM_ONLY},               0,                  0, FLAGS, "ocr_mode" },
+    {   "both",         "use both models combined",      0,                     AV_OPT_TYPE_CONST,  {.i64=OEM_TESSERACT_LSTM_COMBINED}, 0,                  0, FLAGS, "ocr_mode" },
+    { "tessdata_path",  "path to tesseract data",        OFFSET(tessdata_path), AV_OPT_TYPE_STRING, {.str = NULL},                      0,                  0, FLAGS, NULL   },
+    { "language",       "ocr language",                  OFFSET(language),      AV_OPT_TYPE_STRING, {.str = "eng"},                     0,                  0, FLAGS, NULL   },
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(graphicsub2text);
+
+static const AVFilterPad inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_SUBTITLE,
+        .filter_frame = filter_frame,
+        .config_props = config_input,
+    },
+};
+
+static const AVFilterPad outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_SUBTITLE,
+        .config_props  = config_output,
+    },
+};
+
+/*
+ * Example:
+ * ffmpeg -loglevel verbose -i "https://streams.videolan.org/streams/ts/video_subs_ttxt%2Bdvbsub.ts" -filter_complex "[0:13]graphicsub2text=ocr_mode=both" -c:s ass -y output.mkv
+ */
+const AVFilter ff_sf_graphicsub2text = {
+    .name          = "graphicsub2text",
+    .description   = NULL_IF_CONFIG_SMALL("Convert graphical subtitles to text subtitles via OCR"),
+    .init          = init,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    .priv_size     = sizeof(SubOcrContext),
+    .priv_class    = &graphicsub2text_class,
+    FILTER_INPUTS(inputs),
+    FILTER_OUTPUTS(outputs),
+};