diff mbox series

[FFmpeg-devel] lavf/vf_ocr: add subregion support

Message ID tencent_71C3C4E3D7723078068E5BC46F20F8BEA809@qq.com
State Superseded
Headers show
Series [FFmpeg-devel] lavf/vf_ocr: add subregion support | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished

Commit Message

Lingjiang Fang June 17, 2021, 6:01 a.m. UTC
---
 doc/filters.texi     |  7 +++++++
 libavfilter/vf_ocr.c | 30 +++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

Comments

Gyan Doshi June 17, 2021, 6:27 a.m. UTC | #1
On 2021-06-17 11:31, Lingjiang Fang wrote:
> ---
>   doc/filters.texi     |  7 +++++++
>   libavfilter/vf_ocr.c | 30 +++++++++++++++++++++++++++++-
>   2 files changed, 36 insertions(+), 1 deletion(-)
>
> diff --git a/doc/filters.texi b/doc/filters.texi
> index da8f7d7726..9c650a2a5a 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -15451,6 +15451,13 @@ Set character whitelist.
>   
>   @item blacklist
>   Set character blacklist.
> +
> +@item x, y
> +Set top point position of subregion, not support expression now

--> Set position of top-left corner, in pixels.

> +
> +@item w, h
> +Set Width and height of subregion

s/Width/width


> +
>   @end table
>   
>   The filter exports recognized text as the frame metadata @code{lavfi.ocr.text}.
> diff --git a/libavfilter/vf_ocr.c b/libavfilter/vf_ocr.c
> index 6de474025a..7beb101679 100644
> --- a/libavfilter/vf_ocr.c
> +++ b/libavfilter/vf_ocr.c
> @@ -33,6 +33,8 @@ typedef struct OCRContext {
>       char *language;
>       char *whitelist;
>       char *blacklist;
> +    int x, y;
> +    int w, h;
>   
>       TessBaseAPI *tess;
>   } OCRContext;
> @@ -45,6 +47,10 @@ static const AVOption ocr_options[] = {
>       { "language",  "set language",            OFFSET(language),  AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS },
>       { "whitelist", "set character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING, {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~ "}, 0, 0, FLAGS },
>       { "blacklist", "set character blacklist", OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""},    0, 0, FLAGS },
> +    { "x",         "top x of sub region",     OFFSET(x),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "y",         "top y of sub region",     OFFSET(y),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "w",         "width of sub region",     OFFSET(w),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "h",         "height of sub region",    OFFSET(h),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
>       { NULL }
>   };
>   
> @@ -73,6 +79,19 @@ static av_cold int init(AVFilterContext *ctx)
>       return 0;
>   }
>   
> +static int config_input(AVFilterLink *inlink)
> +{
> +    OCRContext  *s = inlink->dst->priv;
> +
> +    // may call many times, we don't check w/h here
> +    if (s->x < 0 || s->y < 0) {
> +        s->x = 0;
> +        s->y = 0;

These are AV_OPT_TYPE_INT with range set as 0 to INT_MAX, so the opt 
parser should disallow negative values.

Regards,
Gyan

> +    }
> +
> +    return 0;
> +}
> +
>   static int query_formats(AVFilterContext *ctx)
>   {
>       static const enum AVPixelFormat pix_fmts[] = {
> @@ -101,9 +120,17 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>       OCRContext *s = ctx->priv;
>       char *result;
>       int *confs;
> +    int w = s->w;
> +    int h = s->h;
> +
> +    if (w <= 0 || h <= 0) {
> +        w = in->width;
> +        h = in->height;
> +    }
>   
> +    av_log(s, AV_LOG_ERROR, "x=%d, y=%d, w=%d, h=%d\n", s->x, s->y, w, h);
>       result = TessBaseAPIRect(s->tess, in->data[0], 1,
> -                             in->linesize[0], 0, 0, in->width, in->height);
> +                             in->linesize[0], s->x, s->y, w, h);
>       confs = TessBaseAPIAllWordConfidences(s->tess);
>       av_dict_set(metadata, "lavfi.ocr.text", result, 0);
>       for (int i = 0; confs[i] != -1; i++) {
> @@ -134,6 +161,7 @@ static const AVFilterPad ocr_inputs[] = {
>           .name         = "default",
>           .type         = AVMEDIA_TYPE_VIDEO,
>           .filter_frame = filter_frame,
> +        .config_props = config_input,
>       },
>       { NULL }
>   };
Lingjiang Fang June 17, 2021, 3:02 p.m. UTC | #2
On Thu, 17 Jun 2021 11:57:00 +0530
Gyan Doshi <ffmpeg@gyani.pro> wrote:

> On 2021-06-17 11:31, Lingjiang Fang wrote:
> > ---
> >   doc/filters.texi     |  7 +++++++
> >   libavfilter/vf_ocr.c | 30 +++++++++++++++++++++++++++++-
> >   2 files changed, 36 insertions(+), 1 deletion(-)
> >
> > diff --git a/doc/filters.texi b/doc/filters.texi
> > index da8f7d7726..9c650a2a5a 100644
> > --- a/doc/filters.texi
> > +++ b/doc/filters.texi
> > @@ -15451,6 +15451,13 @@ Set character whitelist.
> >   
> >   @item blacklist
> >   Set character blacklist.
> > +
> > +@item x, y
> > +Set top point position of subregion, not support expression now  
> 
> --> Set position of top-left corner, in pixels.  
> 
> > +
> > +@item w, h
> > +Set Width and height of subregion  
> 
> s/Width/width
fixed in verson 2, thanks

> 
> 
> > +
> >   @end table
> >   
> >   The filter exports recognized text as the frame metadata
> > @code{lavfi.ocr.text}. diff --git a/libavfilter/vf_ocr.c
> > b/libavfilter/vf_ocr.c index 6de474025a..7beb101679 100644
> > --- a/libavfilter/vf_ocr.c
> > +++ b/libavfilter/vf_ocr.c
> > @@ -33,6 +33,8 @@ typedef struct OCRContext {
> >       char *language;
> >       char *whitelist;
> >       char *blacklist;
> > +    int x, y;
> > +    int w, h;
> >   
> >       TessBaseAPI *tess;
> >   } OCRContext;
> > @@ -45,6 +47,10 @@ static const AVOption ocr_options[] = {
> >       { "language",  "set language",            OFFSET(language),
> > AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS }, { "whitelist",
> > "set character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING,
> > {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~
> > "}, 0, 0, FLAGS }, { "blacklist", "set character blacklist",
> > OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""},    0, 0, FLAGS },
> > +    { "x",         "top x of sub region",     OFFSET(x),
> > AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> > +    { "y",         "top y of sub region",     OFFSET(y),
> > AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> > +    { "w",         "width of sub region",     OFFSET(w),
> > AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> > +    { "h",         "height of sub region",    OFFSET(h),
> > AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS }, { NULL }
> >   };
> >   
> > @@ -73,6 +79,19 @@ static av_cold int init(AVFilterContext *ctx)
> >       return 0;
> >   }
> >   
> > +static int config_input(AVFilterLink *inlink)
> > +{
> > +    OCRContext  *s = inlink->dst->priv;
> > +
> > +    // may call many times, we don't check w/h here
> > +    if (s->x < 0 || s->y < 0) {
> > +        s->x = 0;
> > +        s->y = 0;  
> 
> These are AV_OPT_TYPE_INT with range set as 0 to INT_MAX, so the opt 
> parser should disallow negative values.
removed redundant check in version 2, thanks


> 
> Regards,
> Gyan
> 
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >   static int query_formats(AVFilterContext *ctx)
> >   {
> >       static const enum AVPixelFormat pix_fmts[] = {
> > @@ -101,9 +120,17 @@ static int filter_frame(AVFilterLink *inlink,
> > AVFrame *in) OCRContext *s = ctx->priv;
> >       char *result;
> >       int *confs;
> > +    int w = s->w;
> > +    int h = s->h;
> > +
> > +    if (w <= 0 || h <= 0) {
> > +        w = in->width;
> > +        h = in->height;
> > +    }
> >   
> > +    av_log(s, AV_LOG_ERROR, "x=%d, y=%d, w=%d, h=%d\n", s->x,
> > s->y, w, h); result = TessBaseAPIRect(s->tess, in->data[0], 1,
> > -                             in->linesize[0], 0, 0, in->width,
> > in->height);
> > +                             in->linesize[0], s->x, s->y, w, h);
> >       confs = TessBaseAPIAllWordConfidences(s->tess);
> >       av_dict_set(metadata, "lavfi.ocr.text", result, 0);
> >       for (int i = 0; confs[i] != -1; i++) {
> > @@ -134,6 +161,7 @@ static const AVFilterPad ocr_inputs[] = {
> >           .name         = "default",
> >           .type         = AVMEDIA_TYPE_VIDEO,
> >           .filter_frame = filter_frame,
> > +        .config_props = config_input,
> >       },
> >       { NULL }
> >   };  
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".



Regards,
Lingjiang Fang
diff mbox series

Patch

diff --git a/doc/filters.texi b/doc/filters.texi
index da8f7d7726..9c650a2a5a 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15451,6 +15451,13 @@  Set character whitelist.
 
 @item blacklist
 Set character blacklist.
+
+@item x, y
+Set top point position of subregion, not support expression now
+
+@item w, h
+Set Width and height of subregion
+
 @end table
 
 The filter exports recognized text as the frame metadata @code{lavfi.ocr.text}.
diff --git a/libavfilter/vf_ocr.c b/libavfilter/vf_ocr.c
index 6de474025a..7beb101679 100644
--- a/libavfilter/vf_ocr.c
+++ b/libavfilter/vf_ocr.c
@@ -33,6 +33,8 @@  typedef struct OCRContext {
     char *language;
     char *whitelist;
     char *blacklist;
+    int x, y;
+    int w, h;
 
     TessBaseAPI *tess;
 } OCRContext;
@@ -45,6 +47,10 @@  static const AVOption ocr_options[] = {
     { "language",  "set language",            OFFSET(language),  AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS },
     { "whitelist", "set character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING, {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~ "}, 0, 0, FLAGS },
     { "blacklist", "set character blacklist", OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""},    0, 0, FLAGS },
+    { "x",         "top x of sub region",     OFFSET(x),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
+    { "y",         "top y of sub region",     OFFSET(y),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
+    { "w",         "width of sub region",     OFFSET(w),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
+    { "h",         "height of sub region",    OFFSET(h),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
     { NULL }
 };
 
@@ -73,6 +79,19 @@  static av_cold int init(AVFilterContext *ctx)
     return 0;
 }
 
+static int config_input(AVFilterLink *inlink)
+{
+    OCRContext  *s = inlink->dst->priv;
+
+    // may call many times, we don't check w/h here
+    if (s->x < 0 || s->y < 0) {
+        s->x = 0;
+        s->y = 0;
+    }
+
+    return 0;
+}
+
 static int query_formats(AVFilterContext *ctx)
 {
     static const enum AVPixelFormat pix_fmts[] = {
@@ -101,9 +120,17 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     OCRContext *s = ctx->priv;
     char *result;
     int *confs;
+    int w = s->w;
+    int h = s->h;
+
+    if (w <= 0 || h <= 0) {
+        w = in->width;
+        h = in->height;
+    }
 
+    av_log(s, AV_LOG_ERROR, "x=%d, y=%d, w=%d, h=%d\n", s->x, s->y, w, h);
     result = TessBaseAPIRect(s->tess, in->data[0], 1,
-                             in->linesize[0], 0, 0, in->width, in->height);
+                             in->linesize[0], s->x, s->y, w, h);
     confs = TessBaseAPIAllWordConfidences(s->tess);
     av_dict_set(metadata, "lavfi.ocr.text", result, 0);
     for (int i = 0; confs[i] != -1; i++) {
@@ -134,6 +161,7 @@  static const AVFilterPad ocr_inputs[] = {
         .name         = "default",
         .type         = AVMEDIA_TYPE_VIDEO,
         .filter_frame = filter_frame,
+        .config_props = config_input,
     },
     { NULL }
 };