diff mbox series

[FFmpeg-devel,V4] lavf/vf_ocr: add subregion support

Message ID tencent_12E2816BA4189603142D1237697CC4D46B08@qq.com
State Superseded
Headers show
Series [FFmpeg-devel,V4] lavf/vf_ocr: add subregion support | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Lingjiang Fang July 12, 2021, 5:14 a.m. UTC
follow comments from Steven Liu
---
 doc/filters.texi     |  8 ++++++++
 libavfilter/vf_ocr.c | 45 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

Comments

Lingjiang Fang July 20, 2021, 11:54 a.m. UTC | #1
On Mon, 12 Jul 2021 13:14:01 +0800
Lingjiang Fang <vacingfang@foxmail.com> wrote:

ping for review, thanks

> follow comments from Steven Liu
> ---
>  doc/filters.texi     |  8 ++++++++
>  libavfilter/vf_ocr.c | 45
> +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 52
> insertions(+), 1 deletion(-)
> 
> diff --git a/doc/filters.texi b/doc/filters.texi
> index d991c06628..f41ba0ce46 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -15457,6 +15457,14 @@ Set character whitelist.
>  
>  @item blacklist
>  Set character blacklist.
> +
> +@item x, y
> +Set top-left corner of the subregion, in pixels, default is (0,0).
> +
> +@item w, h
> +Set width and height of the subregion, in pixels,
> +default is the bottom-right part from given top-left corner.
> +
>  @end table
>  
>  The filter exports recognized text as the frame metadata
> @code{lavfi.ocr.text}. diff --git a/libavfilter/vf_ocr.c
> b/libavfilter/vf_ocr.c index 6de474025a..55f04b6592 100644
> --- a/libavfilter/vf_ocr.c
> +++ b/libavfilter/vf_ocr.c
> @@ -33,6 +33,8 @@ typedef struct OCRContext {
>      char *language;
>      char *whitelist;
>      char *blacklist;
> +    int x, y, x_in, y_in;
> +    int w, h, w_in, h_in;
>  
>      TessBaseAPI *tess;
>  } OCRContext;
> @@ -45,6 +47,10 @@ static const AVOption ocr_options[] = {
>      { "language",  "set language",            OFFSET(language),
> AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS }, { "whitelist", "set
> character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING,
> {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~
> "}, 0, 0, FLAGS }, { "blacklist", "set character blacklist",
> OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""},    0, 0, FLAGS },
> +    { "x",         "top x of sub region",     OFFSET(x),
> AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "y",         "top y of sub region",     OFFSET(y),
> AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "w",         "width of sub region",     OFFSET(w),
> AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "h",         "height of sub region",    OFFSET(h),
> AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS }, { NULL }
>  };
>  
> @@ -93,6 +99,41 @@ static int query_formats(AVFilterContext *ctx)
>      return ff_set_common_formats(ctx, fmts_list);
>  }
>  
> +static void check_fix(int *x, int *y, int *w, int *h, int pic_w, int
> pic_h) +{
> +    // 0 <= x < pic_w
> +    if (*x >= pic_w)
> +        *x = 0;
> +    // 0 <= y < pic_h
> +    if (*y >= pic_h)
> +        *y = 0;
> +
> +    if (*w == 0 || *w + *x > pic_w)
> +        *w = pic_w - *x;
> +    if (*h == 0 || *h + *y > pic_h)
> +        *h = pic_h - *y;
> +}
> +
> +static int config_input(AVFilterLink *inlink)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    OCRContext *s = ctx->priv;
> +
> +    s->x_in = s->x;
> +    s->y_in = s->y;
> +    s->w_in = s->w;
> +    s->h_in = s->h;
> +    check_fix(&s->x_in, &s->y_in, &s->w_in, &s->h_in, inlink->w,
> inlink->h);
> +    if ( s->x_in != s->x || s->y_in != s->y  ||
> +        (s->w != 0 && s->w_in != s->w) || (s->h != 0 && s->h_in !=
> s->h)) {
> +        av_log(s, AV_LOG_WARNING, "config error, subregion changed
> to "
> +                                  "x=%d, y=%d, w=%d, h=%d\n",
> +                                  s->x_in, s->y_in, s->w_in,
> s->h_in);
> +    }
> +
> +    return 0;
> +}
> +
>  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>  {
>      AVDictionary **metadata = &in->metadata;
> @@ -102,8 +143,9 @@ static int filter_frame(AVFilterLink *inlink,
> AVFrame *in) char *result;
>      int *confs;
>  
> +    // TODO(vacing): support expression
>      result = TessBaseAPIRect(s->tess, in->data[0], 1,
> -                             in->linesize[0], 0, 0, in->width,
> in->height);
> +                             in->linesize[0], s->x_in, s->y_in,
> s->w_in, s->h_in); confs = TessBaseAPIAllWordConfidences(s->tess);
>      av_dict_set(metadata, "lavfi.ocr.text", result, 0);
>      for (int i = 0; confs[i] != -1; i++) {
> @@ -134,6 +176,7 @@ static const AVFilterPad ocr_inputs[] = {
>          .name         = "default",
>          .type         = AVMEDIA_TYPE_VIDEO,
>          .filter_frame = filter_frame,
> +        .config_props = config_input,
>      },
>      { NULL }
>  };



Regards,
Lingjiang Fang
Liu Steven July 21, 2021, 1:38 a.m. UTC | #2
> 2021年7月12日 下午1:14,Lingjiang Fang <vacingfang@foxmail.com> 写道:
> 
> follow comments from Steven Liu
> ---
> doc/filters.texi     |  8 ++++++++
> libavfilter/vf_ocr.c | 45 +++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/filters.texi b/doc/filters.texi
> index d991c06628..f41ba0ce46 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -15457,6 +15457,14 @@ Set character whitelist.
> 
> @item blacklist
> Set character blacklist.
> +
> +@item x, y
> +Set top-left corner of the subregion, in pixels, default is (0,0).
> +
> +@item w, h
> +Set width and height of the subregion, in pixels,
> +default is the bottom-right part from given top-left corner.
> +
> @end table
> 
> The filter exports recognized text as the frame metadata @code{lavfi.ocr.text}.
> diff --git a/libavfilter/vf_ocr.c b/libavfilter/vf_ocr.c
> index 6de474025a..55f04b6592 100644
> --- a/libavfilter/vf_ocr.c
> +++ b/libavfilter/vf_ocr.c
> @@ -33,6 +33,8 @@ typedef struct OCRContext {
>     char *language;
>     char *whitelist;
>     char *blacklist;
> +    int x, y, x_in, y_in;
> +    int w, h, w_in, h_in;
> 
>     TessBaseAPI *tess;
> } OCRContext;
> @@ -45,6 +47,10 @@ static const AVOption ocr_options[] = {
>     { "language",  "set language",            OFFSET(language),  AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS },
>     { "whitelist", "set character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING, {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~ "}, 0, 0, FLAGS },
>     { "blacklist", "set character blacklist", OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""},    0, 0, FLAGS },
> +    { "x",         "top x of sub region",     OFFSET(x),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "y",         "top y of sub region",     OFFSET(y),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "w",         "width of sub region",     OFFSET(w),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> +    { "h",         "height of sub region",    OFFSET(h),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
>     { NULL }
> };
> 
> @@ -93,6 +99,41 @@ static int query_formats(AVFilterContext *ctx)
>     return ff_set_common_formats(ctx, fmts_list);
> }
> 
> +static void check_fix(int *x, int *y, int *w, int *h, int pic_w, int pic_h)
> +{
> +    // 0 <= x < pic_w
> +    if (*x >= pic_w)
> +        *x = 0;
> +    // 0 <= y < pic_h
> +    if (*y >= pic_h)
> +        *y = 0;
> +
> +    if (*w == 0 || *w + *x > pic_w)
> +        *w = pic_w - *x;
> +    if (*h == 0 || *h + *y > pic_h)
> +        *h = pic_h - *y;
> +}
> +
> +static int config_input(AVFilterLink *inlink)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    OCRContext *s = ctx->priv;
> +
> +    s->x_in = s->x;
> +    s->y_in = s->y;
> +    s->w_in = s->w;
> +    s->h_in = s->h;
> +    check_fix(&s->x_in, &s->y_in, &s->w_in, &s->h_in, inlink->w, inlink->h);
> +    if ( s->x_in != s->x || s->y_in != s->y  ||
> +        (s->w != 0 && s->w_in != s->w) || (s->h != 0 && s->h_in != s->h)) {
> +        av_log(s, AV_LOG_WARNING, "config error, subregion changed to "
> +                                  "x=%d, y=%d, w=%d, h=%d\n",
> +                                  s->x_in, s->y_in, s->w_in, s->h_in);
> +    }
> +
> +    return 0;
> +}
> +
> static int filter_frame(AVFilterLink *inlink, AVFrame *in)
> {
>     AVDictionary **metadata = &in->metadata;
> @@ -102,8 +143,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>     char *result;
>     int *confs;
> 
> +    // TODO(vacing): support expression
What’s vacing? 
>     result = TessBaseAPIRect(s->tess, in->data[0], 1,
> -                             in->linesize[0], 0, 0, in->width, in->height);
> +                             in->linesize[0], s->x_in, s->y_in, s->w_in, s->h_in);
>     confs = TessBaseAPIAllWordConfidences(s->tess);
>     av_dict_set(metadata, "lavfi.ocr.text", result, 0);
>     for (int i = 0; confs[i] != -1; i++) {
> @@ -134,6 +176,7 @@ static const AVFilterPad ocr_inputs[] = {
>         .name         = "default",
>         .type         = AVMEDIA_TYPE_VIDEO,
>         .filter_frame = filter_frame,
> +        .config_props = config_input,
>     },
>     { NULL }
> };
> -- 
> 2.29.2
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 

Thanks

Steven Liu
Lingjiang Fang July 22, 2021, 2:37 a.m. UTC | #3
On Wed, 21 Jul 2021 09:38:51 +0800
Steven Liu <lq@chinaffmpeg.org> wrote:

> 
> 
> > 2021年7月12日 下午1:14,Lingjiang Fang <vacingfang@foxmail.com>
> > 写道:
> > 
> > follow comments from Steven Liu
> > ---
> > doc/filters.texi     |  8 ++++++++
> > libavfilter/vf_ocr.c | 45
> > +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 52
> > insertions(+), 1 deletion(-)
> > 
> > diff --git a/doc/filters.texi b/doc/filters.texi
> > index d991c06628..f41ba0ce46 100644
> > --- a/doc/filters.texi
> > +++ b/doc/filters.texi
> > @@ -15457,6 +15457,14 @@ Set character whitelist.
> > 
> > @item blacklist
> > Set character blacklist.
> > +
> > +@item x, y
> > +Set top-left corner of the subregion, in pixels, default is (0,0).
> > +
> > +@item w, h
> > +Set width and height of the subregion, in pixels,
> > +default is the bottom-right part from given top-left corner.
> > +
> > @end table
> > 
> > The filter exports recognized text as the frame metadata
> > @code{lavfi.ocr.text}. diff --git a/libavfilter/vf_ocr.c
> > b/libavfilter/vf_ocr.c index 6de474025a..55f04b6592 100644
> > --- a/libavfilter/vf_ocr.c
> > +++ b/libavfilter/vf_ocr.c
> > @@ -33,6 +33,8 @@ typedef struct OCRContext {
> >     char *language;
> >     char *whitelist;
> >     char *blacklist;
> > +    int x, y, x_in, y_in;
> > +    int w, h, w_in, h_in;
> > 
> >     TessBaseAPI *tess;
> > } OCRContext;
> > @@ -45,6 +47,10 @@ static const AVOption ocr_options[] = {
> >     { "language",  "set language",            OFFSET(language),
> > AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS }, { "whitelist",
> > "set character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING,
> > {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~
> > "}, 0, 0, FLAGS }, { "blacklist", "set character blacklist",
> > OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""},    0, 0, FLAGS },
> > +    { "x",         "top x of sub region",     OFFSET(x),
> > AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> > +    { "y",         "top y of sub region",     OFFSET(y),
> > AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> > +    { "w",         "width of sub region",     OFFSET(w),
> > AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
> > +    { "h",         "height of sub region",    OFFSET(h),
> > AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS }, { NULL }
> > };
> > 
> > @@ -93,6 +99,41 @@ static int query_formats(AVFilterContext *ctx)
> >     return ff_set_common_formats(ctx, fmts_list);
> > }
> > 
> > +static void check_fix(int *x, int *y, int *w, int *h, int pic_w,
> > int pic_h) +{
> > +    // 0 <= x < pic_w
> > +    if (*x >= pic_w)
> > +        *x = 0;
> > +    // 0 <= y < pic_h
> > +    if (*y >= pic_h)
> > +        *y = 0;
> > +
> > +    if (*w == 0 || *w + *x > pic_w)
> > +        *w = pic_w - *x;
> > +    if (*h == 0 || *h + *y > pic_h)
> > +        *h = pic_h - *y;
> > +}
> > +
> > +static int config_input(AVFilterLink *inlink)
> > +{
> > +    AVFilterContext *ctx = inlink->dst;
> > +    OCRContext *s = ctx->priv;
> > +
> > +    s->x_in = s->x;
> > +    s->y_in = s->y;
> > +    s->w_in = s->w;
> > +    s->h_in = s->h;
> > +    check_fix(&s->x_in, &s->y_in, &s->w_in, &s->h_in, inlink->w,
> > inlink->h);
> > +    if ( s->x_in != s->x || s->y_in != s->y  ||
> > +        (s->w != 0 && s->w_in != s->w) || (s->h != 0 && s->h_in !=
> > s->h)) {
> > +        av_log(s, AV_LOG_WARNING, "config error, subregion changed
> > to "
> > +                                  "x=%d, y=%d, w=%d, h=%d\n",
> > +                                  s->x_in, s->y_in, s->w_in,
> > s->h_in);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > static int filter_frame(AVFilterLink *inlink, AVFrame *in)
> > {
> >     AVDictionary **metadata = &in->metadata;
> > @@ -102,8 +143,9 @@ static int filter_frame(AVFilterLink *inlink,
> > AVFrame *in) char *result;
> >     int *confs;
> > 
> > +    // TODO(vacing): support expression
> What’s vacing? 
"vacing" is my nickname, sorry to make misunderstanding
maybe I should put my email address here, should I?

> >     result = TessBaseAPIRect(s->tess, in->data[0], 1,
> > -                             in->linesize[0], 0, 0, in->width,
> > in->height);
> > +                             in->linesize[0], s->x_in, s->y_in,
> > s->w_in, s->h_in); confs = TessBaseAPIAllWordConfidences(s->tess);
> >     av_dict_set(metadata, "lavfi.ocr.text", result, 0);
> >     for (int i = 0; confs[i] != -1; i++) {
> > @@ -134,6 +176,7 @@ static const AVFilterPad ocr_inputs[] = {
> >         .name         = "default",
> >         .type         = AVMEDIA_TYPE_VIDEO,
> >         .filter_frame = filter_frame,
> > +        .config_props = config_input,
> >     },
> >     { NULL }
> > };
> > -- 
> > 2.29.2
> > 
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > 
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> > 
> 
> Thanks
> 
> Steven Liu
> 
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".



Regards,
Lingjiang Fang
diff mbox series

Patch

diff --git a/doc/filters.texi b/doc/filters.texi
index d991c06628..f41ba0ce46 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15457,6 +15457,14 @@  Set character whitelist.
 
 @item blacklist
 Set character blacklist.
+
+@item x, y
+Set top-left corner of the subregion, in pixels, default is (0,0).
+
+@item w, h
+Set width and height of the subregion, in pixels,
+default is the bottom-right part from given top-left corner.
+
 @end table
 
 The filter exports recognized text as the frame metadata @code{lavfi.ocr.text}.
diff --git a/libavfilter/vf_ocr.c b/libavfilter/vf_ocr.c
index 6de474025a..55f04b6592 100644
--- a/libavfilter/vf_ocr.c
+++ b/libavfilter/vf_ocr.c
@@ -33,6 +33,8 @@  typedef struct OCRContext {
     char *language;
     char *whitelist;
     char *blacklist;
+    int x, y, x_in, y_in;
+    int w, h, w_in, h_in;
 
     TessBaseAPI *tess;
 } OCRContext;
@@ -45,6 +47,10 @@  static const AVOption ocr_options[] = {
     { "language",  "set language",            OFFSET(language),  AV_OPT_TYPE_STRING, {.str="eng"}, 0, 0, FLAGS },
     { "whitelist", "set character whitelist", OFFSET(whitelist), AV_OPT_TYPE_STRING, {.str="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.:;,-+_!?\"'[]{}()<>|/\\=*&%$#@!~ "}, 0, 0, FLAGS },
     { "blacklist", "set character blacklist", OFFSET(blacklist), AV_OPT_TYPE_STRING, {.str=""},    0, 0, FLAGS },
+    { "x",         "top x of sub region",     OFFSET(x),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
+    { "y",         "top y of sub region",     OFFSET(y),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
+    { "w",         "width of sub region",     OFFSET(w),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
+    { "h",         "height of sub region",    OFFSET(h),         AV_OPT_TYPE_INT,    {.i64=0},     0, INT_MAX, FLAGS },
     { NULL }
 };
 
@@ -93,6 +99,41 @@  static int query_formats(AVFilterContext *ctx)
     return ff_set_common_formats(ctx, fmts_list);
 }
 
+static void check_fix(int *x, int *y, int *w, int *h, int pic_w, int pic_h)
+{
+    // 0 <= x < pic_w
+    if (*x >= pic_w)
+        *x = 0;
+    // 0 <= y < pic_h
+    if (*y >= pic_h)
+        *y = 0;
+
+    if (*w == 0 || *w + *x > pic_w)
+        *w = pic_w - *x;
+    if (*h == 0 || *h + *y > pic_h)
+        *h = pic_h - *y;
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    OCRContext *s = ctx->priv;
+
+    s->x_in = s->x;
+    s->y_in = s->y;
+    s->w_in = s->w;
+    s->h_in = s->h;
+    check_fix(&s->x_in, &s->y_in, &s->w_in, &s->h_in, inlink->w, inlink->h);
+    if ( s->x_in != s->x || s->y_in != s->y  ||
+        (s->w != 0 && s->w_in != s->w) || (s->h != 0 && s->h_in != s->h)) {
+        av_log(s, AV_LOG_WARNING, "config error, subregion changed to "
+                                  "x=%d, y=%d, w=%d, h=%d\n",
+                                  s->x_in, s->y_in, s->w_in, s->h_in);
+    }
+
+    return 0;
+}
+
 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 {
     AVDictionary **metadata = &in->metadata;
@@ -102,8 +143,9 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     char *result;
     int *confs;
 
+    // TODO(vacing): support expression
     result = TessBaseAPIRect(s->tess, in->data[0], 1,
-                             in->linesize[0], 0, 0, in->width, in->height);
+                             in->linesize[0], s->x_in, s->y_in, s->w_in, s->h_in);
     confs = TessBaseAPIAllWordConfidences(s->tess);
     av_dict_set(metadata, "lavfi.ocr.text", result, 0);
     for (int i = 0; confs[i] != -1; i++) {
@@ -134,6 +176,7 @@  static const AVFilterPad ocr_inputs[] = {
         .name         = "default",
         .type         = AVMEDIA_TYPE_VIDEO,
         .filter_frame = filter_frame,
+        .config_props = config_input,
     },
     { NULL }
 };