diff mbox series

[FFmpeg-devel,3/3] libavfilter: vf_drawtext filter support draw text with detection bounding boxes in side_data

Message ID 20210514084702.21273-3-ting.fu@intel.com
State Accepted
Commit 7a879cce3793a1314ce50cef4b75671f5e0c0219
Headers show
Series [FFmpeg-devel,1/3] lavfi/drawbox: refine code | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Fu, Ting May 14, 2021, 8:47 a.m. UTC
This feature can be used with dnn detection by setting vf_drawtext's option
text_source=side_data_detection_bboxes, for example:
./ffmpeg -i face.jpeg -vf dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\
input=data:output=detection_out:labels=face-detection-adas-0001.label,drawbox=box_source=
side_data_detection_bboxes,drawtext=text_source=side_data_detection_bboxes:fontcolor=green:\
fontsize=40, -y face_detect.jpeg
Please note, the default fontsize of vf_drawtext is 12, which may be too
small to be seen clearly.

Signed-off-by: Ting Fu <ting.fu@intel.com>
---
 doc/filters.texi          |  8 ++++
 libavfilter/vf_drawtext.c | 77 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 6 deletions(-)

Comments

Guo, Yejun May 20, 2021, 3:04 a.m. UTC | #1
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Ting
> Fu
> Sent: 2021年5月14日 16:47
> To: ffmpeg-devel@ffmpeg.org
> Subject: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter support
> draw text with detection bounding boxes in side_data
> 
> This feature can be used with dnn detection by setting vf_drawtext's option
> text_source=side_data_detection_bboxes, for example:
> ./ffmpeg -i face.jpeg -vf
> dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\
> input=data:output=detection_out:labels=face-detection-adas-0001.label,dra
> wbox=box_source=
> side_data_detection_bboxes,drawtext=text_source=side_data_detection_bbo
> xes:fontcolor=green:\
> fontsize=40, -y face_detect.jpeg
> Please note, the default fontsize of vf_drawtext is 12, which may be too
> small to be seen clearly.
> 
> Signed-off-by: Ting Fu <ting.fu@intel.com>
> ---
>  doc/filters.texi          |  8 ++++
>  libavfilter/vf_drawtext.c | 77
> ++++++++++++++++++++++++++++++++++++---
>  2 files changed, 79 insertions(+), 6 deletions(-)
> 
> diff --git a/doc/filters.texi b/doc/filters.texi
> index f2ac8c4cc8..d10e6de03d 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -10788,6 +10788,14 @@ parameter @var{text}.
> 
>  If both @var{text} and @var{textfile} are specified, an error is thrown.
> 
> +@item text_source
> +Text source should be set as side_data_detection_bboxes if you want to use
> text data in
> +detection bboxes of side data.
> +
> +If text source is set, @var{text} and @var{textfile} will be ignored and still
> use
> +text data in detection bboxes of side data. So please do not use this
> parameter
> +if you are not sure about the text source.
> +
>  @item reload
>  If set to 1, the @var{textfile} will be reloaded before each frame.
>  Be sure to update it atomically, or it may be read partially, or even fail.
> diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c
> index 7ea057b812..382d589e26 100644
> --- a/libavfilter/vf_drawtext.c
> +++ b/libavfilter/vf_drawtext.c
> @@ -55,6 +55,7 @@
>  #include "libavutil/time_internal.h"
>  #include "libavutil/tree.h"
>  #include "libavutil/lfg.h"
> +#include "libavutil/detection_bbox.h"
>  #include "avfilter.h"
>  #include "drawutils.h"
>  #include "formats.h"
> @@ -199,6 +200,8 @@ typedef struct DrawTextContext {
>      int tc24hmax;                   ///< 1 if timecode is wrapped to 24
> hours, 0 otherwise
>      int reload;                     ///< reload text file for each frame
>      int start_number;               ///< starting frame number for
> n/frame_num var
> +    char *text_source_string;       ///< the string to specify text data
> source
> +    enum AVFrameSideDataType text_source;
>  #if CONFIG_LIBFRIBIDI
>      int text_shaping;               ///< 1 to shape the text before
> drawing it
>  #endif
> @@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= {
>      { "alpha",       "apply alpha while rendering", OFFSET(a_expr),
> AV_OPT_TYPE_STRING, { .str = "1"     },          .flags = FLAGS },
>      {"fix_bounds", "check and fix text coords to avoid clipping",
> OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS},
>      {"start_number", "start frame number for n/frame_num variable",
> OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS},
> +    {"text_source", "the source of text", OFFSET(text_source_string),
> AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS },
> 
>  #if CONFIG_LIBFRIBIDI
>      {"text_shaping", "attempt to shape text before drawing",
> OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS},
> @@ -690,6 +694,16 @@ out:
>  }
>  #endif
> 
> +static enum AVFrameSideDataType text_source_string_parse(const char
> *text_source_string)
> +{
> +    av_assert0(text_source_string);
> +    if (!strcmp(text_source_string, "side_data_detection_bboxes")) {
> +        return AV_FRAME_DATA_DETECTION_BBOXES;
> +    } else {
> +        return AVERROR(EINVAL);
> +    }
> +}
> +
>  static av_cold int init(AVFilterContext *ctx)
>  {
>      int err;
> @@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx)
>              s->text = av_strdup("");
>      }
> 
> +    if (s->text_source_string) {
> +        s->text_source = text_source_string_parse(s->text_source_string);
> +        if ((int)s->text_source < 0) {
> +            av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n",
> s->text_source_string);
> +            return AVERROR(EINVAL);
> +        }
> +    }
> +
> +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
> +        if (s->text) {
> +            av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, will
> use text_source only\n");
> +            av_free(s->text);
> +        }
> +        s->text =
> av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE *
> +                             (AV_NUM_DETECTION_BBOX_CLASSIFY +
> 1));
> +        if (!s->text)
> +            return AVERROR(ENOMEM);
> +    }
> +
>      if (!s->text) {
>          av_log(ctx, AV_LOG_ERROR,
> -               "Either text, a valid file or a timecode must be
> provided\n");
> +               "Either text, a valid file, a timecode or text source must be
> provided\n");
>          return AVERROR(EINVAL);
>      }
> 
> @@ -1440,10 +1473,15 @@ continue_on_invalid2:
> 
>      s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = s->max_glyph_h;
> 
> -    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values,
> &s->prng);
> -    s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values,
> &s->prng);
> -    /* It is necessary if x is expressed from y  */
> -    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values,
> &s->prng);
> +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
> +        s->var_values[VAR_X] = s->x;
> +        s->var_values[VAR_Y] = s->y;
> +    } else {
> +        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr,
> s->var_values, &s->prng);
> +        s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr,
> s->var_values, &s->prng);
> +        /* It is necessary if x is expressed from y  */
> +        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr,
> s->var_values, &s->prng);
> +    }
> 
>      update_alpha(s);
>      update_color_with_alpha(s, &fontcolor  , s->fontcolor  );
> @@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink,
> AVFrame *frame)
>      AVFilterLink *outlink = ctx->outputs[0];
>      DrawTextContext *s = ctx->priv;
>      int ret;
> +    const AVDetectionBBoxHeader *header = NULL;
> +    const AVDetectionBBox *bbox;
> +    AVFrameSideData *sd;
> +    int loop = 1;
> +
> +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) {
> +        sd = av_frame_get_side_data(frame,
> AV_FRAME_DATA_DETECTION_BBOXES);
> +        if (sd) {
> +            header = (AVDetectionBBoxHeader *)sd->data;
> +            loop = header->nb_bboxes;
> +        } else {
> +            av_log(s, AV_LOG_WARNING, "No detection bboxes.\n");
> +            return ff_filter_frame(outlink, frame);
> +        }
> +    }
> 
>      if (s->reload) {
>          if ((ret = load_textfile(ctx)) < 0) {
> @@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink,
> AVFrame *frame)
>      s->var_values[VAR_PKT_SIZE] = frame->pkt_size;
>      s->metadata = frame->metadata;
> 
> -    draw_text(ctx, frame, frame->width, frame->height);
> +    for (int i = 0; i < loop; i++) {
> +        if (header) {
> +            bbox = av_get_detection_bbox(header, i);
> +            strcpy(s->text, bbox->detect_label);
> +            for (int j = 0; j < bbox->classify_count; j++) {
> +                strcat(s->text, ", ");
> +                strcat(s->text, bbox->classify_labels[j]);
> +            }
> +            s->x = bbox->x;
> +            s->y = bbox->y - s->fontsize;
> +        }
> +        draw_text(ctx, frame, frame->width, frame->height);
> +    }
> 
>      av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d
> y:%d\n",
>             (int)s->var_values[VAR_N], s->var_values[VAR_T],
> --

any comment? thanks.

A new option is added into vf_drawbox and vf_drawtext to visualize the
data from detection bounding boxes in the side data of AVFrame.
Guo, Yejun May 25, 2021, 1:08 a.m. UTC | #2
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Guo,
> Yejun
> Sent: 2021年5月20日 11:04
> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter
> support draw text with detection bounding boxes in side_data
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Ting
> > Fu
> > Sent: 2021年5月14日 16:47
> > To: ffmpeg-devel@ffmpeg.org
> > Subject: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter support
> > draw text with detection bounding boxes in side_data
> >
> > This feature can be used with dnn detection by setting vf_drawtext's option
> > text_source=side_data_detection_bboxes, for example:
> > ./ffmpeg -i face.jpeg -vf
> >
> dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\
> >
> input=data:output=detection_out:labels=face-detection-adas-0001.label,dra
> > wbox=box_source=
> >
> side_data_detection_bboxes,drawtext=text_source=side_data_detection_bbo
> > xes:fontcolor=green:\
> > fontsize=40, -y face_detect.jpeg
> > Please note, the default fontsize of vf_drawtext is 12, which may be too
> > small to be seen clearly.
> >
> > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > ---
> >  doc/filters.texi          |  8 ++++
> >  libavfilter/vf_drawtext.c | 77
> > ++++++++++++++++++++++++++++++++++++---
> >  2 files changed, 79 insertions(+), 6 deletions(-)
> >
> > diff --git a/doc/filters.texi b/doc/filters.texi
> > index f2ac8c4cc8..d10e6de03d 100644
> > --- a/doc/filters.texi
> > +++ b/doc/filters.texi
> > @@ -10788,6 +10788,14 @@ parameter @var{text}.
> >
> >  If both @var{text} and @var{textfile} are specified, an error is thrown.
> >
> > +@item text_source
> > +Text source should be set as side_data_detection_bboxes if you want to
> use
> > text data in
> > +detection bboxes of side data.
> > +
> > +If text source is set, @var{text} and @var{textfile} will be ignored and still
> > use
> > +text data in detection bboxes of side data. So please do not use this
> > parameter
> > +if you are not sure about the text source.
> > +
> >  @item reload
> >  If set to 1, the @var{textfile} will be reloaded before each frame.
> >  Be sure to update it atomically, or it may be read partially, or even fail.
> > diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c
> > index 7ea057b812..382d589e26 100644
> > --- a/libavfilter/vf_drawtext.c
> > +++ b/libavfilter/vf_drawtext.c
> > @@ -55,6 +55,7 @@
> >  #include "libavutil/time_internal.h"
> >  #include "libavutil/tree.h"
> >  #include "libavutil/lfg.h"
> > +#include "libavutil/detection_bbox.h"
> >  #include "avfilter.h"
> >  #include "drawutils.h"
> >  #include "formats.h"
> > @@ -199,6 +200,8 @@ typedef struct DrawTextContext {
> >      int tc24hmax;                   ///< 1 if timecode is wrapped to
> 24
> > hours, 0 otherwise
> >      int reload;                     ///< reload text file for each frame
> >      int start_number;               ///< starting frame number for
> > n/frame_num var
> > +    char *text_source_string;       ///< the string to specify text data
> > source
> > +    enum AVFrameSideDataType text_source;
> >  #if CONFIG_LIBFRIBIDI
> >      int text_shaping;               ///< 1 to shape the text before
> > drawing it
> >  #endif
> > @@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= {
> >      { "alpha",       "apply alpha while rendering", OFFSET(a_expr),
> > AV_OPT_TYPE_STRING, { .str = "1"     },          .flags = FLAGS },
> >      {"fix_bounds", "check and fix text coords to avoid clipping",
> > OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS},
> >      {"start_number", "start frame number for n/frame_num variable",
> > OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS},
> > +    {"text_source", "the source of text", OFFSET(text_source_string),
> > AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS },
> >
> >  #if CONFIG_LIBFRIBIDI
> >      {"text_shaping", "attempt to shape text before drawing",
> > OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS},
> > @@ -690,6 +694,16 @@ out:
> >  }
> >  #endif
> >
> > +static enum AVFrameSideDataType text_source_string_parse(const char
> > *text_source_string)
> > +{
> > +    av_assert0(text_source_string);
> > +    if (!strcmp(text_source_string, "side_data_detection_bboxes")) {
> > +        return AV_FRAME_DATA_DETECTION_BBOXES;
> > +    } else {
> > +        return AVERROR(EINVAL);
> > +    }
> > +}
> > +
> >  static av_cold int init(AVFilterContext *ctx)
> >  {
> >      int err;
> > @@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx)
> >              s->text = av_strdup("");
> >      }
> >
> > +    if (s->text_source_string) {
> > +        s->text_source = text_source_string_parse(s->text_source_string);
> > +        if ((int)s->text_source < 0) {
> > +            av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n",
> > s->text_source_string);
> > +            return AVERROR(EINVAL);
> > +        }
> > +    }
> > +
> > +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
> > +        if (s->text) {
> > +            av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, will
> > use text_source only\n");
> > +            av_free(s->text);
> > +        }
> > +        s->text =
> > av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE *
> > +                             (AV_NUM_DETECTION_BBOX_CLASSIFY +
> > 1));
> > +        if (!s->text)
> > +            return AVERROR(ENOMEM);
> > +    }
> > +
> >      if (!s->text) {
> >          av_log(ctx, AV_LOG_ERROR,
> > -               "Either text, a valid file or a timecode must be
> > provided\n");
> > +               "Either text, a valid file, a timecode or text source must
> be
> > provided\n");
> >          return AVERROR(EINVAL);
> >      }
> >
> > @@ -1440,10 +1473,15 @@ continue_on_invalid2:
> >
> >      s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] =
> s->max_glyph_h;
> >
> > -    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values,
> > &s->prng);
> > -    s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values,
> > &s->prng);
> > -    /* It is necessary if x is expressed from y  */
> > -    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values,
> > &s->prng);
> > +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
> > +        s->var_values[VAR_X] = s->x;
> > +        s->var_values[VAR_Y] = s->y;
> > +    } else {
> > +        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr,
> > s->var_values, &s->prng);
> > +        s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr,
> > s->var_values, &s->prng);
> > +        /* It is necessary if x is expressed from y  */
> > +        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr,
> > s->var_values, &s->prng);
> > +    }
> >
> >      update_alpha(s);
> >      update_color_with_alpha(s, &fontcolor  , s->fontcolor  );
> > @@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink,
> > AVFrame *frame)
> >      AVFilterLink *outlink = ctx->outputs[0];
> >      DrawTextContext *s = ctx->priv;
> >      int ret;
> > +    const AVDetectionBBoxHeader *header = NULL;
> > +    const AVDetectionBBox *bbox;
> > +    AVFrameSideData *sd;
> > +    int loop = 1;
> > +
> > +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) {
> > +        sd = av_frame_get_side_data(frame,
> > AV_FRAME_DATA_DETECTION_BBOXES);
> > +        if (sd) {
> > +            header = (AVDetectionBBoxHeader *)sd->data;
> > +            loop = header->nb_bboxes;
> > +        } else {
> > +            av_log(s, AV_LOG_WARNING, "No detection bboxes.\n");
> > +            return ff_filter_frame(outlink, frame);
> > +        }
> > +    }
> >
> >      if (s->reload) {
> >          if ((ret = load_textfile(ctx)) < 0) {
> > @@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink,
> > AVFrame *frame)
> >      s->var_values[VAR_PKT_SIZE] = frame->pkt_size;
> >      s->metadata = frame->metadata;
> >
> > -    draw_text(ctx, frame, frame->width, frame->height);
> > +    for (int i = 0; i < loop; i++) {
> > +        if (header) {
> > +            bbox = av_get_detection_bbox(header, i);
> > +            strcpy(s->text, bbox->detect_label);
> > +            for (int j = 0; j < bbox->classify_count; j++) {
> > +                strcat(s->text, ", ");
> > +                strcat(s->text, bbox->classify_labels[j]);
> > +            }
> > +            s->x = bbox->x;
> > +            s->y = bbox->y - s->fontsize;
> > +        }
> > +        draw_text(ctx, frame, frame->width, frame->height);
> > +    }
> >
> >      av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d
> > y:%d\n",
> >             (int)s->var_values[VAR_N], s->var_values[VAR_T],
> > --
> 
> any comment? thanks.
> 
> A new option is added into vf_drawbox and vf_drawtext to visualize the
> data from detection bounding boxes in the side data of AVFrame.
> 

will push tomorrow if there's no objection.
Guo, Yejun May 26, 2021, 1:14 a.m. UTC | #3
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Guo,
> Yejun
> Sent: 2021年5月25日 9:08
> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter
> support draw text with detection bounding boxes in side_data
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Guo,
> > Yejun
> > Sent: 2021年5月20日 11:04
> > To: FFmpeg development discussions and patches
> > <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter
> > support draw text with detection bounding boxes in side_data
> >
> >
> >
> > > -----Original Message-----
> > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Ting
> > > Fu
> > > Sent: 2021年5月14日 16:47
> > > To: ffmpeg-devel@ffmpeg.org
> > > Subject: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter support
> > > draw text with detection bounding boxes in side_data
> > >
> > > This feature can be used with dnn detection by setting vf_drawtext's
> option
> > > text_source=side_data_detection_bboxes, for example:
> > > ./ffmpeg -i face.jpeg -vf
> > >
> >
> dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\
> > >
> >
> input=data:output=detection_out:labels=face-detection-adas-0001.label,dra
> > > wbox=box_source=
> > >
> >
> side_data_detection_bboxes,drawtext=text_source=side_data_detection_bbo
> > > xes:fontcolor=green:\
> > > fontsize=40, -y face_detect.jpeg
> > > Please note, the default fontsize of vf_drawtext is 12, which may be too
> > > small to be seen clearly.
> > >
> > > Signed-off-by: Ting Fu <ting.fu@intel.com>
> > > ---
> > >  doc/filters.texi          |  8 ++++
> > >  libavfilter/vf_drawtext.c | 77
> > > ++++++++++++++++++++++++++++++++++++---
> > >  2 files changed, 79 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/doc/filters.texi b/doc/filters.texi
> > > index f2ac8c4cc8..d10e6de03d 100644
> > > --- a/doc/filters.texi
> > > +++ b/doc/filters.texi
> > > @@ -10788,6 +10788,14 @@ parameter @var{text}.
> > >
> > >  If both @var{text} and @var{textfile} are specified, an error is thrown.
> > >
> > > +@item text_source
> > > +Text source should be set as side_data_detection_bboxes if you want to
> > use
> > > text data in
> > > +detection bboxes of side data.
> > > +
> > > +If text source is set, @var{text} and @var{textfile} will be ignored and still
> > > use
> > > +text data in detection bboxes of side data. So please do not use this
> > > parameter
> > > +if you are not sure about the text source.
> > > +
> > >  @item reload
> > >  If set to 1, the @var{textfile} will be reloaded before each frame.
> > >  Be sure to update it atomically, or it may be read partially, or even fail.
> > > diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c
> > > index 7ea057b812..382d589e26 100644
> > > --- a/libavfilter/vf_drawtext.c
> > > +++ b/libavfilter/vf_drawtext.c
> > > @@ -55,6 +55,7 @@
> > >  #include "libavutil/time_internal.h"
> > >  #include "libavutil/tree.h"
> > >  #include "libavutil/lfg.h"
> > > +#include "libavutil/detection_bbox.h"
> > >  #include "avfilter.h"
> > >  #include "drawutils.h"
> > >  #include "formats.h"
> > > @@ -199,6 +200,8 @@ typedef struct DrawTextContext {
> > >      int tc24hmax;                   ///< 1 if timecode is wrapped
> to
> > 24
> > > hours, 0 otherwise
> > >      int reload;                     ///< reload text file for each
> frame
> > >      int start_number;               ///< starting frame number for
> > > n/frame_num var
> > > +    char *text_source_string;       ///< the string to specify text data
> > > source
> > > +    enum AVFrameSideDataType text_source;
> > >  #if CONFIG_LIBFRIBIDI
> > >      int text_shaping;               ///< 1 to shape the text before
> > > drawing it
> > >  #endif
> > > @@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= {
> > >      { "alpha",       "apply alpha while rendering", OFFSET(a_expr),
> > > AV_OPT_TYPE_STRING, { .str = "1"     },          .flags = FLAGS },
> > >      {"fix_bounds", "check and fix text coords to avoid clipping",
> > > OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS},
> > >      {"start_number", "start frame number for n/frame_num variable",
> > > OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS},
> > > +    {"text_source", "the source of text", OFFSET(text_source_string),
> > > AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS },
> > >
> > >  #if CONFIG_LIBFRIBIDI
> > >      {"text_shaping", "attempt to shape text before drawing",
> > > OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS},
> > > @@ -690,6 +694,16 @@ out:
> > >  }
> > >  #endif
> > >
> > > +static enum AVFrameSideDataType text_source_string_parse(const char
> > > *text_source_string)
> > > +{
> > > +    av_assert0(text_source_string);
> > > +    if (!strcmp(text_source_string, "side_data_detection_bboxes")) {
> > > +        return AV_FRAME_DATA_DETECTION_BBOXES;
> > > +    } else {
> > > +        return AVERROR(EINVAL);
> > > +    }
> > > +}
> > > +
> > >  static av_cold int init(AVFilterContext *ctx)
> > >  {
> > >      int err;
> > > @@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx)
> > >              s->text = av_strdup("");
> > >      }
> > >
> > > +    if (s->text_source_string) {
> > > +        s->text_source =
> text_source_string_parse(s->text_source_string);
> > > +        if ((int)s->text_source < 0) {
> > > +            av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n",
> > > s->text_source_string);
> > > +            return AVERROR(EINVAL);
> > > +        }
> > > +    }
> > > +
> > > +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
> > > +        if (s->text) {
> > > +            av_log(ctx, AV_LOG_WARNING, "Multiple texts provided,
> will
> > > use text_source only\n");
> > > +            av_free(s->text);
> > > +        }
> > > +        s->text =
> > > av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE *
> > > +                             (AV_NUM_DETECTION_BBOX_CLASSIFY
> +
> > > 1));
> > > +        if (!s->text)
> > > +            return AVERROR(ENOMEM);
> > > +    }
> > > +
> > >      if (!s->text) {
> > >          av_log(ctx, AV_LOG_ERROR,
> > > -               "Either text, a valid file or a timecode must be
> > > provided\n");
> > > +               "Either text, a valid file, a timecode or text source must
> > be
> > > provided\n");
> > >          return AVERROR(EINVAL);
> > >      }
> > >
> > > @@ -1440,10 +1473,15 @@ continue_on_invalid2:
> > >
> > >      s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] =
> > s->max_glyph_h;
> > >
> > > -    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values,
> > > &s->prng);
> > > -    s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values,
> > > &s->prng);
> > > -    /* It is necessary if x is expressed from y  */
> > > -    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values,
> > > &s->prng);
> > > +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
> > > +        s->var_values[VAR_X] = s->x;
> > > +        s->var_values[VAR_Y] = s->y;
> > > +    } else {
> > > +        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr,
> > > s->var_values, &s->prng);
> > > +        s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr,
> > > s->var_values, &s->prng);
> > > +        /* It is necessary if x is expressed from y  */
> > > +        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr,
> > > s->var_values, &s->prng);
> > > +    }
> > >
> > >      update_alpha(s);
> > >      update_color_with_alpha(s, &fontcolor  , s->fontcolor  );
> > > @@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink,
> > > AVFrame *frame)
> > >      AVFilterLink *outlink = ctx->outputs[0];
> > >      DrawTextContext *s = ctx->priv;
> > >      int ret;
> > > +    const AVDetectionBBoxHeader *header = NULL;
> > > +    const AVDetectionBBox *bbox;
> > > +    AVFrameSideData *sd;
> > > +    int loop = 1;
> > > +
> > > +    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) {
> > > +        sd = av_frame_get_side_data(frame,
> > > AV_FRAME_DATA_DETECTION_BBOXES);
> > > +        if (sd) {
> > > +            header = (AVDetectionBBoxHeader *)sd->data;
> > > +            loop = header->nb_bboxes;
> > > +        } else {
> > > +            av_log(s, AV_LOG_WARNING, "No detection bboxes.\n");
> > > +            return ff_filter_frame(outlink, frame);
> > > +        }
> > > +    }
> > >
> > >      if (s->reload) {
> > >          if ((ret = load_textfile(ctx)) < 0) {
> > > @@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink,
> > > AVFrame *frame)
> > >      s->var_values[VAR_PKT_SIZE] = frame->pkt_size;
> > >      s->metadata = frame->metadata;
> > >
> > > -    draw_text(ctx, frame, frame->width, frame->height);
> > > +    for (int i = 0; i < loop; i++) {
> > > +        if (header) {
> > > +            bbox = av_get_detection_bbox(header, i);
> > > +            strcpy(s->text, bbox->detect_label);
> > > +            for (int j = 0; j < bbox->classify_count; j++) {
> > > +                strcat(s->text, ", ");
> > > +                strcat(s->text, bbox->classify_labels[j]);
> > > +            }
> > > +            s->x = bbox->x;
> > > +            s->y = bbox->y - s->fontsize;
> > > +        }
> > > +        draw_text(ctx, frame, frame->width, frame->height);
> > > +    }
> > >
> > >      av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d
> > > y:%d\n",
> > >             (int)s->var_values[VAR_N], s->var_values[VAR_T],
> > > --
> >
> > any comment? thanks.
> >
> > A new option is added into vf_drawbox and vf_drawtext to visualize the
> > data from detection bounding boxes in the side data of AVFrame.
> >
> 
> will push tomorrow if there's no objection.

will push soon
diff mbox series

Patch

diff --git a/doc/filters.texi b/doc/filters.texi
index f2ac8c4cc8..d10e6de03d 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -10788,6 +10788,14 @@  parameter @var{text}.
 
 If both @var{text} and @var{textfile} are specified, an error is thrown.
 
+@item text_source
+Text source should be set as side_data_detection_bboxes if you want to use text data in
+detection bboxes of side data.
+
+If text source is set, @var{text} and @var{textfile} will be ignored and still use
+text data in detection bboxes of side data. So please do not use this parameter
+if you are not sure about the text source.
+
 @item reload
 If set to 1, the @var{textfile} will be reloaded before each frame.
 Be sure to update it atomically, or it may be read partially, or even fail.
diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c
index 7ea057b812..382d589e26 100644
--- a/libavfilter/vf_drawtext.c
+++ b/libavfilter/vf_drawtext.c
@@ -55,6 +55,7 @@ 
 #include "libavutil/time_internal.h"
 #include "libavutil/tree.h"
 #include "libavutil/lfg.h"
+#include "libavutil/detection_bbox.h"
 #include "avfilter.h"
 #include "drawutils.h"
 #include "formats.h"
@@ -199,6 +200,8 @@  typedef struct DrawTextContext {
     int tc24hmax;                   ///< 1 if timecode is wrapped to 24 hours, 0 otherwise
     int reload;                     ///< reload text file for each frame
     int start_number;               ///< starting frame number for n/frame_num var
+    char *text_source_string;       ///< the string to specify text data source
+    enum AVFrameSideDataType text_source;
 #if CONFIG_LIBFRIBIDI
     int text_shaping;               ///< 1 to shape the text before drawing it
 #endif
@@ -246,6 +249,7 @@  static const AVOption drawtext_options[]= {
     { "alpha",       "apply alpha while rendering", OFFSET(a_expr),      AV_OPT_TYPE_STRING, { .str = "1"     },          .flags = FLAGS },
     {"fix_bounds", "check and fix text coords to avoid clipping", OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS},
     {"start_number", "start frame number for n/frame_num variable", OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS},
+    {"text_source", "the source of text", OFFSET(text_source_string), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS },
 
 #if CONFIG_LIBFRIBIDI
     {"text_shaping", "attempt to shape text before drawing", OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS},
@@ -690,6 +694,16 @@  out:
 }
 #endif
 
+static enum AVFrameSideDataType text_source_string_parse(const char *text_source_string)
+{
+    av_assert0(text_source_string);
+    if (!strcmp(text_source_string, "side_data_detection_bboxes")) {
+        return AV_FRAME_DATA_DETECTION_BBOXES;
+    } else {
+        return AVERROR(EINVAL);
+    }
+}
+
 static av_cold int init(AVFilterContext *ctx)
 {
     int err;
@@ -731,9 +745,28 @@  static av_cold int init(AVFilterContext *ctx)
             s->text = av_strdup("");
     }
 
+    if (s->text_source_string) {
+        s->text_source = text_source_string_parse(s->text_source_string);
+        if ((int)s->text_source < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n", s->text_source_string);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
+        if (s->text) {
+            av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, will use text_source only\n");
+            av_free(s->text);
+        }
+        s->text = av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE *
+                             (AV_NUM_DETECTION_BBOX_CLASSIFY + 1));
+        if (!s->text)
+            return AVERROR(ENOMEM);
+    }
+
     if (!s->text) {
         av_log(ctx, AV_LOG_ERROR,
-               "Either text, a valid file or a timecode must be provided\n");
+               "Either text, a valid file, a timecode or text source must be provided\n");
         return AVERROR(EINVAL);
     }
 
@@ -1440,10 +1473,15 @@  continue_on_invalid2:
 
     s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = s->max_glyph_h;
 
-    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
-    s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, &s->prng);
-    /* It is necessary if x is expressed from y  */
-    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
+    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
+        s->var_values[VAR_X] = s->x;
+        s->var_values[VAR_Y] = s->y;
+    } else {
+        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
+        s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, &s->prng);
+        /* It is necessary if x is expressed from y  */
+        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
+    }
 
     update_alpha(s);
     update_color_with_alpha(s, &fontcolor  , s->fontcolor  );
@@ -1511,6 +1549,21 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
     AVFilterLink *outlink = ctx->outputs[0];
     DrawTextContext *s = ctx->priv;
     int ret;
+    const AVDetectionBBoxHeader *header = NULL;
+    const AVDetectionBBox *bbox;
+    AVFrameSideData *sd;
+    int loop = 1;
+
+    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) {
+        sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
+        if (sd) {
+            header = (AVDetectionBBoxHeader *)sd->data;
+            loop = header->nb_bboxes;
+        } else {
+            av_log(s, AV_LOG_WARNING, "No detection bboxes.\n");
+            return ff_filter_frame(outlink, frame);
+        }
+    }
 
     if (s->reload) {
         if ((ret = load_textfile(ctx)) < 0) {
@@ -1536,7 +1589,19 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
     s->var_values[VAR_PKT_SIZE] = frame->pkt_size;
     s->metadata = frame->metadata;
 
-    draw_text(ctx, frame, frame->width, frame->height);
+    for (int i = 0; i < loop; i++) {
+        if (header) {
+            bbox = av_get_detection_bbox(header, i);
+            strcpy(s->text, bbox->detect_label);
+            for (int j = 0; j < bbox->classify_count; j++) {
+                strcat(s->text, ", ");
+                strcat(s->text, bbox->classify_labels[j]);
+            }
+            s->x = bbox->x;
+            s->y = bbox->y - s->fontsize;
+        }
+        draw_text(ctx, frame, frame->width, frame->height);
+    }
 
     av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d y:%d\n",
            (int)s->var_values[VAR_N], s->var_values[VAR_T],