Message ID | 20210514084702.21273-3-ting.fu@intel.com |
---|---|
State | Accepted |
Commit | 7a879cce3793a1314ce50cef4b75671f5e0c0219 |
Headers | show |
Series | [FFmpeg-devel,1/3] lavfi/drawbox: refine code | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
> -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Ting > Fu > Sent: 2021年5月14日 16:47 > To: ffmpeg-devel@ffmpeg.org > Subject: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter support > draw text with detection bounding boxes in side_data > > This feature can be used with dnn detection by setting vf_drawtext's option > text_source=side_data_detection_bboxes, for example: > ./ffmpeg -i face.jpeg -vf > dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\ > input=data:output=detection_out:labels=face-detection-adas-0001.label,dra > wbox=box_source= > side_data_detection_bboxes,drawtext=text_source=side_data_detection_bbo > xes:fontcolor=green:\ > fontsize=40, -y face_detect.jpeg > Please note, the default fontsize of vf_drawtext is 12, which may be too > small to be seen clearly. > > Signed-off-by: Ting Fu <ting.fu@intel.com> > --- > doc/filters.texi | 8 ++++ > libavfilter/vf_drawtext.c | 77 > ++++++++++++++++++++++++++++++++++++--- > 2 files changed, 79 insertions(+), 6 deletions(-) > > diff --git a/doc/filters.texi b/doc/filters.texi > index f2ac8c4cc8..d10e6de03d 100644 > --- a/doc/filters.texi > +++ b/doc/filters.texi > @@ -10788,6 +10788,14 @@ parameter @var{text}. > > If both @var{text} and @var{textfile} are specified, an error is thrown. > > +@item text_source > +Text source should be set as side_data_detection_bboxes if you want to use > text data in > +detection bboxes of side data. > + > +If text source is set, @var{text} and @var{textfile} will be ignored and still > use > +text data in detection bboxes of side data. So please do not use this > parameter > +if you are not sure about the text source. > + > @item reload > If set to 1, the @var{textfile} will be reloaded before each frame. > Be sure to update it atomically, or it may be read partially, or even fail. > diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c > index 7ea057b812..382d589e26 100644 > --- a/libavfilter/vf_drawtext.c > +++ b/libavfilter/vf_drawtext.c > @@ -55,6 +55,7 @@ > #include "libavutil/time_internal.h" > #include "libavutil/tree.h" > #include "libavutil/lfg.h" > +#include "libavutil/detection_bbox.h" > #include "avfilter.h" > #include "drawutils.h" > #include "formats.h" > @@ -199,6 +200,8 @@ typedef struct DrawTextContext { > int tc24hmax; ///< 1 if timecode is wrapped to 24 > hours, 0 otherwise > int reload; ///< reload text file for each frame > int start_number; ///< starting frame number for > n/frame_num var > + char *text_source_string; ///< the string to specify text data > source > + enum AVFrameSideDataType text_source; > #if CONFIG_LIBFRIBIDI > int text_shaping; ///< 1 to shape the text before > drawing it > #endif > @@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= { > { "alpha", "apply alpha while rendering", OFFSET(a_expr), > AV_OPT_TYPE_STRING, { .str = "1" }, .flags = FLAGS }, > {"fix_bounds", "check and fix text coords to avoid clipping", > OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS}, > {"start_number", "start frame number for n/frame_num variable", > OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS}, > + {"text_source", "the source of text", OFFSET(text_source_string), > AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS }, > > #if CONFIG_LIBFRIBIDI > {"text_shaping", "attempt to shape text before drawing", > OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS}, > @@ -690,6 +694,16 @@ out: > } > #endif > > +static enum AVFrameSideDataType text_source_string_parse(const char > *text_source_string) > +{ > + av_assert0(text_source_string); > + if (!strcmp(text_source_string, "side_data_detection_bboxes")) { > + return AV_FRAME_DATA_DETECTION_BBOXES; > + } else { > + return AVERROR(EINVAL); > + } > +} > + > static av_cold int init(AVFilterContext *ctx) > { > int err; > @@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx) > s->text = av_strdup(""); > } > > + if (s->text_source_string) { > + s->text_source = text_source_string_parse(s->text_source_string); > + if ((int)s->text_source < 0) { > + av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n", > s->text_source_string); > + return AVERROR(EINVAL); > + } > + } > + > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) { > + if (s->text) { > + av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, will > use text_source only\n"); > + av_free(s->text); > + } > + s->text = > av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE * > + (AV_NUM_DETECTION_BBOX_CLASSIFY + > 1)); > + if (!s->text) > + return AVERROR(ENOMEM); > + } > + > if (!s->text) { > av_log(ctx, AV_LOG_ERROR, > - "Either text, a valid file or a timecode must be > provided\n"); > + "Either text, a valid file, a timecode or text source must be > provided\n"); > return AVERROR(EINVAL); > } > > @@ -1440,10 +1473,15 @@ continue_on_invalid2: > > s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = s->max_glyph_h; > > - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, > &s->prng); > - s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, > &s->prng); > - /* It is necessary if x is expressed from y */ > - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, > &s->prng); > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) { > + s->var_values[VAR_X] = s->x; > + s->var_values[VAR_Y] = s->y; > + } else { > + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, > s->var_values, &s->prng); > + s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, > s->var_values, &s->prng); > + /* It is necessary if x is expressed from y */ > + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, > s->var_values, &s->prng); > + } > > update_alpha(s); > update_color_with_alpha(s, &fontcolor , s->fontcolor ); > @@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink, > AVFrame *frame) > AVFilterLink *outlink = ctx->outputs[0]; > DrawTextContext *s = ctx->priv; > int ret; > + const AVDetectionBBoxHeader *header = NULL; > + const AVDetectionBBox *bbox; > + AVFrameSideData *sd; > + int loop = 1; > + > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) { > + sd = av_frame_get_side_data(frame, > AV_FRAME_DATA_DETECTION_BBOXES); > + if (sd) { > + header = (AVDetectionBBoxHeader *)sd->data; > + loop = header->nb_bboxes; > + } else { > + av_log(s, AV_LOG_WARNING, "No detection bboxes.\n"); > + return ff_filter_frame(outlink, frame); > + } > + } > > if (s->reload) { > if ((ret = load_textfile(ctx)) < 0) { > @@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink, > AVFrame *frame) > s->var_values[VAR_PKT_SIZE] = frame->pkt_size; > s->metadata = frame->metadata; > > - draw_text(ctx, frame, frame->width, frame->height); > + for (int i = 0; i < loop; i++) { > + if (header) { > + bbox = av_get_detection_bbox(header, i); > + strcpy(s->text, bbox->detect_label); > + for (int j = 0; j < bbox->classify_count; j++) { > + strcat(s->text, ", "); > + strcat(s->text, bbox->classify_labels[j]); > + } > + s->x = bbox->x; > + s->y = bbox->y - s->fontsize; > + } > + draw_text(ctx, frame, frame->width, frame->height); > + } > > av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d > y:%d\n", > (int)s->var_values[VAR_N], s->var_values[VAR_T], > -- any comment? thanks. A new option is added into vf_drawbox and vf_drawtext to visualize the data from detection bounding boxes in the side data of AVFrame.
> -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Guo, > Yejun > Sent: 2021年5月20日 11:04 > To: FFmpeg development discussions and patches > <ffmpeg-devel@ffmpeg.org> > Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter > support draw text with detection bounding boxes in side_data > > > > > -----Original Message----- > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Ting > > Fu > > Sent: 2021年5月14日 16:47 > > To: ffmpeg-devel@ffmpeg.org > > Subject: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter support > > draw text with detection bounding boxes in side_data > > > > This feature can be used with dnn detection by setting vf_drawtext's option > > text_source=side_data_detection_bboxes, for example: > > ./ffmpeg -i face.jpeg -vf > > > dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\ > > > input=data:output=detection_out:labels=face-detection-adas-0001.label,dra > > wbox=box_source= > > > side_data_detection_bboxes,drawtext=text_source=side_data_detection_bbo > > xes:fontcolor=green:\ > > fontsize=40, -y face_detect.jpeg > > Please note, the default fontsize of vf_drawtext is 12, which may be too > > small to be seen clearly. > > > > Signed-off-by: Ting Fu <ting.fu@intel.com> > > --- > > doc/filters.texi | 8 ++++ > > libavfilter/vf_drawtext.c | 77 > > ++++++++++++++++++++++++++++++++++++--- > > 2 files changed, 79 insertions(+), 6 deletions(-) > > > > diff --git a/doc/filters.texi b/doc/filters.texi > > index f2ac8c4cc8..d10e6de03d 100644 > > --- a/doc/filters.texi > > +++ b/doc/filters.texi > > @@ -10788,6 +10788,14 @@ parameter @var{text}. > > > > If both @var{text} and @var{textfile} are specified, an error is thrown. > > > > +@item text_source > > +Text source should be set as side_data_detection_bboxes if you want to > use > > text data in > > +detection bboxes of side data. > > + > > +If text source is set, @var{text} and @var{textfile} will be ignored and still > > use > > +text data in detection bboxes of side data. So please do not use this > > parameter > > +if you are not sure about the text source. > > + > > @item reload > > If set to 1, the @var{textfile} will be reloaded before each frame. > > Be sure to update it atomically, or it may be read partially, or even fail. > > diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c > > index 7ea057b812..382d589e26 100644 > > --- a/libavfilter/vf_drawtext.c > > +++ b/libavfilter/vf_drawtext.c > > @@ -55,6 +55,7 @@ > > #include "libavutil/time_internal.h" > > #include "libavutil/tree.h" > > #include "libavutil/lfg.h" > > +#include "libavutil/detection_bbox.h" > > #include "avfilter.h" > > #include "drawutils.h" > > #include "formats.h" > > @@ -199,6 +200,8 @@ typedef struct DrawTextContext { > > int tc24hmax; ///< 1 if timecode is wrapped to > 24 > > hours, 0 otherwise > > int reload; ///< reload text file for each frame > > int start_number; ///< starting frame number for > > n/frame_num var > > + char *text_source_string; ///< the string to specify text data > > source > > + enum AVFrameSideDataType text_source; > > #if CONFIG_LIBFRIBIDI > > int text_shaping; ///< 1 to shape the text before > > drawing it > > #endif > > @@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= { > > { "alpha", "apply alpha while rendering", OFFSET(a_expr), > > AV_OPT_TYPE_STRING, { .str = "1" }, .flags = FLAGS }, > > {"fix_bounds", "check and fix text coords to avoid clipping", > > OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS}, > > {"start_number", "start frame number for n/frame_num variable", > > OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS}, > > + {"text_source", "the source of text", OFFSET(text_source_string), > > AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS }, > > > > #if CONFIG_LIBFRIBIDI > > {"text_shaping", "attempt to shape text before drawing", > > OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS}, > > @@ -690,6 +694,16 @@ out: > > } > > #endif > > > > +static enum AVFrameSideDataType text_source_string_parse(const char > > *text_source_string) > > +{ > > + av_assert0(text_source_string); > > + if (!strcmp(text_source_string, "side_data_detection_bboxes")) { > > + return AV_FRAME_DATA_DETECTION_BBOXES; > > + } else { > > + return AVERROR(EINVAL); > > + } > > +} > > + > > static av_cold int init(AVFilterContext *ctx) > > { > > int err; > > @@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx) > > s->text = av_strdup(""); > > } > > > > + if (s->text_source_string) { > > + s->text_source = text_source_string_parse(s->text_source_string); > > + if ((int)s->text_source < 0) { > > + av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n", > > s->text_source_string); > > + return AVERROR(EINVAL); > > + } > > + } > > + > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) { > > + if (s->text) { > > + av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, will > > use text_source only\n"); > > + av_free(s->text); > > + } > > + s->text = > > av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE * > > + (AV_NUM_DETECTION_BBOX_CLASSIFY + > > 1)); > > + if (!s->text) > > + return AVERROR(ENOMEM); > > + } > > + > > if (!s->text) { > > av_log(ctx, AV_LOG_ERROR, > > - "Either text, a valid file or a timecode must be > > provided\n"); > > + "Either text, a valid file, a timecode or text source must > be > > provided\n"); > > return AVERROR(EINVAL); > > } > > > > @@ -1440,10 +1473,15 @@ continue_on_invalid2: > > > > s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = > s->max_glyph_h; > > > > - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, > > &s->prng); > > - s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, > > &s->prng); > > - /* It is necessary if x is expressed from y */ > > - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, > > &s->prng); > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) { > > + s->var_values[VAR_X] = s->x; > > + s->var_values[VAR_Y] = s->y; > > + } else { > > + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, > > s->var_values, &s->prng); > > + s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, > > s->var_values, &s->prng); > > + /* It is necessary if x is expressed from y */ > > + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, > > s->var_values, &s->prng); > > + } > > > > update_alpha(s); > > update_color_with_alpha(s, &fontcolor , s->fontcolor ); > > @@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink, > > AVFrame *frame) > > AVFilterLink *outlink = ctx->outputs[0]; > > DrawTextContext *s = ctx->priv; > > int ret; > > + const AVDetectionBBoxHeader *header = NULL; > > + const AVDetectionBBox *bbox; > > + AVFrameSideData *sd; > > + int loop = 1; > > + > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) { > > + sd = av_frame_get_side_data(frame, > > AV_FRAME_DATA_DETECTION_BBOXES); > > + if (sd) { > > + header = (AVDetectionBBoxHeader *)sd->data; > > + loop = header->nb_bboxes; > > + } else { > > + av_log(s, AV_LOG_WARNING, "No detection bboxes.\n"); > > + return ff_filter_frame(outlink, frame); > > + } > > + } > > > > if (s->reload) { > > if ((ret = load_textfile(ctx)) < 0) { > > @@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink, > > AVFrame *frame) > > s->var_values[VAR_PKT_SIZE] = frame->pkt_size; > > s->metadata = frame->metadata; > > > > - draw_text(ctx, frame, frame->width, frame->height); > > + for (int i = 0; i < loop; i++) { > > + if (header) { > > + bbox = av_get_detection_bbox(header, i); > > + strcpy(s->text, bbox->detect_label); > > + for (int j = 0; j < bbox->classify_count; j++) { > > + strcat(s->text, ", "); > > + strcat(s->text, bbox->classify_labels[j]); > > + } > > + s->x = bbox->x; > > + s->y = bbox->y - s->fontsize; > > + } > > + draw_text(ctx, frame, frame->width, frame->height); > > + } > > > > av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d > > y:%d\n", > > (int)s->var_values[VAR_N], s->var_values[VAR_T], > > -- > > any comment? thanks. > > A new option is added into vf_drawbox and vf_drawtext to visualize the > data from detection bounding boxes in the side data of AVFrame. > will push tomorrow if there's no objection.
> -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Guo, > Yejun > Sent: 2021年5月25日 9:08 > To: FFmpeg development discussions and patches > <ffmpeg-devel@ffmpeg.org> > Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter > support draw text with detection bounding boxes in side_data > > > > > -----Original Message----- > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of > Guo, > > Yejun > > Sent: 2021年5月20日 11:04 > > To: FFmpeg development discussions and patches > > <ffmpeg-devel@ffmpeg.org> > > Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter > > support draw text with detection bounding boxes in side_data > > > > > > > > > -----Original Message----- > > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of > Ting > > > Fu > > > Sent: 2021年5月14日 16:47 > > > To: ffmpeg-devel@ffmpeg.org > > > Subject: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter support > > > draw text with detection bounding boxes in side_data > > > > > > This feature can be used with dnn detection by setting vf_drawtext's > option > > > text_source=side_data_detection_bboxes, for example: > > > ./ffmpeg -i face.jpeg -vf > > > > > > dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\ > > > > > > input=data:output=detection_out:labels=face-detection-adas-0001.label,dra > > > wbox=box_source= > > > > > > side_data_detection_bboxes,drawtext=text_source=side_data_detection_bbo > > > xes:fontcolor=green:\ > > > fontsize=40, -y face_detect.jpeg > > > Please note, the default fontsize of vf_drawtext is 12, which may be too > > > small to be seen clearly. > > > > > > Signed-off-by: Ting Fu <ting.fu@intel.com> > > > --- > > > doc/filters.texi | 8 ++++ > > > libavfilter/vf_drawtext.c | 77 > > > ++++++++++++++++++++++++++++++++++++--- > > > 2 files changed, 79 insertions(+), 6 deletions(-) > > > > > > diff --git a/doc/filters.texi b/doc/filters.texi > > > index f2ac8c4cc8..d10e6de03d 100644 > > > --- a/doc/filters.texi > > > +++ b/doc/filters.texi > > > @@ -10788,6 +10788,14 @@ parameter @var{text}. > > > > > > If both @var{text} and @var{textfile} are specified, an error is thrown. > > > > > > +@item text_source > > > +Text source should be set as side_data_detection_bboxes if you want to > > use > > > text data in > > > +detection bboxes of side data. > > > + > > > +If text source is set, @var{text} and @var{textfile} will be ignored and still > > > use > > > +text data in detection bboxes of side data. So please do not use this > > > parameter > > > +if you are not sure about the text source. > > > + > > > @item reload > > > If set to 1, the @var{textfile} will be reloaded before each frame. > > > Be sure to update it atomically, or it may be read partially, or even fail. > > > diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c > > > index 7ea057b812..382d589e26 100644 > > > --- a/libavfilter/vf_drawtext.c > > > +++ b/libavfilter/vf_drawtext.c > > > @@ -55,6 +55,7 @@ > > > #include "libavutil/time_internal.h" > > > #include "libavutil/tree.h" > > > #include "libavutil/lfg.h" > > > +#include "libavutil/detection_bbox.h" > > > #include "avfilter.h" > > > #include "drawutils.h" > > > #include "formats.h" > > > @@ -199,6 +200,8 @@ typedef struct DrawTextContext { > > > int tc24hmax; ///< 1 if timecode is wrapped > to > > 24 > > > hours, 0 otherwise > > > int reload; ///< reload text file for each > frame > > > int start_number; ///< starting frame number for > > > n/frame_num var > > > + char *text_source_string; ///< the string to specify text data > > > source > > > + enum AVFrameSideDataType text_source; > > > #if CONFIG_LIBFRIBIDI > > > int text_shaping; ///< 1 to shape the text before > > > drawing it > > > #endif > > > @@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= { > > > { "alpha", "apply alpha while rendering", OFFSET(a_expr), > > > AV_OPT_TYPE_STRING, { .str = "1" }, .flags = FLAGS }, > > > {"fix_bounds", "check and fix text coords to avoid clipping", > > > OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS}, > > > {"start_number", "start frame number for n/frame_num variable", > > > OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS}, > > > + {"text_source", "the source of text", OFFSET(text_source_string), > > > AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS }, > > > > > > #if CONFIG_LIBFRIBIDI > > > {"text_shaping", "attempt to shape text before drawing", > > > OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS}, > > > @@ -690,6 +694,16 @@ out: > > > } > > > #endif > > > > > > +static enum AVFrameSideDataType text_source_string_parse(const char > > > *text_source_string) > > > +{ > > > + av_assert0(text_source_string); > > > + if (!strcmp(text_source_string, "side_data_detection_bboxes")) { > > > + return AV_FRAME_DATA_DETECTION_BBOXES; > > > + } else { > > > + return AVERROR(EINVAL); > > > + } > > > +} > > > + > > > static av_cold int init(AVFilterContext *ctx) > > > { > > > int err; > > > @@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx) > > > s->text = av_strdup(""); > > > } > > > > > > + if (s->text_source_string) { > > > + s->text_source = > text_source_string_parse(s->text_source_string); > > > + if ((int)s->text_source < 0) { > > > + av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n", > > > s->text_source_string); > > > + return AVERROR(EINVAL); > > > + } > > > + } > > > + > > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) { > > > + if (s->text) { > > > + av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, > will > > > use text_source only\n"); > > > + av_free(s->text); > > > + } > > > + s->text = > > > av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE * > > > + (AV_NUM_DETECTION_BBOX_CLASSIFY > + > > > 1)); > > > + if (!s->text) > > > + return AVERROR(ENOMEM); > > > + } > > > + > > > if (!s->text) { > > > av_log(ctx, AV_LOG_ERROR, > > > - "Either text, a valid file or a timecode must be > > > provided\n"); > > > + "Either text, a valid file, a timecode or text source must > > be > > > provided\n"); > > > return AVERROR(EINVAL); > > > } > > > > > > @@ -1440,10 +1473,15 @@ continue_on_invalid2: > > > > > > s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = > > s->max_glyph_h; > > > > > > - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, > > > &s->prng); > > > - s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, > > > &s->prng); > > > - /* It is necessary if x is expressed from y */ > > > - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, > > > &s->prng); > > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) { > > > + s->var_values[VAR_X] = s->x; > > > + s->var_values[VAR_Y] = s->y; > > > + } else { > > > + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, > > > s->var_values, &s->prng); > > > + s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, > > > s->var_values, &s->prng); > > > + /* It is necessary if x is expressed from y */ > > > + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, > > > s->var_values, &s->prng); > > > + } > > > > > > update_alpha(s); > > > update_color_with_alpha(s, &fontcolor , s->fontcolor ); > > > @@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink, > > > AVFrame *frame) > > > AVFilterLink *outlink = ctx->outputs[0]; > > > DrawTextContext *s = ctx->priv; > > > int ret; > > > + const AVDetectionBBoxHeader *header = NULL; > > > + const AVDetectionBBox *bbox; > > > + AVFrameSideData *sd; > > > + int loop = 1; > > > + > > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) { > > > + sd = av_frame_get_side_data(frame, > > > AV_FRAME_DATA_DETECTION_BBOXES); > > > + if (sd) { > > > + header = (AVDetectionBBoxHeader *)sd->data; > > > + loop = header->nb_bboxes; > > > + } else { > > > + av_log(s, AV_LOG_WARNING, "No detection bboxes.\n"); > > > + return ff_filter_frame(outlink, frame); > > > + } > > > + } > > > > > > if (s->reload) { > > > if ((ret = load_textfile(ctx)) < 0) { > > > @@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink, > > > AVFrame *frame) > > > s->var_values[VAR_PKT_SIZE] = frame->pkt_size; > > > s->metadata = frame->metadata; > > > > > > - draw_text(ctx, frame, frame->width, frame->height); > > > + for (int i = 0; i < loop; i++) { > > > + if (header) { > > > + bbox = av_get_detection_bbox(header, i); > > > + strcpy(s->text, bbox->detect_label); > > > + for (int j = 0; j < bbox->classify_count; j++) { > > > + strcat(s->text, ", "); > > > + strcat(s->text, bbox->classify_labels[j]); > > > + } > > > + s->x = bbox->x; > > > + s->y = bbox->y - s->fontsize; > > > + } > > > + draw_text(ctx, frame, frame->width, frame->height); > > > + } > > > > > > av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d > > > y:%d\n", > > > (int)s->var_values[VAR_N], s->var_values[VAR_T], > > > -- > > > > any comment? thanks. > > > > A new option is added into vf_drawbox and vf_drawtext to visualize the > > data from detection bounding boxes in the side data of AVFrame. > > > > will push tomorrow if there's no objection. will push soon
diff --git a/doc/filters.texi b/doc/filters.texi index f2ac8c4cc8..d10e6de03d 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -10788,6 +10788,14 @@ parameter @var{text}. If both @var{text} and @var{textfile} are specified, an error is thrown. +@item text_source +Text source should be set as side_data_detection_bboxes if you want to use text data in +detection bboxes of side data. + +If text source is set, @var{text} and @var{textfile} will be ignored and still use +text data in detection bboxes of side data. So please do not use this parameter +if you are not sure about the text source. + @item reload If set to 1, the @var{textfile} will be reloaded before each frame. Be sure to update it atomically, or it may be read partially, or even fail. diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c index 7ea057b812..382d589e26 100644 --- a/libavfilter/vf_drawtext.c +++ b/libavfilter/vf_drawtext.c @@ -55,6 +55,7 @@ #include "libavutil/time_internal.h" #include "libavutil/tree.h" #include "libavutil/lfg.h" +#include "libavutil/detection_bbox.h" #include "avfilter.h" #include "drawutils.h" #include "formats.h" @@ -199,6 +200,8 @@ typedef struct DrawTextContext { int tc24hmax; ///< 1 if timecode is wrapped to 24 hours, 0 otherwise int reload; ///< reload text file for each frame int start_number; ///< starting frame number for n/frame_num var + char *text_source_string; ///< the string to specify text data source + enum AVFrameSideDataType text_source; #if CONFIG_LIBFRIBIDI int text_shaping; ///< 1 to shape the text before drawing it #endif @@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= { { "alpha", "apply alpha while rendering", OFFSET(a_expr), AV_OPT_TYPE_STRING, { .str = "1" }, .flags = FLAGS }, {"fix_bounds", "check and fix text coords to avoid clipping", OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS}, {"start_number", "start frame number for n/frame_num variable", OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS}, + {"text_source", "the source of text", OFFSET(text_source_string), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS }, #if CONFIG_LIBFRIBIDI {"text_shaping", "attempt to shape text before drawing", OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS}, @@ -690,6 +694,16 @@ out: } #endif +static enum AVFrameSideDataType text_source_string_parse(const char *text_source_string) +{ + av_assert0(text_source_string); + if (!strcmp(text_source_string, "side_data_detection_bboxes")) { + return AV_FRAME_DATA_DETECTION_BBOXES; + } else { + return AVERROR(EINVAL); + } +} + static av_cold int init(AVFilterContext *ctx) { int err; @@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx) s->text = av_strdup(""); } + if (s->text_source_string) { + s->text_source = text_source_string_parse(s->text_source_string); + if ((int)s->text_source < 0) { + av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n", s->text_source_string); + return AVERROR(EINVAL); + } + } + + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) { + if (s->text) { + av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, will use text_source only\n"); + av_free(s->text); + } + s->text = av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE * + (AV_NUM_DETECTION_BBOX_CLASSIFY + 1)); + if (!s->text) + return AVERROR(ENOMEM); + } + if (!s->text) { av_log(ctx, AV_LOG_ERROR, - "Either text, a valid file or a timecode must be provided\n"); + "Either text, a valid file, a timecode or text source must be provided\n"); return AVERROR(EINVAL); } @@ -1440,10 +1473,15 @@ continue_on_invalid2: s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = s->max_glyph_h; - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng); - s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, &s->prng); - /* It is necessary if x is expressed from y */ - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng); + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) { + s->var_values[VAR_X] = s->x; + s->var_values[VAR_Y] = s->y; + } else { + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng); + s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, &s->prng); + /* It is necessary if x is expressed from y */ + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng); + } update_alpha(s); update_color_with_alpha(s, &fontcolor , s->fontcolor ); @@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) AVFilterLink *outlink = ctx->outputs[0]; DrawTextContext *s = ctx->priv; int ret; + const AVDetectionBBoxHeader *header = NULL; + const AVDetectionBBox *bbox; + AVFrameSideData *sd; + int loop = 1; + + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) { + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); + if (sd) { + header = (AVDetectionBBoxHeader *)sd->data; + loop = header->nb_bboxes; + } else { + av_log(s, AV_LOG_WARNING, "No detection bboxes.\n"); + return ff_filter_frame(outlink, frame); + } + } if (s->reload) { if ((ret = load_textfile(ctx)) < 0) { @@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) s->var_values[VAR_PKT_SIZE] = frame->pkt_size; s->metadata = frame->metadata; - draw_text(ctx, frame, frame->width, frame->height); + for (int i = 0; i < loop; i++) { + if (header) { + bbox = av_get_detection_bbox(header, i); + strcpy(s->text, bbox->detect_label); + for (int j = 0; j < bbox->classify_count; j++) { + strcat(s->text, ", "); + strcat(s->text, bbox->classify_labels[j]); + } + s->x = bbox->x; + s->y = bbox->y - s->fontsize; + } + draw_text(ctx, frame, frame->width, frame->height); + } av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d y:%d\n", (int)s->var_values[VAR_N], s->var_values[VAR_T],
This feature can be used with dnn detection by setting vf_drawtext's option text_source=side_data_detection_bboxes, for example: ./ffmpeg -i face.jpeg -vf dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\ input=data:output=detection_out:labels=face-detection-adas-0001.label,drawbox=box_source= side_data_detection_bboxes,drawtext=text_source=side_data_detection_bboxes:fontcolor=green:\ fontsize=40, -y face_detect.jpeg Please note, the default fontsize of vf_drawtext is 12, which may be too small to be seen clearly. Signed-off-by: Ting Fu <ting.fu@intel.com> --- doc/filters.texi | 8 ++++ libavfilter/vf_drawtext.c | 77 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 79 insertions(+), 6 deletions(-)