diff mbox series

[FFmpeg-devel,V2,08/10] libavutil: add side data AVDnnBoundingBox for dnn based detect/classify filters

Message ID 20210210093432.9135-8-yejun.guo@intel.com
State New
Headers show
Series [FFmpeg-devel,V2,01/10] dnn_backend_openvino.c: fix mismatch between ffmpeg(NHWC) and openvino(NCHW)
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Guo, Yejun Feb. 10, 2021, 9:34 a.m. UTC
Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
---
 doc/APIchanges       |  2 ++
 libavutil/Makefile   |  1 +
 libavutil/dnn_bbox.h | 68 ++++++++++++++++++++++++++++++++++++++++++++
 libavutil/frame.c    |  1 +
 libavutil/frame.h    |  7 +++++
 libavutil/version.h  |  2 +-
 6 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 libavutil/dnn_bbox.h

Comments

Mark Thompson Feb. 10, 2021, 10:19 p.m. UTC | #1
On 10/02/2021 09:34, Guo, Yejun wrote:
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>   doc/APIchanges       |  2 ++
>   libavutil/Makefile   |  1 +
>   libavutil/dnn_bbox.h | 68 ++++++++++++++++++++++++++++++++++++++++++++
>   libavutil/frame.c    |  1 +
>   libavutil/frame.h    |  7 +++++
>   libavutil/version.h  |  2 +-
>   6 files changed, 80 insertions(+), 1 deletion(-)
>   create mode 100644 libavutil/dnn_bbox.h

What is the intended consumer of this box information?  (Is there some other filter which will read these are do something with them, or some sort of user program?)

If there is no use in ffmpeg outside libavfilter then the header should probably be in libavfilter.

How tied is this to the DNN implementation, and hence the DNN name?  If someone made a standalone filter doing object detection by some other method, would it make sense for them to reuse this structure?

> diff --git a/libavutil/dnn_bbox.h b/libavutil/dnn_bbox.h
> new file mode 100644
> index 0000000000..50899c4486
> --- /dev/null
> +++ b/libavutil/dnn_bbox.h
> @@ -0,0 +1,68 @@
> +/*
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_DNN_BBOX_H
> +#define AVUTIL_DNN_BBOX_H
> +
> +#include "rational.h"
> +
> +typedef struct AVDnnBoundingBox {
> +    /**
> +     * Must be set to the size of this data structure (that is,
> +     * sizeof(AVDnnBoundingBox)).
> +     */
> +    uint32_t self_size;
> +
> +    /**
> +     * Object detection is usually applied to a smaller image that
> +     * is scaled down from the original frame.
> +     * width and height are attributes of the scaled image, in pixel.
> +     */
> +    int model_input_width;
> +    int model_input_height;

Other than to interpret the distances below, what will the user do with this information?  (Alternatively: why not map the distances back onto the original frame size?)

> +
> +    /**
> +     * Distance in pixels from the top edge of the scaled image to top
> +     * and bottom, and from the left edge of the scaled image to left and
> +     * right, defining the bounding box.
> +     */
> +    int top;
> +    int left;
> +    int bottom;
> +    int right;
> +
> +    /**
> +     * Detect result
> +     */
> +    int detect_label;

How does a user interpret this label?  Is it from some known enum?

> +    AVRational detect_conf;

"conf"... idence?  A longer name and a descriptive comment might help.

> +
> +    /**
> +     * At most 4 classifications based on the detected bounding box.
> +     * For example, we can get max 4 different attributes with 4 different
> +     * DNN models on one bounding box.
> +     * classify_count is zero if no classification.
> +     */
> +#define AV_NUM_BBOX_CLASSIFY 4
> +    uint32_t classify_count;
> +    int classify_labels[AV_NUM_BBOX_CLASSIFY];
> +    AVRational classify_confs[AV_NUM_BBOX_CLASSIFY];

Same comment on these.

> +} AVDnnBoundingBox;
> +
> +#endif
> diff --git a/libavutil/frame.c b/libavutil/frame.c
> index eab51b6a32..4308507827 100644
> --- a/libavutil/frame.c
> +++ b/libavutil/frame.c
> @@ -852,6 +852,7 @@ const char *av_frame_side_data_name(enum AVFrameSideDataType type)
>       case AV_FRAME_DATA_VIDEO_ENC_PARAMS:            return "Video encoding parameters";
>       case AV_FRAME_DATA_SEI_UNREGISTERED:            return "H.26[45] User Data Unregistered SEI message";
>       case AV_FRAME_DATA_FILM_GRAIN_PARAMS:           return "Film grain parameters";
> +    case AV_FRAME_DATA_DNN_BBOXES:                  return "DNN bounding boxes";
>       }
>       return NULL;
>   }
> diff --git a/libavutil/frame.h b/libavutil/frame.h
> index 1aeafef6de..a4dcfd27c9 100644
> --- a/libavutil/frame.h
> +++ b/libavutil/frame.h
> @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>        * Must be present for every frame which should have film grain applied.
>        */
>       AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> +
> +    /**
> +     * Bounding box generated by dnn based filters for object detection and classification,
> +     * the data is an array of AVDnnBoudingBox, the number of array element is implied by
> +     * AVFrameSideData.size / AVDnnBoudingBox.self_size.
> +     */
> +    AV_FRAME_DATA_DNN_BBOXES,
>   };
>   
>   enum AVActiveFormatDescription {
- Mark
Guo, Yejun Feb. 11, 2021, 8:15 a.m. UTC | #2
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Mark
> Thompson
> Sent: 2021年2月11日 6:19
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V2 08/10] libavutil: add side data
> AVDnnBoundingBox for dnn based detect/classify filters
> 
> On 10/02/2021 09:34, Guo, Yejun wrote:
> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > ---
> >   doc/APIchanges       |  2 ++
> >   libavutil/Makefile   |  1 +
> >   libavutil/dnn_bbox.h | 68
> ++++++++++++++++++++++++++++++++++++++++++++
> >   libavutil/frame.c    |  1 +
> >   libavutil/frame.h    |  7 +++++
> >   libavutil/version.h  |  2 +-
> >   6 files changed, 80 insertions(+), 1 deletion(-)
> >   create mode 100644 libavutil/dnn_bbox.h
> 
> What is the intended consumer of this box information?  (Is there some other
> filter which will read these are do something with them, or some sort of user
> program?)
> 
> If there is no use in ffmpeg outside libavfilter then the header should probably
> be in libavfilter.


Thanks for the feedback. 

For most case, other filters will use this box information,
for example, a classify filter will recognize the car number after the car plate is detected,
another filter can apply 'beauty' if a face is detected, and updated drawbox filter (in plan)
can draw the box for visualization, and a new filter such as bbox_to_roi can be added
to apply roi encoding for the detected result.

It is possible that some others will use it,
for example, the new codec is adding AI labels and so libavcodec might need it in the future,
and a user program might do something special such as:
1. use libavcodec to decode
2. use filter detect
3. write his own code to handle the detect result

As the first step, how about to put it in the libavfilter (so do not expose it
at API level and we are free to change it when needed)? And we can move
it to libavutil once it is required.

> 
> How tied is this to the DNN implementation, and hence the DNN name?  If
> someone made a standalone filter doing object detection by some other
> method, would it make sense for them to reuse this structure?

Yes, this structure is general, I add dnn prefix because of two reasons:
1. There's already bounding box in libavfilter/bbox.h, see below, it's simple and we could
not reuse it, so we need a new name.
typedef struct FFBoundingBox {
    int x1, x2, y1, y2;
} FFBoundingBox;

2. DNN is currently the dominate method for object detection.

How about to use 'struct BoudingBox' when it is under libavfilter (added into current file bbox.h),
and rename to 'AVBoudingBox' when it is needed to move to libavutil?

> 
> > diff --git a/libavutil/dnn_bbox.h b/libavutil/dnn_bbox.h new file mode
> > 100644 index 0000000000..50899c4486
> > --- /dev/null
> > +++ b/libavutil/dnn_bbox.h
> > +
> > +    /**
> > +     * Object detection is usually applied to a smaller image that
> > +     * is scaled down from the original frame.
> > +     * width and height are attributes of the scaled image, in pixel.
> > +     */
> > +    int model_input_width;
> > +    int model_input_height;
> 
> Other than to interpret the distances below, what will the user do with this
> information?  (Alternatively: why not map the distances back onto the
> original frame size?)

My idea was: if the original frame is scaled sometime later, the side data (bbox)
keeps correct without any modification.

If we use the distance on the original frame size, shall we say the behavior
is undefined when it is scaled sometime later?

> 
> > +
> > +    /**
> > +     * Distance in pixels from the top edge of the scaled image to top
> > +     * and bottom, and from the left edge of the scaled image to left and
> > +     * right, defining the bounding box.
> > +     */
> > +    int top;
> > +    int left;
> > +    int bottom;
> > +    int right;
> > +
> > +    /**
> > +     * Detect result
> > +     */
> > +    int detect_label;
> 
> How does a user interpret this label?  Is it from some known enum?

The mappings between label_id (int) and label_name (string) are not part of
the model file, it is usually another file provided together with the model file.
The mappings are specific with the model file.

My idea was to provide the mapping file only when it is needed, for example,
when draw the box and text with filter drawbox/drawtext to visualize the bounding box.

If we don't care the size in side_data, we can add label name into this struct,
for example.
#define BBOX_LABEL_NAME_MAX_LENGTH 32
int detect_label_id;
char detect_label_name[BBOX_LABEL_NAME_MAX_LENGTH+1];
int classify_label_ids[AV_NUM_BBOX_CLASSIFY];
char classify_label_names[AV_NUM_BBOX_CLASSIFY][ BBOX_LABEL_NAME_MAX_LENGTH+1]

> 
> > +    AVRational detect_conf;
> 
> "conf"... idence?  A longer name and a descriptive comment might help.

sure, will change to detect_confidence and classify_confidences[]

> 
> > +
> > +    /**
> > +     * At most 4 classifications based on the detected bounding box.
> > +     * For example, we can get max 4 different attributes with 4 different
> > +     * DNN models on one bounding box.
> > +     * classify_count is zero if no classification.
> > +     */
> > +#define AV_NUM_BBOX_CLASSIFY 4
> > +    uint32_t classify_count;
> > +    int classify_labels[AV_NUM_BBOX_CLASSIFY];
> > +    AVRational classify_confs[AV_NUM_BBOX_CLASSIFY];
> 
> Same comment on these.

and, Happy Chinese New Year!

Thanks
Yejun
Mark Thompson Feb. 15, 2021, 11:47 p.m. UTC | #3
On 11/02/2021 08:15, Guo, Yejun wrote:
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Mark
>> Thompson
>> Sent: 2021年2月11日 6:19
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH V2 08/10] libavutil: add side data
>> AVDnnBoundingBox for dnn based detect/classify filters
>>
>> On 10/02/2021 09:34, Guo, Yejun wrote:
>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>>> ---
>>>    doc/APIchanges       |  2 ++
>>>    libavutil/Makefile   |  1 +
>>>    libavutil/dnn_bbox.h | 68
>> ++++++++++++++++++++++++++++++++++++++++++++
>>>    libavutil/frame.c    |  1 +
>>>    libavutil/frame.h    |  7 +++++
>>>    libavutil/version.h  |  2 +-
>>>    6 files changed, 80 insertions(+), 1 deletion(-)
>>>    create mode 100644 libavutil/dnn_bbox.h
>>
>> What is the intended consumer of this box information?  (Is there some other
>> filter which will read these are do something with them, or some sort of user
>> program?)
>>
>> If there is no use in ffmpeg outside libavfilter then the header should probably
>> be in libavfilter.
> 
> 
> Thanks for the feedback.
> 
> For most case, other filters will use this box information,
> for example, a classify filter will recognize the car number after the car plate is detected,
> another filter can apply 'beauty' if a face is detected, and updated drawbox filter (in plan)
> can draw the box for visualization, and a new filter such as bbox_to_roi can be added
> to apply roi encoding for the detected result.
> 
> It is possible that some others will use it,
> for example, the new codec is adding AI labels and so libavcodec might need it in the future,
> and a user program might do something special such as:
> 1. use libavcodec to decode
> 2. use filter detect
> 3. write his own code to handle the detect result
> 
> As the first step, how about to put it in the libavfilter (so do not expose it
> at API level and we are free to change it when needed)? And we can move
> it to libavutil once it is required.

Sure.

>> How tied is this to the DNN implementation, and hence the DNN name?  If
>> someone made a standalone filter doing object detection by some other
>> method, would it make sense for them to reuse this structure?
> 
> Yes, this structure is general, I add dnn prefix because of two reasons:
> 1. There's already bounding box in libavfilter/bbox.h, see below, it's simple and we could
> not reuse it, so we need a new name.
> typedef struct FFBoundingBox {
>      int x1, x2, y1, y2;
> } FFBoundingBox;

Right, really this is just the return type for the internal ff_calculate_bounding_box() function - if you want to reuse the name externally then it would be fine to rename the existing stuff to get it out of the way.

> 2. DNN is currently the dominate method for object detection.

Unless your ID values or something else about the output are DNN-specific then I'm not really seeing the attraction of associating them with the DNN name for external use.  If a user wants to detect some objects in an image and then do something with the result then maybe they know they are using DNN for first step, but they won't care about where the result came from after that.

> How about to use 'struct BoudingBox' when it is under libavfilter (added into current file bbox.h),
> and rename to 'AVBoudingBox' when it is needed to move to libavutil?

Hmm.  What are we actually representing here?  It's not just a bounding box because the structure is also attaching a specific meaning to it - that we think the bounding box corresponds to the location of some particular object in the image data.

Given that, maybe we would be better with something like "ObjectLocation"?  (Just an idea, feel free to ignore me and suggest something else.)

>>> diff --git a/libavutil/dnn_bbox.h b/libavutil/dnn_bbox.h new file mode
>>> 100644 index 0000000000..50899c4486
>>> --- /dev/null
>>> +++ b/libavutil/dnn_bbox.h
>>> +
>>> +    /**
>>> +     * Object detection is usually applied to a smaller image that
>>> +     * is scaled down from the original frame.
>>> +     * width and height are attributes of the scaled image, in pixel.
>>> +     */
>>> +    int model_input_width;
>>> +    int model_input_height;
>>
>> Other than to interpret the distances below, what will the user do with this
>> information?  (Alternatively: why not map the distances back onto the
>> original frame size?)
> 
> My idea was: if the original frame is scaled sometime later, the side data (bbox)
> keeps correct without any modification.
> 
> If we use the distance on the original frame size, shall we say the behavior
> is undefined when it is scaled sometime later?

This probably shouldn't be a specific concern - many other existing side-data types are invalidated by scaling (or other non-scaling filtering transformations, for that matter).

Unless there is an independent reason to keep the model size (which there might well be, I don't know) then I think it would be better to use the actual frame size.

>>> +
>>> +    /**
>>> +     * Distance in pixels from the top edge of the scaled image to top
>>> +     * and bottom, and from the left edge of the scaled image to left and
>>> +     * right, defining the bounding box.
>>> +     */
>>> +    int top;
>>> +    int left;
>>> +    int bottom;
>>> +    int right;
>>> +
>>> +    /**
>>> +     * Detect result
>>> +     */
>>> +    int detect_label;
>>
>> How does a user interpret this label?  Is it from some known enum?
> 
> The mappings between label_id (int) and label_name (string) are not part of
> the model file, it is usually another file provided together with the model file.
> The mappings are specific with the model file.
> 
> My idea was to provide the mapping file only when it is needed, for example,
> when draw the box and text with filter drawbox/drawtext to visualize the bounding box.

So every component involved would need to be separately supplied with the same mapping information?

> If we don't care the size in side_data, we can add label name into this struct,
> for example.
> #define BBOX_LABEL_NAME_MAX_LENGTH 32
> int detect_label_id;
> char detect_label_name[BBOX_LABEL_NAME_MAX_LENGTH+1];
> int classify_label_ids[AV_NUM_BBOX_CLASSIFY];
> char classify_label_names[AV_NUM_BBOX_CLASSIFY][ BBOX_LABEL_NAME_MAX_LENGTH+1]

Assuming the overhead isn't excessive and there isn't other useful metadata in the model-information file, supplying the names inline does seem likely to be less confusing for users so they don't have to be careful with the mapping information.

(On top of that - do the IDs do anything at all if you have the names?)

Thanks,

- Mark
Guo, Yejun Feb. 16, 2021, 10:37 a.m. UTC | #4
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Mark
> Thompson
> Sent: 2021年2月16日 7:48
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V2 08/10] libavutil: add side data
> AVDnnBoundingBox for dnn based detect/classify filters
> 
> On 11/02/2021 08:15, Guo, Yejun wrote:
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >> Mark Thompson
> >> Sent: 2021年2月11日 6:19
> >> To: ffmpeg-devel@ffmpeg.org
> >> Subject: Re: [FFmpeg-devel] [PATCH V2 08/10] libavutil: add side data
> >> AVDnnBoundingBox for dnn based detect/classify filters
> >>
> >> On 10/02/2021 09:34, Guo, Yejun wrote:
> >>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> >>> ---
> >>>    doc/APIchanges       |  2 ++
> >>>    libavutil/Makefile   |  1 +
> >>>    libavutil/dnn_bbox.h | 68
> >> ++++++++++++++++++++++++++++++++++++++++++++
> >>>    libavutil/frame.c    |  1 +
> >>>    libavutil/frame.h    |  7 +++++
> >>>    libavutil/version.h  |  2 +-
> >>>    6 files changed, 80 insertions(+), 1 deletion(-)
> >>>    create mode 100644 libavutil/dnn_bbox.h
> >>
> >> What is the intended consumer of this box information?  (Is there
> >> some other filter which will read these are do something with them,
> >> or some sort of user
> >> program?)
> >>
> >> If there is no use in ffmpeg outside libavfilter then the header
> >> should probably be in libavfilter.
> >
> >
> > Thanks for the feedback.
> >
> > For most case, other filters will use this box information, for
> > example, a classify filter will recognize the car number after the car
> > plate is detected, another filter can apply 'beauty' if a face is
> > detected, and updated drawbox filter (in plan) can draw the box for
> > visualization, and a new filter such as bbox_to_roi can be added to apply roi
> encoding for the detected result.
> >
> > It is possible that some others will use it, for example, the new
> > codec is adding AI labels and so libavcodec might need it in the
> > future, and a user program might do something special such as:
> > 1. use libavcodec to decode
> > 2. use filter detect
> > 3. write his own code to handle the detect result
> >
> > As the first step, how about to put it in the libavfilter (so do not
> > expose it at API level and we are free to change it when needed)? And
> > we can move it to libavutil once it is required.
> 
> Sure.
> 
> >> How tied is this to the DNN implementation, and hence the DNN name?
> >> If someone made a standalone filter doing object detection by some
> >> other method, would it make sense for them to reuse this structure?
> >
> > Yes, this structure is general, I add dnn prefix because of two reasons:
> > 1. There's already bounding box in libavfilter/bbox.h, see below, it's
> > simple and we could not reuse it, so we need a new name.
> > typedef struct FFBoundingBox {
> >      int x1, x2, y1, y2;
> > } FFBoundingBox;
> 
> Right, really this is just the return type for the internal
> ff_calculate_bounding_box() function - if you want to reuse the name
> externally then it would be fine to rename the existing stuff to get it out of the
> way.

yeah, I'll consider to rename it after these patches are done, since they now
are not conflict from compiler's perspective. 

> 
> > 2. DNN is currently the dominate method for object detection.
> 
> Unless your ID values or something else about the output are DNN-specific
> then I'm not really seeing the attraction of associating them with the DNN
> name for external use.  If a user wants to detect some objects in an image and
> then do something with the result then maybe they know they are using DNN
> for first step, but they won't care about where the result came from after that.

It reminds me that we might need to save some information such as model name,
name of data set trained, name/parameters of other non-dnn implementations etc.,
and so the user knows better about the bbox. For example, we can add 'char source[128]'
as box header for all the bboxes.

I'll think about it and send new patches for the side data and detect filter.

> 
> > How about to use 'struct BoudingBox' when it is under libavfilter
> > (added into current file bbox.h), and rename to 'AVBoudingBox' when it is
> needed to move to libavutil?
> 
> Hmm.  What are we actually representing here?  It's not just a bounding box
> because the structure is also attaching a specific meaning to it - that we think
> the bounding box corresponds to the location of some particular object in the
> image data.
> 
> Given that, maybe we would be better with something like "ObjectLocation"?
> (Just an idea, feel free to ignore me and suggest something else.)

For the current papers of object objection, they all use the term bounding box, 
that' the reason that I think we'd better use bounding box.

> 
> >>> diff --git a/libavutil/dnn_bbox.h b/libavutil/dnn_bbox.h new file
> >>> mode
> >>> 100644 index 0000000000..50899c4486
> >>> --- /dev/null
> >>> +++ b/libavutil/dnn_bbox.h
> >>> +
> >>> +    /**
> >>> +     * Object detection is usually applied to a smaller image that
> >>> +     * is scaled down from the original frame.
> >>> +     * width and height are attributes of the scaled image, in pixel.
> >>> +     */
> >>> +    int model_input_width;
> >>> +    int model_input_height;
> >>
> >> Other than to interpret the distances below, what will the user do
> >> with this information?  (Alternatively: why not map the distances
> >> back onto the original frame size?)
> >
> > My idea was: if the original frame is scaled sometime later, the side
> > data (bbox) keeps correct without any modification.
> >
> > If we use the distance on the original frame size, shall we say the
> > behavior is undefined when it is scaled sometime later?
> 
> This probably shouldn't be a specific concern - many other existing side-data
> types are invalidated by scaling (or other non-scaling filtering transformations,
> for that matter).
> 
> Unless there is an independent reason to keep the model size (which there
> might well be, I don't know) then I think it would be better to use the actual
> frame size.

sure, i'll switch to the actual frame size.

> 
> >>> +
> >>> +    /**
> >>> +     * Distance in pixels from the top edge of the scaled image to top
> >>> +     * and bottom, and from the left edge of the scaled image to left
> and
> >>> +     * right, defining the bounding box.
> >>> +     */
> >>> +    int top;
> >>> +    int left;
> >>> +    int bottom;
> >>> +    int right;
> >>> +
> >>> +    /**
> >>> +     * Detect result
> >>> +     */
> >>> +    int detect_label;
> >>
> >> How does a user interpret this label?  Is it from some known enum?
> >
> > The mappings between label_id (int) and label_name (string) are not
> > part of the model file, it is usually another file provided together with the
> model file.
> > The mappings are specific with the model file.
> >
> > My idea was to provide the mapping file only when it is needed, for
> > example, when draw the box and text with filter drawbox/drawtext to
> visualize the bounding box.
> 
> So every component involved would need to be separately supplied with the
> same mapping information?
> 
> > If we don't care the size in side_data, we can add label name into
> > this struct, for example.
> > #define BBOX_LABEL_NAME_MAX_LENGTH 32
> > int detect_label_id;
> > char detect_label_name[BBOX_LABEL_NAME_MAX_LENGTH+1];
> > int classify_label_ids[AV_NUM_BBOX_CLASSIFY];
> > char classify_label_names[AV_NUM_BBOX_CLASSIFY][
> > BBOX_LABEL_NAME_MAX_LENGTH+1]
> 
> Assuming the overhead isn't excessive and there isn't other useful metadata in
> the model-information file, supplying the names inline does seem likely to be
> less confusing for users so they don't have to be careful with the mapping
> information.
> 
> (On top of that - do the IDs do anything at all if you have the names?)

yes, the label names are more straight forward and less confusing. I'll add them.

We can do int comparison if we have label_id (int), otherwise, we'll do strncmp
with label_name (string). Looks it is trivial, I'll remove the IDs.

Thanks
Yejun
Guo, Yejun Feb. 17, 2021, 1:46 a.m. UTC | #5
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Guo,
> Yejun
> Sent: 2021年2月16日 18:37
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V2 08/10] libavutil: add side data
> AVDnnBoundingBox for dnn based detect/classify filters
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Mark
> > Thompson
> > Sent: 2021年2月16日 7:48
> > To: ffmpeg-devel@ffmpeg.org
> > Subject: Re: [FFmpeg-devel] [PATCH V2 08/10] libavutil: add side data
> > AVDnnBoundingBox for dnn based detect/classify filters
> >
> > On 11/02/2021 08:15, Guo, Yejun wrote:
> > >> -----Original Message-----
> > >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > >> Mark Thompson
> > >> Sent: 2021年2月11日 6:19
> > >> To: ffmpeg-devel@ffmpeg.org
> > >> Subject: Re: [FFmpeg-devel] [PATCH V2 08/10] libavutil: add side
> > >> data AVDnnBoundingBox for dnn based detect/classify filters
> > >>
> > >> On 10/02/2021 09:34, Guo, Yejun wrote:
> > >>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > >>> ---
> > >>>    doc/APIchanges       |  2 ++
> > >>>    libavutil/Makefile   |  1 +
> > >>>    libavutil/dnn_bbox.h | 68
> > >> ++++++++++++++++++++++++++++++++++++++++++++
> > >>>    libavutil/frame.c    |  1 +
> > >>>    libavutil/frame.h    |  7 +++++
> > >>>    libavutil/version.h  |  2 +-
> > >>>    6 files changed, 80 insertions(+), 1 deletion(-)
> > >>>    create mode 100644 libavutil/dnn_bbox.h
> > >>
> > >> What is the intended consumer of this box information?  (Is there
> > >> some other filter which will read these are do something with them,
> > >> or some sort of user
> > >> program?)
> > >>
> > >> If there is no use in ffmpeg outside libavfilter then the header
> > >> should probably be in libavfilter.
> > >
> > >
> > > Thanks for the feedback.
> > >
> > > For most case, other filters will use this box information, for
> > > example, a classify filter will recognize the car number after the
> > > car plate is detected, another filter can apply 'beauty' if a face
> > > is detected, and updated drawbox filter (in plan) can draw the box
> > > for visualization, and a new filter such as bbox_to_roi can be added
> > > to apply roi
> > encoding for the detected result.
> > >
> > > It is possible that some others will use it, for example, the new
> > > codec is adding AI labels and so libavcodec might need it in the
> > > future, and a user program might do something special such as:
> > > 1. use libavcodec to decode
> > > 2. use filter detect
> > > 3. write his own code to handle the detect result
> > >
> > > As the first step, how about to put it in the libavfilter (so do not
> > > expose it at API level and we are free to change it when needed)?
> > > And we can move it to libavutil once it is required.
> >
> > Sure.
> >
> > >> How tied is this to the DNN implementation, and hence the DNN name?
> > >> If someone made a standalone filter doing object detection by some
> > >> other method, would it make sense for them to reuse this structure?
> > >
> > > Yes, this structure is general, I add dnn prefix because of two reasons:
> > > 1. There's already bounding box in libavfilter/bbox.h, see below,
> > > it's simple and we could not reuse it, so we need a new name.
> > > typedef struct FFBoundingBox {
> > >      int x1, x2, y1, y2;
> > > } FFBoundingBox;
> >
> > Right, really this is just the return type for the internal
> > ff_calculate_bounding_box() function - if you want to reuse the name
> > externally then it would be fine to rename the existing stuff to get
> > it out of the way.
> 
> yeah, I'll consider to rename it after these patches are done, since they now
> are not conflict from compiler's perspective.
> 
> >
> > > 2. DNN is currently the dominate method for object detection.
> >
> > Unless your ID values or something else about the output are
> > DNN-specific then I'm not really seeing the attraction of associating
> > them with the DNN name for external use.  If a user wants to detect
> > some objects in an image and then do something with the result then
> > maybe they know they are using DNN for first step, but they won't care
> about where the result came from after that.
> 
> It reminds me that we might need to save some information such as model
> name, name of data set trained, name/parameters of other non-dnn
> implementations etc., and so the user knows better about the bbox. For
> example, we can add 'char source[128]'
> as box header for all the bboxes.
> 
> I'll think about it and send new patches for the side data and detect filter.

hi, I'll push the first 7 patches of this patch set tomorrow if there's no other
comment for them, and then send new patch set for side data and detect filter,
thanks.
diff mbox series

Patch

diff --git a/doc/APIchanges b/doc/APIchanges
index 1332694820..7cbfa9fafa 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -14,6 +14,8 @@  libavutil:     2017-10-21
 
 
 API changes, most recent first:
+2021-02-08 - xxxxxxxxxx - lavu 56.65.100 - frame.h
+  Add AV_FRAME_DATA_DNN_BBOXES
 
 2021-01-26 - xxxxxxxxxx - lavu 56.64.100 - common.h
   Add FFABSU()
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 27bafe9e12..b99cb7084f 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -23,6 +23,7 @@  HEADERS = adler32.h                                                     \
           des.h                                                         \
           dict.h                                                        \
           display.h                                                     \
+          dnn_bbox.h                                                    \
           dovi_meta.h                                                   \
           downmix_info.h                                                \
           encryption_info.h                                             \
diff --git a/libavutil/dnn_bbox.h b/libavutil/dnn_bbox.h
new file mode 100644
index 0000000000..50899c4486
--- /dev/null
+++ b/libavutil/dnn_bbox.h
@@ -0,0 +1,68 @@ 
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_DNN_BBOX_H
+#define AVUTIL_DNN_BBOX_H
+
+#include "rational.h"
+
+typedef struct AVDnnBoundingBox {
+    /**
+     * Must be set to the size of this data structure (that is,
+     * sizeof(AVDnnBoundingBox)).
+     */
+    uint32_t self_size;
+
+    /**
+     * Object detection is usually applied to a smaller image that
+     * is scaled down from the original frame.
+     * width and height are attributes of the scaled image, in pixel.
+     */
+    int model_input_width;
+    int model_input_height;
+
+    /**
+     * Distance in pixels from the top edge of the scaled image to top
+     * and bottom, and from the left edge of the scaled image to left and
+     * right, defining the bounding box.
+     */
+    int top;
+    int left;
+    int bottom;
+    int right;
+
+    /**
+     * Detect result
+     */
+    int detect_label;
+    AVRational detect_conf;
+
+    /**
+     * At most 4 classifications based on the detected bounding box.
+     * For example, we can get max 4 different attributes with 4 different
+     * DNN models on one bounding box.
+     * classify_count is zero if no classification.
+     */
+#define AV_NUM_BBOX_CLASSIFY 4
+    uint32_t classify_count;
+    int classify_labels[AV_NUM_BBOX_CLASSIFY];
+    AVRational classify_confs[AV_NUM_BBOX_CLASSIFY];
+} AVDnnBoundingBox;
+
+#endif
diff --git a/libavutil/frame.c b/libavutil/frame.c
index eab51b6a32..4308507827 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
@@ -852,6 +852,7 @@  const char *av_frame_side_data_name(enum AVFrameSideDataType type)
     case AV_FRAME_DATA_VIDEO_ENC_PARAMS:            return "Video encoding parameters";
     case AV_FRAME_DATA_SEI_UNREGISTERED:            return "H.26[45] User Data Unregistered SEI message";
     case AV_FRAME_DATA_FILM_GRAIN_PARAMS:           return "Film grain parameters";
+    case AV_FRAME_DATA_DNN_BBOXES:                  return "DNN bounding boxes";
     }
     return NULL;
 }
diff --git a/libavutil/frame.h b/libavutil/frame.h
index 1aeafef6de..a4dcfd27c9 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -198,6 +198,13 @@  enum AVFrameSideDataType {
      * Must be present for every frame which should have film grain applied.
      */
     AV_FRAME_DATA_FILM_GRAIN_PARAMS,
+
+    /**
+     * Bounding box generated by dnn based filters for object detection and classification,
+     * the data is an array of AVDnnBoudingBox, the number of array element is implied by
+     * AVFrameSideData.size / AVDnnBoudingBox.self_size.
+     */
+    AV_FRAME_DATA_DNN_BBOXES,
 };
 
 enum AVActiveFormatDescription {
diff --git a/libavutil/version.h b/libavutil/version.h
index 21136e6cb7..b2165754f9 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -79,7 +79,7 @@ 
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  56
-#define LIBAVUTIL_VERSION_MINOR  64
+#define LIBAVUTIL_VERSION_MINOR  65
 #define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \