diff mbox series

[FFmpeg-devel,V6,4/6] lavu: add side data AV_FRAME_DATA_BOUNDING_BOXES

Message ID 20210326080931.2952-4-yejun.guo@intel.com
State Superseded
Headers show
Series [FFmpeg-devel,V6,1/6] lavfi/dnn_backend_openvino.c: only allow DFT_PROCESS_FRAME to get output dim | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Guo, Yejun March 26, 2021, 8:09 a.m. UTC
Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
---
 doc/APIchanges          |  2 ++
 libavutil/Makefile      |  1 +
 libavutil/boundingbox.h | 79 +++++++++++++++++++++++++++++++++++++++++
 libavutil/frame.c       |  1 +
 libavutil/frame.h       |  7 ++++
 5 files changed, 90 insertions(+)
 create mode 100644 libavutil/boundingbox.h

Comments

Anton Khirnov April 1, 2021, 8:13 a.m. UTC | #1
Quoting Guo, Yejun (2021-03-26 09:09:29)
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  doc/APIchanges          |  2 ++
>  libavutil/Makefile      |  1 +
>  libavutil/boundingbox.h | 79 +++++++++++++++++++++++++++++++++++++++++
>  libavutil/frame.c       |  1 +
>  libavutil/frame.h       |  7 ++++
>  5 files changed, 90 insertions(+)
>  create mode 100644 libavutil/boundingbox.h
> 
> diff --git a/doc/APIchanges b/doc/APIchanges
> index b41dadee8d..160b020d9d 100644
> --- a/doc/APIchanges
> +++ b/doc/APIchanges
> @@ -14,6 +14,8 @@ libavutil:     2017-10-21
>  
>  
>  API changes, most recent first:
> +2021-03-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
> +  Add AV_FRAME_DATA_BOUNDING_BOXES
>  
>  2021-03-21 - xxxxxxxxxx - lavu 56.72.100 - frame.h
>    Deprecated av_get_colorspace_name().
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 27bafe9e12..7e1dc713ac 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -11,6 +11,7 @@ HEADERS = adler32.h                                                     \
>            avutil.h                                                      \
>            base64.h                                                      \
>            blowfish.h                                                    \
> +          boundingbox.h                                                 \
>            bprint.h                                                      \
>            bswap.h                                                       \
>            buffer.h                                                      \
> diff --git a/libavutil/boundingbox.h b/libavutil/boundingbox.h
> new file mode 100644
> index 0000000000..5e2fdd55e9
> --- /dev/null
> +++ b/libavutil/boundingbox.h
> @@ -0,0 +1,79 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_BOUNDINGBOX_H
> +#define AVUTIL_BOUNDINGBOX_H
> +
> +#include "libavutil/rational.h"
> +
> +typedef struct AVBoundingBox {
> +    /**
> +     * Distance in pixels from the top edge of the frame to top
> +     * and bottom, and from the left edge of the frame to left and
> +     * right, defining the bounding box.
> +     */
> +    int top;
> +    int left;
> +    int bottom;
> +    int right;

Is this allowed to be negative? Can/should this be size_t?

> +
> +#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
> +
> +    /**
> +     * Detect result with confidence
> +     */
> +    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
> +    AVRational detect_confidence;
> +
> +    /**
> +     * At most 4 classifications based on the detected bounding box.
> +     * For example, we can get max 4 different attributes with 4 different
> +     * DNN models on one bounding box.
> +     * classify_count is zero if no classification.
> +     */
> +#define AV_NUM_BBOX_CLASSIFY 4
> +    uint32_t classify_count;

size_t?

> +    char classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
> +    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
> +} AVBoundingBox;
> +
> +typedef struct AVBoundingBoxHeader {

This structure is not extensible in an ABI-compatible way.

> +    /**
> +     * Information about how the bounding box is generated.
> +     * for example, the DNN model name.
> +     */
> +    char source[128];
> +
> +    /**
> +     * The size of frame when it is detected.
> +     */
> +    int frame_width;
> +    int frame_height;

Why?

> +
> +    /**
> +     * Number of bounding boxes.
> +     */
> +    uint32_t nb_bbox;

size_t?
Guo, Yejun April 1, 2021, 11:31 a.m. UTC | #2
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Anton
> Khirnov
> Sent: 2021年4月1日 16:13
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V6 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Quoting Guo, Yejun (2021-03-26 09:09:29)
> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > ---
> >  doc/APIchanges          |  2 ++
> >  libavutil/Makefile      |  1 +
> >  libavutil/boundingbox.h | 79
> +++++++++++++++++++++++++++++++++++++++++
> >  libavutil/frame.c       |  1 +
> >  libavutil/frame.h       |  7 ++++
> >  5 files changed, 90 insertions(+)
> > +
> > +#ifndef AVUTIL_BOUNDINGBOX_H
> > +#define AVUTIL_BOUNDINGBOX_H
> > +
> > +#include "libavutil/rational.h"
> > +
> > +typedef struct AVBoundingBox {
> > +    /**
> > +     * Distance in pixels from the top edge of the frame to top
> > +     * and bottom, and from the left edge of the frame to left and
> > +     * right, defining the bounding box.
> > +     */
> > +    int top;
> > +    int left;
> > +    int bottom;
> > +    int right;
> 
> Is this allowed to be negative? Can/should this be size_t?

There was a long discussion about size_t/int/uint32_t when I added
struct AVRegionOfInterest in frame.h, and finally size_t is not chosen.

> 
> > +
> > +#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
> > +
> > +    /**
> > +     * Detect result with confidence
> > +     */
> > +    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
> > +    AVRational detect_confidence;
> > +
> > +    /**
> > +     * At most 4 classifications based on the detected bounding box.
> > +     * For example, we can get max 4 different attributes with 4 different
> > +     * DNN models on one bounding box.
> > +     * classify_count is zero if no classification.
> > +     */
> > +#define AV_NUM_BBOX_CLASSIFY 4
> > +    uint32_t classify_count;
> 
> size_t?
> 
> > +    char
> classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
> > +    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
> > +} AVBoundingBox;
> > +
> > +typedef struct AVBoundingBoxHeader {
> 
> This structure is not extensible in an ABI-compatible way.

yes, we can add a version number (enum AVBBoxHeaderVersion, see below) at the
beginning of the struct, and the user needs to set it with AV_BBOX_HEADER_CURRENT.
It is not elegant, is this what we really want?

enum AVBBoxHeaderVersion {
  AV_BBOX_HEADER_NONE = 0,
  AV_BBOX_HEADER_V1,
  AV_BBOX_HEADER_CURRENT = AV_BBOX_HEADER_V1,
};

typedef struct AVBoundingBoxHeader {
  enum AVBBoxHeaderVersion version;
  union {
    AVBoundingBoxHeaderV1 v1;
  } header;
} AVBoundingBoxHeader;


We have carefully thought/discussed about the struct for months, and there's
little chance for later update, imho.


> 
> > +    /**
> > +     * Information about how the bounding box is generated.
> > +     * for example, the DNN model name.
> > +     */
> > +    char source[128];
> > +
> > +    /**
> > +     * The size of frame when it is detected.
> > +     */
> > +    int frame_width;
> > +    int frame_height;
> 
> Why?

so we know if the value of top/left/bottom/right in AVBoundingBox
can be used directly in other later filters.

for example, the filter chain is: dnn_detect -> scale -> dnn_classify (in plan)

In filter dnn_classify, it needs to check the frame size to know if the values
in bounding boxes (the detection result) are still valid.

> 
> > +
> > +    /**
> > +     * Number of bounding boxes.
> > +     */
> > +    uint32_t nb_bbox;
> 
> size_t?
> 
> --
> Anton Khirnov
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> with subject "unsubscribe".
Guo, Yejun April 2, 2021, 2:27 a.m. UTC | #3
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Guo,
> Yejun
> Sent: 2021年4月1日 19:32
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V6 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Anton
> > Khirnov
> > Sent: 2021年4月1日 16:13
> > To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH V6 4/6] lavu: add side data
> > AV_FRAME_DATA_BOUNDING_BOXES
> >
> > Quoting Guo, Yejun (2021-03-26 09:09:29)
> > > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > > ---
> > >  doc/APIchanges          |  2 ++
> > >  libavutil/Makefile      |  1 +
> > >  libavutil/boundingbox.h | 79
> > +++++++++++++++++++++++++++++++++++++++++
> > >  libavutil/frame.c       |  1 +
> > >  libavutil/frame.h       |  7 ++++
> > >  5 files changed, 90 insertions(+)
> > > +
> > > +#ifndef AVUTIL_BOUNDINGBOX_H
> > > +#define AVUTIL_BOUNDINGBOX_H
> > > +
> > > +#include "libavutil/rational.h"
> > > +
> > > +typedef struct AVBoundingBox {
> > > +    /**
> > > +     * Distance in pixels from the top edge of the frame to top
> > > +     * and bottom, and from the left edge of the frame to left and
> > > +     * right, defining the bounding box.
> > > +     */
> > > +    int top;
> > > +    int left;
> > > +    int bottom;
> > > +    int right;
> >
> > Is this allowed to be negative? Can/should this be size_t?
> 
> There was a long discussion about size_t/int/uint32_t when I added
> struct AVRegionOfInterest in frame.h, and finally size_t is not chosen.
> 
> >
> > > +
> > > +#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
> > > +
> > > +    /**
> > > +     * Detect result with confidence
> > > +     */
> > > +    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
> > > +    AVRational detect_confidence;
> > > +
> > > +    /**
> > > +     * At most 4 classifications based on the detected bounding box.
> > > +     * For example, we can get max 4 different attributes with 4 different
> > > +     * DNN models on one bounding box.
> > > +     * classify_count is zero if no classification.
> > > +     */
> > > +#define AV_NUM_BBOX_CLASSIFY 4
> > > +    uint32_t classify_count;
> >
> > size_t?
> >
> > > +    char
> >
> classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
> > > +    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
> > > +} AVBoundingBox;
> > > +
> > > +typedef struct AVBoundingBoxHeader {
> >
> > This structure is not extensible in an ABI-compatible way.
> 
> yes, we can add a version number (enum AVBBoxHeaderVersion, see below) at
> the
> beginning of the struct, and the user needs to set it with
> AV_BBOX_HEADER_CURRENT.
> It is not elegant, is this what we really want?
> 
> enum AVBBoxHeaderVersion {
>   AV_BBOX_HEADER_NONE = 0,
>   AV_BBOX_HEADER_V1,
>   AV_BBOX_HEADER_CURRENT = AV_BBOX_HEADER_V1,
> };
> 
> typedef struct AVBoundingBoxHeader {
>   enum AVBBoxHeaderVersion version;
>   union {
>     AVBoundingBoxHeaderV1 v1;
>   } header;
> } AVBoundingBoxHeader;
> 
> 
> We have carefully thought/discussed about the struct for months, and there's
> little chance for later update, imho.

And my opinion is to still use the code in this patch. For the worst case, if we need to
change the struct some time later, we still have a timing before the next release 4.5.

> 
> 
> >
> > > +    /**
> > > +     * Information about how the bounding box is generated.
> > > +     * for example, the DNN model name.
> > > +     */
> > > +    char source[128];
> > > +
> > > +    /**
> > > +     * The size of frame when it is detected.
> > > +     */
> > > +    int frame_width;
> > > +    int frame_height;
> >
> > Why?
> 
> so we know if the value of top/left/bottom/right in AVBoundingBox
> can be used directly in other later filters.
> 
> for example, the filter chain is: dnn_detect -> scale -> dnn_classify (in plan)
> 
> In filter dnn_classify, it needs to check the frame size to know if the values
> in bounding boxes (the detection result) are still valid.
> 
> >
> > > +
> > > +    /**
> > > +     * Number of bounding boxes.
> > > +     */
> > > +    uint32_t nb_bbox;
> >
> > size_t?
> >
> > --
> > Anton Khirnov
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> > with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Nicolas George April 4, 2021, 10:01 a.m. UTC | #4
Guo, Yejun (12021-04-01):
> > Is this allowed to be negative? Can/should this be size_t?
> There was a long discussion about size_t/int/uint32_t when I added
> struct AVRegionOfInterest in frame.h, and finally size_t is not chosen.

Then at least unsigned.

> yes, we can add a version number (enum AVBBoxHeaderVersion, see below) at the
> beginning of the struct, and the user needs to set it with AV_BBOX_HEADER_CURRENT.
> It is not elegant, is this what we really want?

A version number does not make the structure compatible. At best,
applications check the version and detect they do not support this one
and report an error. At worse, they do not bother to check and it causes
strange bugs. The version number does not help with compatibility, it
just makes it detectable.

To make the structure compatible, we have to ask ourselves: What happens
if we update it? What breaks compatibility? Once we have an answer, we
find a workaround.

In this case, what happens is that the variable array must be at the
end, and therefore its offset changes. And we cannot have a second
variable array (like the name! you had to set a constant size), and we
cannot update the type of the array elements.

And the solution becomes obvious: let us store the offsets in the
structure.

So, that would be:

	typedef struct AVBoundingBoxHeader {
	...
	    /**
	     * Offset of the array of bounding boxes.
	     */
	    size_t bboxes_offset;

	    /**
	     * Offset increment in the array of bounding boxes.
	     */
	    size_t bboxes_step;
	};

Note that with that solution, we do not need the empty array, we can do
this:

AVBoundingBoxHeader *ff_bounding_box_alloc(size_t nb_bbox)
{
    struct {
	AVBoundingBoxHeader header;
	AVBoundingBox boxes[nb_bbox];
    } *ret;
    ret = av_mallocz(sizeof(*ret));
    /* add error checks */
    ret->header->bboxes_offset = (char *)&ret->boxes - (char *)ret->header;
    ret->header->bboxes_step = sizeof(*ret->boxes);
}

#define AV_BOUNDING_BOX_GET(header, idx) \
    ((AVBoundingBox *)((char *)(header) + (header)->bboxes_offset + (idx) * (header)->bboxes_step))

You can do the same to lift the 128 limit on the name.

Regards,
James Almer April 4, 2021, 1:27 p.m. UTC | #5
On 4/4/2021 7:01 AM, Nicolas George wrote:
> Guo, Yejun (12021-04-01):
>>> Is this allowed to be negative? Can/should this be size_t?
>> There was a long discussion about size_t/int/uint32_t when I added
>> struct AVRegionOfInterest in frame.h, and finally size_t is not chosen.
> 
> Then at least unsigned.
> 
>> yes, we can add a version number (enum AVBBoxHeaderVersion, see below) at the
>> beginning of the struct, and the user needs to set it with AV_BBOX_HEADER_CURRENT.
>> It is not elegant, is this what we really want?
> 
> A version number does not make the structure compatible. At best,
> applications check the version and detect they do not support this one
> and report an error. At worse, they do not bother to check and it causes
> strange bugs. The version number does not help with compatibility, it
> just makes it detectable.
> 
> To make the structure compatible, we have to ask ourselves: What happens
> if we update it? What breaks compatibility? Once we have an answer, we
> find a workaround.
> 
> In this case, what happens is that the variable array must be at the
> end, and therefore its offset changes. And we cannot have a second
> variable array (like the name! you had to set a constant size), and we
> cannot update the type of the array elements.
> 
> And the solution becomes obvious: let us store the offsets in the
> structure.
> 
> So, that would be:
> 
> 	typedef struct AVBoundingBoxHeader {
> 	...
> 	    /**
> 	     * Offset of the array of bounding boxes.
> 	     */
> 	    size_t bboxes_offset;
> 
> 	    /**
> 	     * Offset increment in the array of bounding boxes.
> 	     */
> 	    size_t bboxes_step;
> 	};
> 
> Note that with that solution, we do not need the empty array, we can do
> this:
> 
> AVBoundingBoxHeader *ff_bounding_box_alloc(size_t nb_bbox)
> {
>      struct {
> 	AVBoundingBoxHeader header;
> 	AVBoundingBox boxes[nb_bbox];
>      } *ret;
>      ret = av_mallocz(sizeof(*ret));
>      /* add error checks */
>      ret->header->bboxes_offset = (char *)&ret->boxes - (char *)ret->header;
>      ret->header->bboxes_step = sizeof(*ret->boxes);
> }
> 
> #define AV_BOUNDING_BOX_GET(header, idx) \
>      ((AVBoundingBox *)((char *)(header) + (header)->bboxes_offset + (idx) * (header)->bboxes_step))

This solution is what was used for video_enc_params.h, so i agree it 
should be used here too. What's missing is a check for idx < nb_bbox 
before accessing the offset in question, so an inline function instead 
of a #define with a check for the above would be needed.

And since both are used as frame side data, it would be ideal that the 
signature for the public helpers on both are the same (The standard 
alloc, and the alloc + wrapping into frame side data ones).

> 
> You can do the same to lift the 128 limit on the name.
> 
> Regards,
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Andreas Rheinhardt April 4, 2021, 1:36 p.m. UTC | #6
Nicolas George:
> Guo, Yejun (12021-04-01):
>>> Is this allowed to be negative? Can/should this be size_t?
>> There was a long discussion about size_t/int/uint32_t when I added
>> struct AVRegionOfInterest in frame.h, and finally size_t is not chosen.
> 
> Then at least unsigned.
> 
>> yes, we can add a version number (enum AVBBoxHeaderVersion, see below) at the
>> beginning of the struct, and the user needs to set it with AV_BBOX_HEADER_CURRENT.
>> It is not elegant, is this what we really want?
> 
> A version number does not make the structure compatible. At best,
> applications check the version and detect they do not support this one
> and report an error. At worse, they do not bother to check and it causes
> strange bugs. The version number does not help with compatibility, it
> just makes it detectable.
> 
> To make the structure compatible, we have to ask ourselves: What happens
> if we update it? What breaks compatibility? Once we have an answer, we
> find a workaround.
> 
> In this case, what happens is that the variable array must be at the
> end, and therefore its offset changes. And we cannot have a second
> variable array (like the name! you had to set a constant size), and we
> cannot update the type of the array elements.
> 
> And the solution becomes obvious: let us store the offsets in the
> structure.
> 
> So, that would be:
> 
> 	typedef struct AVBoundingBoxHeader {
> 	...
> 	    /**
> 	     * Offset of the array of bounding boxes.
> 	     */
> 	    size_t bboxes_offset;
> 
> 	    /**
> 	     * Offset increment in the array of bounding boxes.
> 	     */
> 	    size_t bboxes_step;
> 	};
> 
> Note that with that solution, we do not need the empty array, we can do
> this:
> 
> AVBoundingBoxHeader *ff_bounding_box_alloc(size_t nb_bbox)
> {
>     struct {
> 	AVBoundingBoxHeader header;
> 	AVBoundingBox boxes[nb_bbox];

That's a variable-length array. That's a C99 feature we do not require.

>     } *ret;
>     ret = av_mallocz(sizeof(*ret));
>     /* add error checks */
>     ret->header->bboxes_offset = (char *)&ret->boxes - (char *)ret->header;
>     ret->header->bboxes_step = sizeof(*ret->boxes);
> }
> 
> #define AV_BOUNDING_BOX_GET(header, idx) \
>     ((AVBoundingBox *)((char *)(header) + (header)->bboxes_offset + (idx) * (header)->bboxes_step))
> 
> You can do the same to lift the 128 limit on the name.
> 
> Regards,
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Andreas Rheinhardt April 4, 2021, 1:41 p.m. UTC | #7
James Almer:
> On 4/4/2021 7:01 AM, Nicolas George wrote:
>> Guo, Yejun (12021-04-01):
>>>> Is this allowed to be negative? Can/should this be size_t?
>>> There was a long discussion about size_t/int/uint32_t when I added
>>> struct AVRegionOfInterest in frame.h, and finally size_t is not chosen.
>>
>> Then at least unsigned.
>>
>>> yes, we can add a version number (enum AVBBoxHeaderVersion, see
>>> below) at the
>>> beginning of the struct, and the user needs to set it with
>>> AV_BBOX_HEADER_CURRENT.
>>> It is not elegant, is this what we really want?
>>
>> A version number does not make the structure compatible. At best,
>> applications check the version and detect they do not support this one
>> and report an error. At worse, they do not bother to check and it causes
>> strange bugs. The version number does not help with compatibility, it
>> just makes it detectable.
>>
>> To make the structure compatible, we have to ask ourselves: What happens
>> if we update it? What breaks compatibility? Once we have an answer, we
>> find a workaround.
>>
>> In this case, what happens is that the variable array must be at the
>> end, and therefore its offset changes. And we cannot have a second
>> variable array (like the name! you had to set a constant size), and we
>> cannot update the type of the array elements.
>>
>> And the solution becomes obvious: let us store the offsets in the
>> structure.
>>
>> So, that would be:
>>
>>     typedef struct AVBoundingBoxHeader {
>>     ...
>>         /**
>>          * Offset of the array of bounding boxes.
>>          */
>>         size_t bboxes_offset;
>>
>>         /**
>>          * Offset increment in the array of bounding boxes.
>>          */
>>         size_t bboxes_step;
>>     };
>>
>> Note that with that solution, we do not need the empty array, we can do
>> this:
>>
>> AVBoundingBoxHeader *ff_bounding_box_alloc(size_t nb_bbox)
>> {
>>      struct {
>>     AVBoundingBoxHeader header;
>>     AVBoundingBox boxes[nb_bbox];
>>      } *ret;
>>      ret = av_mallocz(sizeof(*ret));
>>      /* add error checks */
>>      ret->header->bboxes_offset = (char *)&ret->boxes - (char
>> *)ret->header;
>>      ret->header->bboxes_step = sizeof(*ret->boxes);
>> }
>>
>> #define AV_BOUNDING_BOX_GET(header, idx) \
>>      ((AVBoundingBox *)((char *)(header) + (header)->bboxes_offset +
>> (idx) * (header)->bboxes_step))
> 
> This solution is what was used for video_enc_params.h, so i agree it
> should be used here too. What's missing is a check for idx < nb_bbox

Actually nothing in video_enc_params.c guarantees proper alignment for
AVVideoBlockParams. If we added a type requiring 64bit alignment to
AVVideoBlockParams and are on a 32bit system, then the blocks will be
misaligned.

> before accessing the offset in question, so an inline function instead
> of a #define with a check for the above would be needed.
> 
> And since both are used as frame side data, it would be ideal that the
> signature for the public helpers on both are the same (The standard
> alloc, and the alloc + wrapping into frame side data ones).
> 
>>
>> You can do the same to lift the 128 limit on the name.
>>
>> Regards,
>>
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Nicolas George April 4, 2021, 1:57 p.m. UTC | #8
James Almer (12021-04-04):
> This solution is what was used for video_enc_params.h, so i agree it should
> be used here too. What's missing is a check for idx < nb_bbox before
> accessing the offset in question, so an inline function instead of a #define
> with a check for the above would be needed.

That may be a good idea, but please notice that this check would not be
there with just an array or a pointer. FFmpeg is C, not Java. We do not
HAVE to add the check.

> And since both are used as frame side data, it would be ideal that the
> signature for the public helpers on both are the same (The standard alloc,
> and the alloc + wrapping into frame side data ones).

I agree with this.

Regards,
Nicolas George April 4, 2021, 2 p.m. UTC | #9
Andreas Rheinhardt (12021-04-04):
> That's a variable-length array. That's a C99 feature we do not require.

We have used variable-length arrays in the past. We have eliminated them
from our code base not because lack of support, IIRC, but because they
have a tendency to ruin compiler optimization.

This is not a variable-length array but a pointer to a variable-length
array, it does not cause the optimization problems. As it is, it is just
a trick to let the compiler compute offsets for us taking into account
alignment requirements.

Regards,
Guo, Yejun April 5, 2021, 9:56 a.m. UTC | #10
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nicolas
> George
> Sent: 2021年4月4日 18:02
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V6 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Guo, Yejun (12021-04-01):
> > > Is this allowed to be negative? Can/should this be size_t?
> > There was a long discussion about size_t/int/uint32_t when I added
> > struct AVRegionOfInterest in frame.h, and finally size_t is not chosen.
> 
> Then at least unsigned.

yes, 'unsigned int' is another option. but imho, it might be better to use
the same data type of AVFrame.width/height, they are 'int'.

> 
> > yes, we can add a version number (enum AVBBoxHeaderVersion, see below)
> at the
> > beginning of the struct, and the user needs to set it with
> AV_BBOX_HEADER_CURRENT.
> > It is not elegant, is this what we really want?
> 
> A version number does not make the structure compatible. At best,
> applications check the version and detect they do not support this one
> and report an error. At worse, they do not bother to check and it causes
> strange bugs. The version number does not help with compatibility, it
> just makes it detectable.

My idea was to use separate code path for AVBoundingBoxHeaderV1, 
AVBoundingBoxHeaderV2, ... (not union in my previous email) according to
the version number. And yes, it doesn't work if the user does not set or
check the version number correctly.

> 
> To make the structure compatible, we have to ask ourselves: What happens
> if we update it? What breaks compatibility? Once we have an answer, we
> find a workaround.
> 
> In this case, what happens is that the variable array must be at the
> end, and therefore its offset changes. And we cannot have a second
> variable array (like the name! you had to set a constant size), and we
> cannot update the type of the array elements.
> 
> And the solution becomes obvious: let us store the offsets in the
> structure.

Thanks a lot for your nice explanation! thank you.

> 
> So, that would be:
> 
> 	typedef struct AVBoundingBoxHeader {
> 	...
> 	    /**
> 	     * Offset of the array of bounding boxes.
> 	     */
> 	    size_t bboxes_offset;
> 
> 	    /**
> 	     * Offset increment in the array of bounding boxes.
> 	     */
> 	    size_t bboxes_step;
> 	};
> 
> Note that with that solution, we do not need the empty array, we can do
> this:
> 
> AVBoundingBoxHeader *ff_bounding_box_alloc(size_t nb_bbox)
> {
>     struct {
> 	AVBoundingBoxHeader header;
> 	AVBoundingBox boxes[nb_bbox];
>     } *ret;
>     ret = av_mallocz(sizeof(*ret));
>     /* add error checks */
>     ret->header->bboxes_offset = (char *)&ret->boxes - (char *)ret->header;
>     ret->header->bboxes_step = sizeof(*ret->boxes);
> }
> 
> #define AV_BOUNDING_BOX_GET(header, idx) \
>     ((AVBoundingBox *)((char *)(header) + (header)->bboxes_offset + (idx) *
> (header)->bboxes_step))

thanks, and I'll update the naming as what James mentioned.

> 
> You can do the same to lift the 128 limit on the name.

yes, we can now handle more variable arrays with this method.

For the special case 'char source[128]' in AVBoundingBoxHeader, it will be written
for several times, for example, vf_dnn_detect.c writes model name of detection
into source, and later, vf_dnn_classify.c (in plan) will append the model name of
classification into source. So, source will be at the end of the side data, and the side
data will be reallocated when needed.

> 
> Regards,
> 
> --
>   Nicolas George
Guo, Yejun April 6, 2021, 8:30 a.m. UTC | #11
> -----Original Message-----
> From: Guo, Yejun
> Sent: 2021年4月5日 17:56
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: RE: [FFmpeg-devel] [PATCH V6 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Nicolas
> > George
> > Sent: 2021年4月4日 18:02
> > To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH V6 4/6] lavu: add side data
> > AV_FRAME_DATA_BOUNDING_BOXES
> >
> > Guo, Yejun (12021-04-01):
> > > > Is this allowed to be negative? Can/should this be size_t?
> > > There was a long discussion about size_t/int/uint32_t when I added
> > > struct AVRegionOfInterest in frame.h, and finally size_t is not chosen.
> >
> > Then at least unsigned.
> 
> yes, 'unsigned int' is another option. but imho, it might be better to use
> the same data type of AVFrame.width/height, they are 'int'.
> 
> >
> > > yes, we can add a version number (enum AVBBoxHeaderVersion, see below)
> > at the
> > > beginning of the struct, and the user needs to set it with
> > AV_BBOX_HEADER_CURRENT.
> > > It is not elegant, is this what we really want?
> >
> > A version number does not make the structure compatible. At best,
> > applications check the version and detect they do not support this one
> > and report an error. At worse, they do not bother to check and it causes
> > strange bugs. The version number does not help with compatibility, it
> > just makes it detectable.
> 
> My idea was to use separate code path for AVBoundingBoxHeaderV1,
> AVBoundingBoxHeaderV2, ... (not union in my previous email) according to
> the version number. And yes, it doesn't work if the user does not set or
> check the version number correctly.
> 
> >
> > To make the structure compatible, we have to ask ourselves: What happens
> > if we update it? What breaks compatibility? Once we have an answer, we
> > find a workaround.
> >
> > In this case, what happens is that the variable array must be at the
> > end, and therefore its offset changes. And we cannot have a second
> > variable array (like the name! you had to set a constant size), and we
> > cannot update the type of the array elements.
> >
> > And the solution becomes obvious: let us store the offsets in the
> > structure.
> 
> Thanks a lot for your nice explanation! thank you.
> 
> >
> > So, that would be:
> >
> > 	typedef struct AVBoundingBoxHeader {
> > 	...
> > 	    /**
> > 	     * Offset of the array of bounding boxes.
> > 	     */
> > 	    size_t bboxes_offset;
> >
> > 	    /**
> > 	     * Offset increment in the array of bounding boxes.
> > 	     */
> > 	    size_t bboxes_step;
> > 	};
> >
> > Note that with that solution, we do not need the empty array, we can do
> > this:
> >
> > AVBoundingBoxHeader *ff_bounding_box_alloc(size_t nb_bbox)
> > {
> >     struct {
> > 	AVBoundingBoxHeader header;
> > 	AVBoundingBox boxes[nb_bbox];
> >     } *ret;
> >     ret = av_mallocz(sizeof(*ret));
> >     /* add error checks */
> >     ret->header->bboxes_offset = (char *)&ret->boxes - (char *)ret->header;
> >     ret->header->bboxes_step = sizeof(*ret->boxes);
> > }
> >
> > #define AV_BOUNDING_BOX_GET(header, idx) \
> >     ((AVBoundingBox *)((char *)(header) + (header)->bboxes_offset + (idx) *
> > (header)->bboxes_step))
> 
> thanks, and I'll update the naming as what James mentioned.
> 
> >
> > You can do the same to lift the 128 limit on the name.
> 
> yes, we can now handle more variable arrays with this method.

Looks that we could not support more variable arrays without requiring special
compiler feature. See the sample code below, and I got compile error on my
ubuntu 18.04 with default compiler and default configure option.

sample code:
AVBoundingBoxHeader *
av_bbox_alloc(uint32_t nb_bboxes, size_t name_size, size_t *out_size)
{
    struct {
	    AVBoundingBoxHeader header;
	    AVBoundingBox boxes[nb_bboxes];
        char name[name_size];
    } *ret;
    ...
}

compile error on ubuntu 18.04 with default setting:
error: ISO C90 forbids variable length array ‘boxes’ [-Werror=vla]
      AVBoundingBox boxes[nb_bboxes];
...

So, I might still use 'char source[128]' in AVBoundingBoxHeader, and use the
following code to meet the alignment requirement.

    struct {
	    AVBoundingBoxHeader header;
	    AVBoundingBox boxes[];
    } *ret;
    ret = av_mallocz(sizeof(*ret) + sizeof(AVBoundingBox) * nb_bboxes);
Nicolas George April 6, 2021, 4:45 p.m. UTC | #12
Guo, Yejun (12021-04-06):
> compile error on ubuntu 18.04 with default setting:
> error: ISO C90 forbids variable length array ‘boxes’ [-Werror=vla]
>       AVBoundingBox boxes[nb_bboxes];

This warning is explicitly added by configure it, we do not have to keep
it if it hiders us.

If people agree with my analysis that pointers to VLA used to compute
sizes and offsets in an isolated function are not a problem, unlike
actual VLAs on the stack, then we can just remove it. I do not think we
are at risk of adding VLAs by mistake.

Regards,
Guo, Yejun April 7, 2021, 2:08 a.m. UTC | #13
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nicolas
> George
> Sent: 2021年4月7日 0:46
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V6 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Guo, Yejun (12021-04-06):
> > compile error on ubuntu 18.04 with default setting:
> > error: ISO C90 forbids variable length array ‘boxes’ [-Werror=vla]
> >       AVBoundingBox boxes[nb_bboxes];
> 
> This warning is explicitly added by configure it, we do not have to keep
> it if it hiders us.
> 
> If people agree with my analysis that pointers to VLA used to compute
> sizes and offsets in an isolated function are not a problem, unlike
> actual VLAs on the stack, then we can just remove it. I do not think we
> are at risk of adding VLAs by mistake.

I see, thanks. 

@Michael, any comment on this, thanks, '-Werror=vla' is enabled by 293e5423
in 2012. I can send a patch to remove this in configure for more comments. 

Then, I can update my detection patch set according to the result.
Hendrik Leppkes April 7, 2021, 9:01 a.m. UTC | #14
On Tue, Apr 6, 2021 at 6:45 PM Nicolas George <george@nsup.org> wrote:
>
> Guo, Yejun (12021-04-06):
> > compile error on ubuntu 18.04 with default setting:
> > error: ISO C90 forbids variable length array ‘boxes’ [-Werror=vla]
> >       AVBoundingBox boxes[nb_bboxes];
>
> This warning is explicitly added by configure it, we do not have to keep
> it if it hiders us.
>
> If people agree with my analysis that pointers to VLA used to compute
> sizes and offsets in an isolated function are not a problem, unlike
> actual VLAs on the stack, then we can just remove it. I do not think we
> are at risk of adding VLAs by mistake.
>

Unfortunately, this is not correct. I don't know about optimization
problems, but I do know that compilers we do support, which are C11
compliant (where VLAs are optional), do not implement them (just
confirmed this with a test, as well, to make sure).

- Hendrik
Nicolas George April 7, 2021, 9:32 a.m. UTC | #15
Hendrik Leppkes (12021-04-07):
> Unfortunately, this is not correct. I don't know about optimization
> problems, but I do know that compilers we do support, which are C11
> compliant (where VLAs are optional), do not implement them (just
> confirmed this with a test, as well, to make sure).

Too bad.

Can you document which compilers they are? This is an information that
should be centralized.

We will have have a backup implementation.

Regards,
Andreas Rheinhardt April 7, 2021, 9:59 a.m. UTC | #16
Nicolas George:
> Hendrik Leppkes (12021-04-07):
>> Unfortunately, this is not correct. I don't know about optimization
>> problems, but I do know that compilers we do support, which are C11
>> compliant (where VLAs are optional), do not implement them (just
>> confirmed this with a test, as well, to make sure).
> 
> Too bad.
> 
> Can you document which compilers they are? This is an information that
> should be centralized.
> 
> We will have have a backup implementation.
> 
Why a backup implementation? Why not just avoid VLAs altogether (I don't
like using features that are optional in modern standards)? All one
needs to calculate a suitable offset is the alignment of AVBoundingBox.
This can be done via _Alignof for C11 compilers and via compiler
extensions for lots of other compilers (e.g. GCC supports __alignof__
since at least 3.1). And a generic alignment of (say) 8 can be used as a
fallback.

- Andreas
Nicolas George April 7, 2021, 10:06 a.m. UTC | #17
Andreas Rheinhardt (12021-04-07):
> Why a backup implementation?

Because:

> And a generic alignment of (say) 8 can be used as a fallback.

I believe using a pointer to VLA is more elegant than reimplementing the
compiler's feature using _Alignof and clumsy arithmetic, but it is
eventually for whoever writes it to decide.

Also, assuming the alignment is the same as the size is rather a safe
assumption for elementary types.

Regards,
Guo, Yejun April 7, 2021, 12:49 p.m. UTC | #18
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nicolas
> George
> Sent: 2021年4月7日 18:06
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V6 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Andreas Rheinhardt (12021-04-07):
> > Why a backup implementation?
> 
> Because:
> 
> > And a generic alignment of (say) 8 can be used as a fallback.
> 
> I believe using a pointer to VLA is more elegant than reimplementing the
> compiler's feature using _Alignof and clumsy arithmetic, but it is
> eventually for whoever writes it to decide.
> 
> Also, assuming the alignment is the same as the size is rather a safe
> assumption for elementary types.
> 

Thanks all, I might choose a simple and straight-forward method, instead of
using align keyword.
diff mbox series

Patch

diff --git a/doc/APIchanges b/doc/APIchanges
index b41dadee8d..160b020d9d 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -14,6 +14,8 @@  libavutil:     2017-10-21
 
 
 API changes, most recent first:
+2021-03-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
+  Add AV_FRAME_DATA_BOUNDING_BOXES
 
 2021-03-21 - xxxxxxxxxx - lavu 56.72.100 - frame.h
   Deprecated av_get_colorspace_name().
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 27bafe9e12..7e1dc713ac 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -11,6 +11,7 @@  HEADERS = adler32.h                                                     \
           avutil.h                                                      \
           base64.h                                                      \
           blowfish.h                                                    \
+          boundingbox.h                                                 \
           bprint.h                                                      \
           bswap.h                                                       \
           buffer.h                                                      \
diff --git a/libavutil/boundingbox.h b/libavutil/boundingbox.h
new file mode 100644
index 0000000000..5e2fdd55e9
--- /dev/null
+++ b/libavutil/boundingbox.h
@@ -0,0 +1,79 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_BOUNDINGBOX_H
+#define AVUTIL_BOUNDINGBOX_H
+
+#include "libavutil/rational.h"
+
+typedef struct AVBoundingBox {
+    /**
+     * Distance in pixels from the top edge of the frame to top
+     * and bottom, and from the left edge of the frame to left and
+     * right, defining the bounding box.
+     */
+    int top;
+    int left;
+    int bottom;
+    int right;
+
+#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
+
+    /**
+     * Detect result with confidence
+     */
+    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
+    AVRational detect_confidence;
+
+    /**
+     * At most 4 classifications based on the detected bounding box.
+     * For example, we can get max 4 different attributes with 4 different
+     * DNN models on one bounding box.
+     * classify_count is zero if no classification.
+     */
+#define AV_NUM_BBOX_CLASSIFY 4
+    uint32_t classify_count;
+    char classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
+    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
+} AVBoundingBox;
+
+typedef struct AVBoundingBoxHeader {
+    /**
+     * Information about how the bounding box is generated.
+     * for example, the DNN model name.
+     */
+    char source[128];
+
+    /**
+     * The size of frame when it is detected.
+     */
+    int frame_width;
+    int frame_height;
+
+    /**
+     * Number of bounding boxes.
+     */
+    uint32_t nb_bbox;
+
+    /**
+     * Pointer to the array of AVBoundingBox.
+     */
+    AVBoundingBox bboxes[];
+} AVBoundingBoxHeader;
+
+#endif
diff --git a/libavutil/frame.c b/libavutil/frame.c
index 31a2117b82..99080af777 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
@@ -853,6 +853,7 @@  const char *av_frame_side_data_name(enum AVFrameSideDataType type)
     case AV_FRAME_DATA_VIDEO_ENC_PARAMS:            return "Video encoding parameters";
     case AV_FRAME_DATA_SEI_UNREGISTERED:            return "H.26[45] User Data Unregistered SEI message";
     case AV_FRAME_DATA_FILM_GRAIN_PARAMS:           return "Film grain parameters";
+    case AV_FRAME_DATA_BOUNDING_BOXES:              return "Bounding boxes";
     }
     return NULL;
 }
diff --git a/libavutil/frame.h b/libavutil/frame.h
index a5ed91b20a..9b7145199c 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -198,6 +198,13 @@  enum AVFrameSideDataType {
      * Must be present for every frame which should have film grain applied.
      */
     AV_FRAME_DATA_FILM_GRAIN_PARAMS,
+
+    /**
+     * Bounding boxes for object detection and classification, the data is a BoundingBoxHeader
+     * followed with an array of BoudingBox, and BoundingBoxHeader.nb_bbox is the number of
+     * array element.
+     */
+    AV_FRAME_DATA_BOUNDING_BOXES,
 };
 
 enum AVActiveFormatDescription {