diff mbox series

[FFmpeg-devel,V7,4/6] lavu: add side data AV_FRAME_DATA_BOUNDING_BOXES

Message ID 20210407141723.11527-4-yejun.guo@intel.com
State New
Headers show
Series [FFmpeg-devel,V7,1/6] lavfi/dnn_backend_openvino.c: only allow DFT_PROCESS_FRAME to get output dim
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Guo, Yejun April 7, 2021, 2:17 p.m. UTC
Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
---
 doc/APIchanges          |   2 +
 libavutil/Makefile      |   2 +
 libavutil/boundingbox.c |  73 +++++++++++++++++++++++++
 libavutil/boundingbox.h | 114 ++++++++++++++++++++++++++++++++++++++++
 libavutil/frame.c       |   1 +
 libavutil/frame.h       |   7 +++
 6 files changed, 199 insertions(+)
 create mode 100644 libavutil/boundingbox.c
 create mode 100644 libavutil/boundingbox.h

Comments

Andreas Rheinhardt April 7, 2021, 2:44 p.m. UTC | #1
Guo, Yejun:
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  doc/APIchanges          |   2 +
>  libavutil/Makefile      |   2 +
>  libavutil/boundingbox.c |  73 +++++++++++++++++++++++++
>  libavutil/boundingbox.h | 114 ++++++++++++++++++++++++++++++++++++++++
>  libavutil/frame.c       |   1 +
>  libavutil/frame.h       |   7 +++
>  6 files changed, 199 insertions(+)
>  create mode 100644 libavutil/boundingbox.c
>  create mode 100644 libavutil/boundingbox.h
> 
> diff --git a/doc/APIchanges b/doc/APIchanges
> index 9dfcc97d5c..81d01aac1e 100644
> --- a/doc/APIchanges
> +++ b/doc/APIchanges
> @@ -14,6 +14,8 @@ libavutil:     2017-10-21
>  
>  
>  API changes, most recent first:
> +2021-04-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
> +  Add AV_FRAME_DATA_BOUNDING_BOXES
>  
>  2021-04-06 - xxxxxxxxxx - lavf 58.78.100 - avformat.h
>    Add avformat_index_get_entries_count(), avformat_index_get_entry(),
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 27bafe9e12..f6d21bb5ce 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -11,6 +11,7 @@ HEADERS = adler32.h                                                     \
>            avutil.h                                                      \
>            base64.h                                                      \
>            blowfish.h                                                    \
> +          boundingbox.h                                                 \
>            bprint.h                                                      \
>            bswap.h                                                       \
>            buffer.h                                                      \
> @@ -104,6 +105,7 @@ OBJS = adler32.o                                                        \
>         avsscanf.o                                                       \
>         base64.o                                                         \
>         blowfish.o                                                       \
> +       boundingbox.o                                                    \
>         bprint.o                                                         \
>         buffer.o                                                         \
>         cast5.o                                                          \
> diff --git a/libavutil/boundingbox.c b/libavutil/boundingbox.c
> new file mode 100644
> index 0000000000..881661ec18
> --- /dev/null
> +++ b/libavutil/boundingbox.c
> @@ -0,0 +1,73 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "boundingbox.h"
> +
> +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t *out_size)
> +{
> +    size_t size;
> +    struct {
> +	    AVBoundingBoxHeader header;
> +	    AVBoundingBox boxes[];

Hopefully all compilers accept a flexible array member; if not, using
boxes[1] (or just boxes) would be sufficient.

> +    } *ret;
> +
> +    size = sizeof(*ret);
> +    if (nb_bboxes > (SIZE_MAX - size) / sizeof(*ret->boxes))
> +        return NULL;
> +    size += sizeof(*ret->boxes) * nb_bboxes;
> +
> +    ret = av_mallocz(size);
> +    if (!ret)
> +        return NULL;
> +
> +    ret->header.nb_bboxes = nb_bboxes;
> +    ret->header.bbox_size = sizeof(*ret->boxes);
> +    ret->header.bboxes_offset = (char *)&ret->boxes - (char *)&ret->header;

Using offsetof would be clearer (for this you have to declare a proper
type).

> +
> +    if (out_size)
> +        *out_size = sizeof(*ret);
> +
> +    return &ret->header;
> +}
> +
> +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes)
> +{
> +    AVBufferRef         *buf;
> +    AVBoundingBoxHeader *header;
> +    size_t size;
> +
> +    header = av_bbox_alloc(nb_bboxes, &size);
> +    if (!header)
> +        return NULL;
> +    if (size > INT_MAX) {
> +        av_freep(&header);
> +        return NULL;
> +    }

Will be obsolete soon.

> +    buf = av_buffer_create((uint8_t *)header, size, NULL, NULL, 0);
> +    if (!buf) {
> +        av_freep(&header);
> +        return NULL;
> +    }
> +
> +    if (!av_frame_new_side_data_from_buf(frame, AV_FRAME_DATA_BOUNDING_BOXES, buf)) {
> +        av_buffer_unref(&buf);
> +        return NULL;
> +    }
> +
> +    return header;
> +}

Overall, the code duplication with video_enc_params.c should be factored
out.

> diff --git a/libavutil/boundingbox.h b/libavutil/boundingbox.h
> new file mode 100644
> index 0000000000..2b77c6c0dc
> --- /dev/null
> +++ b/libavutil/boundingbox.h
> @@ -0,0 +1,114 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_BOUNDINGBOX_H
> +#define AVUTIL_BOUNDINGBOX_H
> +
> +#include "rational.h"
> +#include "avassert.h"
> +#include "frame.h"
> +
> +typedef struct AVBoundingBox {
> +    /**
> +     * Distance in pixels from the top edge of the frame to top
> +     * and bottom, and from the left edge of the frame to left and
> +     * right, defining the bounding box.
> +     */
> +    int top;
> +    int left;
> +    int bottom;
> +    int right;
> +
> +#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
> +
> +    /**
> +     * Detect result with confidence
> +     */
> +    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
> +    AVRational detect_confidence;
> +
> +    /**
> +     * At most 4 classifications based on the detected bounding box.
> +     * For example, we can get max 4 different attributes with 4 different
> +     * DNN models on one bounding box.
> +     * classify_count is zero if no classification.
> +     */
> +#define AV_NUM_BBOX_CLASSIFY 4
> +    uint32_t classify_count;
> +    char classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
> +    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
> +} AVBoundingBox;
> +
> +typedef struct AVBoundingBoxHeader {
> +    /**
> +     * Information about how the bounding box is generated.
> +     * for example, the DNN model name.
> +     */
> +    char source[128];
> +
> +    /**
> +     * The size of frame when it is detected.
> +     */
> +    int frame_width;
> +    int frame_height;
> +
> +    /**
> +     * Number of bounding boxes in the array.
> +     */
> +    uint32_t nb_bboxes;
> +
> +    /**
> +     * Offset in bytes from the beginning of this structure at which
> +     * the array of bounding boxes starts.
> +     */
> +    size_t bboxes_offset;
> +
> +    /**
> +     * Size of each bounding box in bytes.
> +     */
> +    size_t bbox_size;
> +} AVBoundingBoxHeader;
> +
> +/*
> + * Get the bounding box at the specified {@code idx}. Must be between 0 and nb_bboxes.
> + */
> +static av_always_inline AVBoundingBox*
> +av_get_bounding_box(const AVBoundingBoxHeader *header, unsigned int idx)
> +{
> +    av_assert0(idx < header->nb_bboxes);
> +    return (AVBoundingBox *)((uint8_t *)header + header->bboxes_offset +
> +                             idx * header->bbox_size);
> +}
> +
> +/**
> + * Allocates memory for AVBoundingBoxHeader, plus an array of {@code nb_bboxes}
> + * AVBoundingBox, and initializes the variables.
> + * Can be freed with a normal av_free() call.
> + *
> + * @param out_size if non-NULL, the size in bytes of the resulting data array is
> + * written here.
> + */
> +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t *out_size);
> +
> +/**
> + * Allocates memory for AVBoundingBoxHeader, plus an array of {@code nb_bboxes}
> + * AVBoundingBox, in the given AVFrame {@code frame} as AVFrameSideData of type
> + * AV_FRAME_DATA_BOUNDING_BOXES and initializes the variables.
> + */
> +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes);
> +#endif
> diff --git a/libavutil/frame.c b/libavutil/frame.c
> index fefb2b69c0..d8e86263af 100644
> --- a/libavutil/frame.c
> +++ b/libavutil/frame.c
> @@ -853,6 +853,7 @@ const char *av_frame_side_data_name(enum AVFrameSideDataType type)
>      case AV_FRAME_DATA_VIDEO_ENC_PARAMS:            return "Video encoding parameters";
>      case AV_FRAME_DATA_SEI_UNREGISTERED:            return "H.26[45] User Data Unregistered SEI message";
>      case AV_FRAME_DATA_FILM_GRAIN_PARAMS:           return "Film grain parameters";
> +    case AV_FRAME_DATA_BOUNDING_BOXES:              return "Bounding boxes";
>      }
>      return NULL;
>  }
> diff --git a/libavutil/frame.h b/libavutil/frame.h
> index a5ed91b20a..41e22de02a 100644
> --- a/libavutil/frame.h
> +++ b/libavutil/frame.h
> @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>       * Must be present for every frame which should have film grain applied.
>       */
>      AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> +
> +    /**
> +     * Bounding boxes for object detection and classification, the data is a AVBoundingBoxHeader
> +     * followed with an array of AVBoudingBox, and AVBoundingBoxHeader.nb_bboxes is the number
> +     * of array element.
> +     */
> +    AV_FRAME_DATA_BOUNDING_BOXES,
>  };
>  
>  enum AVActiveFormatDescription {
>
Nicolas George April 7, 2021, 2:47 p.m. UTC | #2
Andreas Rheinhardt (12021-04-07):
> > +    ret->header.bboxes_offset = (char *)&ret->boxes - (char *)&ret->header;
> 
> Using offsetof would be clearer (for this you have to declare a proper
> type).

I find this version rather clearer. offsetof is good we we do not have a
pointer to do the actual arithmetic on, but here we have, and we define
the offset by the exact difference between the pointer we want.

Regards,
Guo, Yejun April 7, 2021, 3:50 p.m. UTC | #3
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Andreas Rheinhardt
> Sent: 2021年4月7日 22:44
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Guo, Yejun:
> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > ---
> >  doc/APIchanges          |   2 +
> >  libavutil/Makefile      |   2 +
> >  libavutil/boundingbox.c |  73 +++++++++++++++++++++++++
> >  libavutil/boundingbox.h | 114
> ++++++++++++++++++++++++++++++++++++++++
> >  libavutil/frame.c       |   1 +
> >  libavutil/frame.h       |   7 +++
> >  6 files changed, 199 insertions(+)
> >  create mode 100644 libavutil/boundingbox.c
> >  create mode 100644 libavutil/boundingbox.h
> >
> > diff --git a/doc/APIchanges b/doc/APIchanges
> > index 9dfcc97d5c..81d01aac1e 100644
> > --- a/doc/APIchanges
> > +++ b/doc/APIchanges
> > @@ -14,6 +14,8 @@ libavutil:     2017-10-21
> >
> >
> >  API changes, most recent first:
> > +2021-04-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
> > +  Add AV_FRAME_DATA_BOUNDING_BOXES
> >
> >  2021-04-06 - xxxxxxxxxx - lavf 58.78.100 - avformat.h
> >    Add avformat_index_get_entries_count(), avformat_index_get_entry(),
> > diff --git a/libavutil/Makefile b/libavutil/Makefile
> > index 27bafe9e12..f6d21bb5ce 100644
> > --- a/libavutil/Makefile
> > +++ b/libavutil/Makefile
> > @@ -11,6 +11,7 @@ HEADERS = adler32.h
> \
> >            avutil.h
> \
> >            base64.h
> \
> >            blowfish.h
> \
> > +          boundingbox.h
> \
> >            bprint.h
> \
> >            bswap.h
> \
> >            buffer.h
> \
> > @@ -104,6 +105,7 @@ OBJS = adler32.o
> \
> >         avsscanf.o
> \
> >         base64.o
> \
> >         blowfish.o
> \
> > +       boundingbox.o
> \
> >         bprint.o
> \
> >         buffer.o
> \
> >         cast5.o
> \
> > diff --git a/libavutil/boundingbox.c b/libavutil/boundingbox.c
> > new file mode 100644
> > index 0000000000..881661ec18
> > --- /dev/null
> > +++ b/libavutil/boundingbox.c
> > @@ -0,0 +1,73 @@
> > +/*
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> > + */
> > +
> > +#include "boundingbox.h"
> > +
> > +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t
> *out_size)
> > +{
> > +    size_t size;
> > +    struct {
> > +	    AVBoundingBoxHeader header;
> > +	    AVBoundingBox boxes[];
> 
> Hopefully all compilers accept a flexible array member; if not, using
> boxes[1] (or just boxes) would be sufficient.

patchwork is passed, but I'm not 100% sure all the compilers accept it,
I'll change to boxes[1].

> 
> > +    } *ret;
> > +
> > +    size = sizeof(*ret);
> > +    if (nb_bboxes > (SIZE_MAX - size) / sizeof(*ret->boxes))
> > +        return NULL;
> > +    size += sizeof(*ret->boxes) * nb_bboxes;
> > +
> > +    ret = av_mallocz(size);
> > +    if (!ret)
> > +        return NULL;
> > +
> > +    ret->header.nb_bboxes = nb_bboxes;
> > +    ret->header.bbox_size = sizeof(*ret->boxes);
> > +    ret->header.bboxes_offset = (char *)&ret->boxes - (char *)&ret->header;
> 
> Using offsetof would be clearer (for this you have to declare a proper
> type).

maybe both are ok.

> 
> > +
> > +    if (out_size)
> > +        *out_size = sizeof(*ret);
> > +
> > +    return &ret->header;
> > +}
> > +
> > +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame,
> uint32_t nb_bboxes)
> > +{
> > +    AVBufferRef         *buf;
> > +    AVBoundingBoxHeader *header;
> > +    size_t size;
> > +
> > +    header = av_bbox_alloc(nb_bboxes, &size);
> > +    if (!header)
> > +        return NULL;
> > +    if (size > INT_MAX) {
> > +        av_freep(&header);
> > +        return NULL;
> > +    }
> 
> Will be obsolete soon.

will remove the check, thanks.

> 
> > +    buf = av_buffer_create((uint8_t *)header, size, NULL, NULL, 0);
> > +    if (!buf) {
> > +        av_freep(&header);
> > +        return NULL;
> > +    }
> > +
> > +    if (!av_frame_new_side_data_from_buf(frame,
> AV_FRAME_DATA_BOUNDING_BOXES, buf)) {
> > +        av_buffer_unref(&buf);
> > +        return NULL;
> > +    }
> > +
> > +    return header;
> > +}
> 
> Overall, the code duplication with video_enc_params.c should be factored
> out.

Yes, there's some code duplication. But some code are relative to different
struct (AVBoundingBoxHeader and AVVideoEncParams), and we could not
unify them without type template. The left common code is about av_buffer_create
and av_frame_new_side_data_from_buf, do we have necessarity to put them
together? My current answer is no, I'll continue to think about it tomorrow.


I'll push the first 3 patches in this patch set tomorrow if there's no objection.
Andreas Rheinhardt April 7, 2021, 4:07 p.m. UTC | #4
Guo, Yejun:
> 
> 
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> Andreas Rheinhardt
>> Sent: 2021年4月7日 22:44
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> AV_FRAME_DATA_BOUNDING_BOXES
>>
>> Guo, Yejun:
>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>>> ---
>>>  doc/APIchanges          |   2 +
>>>  libavutil/Makefile      |   2 +
>>>  libavutil/boundingbox.c |  73 +++++++++++++++++++++++++
>>>  libavutil/boundingbox.h | 114
>> ++++++++++++++++++++++++++++++++++++++++
>>>  libavutil/frame.c       |   1 +
>>>  libavutil/frame.h       |   7 +++
>>>  6 files changed, 199 insertions(+)
>>>  create mode 100644 libavutil/boundingbox.c
>>>  create mode 100644 libavutil/boundingbox.h
>>>
>>> diff --git a/doc/APIchanges b/doc/APIchanges
>>> index 9dfcc97d5c..81d01aac1e 100644
>>> --- a/doc/APIchanges
>>> +++ b/doc/APIchanges
>>> @@ -14,6 +14,8 @@ libavutil:     2017-10-21
>>>
>>>
>>>  API changes, most recent first:
>>> +2021-04-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
>>> +  Add AV_FRAME_DATA_BOUNDING_BOXES
>>>
>>>  2021-04-06 - xxxxxxxxxx - lavf 58.78.100 - avformat.h
>>>    Add avformat_index_get_entries_count(), avformat_index_get_entry(),
>>> diff --git a/libavutil/Makefile b/libavutil/Makefile
>>> index 27bafe9e12..f6d21bb5ce 100644
>>> --- a/libavutil/Makefile
>>> +++ b/libavutil/Makefile
>>> @@ -11,6 +11,7 @@ HEADERS = adler32.h
>> \
>>>            avutil.h
>> \
>>>            base64.h
>> \
>>>            blowfish.h
>> \
>>> +          boundingbox.h
>> \
>>>            bprint.h
>> \
>>>            bswap.h
>> \
>>>            buffer.h
>> \
>>> @@ -104,6 +105,7 @@ OBJS = adler32.o
>> \
>>>         avsscanf.o
>> \
>>>         base64.o
>> \
>>>         blowfish.o
>> \
>>> +       boundingbox.o
>> \
>>>         bprint.o
>> \
>>>         buffer.o
>> \
>>>         cast5.o
>> \
>>> diff --git a/libavutil/boundingbox.c b/libavutil/boundingbox.c
>>> new file mode 100644
>>> index 0000000000..881661ec18
>>> --- /dev/null
>>> +++ b/libavutil/boundingbox.c
>>> @@ -0,0 +1,73 @@
>>> +/*
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
>> USA
>>> + */
>>> +
>>> +#include "boundingbox.h"
>>> +
>>> +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t
>> *out_size)
>>> +{
>>> +    size_t size;
>>> +    struct {
>>> +	    AVBoundingBoxHeader header;
>>> +	    AVBoundingBox boxes[];
>>
>> Hopefully all compilers accept a flexible array member; if not, using
>> boxes[1] (or just boxes) would be sufficient.
> 
> patchwork is passed, but I'm not 100% sure all the compilers accept it,
> I'll change to boxes[1].
> 
>>
>>> +    } *ret;
>>> +
>>> +    size = sizeof(*ret);
>>> +    if (nb_bboxes > (SIZE_MAX - size) / sizeof(*ret->boxes))
>>> +        return NULL;
>>> +    size += sizeof(*ret->boxes) * nb_bboxes;
>>> +
>>> +    ret = av_mallocz(size);
>>> +    if (!ret)
>>> +        return NULL;
>>> +
>>> +    ret->header.nb_bboxes = nb_bboxes;
>>> +    ret->header.bbox_size = sizeof(*ret->boxes);
>>> +    ret->header.bboxes_offset = (char *)&ret->boxes - (char *)&ret->header;
>>
>> Using offsetof would be clearer (for this you have to declare a proper
>> type).
> 
> maybe both are ok.
> 
>>
>>> +
>>> +    if (out_size)
>>> +        *out_size = sizeof(*ret);
>>> +
>>> +    return &ret->header;
>>> +}
>>> +
>>> +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame,
>> uint32_t nb_bboxes)
>>> +{
>>> +    AVBufferRef         *buf;
>>> +    AVBoundingBoxHeader *header;
>>> +    size_t size;
>>> +
>>> +    header = av_bbox_alloc(nb_bboxes, &size);
>>> +    if (!header)
>>> +        return NULL;
>>> +    if (size > INT_MAX) {
>>> +        av_freep(&header);
>>> +        return NULL;
>>> +    }
>>
>> Will be obsolete soon.
> 
> will remove the check, thanks.
> 

My remark was to inform you that this check should be removed if this
patch is committed after the bump (when the check is obsolete (and
harmful)). You should not remove it right now.

>>
>>> +    buf = av_buffer_create((uint8_t *)header, size, NULL, NULL, 0);
>>> +    if (!buf) {
>>> +        av_freep(&header);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    if (!av_frame_new_side_data_from_buf(frame,
>> AV_FRAME_DATA_BOUNDING_BOXES, buf)) {
>>> +        av_buffer_unref(&buf);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    return header;
>>> +}
>>
>> Overall, the code duplication with video_enc_params.c should be factored
>> out.
> 
> Yes, there's some code duplication. But some code are relative to different
> struct (AVBoundingBoxHeader and AVVideoEncParams), and we could not
> unify them without type template. The left common code is about av_buffer_create
> and av_frame_new_side_data_from_buf, do we have necessarity to put them
> together? My current answer is no, I'll continue to think about it tomorrow.
> 

Well, as you wish.

> 
> I'll push the first 3 patches in this patch set tomorrow if there's no objection.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Lynne April 7, 2021, 9:04 p.m. UTC | #5
Apr 7, 2021, 16:17 by yejun.guo@intel.com:

> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  doc/APIchanges          |   2 +
>  libavutil/Makefile      |   2 +
>  libavutil/boundingbox.c |  73 +++++++++++++++++++++++++
>  libavutil/boundingbox.h | 114 ++++++++++++++++++++++++++++++++++++++++
>  libavutil/frame.c       |   1 +
>  libavutil/frame.h       |   7 +++
>  6 files changed, 199 insertions(+)
>  create mode 100644 libavutil/boundingbox.c
>  create mode 100644 libavutil/boundingbox.h
>
> diff --git a/doc/APIchanges b/doc/APIchanges
> index 9dfcc97d5c..81d01aac1e 100644
> --- a/doc/APIchanges
> +++ b/doc/APIchanges
> @@ -14,6 +14,8 @@ libavutil:     2017-10-21
>  
>  
>  API changes, most recent first:
> +2021-04-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
> +  Add AV_FRAME_DATA_BOUNDING_BOXES
>  
>  2021-04-06 - xxxxxxxxxx - lavf 58.78.100 - avformat.h
>  Add avformat_index_get_entries_count(), avformat_index_get_entry(),
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 27bafe9e12..f6d21bb5ce 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -11,6 +11,7 @@ HEADERS = adler32.h                                                     \
>  avutil.h                                                      \
>  base64.h                                                      \
>  blowfish.h                                                    \
> +          boundingbox.h                                                 \
>  bprint.h                                                      \
>  bswap.h                                                       \
>  buffer.h                                                      \
> @@ -104,6 +105,7 @@ OBJS = adler32.o                                                        \
>  avsscanf.o                                                       \
>  base64.o                                                         \
>  blowfish.o                                                       \
> +       boundingbox.o                                                    \
>  bprint.o                                                         \
>  buffer.o                                                         \
>  cast5.o                                                          \
> diff --git a/libavutil/boundingbox.c b/libavutil/boundingbox.c
> new file mode 100644
> index 0000000000..881661ec18
> --- /dev/null
> +++ b/libavutil/boundingbox.c
> @@ -0,0 +1,73 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "boundingbox.h"
> +
> +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t *out_size)
> +{
> +    size_t size;
> +    struct {
> +	    AVBoundingBoxHeader header;
> +	    AVBoundingBox boxes[];
> +    } *ret;
> +
> +    size = sizeof(*ret);
> +    if (nb_bboxes > (SIZE_MAX - size) / sizeof(*ret->boxes))
> +        return NULL;
> +    size += sizeof(*ret->boxes) * nb_bboxes;
> +
> +    ret = av_mallocz(size);
> +    if (!ret)
> +        return NULL;
> +
> +    ret->header.nb_bboxes = nb_bboxes;
> +    ret->header.bbox_size = sizeof(*ret->boxes);
> +    ret->header.bboxes_offset = (char *)&ret->boxes - (char *)&ret->header;
> +
> +    if (out_size)
> +        *out_size = sizeof(*ret);
> +
> +    return &ret->header;
> +}
> +
> +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes)
>

Fix this everywhere. We never put the * before the name. This is a misleading
coding style that makes absolutely no sense.


> +{
> +    AVBufferRef         *buf;
> +    AVBoundingBoxHeader *header;
> +    size_t size;
> +
> +    header = av_bbox_alloc(nb_bboxes, &size);
> +    if (!header)
> +        return NULL;
> +    if (size > INT_MAX) {
> +        av_freep(&header);
> +        return NULL;
> +    }
> +    buf = av_buffer_create((uint8_t *)header, size, NULL, NULL, 0);
> +    if (!buf) {
> +        av_freep(&header);
> +        return NULL;
> +    }
> +
> +    if (!av_frame_new_side_data_from_buf(frame, AV_FRAME_DATA_BOUNDING_BOXES, buf)) {
> +        av_buffer_unref(&buf);
> +        return NULL;
> +    }
> +
> +    return header;
> +}
> diff --git a/libavutil/boundingbox.h b/libavutil/boundingbox.h
> new file mode 100644
> index 0000000000..2b77c6c0dc
> --- /dev/null
> +++ b/libavutil/boundingbox.h
> @@ -0,0 +1,114 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_BOUNDINGBOX_H
> +#define AVUTIL_BOUNDINGBOX_H
> +
> +#include "rational.h"
> +#include "avassert.h"
> +#include "frame.h"
> +
> +typedef struct AVBoundingBox {
> +    /**
> +     * Distance in pixels from the top edge of the frame to top
> +     * and bottom, and from the left edge of the frame to left and
> +     * right, defining the bounding box.
> +     */
> +    int top;
> +    int left;
> +    int bottom;
> +    int right;
>

Why not x, y, w, h?
It makes more sense, all of coordinates go like this.


> +
> +#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
>

Better bump this to 64. It won't be enough, and will
require a major bump to increment.


> +
> +    /**
> +     * Detect result with confidence
> +     */
> +    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
> +    AVRational detect_confidence;
> +
> +    /**
> +     * At most 4 classifications based on the detected bounding box.
> +     * For example, we can get max 4 different attributes with 4 different
> +     * DNN models on one bounding box.
> +     * classify_count is zero if no classification.
> +     */
> +#define AV_NUM_BBOX_CLASSIFY 4
> +    uint32_t classify_count;
> +    char classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
> +    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
> +} AVBoundingBox;
> +
> +typedef struct AVBoundingBoxHeader {
> +    /**
> +     * Information about how the bounding box is generated.
> +     * for example, the DNN model name.
> +     */
> +    char source[128];
> +
> +    /**
> +     * The size of frame when it is detected.
> +     */
> +    int frame_width;
> +    int frame_height;
>

Why? This side data is attached to AVFrames only, where we
already have width and height.


> +
> +    /**
> +     * Number of bounding boxes in the array.
> +     */
> +    uint32_t nb_bboxes;
> +
> +    /**
> +     * Offset in bytes from the beginning of this structure at which
> +     * the array of bounding boxes starts.
> +     */
> +    size_t bboxes_offset;
> +
> +    /**
> +     * Size of each bounding box in bytes.
> +     */
> +    size_t bbox_size;
> +} AVBoundingBoxHeader;
> +
> +/*
> + * Get the bounding box at the specified {@code idx}. Must be between 0 and nb_bboxes.
> + */
> +static av_always_inline AVBoundingBox*
> +av_get_bounding_box(const AVBoundingBoxHeader *header, unsigned int idx)
> +{
> +    av_assert0(idx < header->nb_bboxes);
> +    return (AVBoundingBox *)((uint8_t *)header + header->bboxes_offset +
> +                             idx * header->bbox_size);
> +}
> +
> +/**
> + * Allocates memory for AVBoundingBoxHeader, plus an array of {@code nb_bboxes}
> + * AVBoundingBox, and initializes the variables.
> + * Can be freed with a normal av_free() call.
> + *
> + * @param out_size if non-NULL, the size in bytes of the resulting data array is
> + * written here.
> + */
> +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t *out_size);
> +
> +/**
> + * Allocates memory for AVBoundingBoxHeader, plus an array of {@code nb_bboxes}
> + * AVBoundingBox, in the given AVFrame {@code frame} as AVFrameSideData of type
> + * AV_FRAME_DATA_BOUNDING_BOXES and initializes the variables.
> + */
> +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes);
> +#endif
> diff --git a/libavutil/frame.c b/libavutil/frame.c
> index fefb2b69c0..d8e86263af 100644
> --- a/libavutil/frame.c
> +++ b/libavutil/frame.c
> @@ -853,6 +853,7 @@ const char *av_frame_side_data_name(enum AVFrameSideDataType type)
>  case AV_FRAME_DATA_VIDEO_ENC_PARAMS:            return "Video encoding parameters";
>  case AV_FRAME_DATA_SEI_UNREGISTERED:            return "H.26[45] User Data Unregistered SEI message";
>  case AV_FRAME_DATA_FILM_GRAIN_PARAMS:           return "Film grain parameters";
> +    case AV_FRAME_DATA_BOUNDING_BOXES:              return "Bounding boxes";
>  }
>  return NULL;
>  }
> diff --git a/libavutil/frame.h b/libavutil/frame.h
> index a5ed91b20a..41e22de02a 100644
> --- a/libavutil/frame.h
> +++ b/libavutil/frame.h
> @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>  * Must be present for every frame which should have film grain applied.
>  */
>  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> +
> +    /**
> +     * Bounding boxes for object detection and classification, the data is a AVBoundingBoxHeader
> +     * followed with an array of AVBoudingBox, and AVBoundingBoxHeader.nb_bboxes is the number
> +     * of array element.
> +     */
> +    AV_FRAME_DATA_BOUNDING_BOXES,
>  }; 
>

Finally, why call it a Bounding Box? It's not descriptive at all.
How about "Object Classification"? It makes much more sense, it's
exactly what this is. So AVObjectClassification, AVObjectClassification,
AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
Guo, Yejun April 8, 2021, 2:03 a.m. UTC | #6
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Andreas Rheinhardt
> Sent: 2021年4月8日 0:07
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Guo, Yejun:
> >
> >
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >> Andreas Rheinhardt
> >> Sent: 2021年4月7日 22:44
> >> To: ffmpeg-devel@ffmpeg.org
> >> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> >> AV_FRAME_DATA_BOUNDING_BOXES
> >>
> >>> --- /dev/null
> >>> +++ b/libavutil/boundingbox.c
> >>> @@ -0,0 +1,73 @@
> >>> +/*
> >>> + * This file is part of FFmpeg.
> >>> + *
> >>> + * FFmpeg is free software; you can redistribute it and/or
> >>> + * modify it under the terms of the GNU Lesser General Public
> >>> + * License as published by the Free Software Foundation; either
> >>> + * version 2.1 of the License, or (at your option) any later version.
> >>> + *
> >>> + * FFmpeg is distributed in the hope that it will be useful,
> >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >> GNU
> >>> + * Lesser General Public License for more details.
> >>> + *
> >>> + * You should have received a copy of the GNU Lesser General Public
> >>> + * License along with FFmpeg; if not, write to the Free Software
> >>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> >> USA
> >>> + */
> >>> +
> >>> +#include "boundingbox.h"
> >>> +
> >>> +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t
> >> *out_size)
> >>> +{
> >>> +    size_t size;
> >>> +    struct {
> >>> +	    AVBoundingBoxHeader header;
> >>> +	    AVBoundingBox boxes[];
> >>
> >> Hopefully all compilers accept a flexible array member; if not, using
> >> boxes[1] (or just boxes) would be sufficient.
> >
> > patchwork is passed, but I'm not 100% sure all the compilers accept it,
> > I'll change to boxes[1].

Looks that there will be possible padding between boxes[0] and boxes[1]
by using 'boxes[1] or just boxes'. Based on our design, it still works by
considering these padding bytes at the end of the array. I'll continue it.

> >
> >>
> >>> +    } *ret;
> >>> +
> >>> +    size = sizeof(*ret);
> >>> +    if (nb_bboxes > (SIZE_MAX - size) / sizeof(*ret->boxes))
> >>> +        return NULL;
> >>> +    size += sizeof(*ret->boxes) * nb_bboxes;
> >>> +
> >>> +    ret = av_mallocz(size);
> >>> +    if (!ret)
> >>> +        return NULL;
> >>> +
> >>> +    ret->header.nb_bboxes = nb_bboxes;
> >>> +    ret->header.bbox_size = sizeof(*ret->boxes);
> >>> +    ret->header.bboxes_offset = (char *)&ret->boxes - (char
> *)&ret->header;
> >>
> >> Using offsetof would be clearer (for this you have to declare a proper
> >> type).
> >
> > maybe both are ok.
> >
> >>
> >>> +
> >>> +    if (out_size)
> >>> +        *out_size = sizeof(*ret);
> >>> +
> >>> +    return &ret->header;
> >>> +}
> >>> +
> >>> +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame,
> >> uint32_t nb_bboxes)
> >>> +{
> >>> +    AVBufferRef         *buf;
> >>> +    AVBoundingBoxHeader *header;
> >>> +    size_t size;
> >>> +
> >>> +    header = av_bbox_alloc(nb_bboxes, &size);
> >>> +    if (!header)
> >>> +        return NULL;
> >>> +    if (size > INT_MAX) {
> >>> +        av_freep(&header);
> >>> +        return NULL;
> >>> +    }
> >>
> >> Will be obsolete soon.
> >
> > will remove the check, thanks.
> >
> 
> My remark was to inform you that this check should be removed if this
> patch is committed after the bump (when the check is obsolete (and
> harmful)). You should not remove it right now.

got it, thanks.
Guo, Yejun April 8, 2021, 2:48 a.m. UTC | #7
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
> Sent: 2021年4月8日 5:04
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Apr 7, 2021, 16:17 by yejun.guo@intel.com:
> 
> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > ---
> >  doc/APIchanges          |   2 +
> >  libavutil/Makefile      |   2 +
> >  libavutil/boundingbox.c |  73 +++++++++++++++++++++++++
> >  libavutil/boundingbox.h | 114
> ++++++++++++++++++++++++++++++++++++++++
> >  libavutil/frame.c       |   1 +
> >  libavutil/frame.h       |   7 +++
> >  6 files changed, 199 insertions(+)
> >  create mode 100644 libavutil/boundingbox.c
> >  create mode 100644 libavutil/boundingbox.h
> >
> > diff --git a/doc/APIchanges b/doc/APIchanges
> > index 9dfcc97d5c..81d01aac1e 100644
> > --- a/doc/APIchanges
> > +++ b/doc/APIchanges
> > @@ -14,6 +14,8 @@ libavutil:     2017-10-21
> >
> >
> >  API changes, most recent first:
> > +2021-04-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
> > +  Add AV_FRAME_DATA_BOUNDING_BOXES
> >
> >  2021-04-06 - xxxxxxxxxx - lavf 58.78.100 - avformat.h
> >  Add avformat_index_get_entries_count(), avformat_index_get_entry(),
> > diff --git a/libavutil/Makefile b/libavutil/Makefile
> > index 27bafe9e12..f6d21bb5ce 100644
> > --- a/libavutil/Makefile
> > +++ b/libavutil/Makefile
> > @@ -11,6 +11,7 @@ HEADERS = adler32.h
> \
> >  avutil.h                                                      \
> >  base64.h                                                      \
> >  blowfish.h                                                    \
> > +          boundingbox.h
> \
> >  bprint.h                                                      \
> >  bswap.h                                                       \
> >  buffer.h                                                      \
> > @@ -104,6 +105,7 @@ OBJS = adler32.o
> \
> >  avsscanf.o
> \
> >  base64.o
> \
> >  blowfish.o
> \
> > +       boundingbox.o
> \
> >  bprint.o
> \
> >  buffer.o                                                         \
> >  cast5.o
> \
> > diff --git a/libavutil/boundingbox.c b/libavutil/boundingbox.c
> > new file mode 100644
> > index 0000000000..881661ec18
> > --- /dev/null
> > +++ b/libavutil/boundingbox.c
> > @@ -0,0 +1,73 @@
> > +/*
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> > + */
> > +
> > +#include "boundingbox.h"
> > +
> > +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t
> *out_size)
> > +{
> > +    size_t size;
> > +    struct {
> > +	    AVBoundingBoxHeader header;
> > +	    AVBoundingBox boxes[];
> > +    } *ret;
> > +
> > +    size = sizeof(*ret);
> > +    if (nb_bboxes > (SIZE_MAX - size) / sizeof(*ret->boxes))
> > +        return NULL;
> > +    size += sizeof(*ret->boxes) * nb_bboxes;
> > +
> > +    ret = av_mallocz(size);
> > +    if (!ret)
> > +        return NULL;
> > +
> > +    ret->header.nb_bboxes = nb_bboxes;
> > +    ret->header.bbox_size = sizeof(*ret->boxes);
> > +    ret->header.bboxes_offset = (char *)&ret->boxes - (char *)&ret->header;
> > +
> > +    if (out_size)
> > +        *out_size = sizeof(*ret);
> > +
> > +    return &ret->header;
> > +}
> > +
> > +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame,
> uint32_t nb_bboxes)
> >
> 
> Fix this everywhere. We never put the * before the name. This is a misleading
> coding style that makes absolutely no sense.

Do you mean change from:
AVBoundingBoxHeader* av_bbox_create_side_data
to:
AVBoundingBoxHeader *av_bbox_create_side_data

Will fix it, thanks.

btw, I think the coding style is to put '*' just before the name, for example,
AVFrame *frame;
const AVClass *class;

> 
> 
> > +{
> > +    AVBufferRef         *buf;
> > +    AVBoundingBoxHeader *header;
> > +    size_t size;
> > +
> > +    header = av_bbox_alloc(nb_bboxes, &size);
> > +    if (!header)
> > +        return NULL;
> > +    if (size > INT_MAX) {
> > +        av_freep(&header);
> > +        return NULL;
> > +    }
> > +    buf = av_buffer_create((uint8_t *)header, size, NULL, NULL, 0);
> > +    if (!buf) {
> > +        av_freep(&header);
> > +        return NULL;
> > +    }
> > +
> > +    if (!av_frame_new_side_data_from_buf(frame,
> AV_FRAME_DATA_BOUNDING_BOXES, buf)) {
> > +        av_buffer_unref(&buf);
> > +        return NULL;
> > +    }
> > +
> > +    return header;
> > +}
> > diff --git a/libavutil/boundingbox.h b/libavutil/boundingbox.h
> > new file mode 100644
> > index 0000000000..2b77c6c0dc
> > --- /dev/null
> > +++ b/libavutil/boundingbox.h
> > @@ -0,0 +1,114 @@
> > +/*
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> > + */
> > +
> > +#ifndef AVUTIL_BOUNDINGBOX_H
> > +#define AVUTIL_BOUNDINGBOX_H
> > +
> > +#include "rational.h"
> > +#include "avassert.h"
> > +#include "frame.h"
> > +
> > +typedef struct AVBoundingBox {
> > +    /**
> > +     * Distance in pixels from the top edge of the frame to top
> > +     * and bottom, and from the left edge of the frame to left and
> > +     * right, defining the bounding box.
> > +     */
> > +    int top;
> > +    int left;
> > +    int bottom;
> > +    int right;
> >
> 
> Why not x, y, w, h?
> It makes more sense, all of coordinates go like this.

We want to keep it consistent with 'struct AVRegionOfInterest',
we also have a plan to add a filter which converts from bounding box
to RegionOfInterest which impacts the encoder.

> 
> 
> > +
> > +#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
> >
> 
> Better bump this to 64. It won't be enough, and will
> require a major bump to increment.

I usually don't see a very long label name, and you are right,
we'd better bump it for safe. Will change, thanks.

> 
> 
> > +
> > +    /**
> > +     * Detect result with confidence
> > +     */
> > +    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
> > +    AVRational detect_confidence;
> > +
> > +    /**
> > +     * At most 4 classifications based on the detected bounding box.
> > +     * For example, we can get max 4 different attributes with 4 different
> > +     * DNN models on one bounding box.
> > +     * classify_count is zero if no classification.
> > +     */
> > +#define AV_NUM_BBOX_CLASSIFY 4
> > +    uint32_t classify_count;
> > +    char
> classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
> > +    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
> > +} AVBoundingBox;
> > +
> > +typedef struct AVBoundingBoxHeader {
> > +    /**
> > +     * Information about how the bounding box is generated.
> > +     * for example, the DNN model name.
> > +     */
> > +    char source[128];
> > +
> > +    /**
> > +     * The size of frame when it is detected.
> > +     */
> > +    int frame_width;
> > +    int frame_height;
> >
> 
> Why? This side data is attached to AVFrames only, where we
> already have width and height.

The detection result will be used by other filters, for example,
dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).

The filter dnn_detect detects all the objects (cat, dog, person ...) in a
frame, while dnn_classify classifies one detected object (for example, person)
for its attribute (for example, emotion, etc.)

The filter dnn_classify have to check if the frame size is changed after
it is detected, to handle the below filter chain:
dnn_detect -> scale -> dnn_classify

> 
> 
> > +
> > +    /**
> > +     * Number of bounding boxes in the array.
> > +     */
> > +    uint32_t nb_bboxes;
> > +
> > +    /**
> > +     * Offset in bytes from the beginning of this structure at which
> > +     * the array of bounding boxes starts.
> > +     */
> > +    size_t bboxes_offset;
> > +
> > +    /**
> > +     * Size of each bounding box in bytes.
> > +     */
> > +    size_t bbox_size;
> > +} AVBoundingBoxHeader;
> > +
> > +/*
> > + * Get the bounding box at the specified {@code idx}. Must be between 0
> and nb_bboxes.
> > + */
> > +static av_always_inline AVBoundingBox*
> > +av_get_bounding_box(const AVBoundingBoxHeader *header, unsigned int
> idx)
> > +{
> > +    av_assert0(idx < header->nb_bboxes);
> > +    return (AVBoundingBox *)((uint8_t *)header + header->bboxes_offset +
> > +                             idx * header->bbox_size);
> > +}
> > +
> > +/**
> > + * Allocates memory for AVBoundingBoxHeader, plus an array of {@code
> nb_bboxes}
> > + * AVBoundingBox, and initializes the variables.
> > + * Can be freed with a normal av_free() call.
> > + *
> > + * @param out_size if non-NULL, the size in bytes of the resulting data array
> is
> > + * written here.
> > + */
> > +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t
> *out_size);
> > +
> > +/**
> > + * Allocates memory for AVBoundingBoxHeader, plus an array of {@code
> nb_bboxes}
> > + * AVBoundingBox, in the given AVFrame {@code frame} as
> AVFrameSideData of type
> > + * AV_FRAME_DATA_BOUNDING_BOXES and initializes the variables.
> > + */
> > +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame,
> uint32_t nb_bboxes);
> > +#endif
> > diff --git a/libavutil/frame.c b/libavutil/frame.c
> > index fefb2b69c0..d8e86263af 100644
> > --- a/libavutil/frame.c
> > +++ b/libavutil/frame.c
> > @@ -853,6 +853,7 @@ const char *av_frame_side_data_name(enum
> AVFrameSideDataType type)
> >  case AV_FRAME_DATA_VIDEO_ENC_PARAMS:            return "Video
> encoding parameters";
> >  case AV_FRAME_DATA_SEI_UNREGISTERED:            return "H.26[45]
> User Data Unregistered SEI message";
> >  case AV_FRAME_DATA_FILM_GRAIN_PARAMS:           return "Film
> grain parameters";
> > +    case AV_FRAME_DATA_BOUNDING_BOXES:              return
> "Bounding boxes";
> >  }
> >  return NULL;
> >  }
> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> > index a5ed91b20a..41e22de02a 100644
> > --- a/libavutil/frame.h
> > +++ b/libavutil/frame.h
> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> >  * Must be present for every frame which should have film grain applied.
> >  */
> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> > +
> > +    /**
> > +     * Bounding boxes for object detection and classification, the data is a
> AVBoundingBoxHeader
> > +     * followed with an array of AVBoudingBox, and
> AVBoundingBoxHeader.nb_bboxes is the number
> > +     * of array element.
> > +     */
> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> >  };
> >
> 
> Finally, why call it a Bounding Box? It's not descriptive at all.
> How about "Object Classification"? It makes much more sense, it's
> exactly what this is. So AVObjectClassification, AVObjectClassification,
> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.

In object detection papers, bounding box is usually used.
We'd better use the same term, imho, thanks.
Lynne April 8, 2021, 4:13 a.m. UTC | #8
Apr 8, 2021, 04:48 by yejun.guo@intel.com:

>
>
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
>> Sent: 2021年4月8日 5:04
>> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> AV_FRAME_DATA_BOUNDING_BOXES
>>
>> Apr 7, 2021, 16:17 by yejun.guo@intel.com:
>>
>> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>> > ---
>> >  doc/APIchanges          |   2 +
>> >  libavutil/Makefile      |   2 +
>> >  libavutil/boundingbox.c |  73 +++++++++++++++++++++++++
>> >  libavutil/boundingbox.h | 114
>> ++++++++++++++++++++++++++++++++++++++++
>> >  libavutil/frame.c       |   1 +
>> >  libavutil/frame.h       |   7 +++
>> >  6 files changed, 199 insertions(+)
>> >  create mode 100644 libavutil/boundingbox.c
>> >  create mode 100644 libavutil/boundingbox.h
>> >
>> > diff --git a/doc/APIchanges b/doc/APIchanges
>> > index 9dfcc97d5c..81d01aac1e 100644
>> > --- a/doc/APIchanges
>> > +++ b/doc/APIchanges
>> > @@ -14,6 +14,8 @@ libavutil:     2017-10-21
>> >
>> >
>> >  API changes, most recent first:
>> > +2021-04-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
>> > +  Add AV_FRAME_DATA_BOUNDING_BOXES
>> >
>> >  2021-04-06 - xxxxxxxxxx - lavf 58.78.100 - avformat.h
>> >  Add avformat_index_get_entries_count(), avformat_index_get_entry(),
>> > diff --git a/libavutil/Makefile b/libavutil/Makefile
>> > index 27bafe9e12..f6d21bb5ce 100644
>> > --- a/libavutil/Makefile
>> > +++ b/libavutil/Makefile
>> > @@ -11,6 +11,7 @@ HEADERS = adler32.h
>> \
>> >  avutil.h                                                      \
>> >  base64.h                                                      \
>> >  blowfish.h                                                    \
>> > +          boundingbox.h
>> \
>> >  bprint.h                                                      \
>> >  bswap.h                                                       \
>> >  buffer.h                                                      \
>> > @@ -104,6 +105,7 @@ OBJS = adler32.o
>> \
>> >  avsscanf.o
>> \
>> >  base64.o
>> \
>> >  blowfish.o
>> \
>> > +       boundingbox.o
>> \
>> >  bprint.o
>> \
>> >  buffer.o                                                         \
>> >  cast5.o
>> \
>> > diff --git a/libavutil/boundingbox.c b/libavutil/boundingbox.c
>> > new file mode 100644
>> > index 0000000000..881661ec18
>> > --- /dev/null
>> > +++ b/libavutil/boundingbox.c
>> > @@ -0,0 +1,73 @@
>> > +/*
>> > + * This file is part of FFmpeg.
>> > + *
>> > + * FFmpeg is free software; you can redistribute it and/or
>> > + * modify it under the terms of the GNU Lesser General Public
>> > + * License as published by the Free Software Foundation; either
>> > + * version 2.1 of the License, or (at your option) any later version.
>> > + *
>> > + * FFmpeg is distributed in the hope that it will be useful,
>> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> GNU
>> > + * Lesser General Public License for more details.
>> > + *
>> > + * You should have received a copy of the GNU Lesser General Public
>> > + * License along with FFmpeg; if not, write to the Free Software
>> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
>> USA
>> > + */
>> > +
>> > +#include "boundingbox.h"
>> > +
>> > +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t
>> *out_size)
>> > +{
>> > +    size_t size;
>> > +    struct {
>> > +	    AVBoundingBoxHeader header;
>> > +	    AVBoundingBox boxes[];
>> > +    } *ret;
>> > +
>> > +    size = sizeof(*ret);
>> > +    if (nb_bboxes > (SIZE_MAX - size) / sizeof(*ret->boxes))
>> > +        return NULL;
>> > +    size += sizeof(*ret->boxes) * nb_bboxes;
>> > +
>> > +    ret = av_mallocz(size);
>> > +    if (!ret)
>> > +        return NULL;
>> > +
>> > +    ret->header.nb_bboxes = nb_bboxes;
>> > +    ret->header.bbox_size = sizeof(*ret->boxes);
>> > +    ret->header.bboxes_offset = (char *)&ret->boxes - (char *)&ret->header;
>> > +
>> > +    if (out_size)
>> > +        *out_size = sizeof(*ret);
>> > +
>> > +    return &ret->header;
>> > +}
>> > +
>> > +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame,
>> uint32_t nb_bboxes)
>> >
>>
>> Fix this everywhere. We never put the * before the name. This is a misleading
>> coding style that makes absolutely no sense.
>>
>
> Do you mean change from:
> AVBoundingBoxHeader* av_bbox_create_side_data
> to:
> AVBoundingBoxHeader *av_bbox_create_side_data
>
> Will fix it, thanks.
>
> btw, I think the coding style is to put '*' just before the name, for example,
> AVFrame *frame;
> const AVClass *class;
>
>>
>>
>> > +{
>> > +    AVBufferRef         *buf;
>> > +    AVBoundingBoxHeader *header;
>> > +    size_t size;
>> > +
>> > +    header = av_bbox_alloc(nb_bboxes, &size);
>> > +    if (!header)
>> > +        return NULL;
>> > +    if (size > INT_MAX) {
>> > +        av_freep(&header);
>> > +        return NULL;
>> > +    }
>> > +    buf = av_buffer_create((uint8_t *)header, size, NULL, NULL, 0);
>> > +    if (!buf) {
>> > +        av_freep(&header);
>> > +        return NULL;
>> > +    }
>> > +
>> > +    if (!av_frame_new_side_data_from_buf(frame,
>> AV_FRAME_DATA_BOUNDING_BOXES, buf)) {
>> > +        av_buffer_unref(&buf);
>> > +        return NULL;
>> > +    }
>> > +
>> > +    return header;
>> > +}
>> > diff --git a/libavutil/boundingbox.h b/libavutil/boundingbox.h
>> > new file mode 100644
>> > index 0000000000..2b77c6c0dc
>> > --- /dev/null
>> > +++ b/libavutil/boundingbox.h
>> > @@ -0,0 +1,114 @@
>> > +/*
>> > + * This file is part of FFmpeg.
>> > + *
>> > + * FFmpeg is free software; you can redistribute it and/or
>> > + * modify it under the terms of the GNU Lesser General Public
>> > + * License as published by the Free Software Foundation; either
>> > + * version 2.1 of the License, or (at your option) any later version.
>> > + *
>> > + * FFmpeg is distributed in the hope that it will be useful,
>> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> GNU
>> > + * Lesser General Public License for more details.
>> > + *
>> > + * You should have received a copy of the GNU Lesser General Public
>> > + * License along with FFmpeg; if not, write to the Free Software
>> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
>> USA
>> > + */
>> > +
>> > +#ifndef AVUTIL_BOUNDINGBOX_H
>> > +#define AVUTIL_BOUNDINGBOX_H
>> > +
>> > +#include "rational.h"
>> > +#include "avassert.h"
>> > +#include "frame.h"
>> > +
>> > +typedef struct AVBoundingBox {
>> > +    /**
>> > +     * Distance in pixels from the top edge of the frame to top
>> > +     * and bottom, and from the left edge of the frame to left and
>> > +     * right, defining the bounding box.
>> > +     */
>> > +    int top;
>> > +    int left;
>> > +    int bottom;
>> > +    int right;
>> >
>>
>> Why not x, y, w, h?
>> It makes more sense, all of coordinates go like this.
>>
>
> We want to keep it consistent with 'struct AVRegionOfInterest',
> we also have a plan to add a filter which converts from bounding box
> to RegionOfInterest which impacts the encoder.
>

Not a good idea. Region of interest is only useful to indicate a
single region, while this API describes multiple regions
within the frame. The video_enc_params API follows the
x, y, w, h because it's the easiest to work with, so I'm sure
it's well worth going to such coordinates.


>>
>> > +
>> > +#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
>> >
>>
>> Better bump this to 64. It won't be enough, and will
>> require a major bump to increment.
>>
>
> I usually don't see a very long label name, and you are right,
> we'd better bump it for safe. Will change, thanks.
>
>>
>>
>> > +
>> > +    /**
>> > +     * Detect result with confidence
>> > +     */
>> > +    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
>> > +    AVRational detect_confidence;
>> > +
>> > +    /**
>> > +     * At most 4 classifications based on the detected bounding box.
>> > +     * For example, we can get max 4 different attributes with 4 different
>> > +     * DNN models on one bounding box.
>> > +     * classify_count is zero if no classification.
>> > +     */
>> > +#define AV_NUM_BBOX_CLASSIFY 4
>> > +    uint32_t classify_count;
>> > +    char
>> classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
>> > +    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
>> > +} AVBoundingBox;
>> > +
>> > +typedef struct AVBoundingBoxHeader {
>> > +    /**
>> > +     * Information about how the bounding box is generated.
>> > +     * for example, the DNN model name.
>> > +     */
>> > +    char source[128];
>> > +
>> > +    /**
>> > +     * The size of frame when it is detected.
>> > +     */
>> > +    int frame_width;
>> > +    int frame_height;
>> >
>>
>> Why? This side data is attached to AVFrames only, where we
>> already have width and height.
>>
>
> The detection result will be used by other filters, for example,
> dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
>
> The filter dnn_detect detects all the objects (cat, dog, person ...) in a
> frame, while dnn_classify classifies one detected object (for example, person)
> for its attribute (for example, emotion, etc.)
>
> The filter dnn_classify have to check if the frame size is changed after
> it is detected, to handle the below filter chain:
> dnn_detect -> scale -> dnn_classify
>

This doesn't look good. Why is dnn_classify needing to know
the original frame size at all?


>> > +
>> > +    /**
>> > +     * Number of bounding boxes in the array.
>> > +     */
>> > +    uint32_t nb_bboxes;
>> > +
>> > +    /**
>> > +     * Offset in bytes from the beginning of this structure at which
>> > +     * the array of bounding boxes starts.
>> > +     */
>> > +    size_t bboxes_offset;
>> > +
>> > +    /**
>> > +     * Size of each bounding box in bytes.
>> > +     */
>> > +    size_t bbox_size;
>> > +} AVBoundingBoxHeader;
>> > +
>> > +/*
>> > + * Get the bounding box at the specified {@code idx}. Must be between 0
>> and nb_bboxes.
>> > + */
>> > +static av_always_inline AVBoundingBox*
>> > +av_get_bounding_box(const AVBoundingBoxHeader *header, unsigned int
>> idx)
>> > +{
>> > +    av_assert0(idx < header->nb_bboxes);
>> > +    return (AVBoundingBox *)((uint8_t *)header + header->bboxes_offset +
>> > +                             idx * header->bbox_size);
>> > +}
>> > +
>> > +/**
>> > + * Allocates memory for AVBoundingBoxHeader, plus an array of {@code
>> nb_bboxes}
>> > + * AVBoundingBox, and initializes the variables.
>> > + * Can be freed with a normal av_free() call.
>> > + *
>> > + * @param out_size if non-NULL, the size in bytes of the resulting data array
>> is
>> > + * written here.
>> > + */
>> > +AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t
>> *out_size);
>> > +
>> > +/**
>> > + * Allocates memory for AVBoundingBoxHeader, plus an array of {@code
>> nb_bboxes}
>> > + * AVBoundingBox, in the given AVFrame {@code frame} as
>> AVFrameSideData of type
>> > + * AV_FRAME_DATA_BOUNDING_BOXES and initializes the variables.
>> > + */
>> > +AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame,
>> uint32_t nb_bboxes);
>> > +#endif
>> > diff --git a/libavutil/frame.c b/libavutil/frame.c
>> > index fefb2b69c0..d8e86263af 100644
>> > --- a/libavutil/frame.c
>> > +++ b/libavutil/frame.c
>> > @@ -853,6 +853,7 @@ const char *av_frame_side_data_name(enum
>> AVFrameSideDataType type)
>> >  case AV_FRAME_DATA_VIDEO_ENC_PARAMS:            return "Video
>> encoding parameters";
>> >  case AV_FRAME_DATA_SEI_UNREGISTERED:            return "H.26[45]
>> User Data Unregistered SEI message";
>> >  case AV_FRAME_DATA_FILM_GRAIN_PARAMS:           return "Film
>> grain parameters";
>> > +    case AV_FRAME_DATA_BOUNDING_BOXES:              return
>> "Bounding boxes";
>> >  }
>> >  return NULL;
>> >  }
>> > diff --git a/libavutil/frame.h b/libavutil/frame.h
>> > index a5ed91b20a..41e22de02a 100644
>> > --- a/libavutil/frame.h
>> > +++ b/libavutil/frame.h
>> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>> >  * Must be present for every frame which should have film grain applied.
>> >  */
>> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
>> > +
>> > +    /**
>> > +     * Bounding boxes for object detection and classification, the data is a
>> AVBoundingBoxHeader
>> > +     * followed with an array of AVBoudingBox, and
>> AVBoundingBoxHeader.nb_bboxes is the number
>> > +     * of array element.
>> > +     */
>> > +    AV_FRAME_DATA_BOUNDING_BOXES,
>> >  };
>> >
>>
>> Finally, why call it a Bounding Box? It's not descriptive at all.
>> How about "Object Classification"? It makes much more sense, it's
>> exactly what this is. So AVObjectClassification, AVObjectClassification,
>> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
>>
>
> In object detection papers, bounding box is usually used.
> We'd better use the same term, imho, thanks.
>

Not in this case, API users won't have any idea what this is or what
it's for. This is user-facing code after all.
Papers in fields can get away with overloading language, but we're
trying to make a concise API. Object classification makes sense
because this is exactly what this is.
Guo, Yejun April 8, 2021, 5:35 a.m. UTC | #9
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
> Sent: 2021年4月8日 12:14
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Apr 8, 2021, 04:48 by yejun.guo@intel.com:
> 
> >> > +
> >> > +#ifndef AVUTIL_BOUNDINGBOX_H
> >> > +#define AVUTIL_BOUNDINGBOX_H
> >> > +
> >> > +#include "rational.h"
> >> > +#include "avassert.h"
> >> > +#include "frame.h"
> >> > +
> >> > +typedef struct AVBoundingBox {
> >> > +    /**
> >> > +     * Distance in pixels from the top edge of the frame to top
> >> > +     * and bottom, and from the left edge of the frame to left and
> >> > +     * right, defining the bounding box.
> >> > +     */
> >> > +    int top;
> >> > +    int left;
> >> > +    int bottom;
> >> > +    int right;
> >> >
> >>
> >> Why not x, y, w, h?
> >> It makes more sense, all of coordinates go like this.
> >>
> >
> > We want to keep it consistent with 'struct AVRegionOfInterest',
> > we also have a plan to add a filter which converts from bounding box
> > to RegionOfInterest which impacts the encoder.
> >
> 
> Not a good idea. Region of interest is only useful to indicate a
> single region, while this API describes multiple regions
> within the frame. The video_enc_params API follows the
> x, y, w, h because it's the easiest to work with, so I'm sure
> it's well worth going to such coordinates.
> 

RegionOfInterest is similar with boundingbox, it is used for multiple
regions in an array, see AV_FRAME_DATA_REGIONS_OF_INTEREST.

anyway, I'm ok to change to x,y,w,h.

btw, do we need to change to x,y,w,h for RegionOfInterest? Is such
change allowed after current major version change?

> >> > +
> >> > +typedef struct AVBoundingBoxHeader {
> >> > +    /**
> >> > +     * Information about how the bounding box is generated.
> >> > +     * for example, the DNN model name.
> >> > +     */
> >> > +    char source[128];
> >> > +
> >> > +    /**
> >> > +     * The size of frame when it is detected.
> >> > +     */
> >> > +    int frame_width;
> >> > +    int frame_height;
> >> >
> >>
> >> Why? This side data is attached to AVFrames only, where we
> >> already have width and height.
> >>
> >
> > The detection result will be used by other filters, for example,
> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
> >
> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
> > frame, while dnn_classify classifies one detected object (for example, person)
> > for its attribute (for example, emotion, etc.)
> >
> > The filter dnn_classify have to check if the frame size is changed after
> > it is detected, to handle the below filter chain:
> > dnn_detect -> scale -> dnn_classify
> >
> 
> This doesn't look good. Why is dnn_classify needing to know
> the original frame size at all?

For example, the original size of the frame is 100*100, and dnn_detect
detects a face at place (10, 10) -> (30, 40), such data will be saved in
AVBoundingBox.top/left/right/bottom.

Then, the frame is scaled into 50*50.

Then, dnn_classify is used to analyze the emotion of the face, it needs to
know the frame size (100*100) when it is detected, otherwise, it does not
work with just (10,10), (30,40) and 50*50.

> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> >> > index a5ed91b20a..41e22de02a 100644
> >> > --- a/libavutil/frame.h
> >> > +++ b/libavutil/frame.h
> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> >> >  * Must be present for every frame which should have film grain applied.
> >> >  */
> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> >> > +
> >> > +    /**
> >> > +     * Bounding boxes for object detection and classification, the data is
> a
> >> AVBoundingBoxHeader
> >> > +     * followed with an array of AVBoudingBox, and
> >> AVBoundingBoxHeader.nb_bboxes is the number
> >> > +     * of array element.
> >> > +     */
> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> >> >  };
> >> >
> >>
> >> Finally, why call it a Bounding Box? It's not descriptive at all.
> >> How about "Object Classification"? It makes much more sense, it's
> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> >>
> >
> > In object detection papers, bounding box is usually used.
> > We'd better use the same term, imho, thanks.
> >
> 
> Not in this case, API users won't have any idea what this is or what
> it's for. This is user-facing code after all.
> Papers in fields can get away with overloading language, but we're
> trying to make a concise API. Object classification makes sense
> because this is exactly what this is.

The term bounding box is dominating the domain, for example, even
HEVC spec uses this term, see page 317 of
https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!PDF-E&type=items

also copy some here for your convenient.
ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)

I would prefer to use bounding box.
Lynne April 8, 2021, 11:35 a.m. UTC | #10
Apr 8, 2021, 07:35 by yejun.guo@intel.com:

>
>
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
>> Sent: 2021年4月8日 12:14
>> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> AV_FRAME_DATA_BOUNDING_BOXES
>>
>> Apr 8, 2021, 04:48 by yejun.guo@intel.com:
>>
>> >> > +
>> >> > +#ifndef AVUTIL_BOUNDINGBOX_H
>> >> > +#define AVUTIL_BOUNDINGBOX_H
>> >> > +
>> >> > +#include "rational.h"
>> >> > +#include "avassert.h"
>> >> > +#include "frame.h"
>> >> > +
>> >> > +typedef struct AVBoundingBox {
>> >> > +    /**
>> >> > +     * Distance in pixels from the top edge of the frame to top
>> >> > +     * and bottom, and from the left edge of the frame to left and
>> >> > +     * right, defining the bounding box.
>> >> > +     */
>> >> > +    int top;
>> >> > +    int left;
>> >> > +    int bottom;
>> >> > +    int right;
>> >> >
>> >>
>> >> Why not x, y, w, h?
>> >> It makes more sense, all of coordinates go like this.
>> >>
>> >
>> > We want to keep it consistent with 'struct AVRegionOfInterest',
>> > we also have a plan to add a filter which converts from bounding box
>> > to RegionOfInterest which impacts the encoder.
>> >
>>
>> Not a good idea. Region of interest is only useful to indicate a
>> single region, while this API describes multiple regions
>> within the frame. The video_enc_params API follows the
>> x, y, w, h because it's the easiest to work with, so I'm sure
>> it's well worth going to such coordinates.
>>
>
> RegionOfInterest is similar with boundingbox, it is used for multiple
> regions in an array, see AV_FRAME_DATA_REGIONS_OF_INTEREST.
>
> anyway, I'm ok to change to x,y,w,h.
>
> btw, do we need to change to x,y,w,h for RegionOfInterest? Is such
> change allowed after current major version change?
>
>> >> > +
>> >> > +typedef struct AVBoundingBoxHeader {
>> >> > +    /**
>> >> > +     * Information about how the bounding box is generated.
>> >> > +     * for example, the DNN model name.
>> >> > +     */
>> >> > +    char source[128];
>> >> > +
>> >> > +    /**
>> >> > +     * The size of frame when it is detected.
>> >> > +     */
>> >> > +    int frame_width;
>> >> > +    int frame_height;
>> >> >
>> >>
>> >> Why? This side data is attached to AVFrames only, where we
>> >> already have width and height.
>> >>
>> >
>> > The detection result will be used by other filters, for example,
>> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
>> >
>> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
>> > frame, while dnn_classify classifies one detected object (for example, person)
>> > for its attribute (for example, emotion, etc.)
>> >
>> > The filter dnn_classify have to check if the frame size is changed after
>> > it is detected, to handle the below filter chain:
>> > dnn_detect -> scale -> dnn_classify
>> >
>>
>> This doesn't look good. Why is dnn_classify needing to know
>> the original frame size at all?
>>
>
> For example, the original size of the frame is 100*100, and dnn_detect
> detects a face at place (10, 10) -> (30, 40), such data will be saved in
> AVBoundingBox.top/left/right/bottom.
>
> Then, the frame is scaled into 50*50.
>
> Then, dnn_classify is used to analyze the emotion of the face, it needs to
> know the frame size (100*100) when it is detected, otherwise, it does not
> work with just (10,10), (30,40) and 50*50.
>

Why can't the scale filter also rescale this side data as well?


>> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
>> >> > index a5ed91b20a..41e22de02a 100644
>> >> > --- a/libavutil/frame.h
>> >> > +++ b/libavutil/frame.h
>> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>> >> >  * Must be present for every frame which should have film grain applied.
>> >> >  */
>> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
>> >> > +
>> >> > +    /**
>> >> > +     * Bounding boxes for object detection and classification, the data is
>> a
>> >> AVBoundingBoxHeader
>> >> > +     * followed with an array of AVBoudingBox, and
>> >> AVBoundingBoxHeader.nb_bboxes is the number
>> >> > +     * of array element.
>> >> > +     */
>> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
>> >> >  };
>> >> >
>> >>
>> >> Finally, why call it a Bounding Box? It's not descriptive at all.
>> >> How about "Object Classification"? It makes much more sense, it's
>> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
>> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
>> >>
>> >
>> > In object detection papers, bounding box is usually used.
>> > We'd better use the same term, imho, thanks.
>> >
>>
>> Not in this case, API users won't have any idea what this is or what
>> it's for. This is user-facing code after all.
>> Papers in fields can get away with overloading language, but we're
>> trying to make a concise API. Object classification makes sense
>> because this is exactly what this is.
>>
>
> The term bounding box is dominating the domain, for example, even
> HEVC spec uses this term, see page 317 of
> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!PDF-E&type=items
>
> also copy some here for your convenient.
> ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
>
> I would prefer to use bounding box.
>

It's for an entirely different thing, and like I said, it's just an overloaded
language because they can get away. We're trying to be generic.
This side data is for detecting _and_ classifying objects. In fact, most of
the structure is dedicated towards classifying. If you'd like users to actually
use this, give it a good name and don't leave them guessing what this
structure is by throwing vague jargon some other paper or spec has
because it's close enough.
Guo, Yejun April 8, 2021, 2:51 p.m. UTC | #11
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
> Sent: 2021年4月8日 19:35
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Apr 8, 2021, 07:35 by yejun.guo@intel.com:
> 
> >
> >
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Lynne
> >> Sent: 2021年4月8日 12:14
> >> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> >> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> >> AV_FRAME_DATA_BOUNDING_BOXES
> >>
> >> Apr 8, 2021, 04:48 by yejun.guo@intel.com:
> >>
> >> >> > +
> >> >> > +#ifndef AVUTIL_BOUNDINGBOX_H
> >> >> > +#define AVUTIL_BOUNDINGBOX_H
> >> >> > +
> >> >> > +#include "rational.h"
> >> >> > +#include "avassert.h"
> >> >> > +#include "frame.h"
> >> >> > +
> >> >> > +typedef struct AVBoundingBox {
> >> >> > +    /**
> >> >> > +     * Distance in pixels from the top edge of the frame to top
> >> >> > +     * and bottom, and from the left edge of the frame to left and
> >> >> > +     * right, defining the bounding box.
> >> >> > +     */
> >> >> > +    int top;
> >> >> > +    int left;
> >> >> > +    int bottom;
> >> >> > +    int right;
> >> >> >
> >> >>
> >> >> Why not x, y, w, h?
> >> >> It makes more sense, all of coordinates go like this.
> >> >>
> >> >
> >> > We want to keep it consistent with 'struct AVRegionOfInterest',
> >> > we also have a plan to add a filter which converts from bounding box
> >> > to RegionOfInterest which impacts the encoder.
> >> >
> >>
> >> Not a good idea. Region of interest is only useful to indicate a
> >> single region, while this API describes multiple regions
> >> within the frame. The video_enc_params API follows the
> >> x, y, w, h because it's the easiest to work with, so I'm sure
> >> it's well worth going to such coordinates.
> >>
> >
> > RegionOfInterest is similar with boundingbox, it is used for multiple
> > regions in an array, see AV_FRAME_DATA_REGIONS_OF_INTEREST.
> >
> > anyway, I'm ok to change to x,y,w,h.
> >
> > btw, do we need to change to x,y,w,h for RegionOfInterest? Is such
> > change allowed after current major version change?

any comment on this about RegionOfInterest? thanks.

> >
> >> >> > +
> >> >> > +typedef struct AVBoundingBoxHeader {
> >> >> > +    /**
> >> >> > +     * Information about how the bounding box is generated.
> >> >> > +     * for example, the DNN model name.
> >> >> > +     */
> >> >> > +    char source[128];
> >> >> > +
> >> >> > +    /**
> >> >> > +     * The size of frame when it is detected.
> >> >> > +     */
> >> >> > +    int frame_width;
> >> >> > +    int frame_height;
> >> >> >
> >> >>
> >> >> Why? This side data is attached to AVFrames only, where we
> >> >> already have width and height.
> >> >>
> >> >
> >> > The detection result will be used by other filters, for example,
> >> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
> >> >
> >> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
> >> > frame, while dnn_classify classifies one detected object (for example,
> person)
> >> > for its attribute (for example, emotion, etc.)
> >> >
> >> > The filter dnn_classify have to check if the frame size is changed after
> >> > it is detected, to handle the below filter chain:
> >> > dnn_detect -> scale -> dnn_classify
> >> >
> >>
> >> This doesn't look good. Why is dnn_classify needing to know
> >> the original frame size at all?
> >>
> >
> > For example, the original size of the frame is 100*100, and dnn_detect
> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
> > AVBoundingBox.top/left/right/bottom.
> >
> > Then, the frame is scaled into 50*50.
> >
> > Then, dnn_classify is used to analyze the emotion of the face, it needs to
> > know the frame size (100*100) when it is detected, otherwise, it does not
> > work with just (10,10), (30,40) and 50*50.
> >
> 
> Why can't the scale filter also rescale this side data as well?

I'm afraid that we could not make sure all such filters (including filters in the
future) to do the rescale. And in the previous discussion, I got to know that
'many other existing side-data types are invalidated by scaling'.

So, we need frame_width and frame_height here.

> 
> 
> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> >> >> > index a5ed91b20a..41e22de02a 100644
> >> >> > --- a/libavutil/frame.h
> >> >> > +++ b/libavutil/frame.h
> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> >> >> >  * Must be present for every frame which should have film grain
> applied.
> >> >> >  */
> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> >> >> > +
> >> >> > +    /**
> >> >> > +     * Bounding boxes for object detection and classification, the
> data is
> >> a
> >> >> AVBoundingBoxHeader
> >> >> > +     * followed with an array of AVBoudingBox, and
> >> >> AVBoundingBoxHeader.nb_bboxes is the number
> >> >> > +     * of array element.
> >> >> > +     */
> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> >> >> >  };
> >> >> >
> >> >>
> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
> >> >> How about "Object Classification"? It makes much more sense, it's
> >> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> >> >>
> >> >
> >> > In object detection papers, bounding box is usually used.
> >> > We'd better use the same term, imho, thanks.
> >> >
> >>
> >> Not in this case, API users won't have any idea what this is or what
> >> it's for. This is user-facing code after all.
> >> Papers in fields can get away with overloading language, but we're
> >> trying to make a concise API. Object classification makes sense
> >> because this is exactly what this is.
> >>
> >
> > The term bounding box is dominating the domain, for example, even
> > HEVC spec uses this term, see page 317 of
> >
> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!P
> DF-E&type=items
> >
> > also copy some here for your convenient.
> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
> >
> > I would prefer to use bounding box.
> >
> 
> It's for an entirely different thing, and like I said, it's just an overloaded
> language because they can get away. We're trying to be generic.
> This side data is for detecting _and_ classifying objects. In fact, most of
> the structure is dedicated towards classifying. If you'd like users to actually
> use this, give it a good name and don't leave them guessing what this
> structure is by throwing vague jargon some other paper or spec has
> because it's close enough.

all the people in the domain accepts bounding box, they can understand this
struct name easily and clearly, they might be bothered if we use another name.

btw, AVObjectClassification confuses people who just want object detection.
Nicolas George April 8, 2021, 2:53 p.m. UTC | #12
Guo, Yejun (12021-04-08):
> I'm afraid that we could not make sure all such filters (including filters in the
> future) to do the rescale. And in the previous discussion, I got to know that
> 'many other existing side-data types are invalidated by scaling'.
> 
> So, we need frame_width and frame_height here.

But the crop or pad filters will also invalidate the information, and in
a different way. How should frame_width and frame_height be used to
solve all?

Regards,
Guo, Yejun April 8, 2021, 2:59 p.m. UTC | #13
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nicolas
> George
> Sent: 2021年4月8日 22:54
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Guo, Yejun (12021-04-08):
> > I'm afraid that we could not make sure all such filters (including filters in the
> > future) to do the rescale. And in the previous discussion, I got to know that
> > 'many other existing side-data types are invalidated by scaling'.
> >
> > So, we need frame_width and frame_height here.
> 
> But the crop or pad filters will also invalidate the information, and in
> a different way. How should frame_width and frame_height be used to
> solve all?

thanks for the new info, my idea was to rescale the bounding box for the frame
size change. Looks that we can just output a warning message and don't do the
classification if the size changes, otherwise, the filter will give wrong result.

> 
> Regards,
> 
> --
>   Nicolas George
Lynne April 8, 2021, 4:57 p.m. UTC | #14
Apr 8, 2021, 16:51 by yejun.guo@intel.com:

>
>
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
>> Sent: 2021年4月8日 19:35
>> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> AV_FRAME_DATA_BOUNDING_BOXES
>>
>> Apr 8, 2021, 07:35 by yejun.guo@intel.com:
>>
>> >
>> >
>> >> -----Original Message-----
>> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> Lynne
>> >> Sent: 2021年4月8日 12:14
>> >> To: FFmpeg development discussions and patches
>> <ffmpeg-devel@ffmpeg.org>
>> >> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> >> AV_FRAME_DATA_BOUNDING_BOXES
>> >>
>> >> Apr 8, 2021, 04:48 by yejun.guo@intel.com:
>> >>
>> >> >> > +
>> >> >> > +#ifndef AVUTIL_BOUNDINGBOX_H
>> >> >> > +#define AVUTIL_BOUNDINGBOX_H
>> >> >> > +
>> >> >> > +#include "rational.h"
>> >> >> > +#include "avassert.h"
>> >> >> > +#include "frame.h"
>> >> >> > +
>> >> >> > +typedef struct AVBoundingBox {
>> >> >> > +    /**
>> >> >> > +     * Distance in pixels from the top edge of the frame to top
>> >> >> > +     * and bottom, and from the left edge of the frame to left and
>> >> >> > +     * right, defining the bounding box.
>> >> >> > +     */
>> >> >> > +    int top;
>> >> >> > +    int left;
>> >> >> > +    int bottom;
>> >> >> > +    int right;
>> >> >> >
>> >> >>
>> >> >> Why not x, y, w, h?
>> >> >> It makes more sense, all of coordinates go like this.
>> >> >>
>> >> >
>> >> > We want to keep it consistent with 'struct AVRegionOfInterest',
>> >> > we also have a plan to add a filter which converts from bounding box
>> >> > to RegionOfInterest which impacts the encoder.
>> >> >
>> >>
>> >> Not a good idea. Region of interest is only useful to indicate a
>> >> single region, while this API describes multiple regions
>> >> within the frame. The video_enc_params API follows the
>> >> x, y, w, h because it's the easiest to work with, so I'm sure
>> >> it's well worth going to such coordinates.
>> >>
>> >
>> > RegionOfInterest is similar with boundingbox, it is used for multiple
>> > regions in an array, see AV_FRAME_DATA_REGIONS_OF_INTEREST.
>> >
>> > anyway, I'm ok to change to x,y,w,h.
>> >
>> > btw, do we need to change to x,y,w,h for RegionOfInterest? Is such
>> > change allowed after current major version change?
>>
>
> any comment on this about RegionOfInterest? thanks.
>
>> >
>> >> >> > +
>> >> >> > +typedef struct AVBoundingBoxHeader {
>> >> >> > +    /**
>> >> >> > +     * Information about how the bounding box is generated.
>> >> >> > +     * for example, the DNN model name.
>> >> >> > +     */
>> >> >> > +    char source[128];
>> >> >> > +
>> >> >> > +    /**
>> >> >> > +     * The size of frame when it is detected.
>> >> >> > +     */
>> >> >> > +    int frame_width;
>> >> >> > +    int frame_height;
>> >> >> >
>> >> >>
>> >> >> Why? This side data is attached to AVFrames only, where we
>> >> >> already have width and height.
>> >> >>
>> >> >
>> >> > The detection result will be used by other filters, for example,
>> >> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
>> >> >
>> >> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
>> >> > frame, while dnn_classify classifies one detected object (for example,
>> person)
>> >> > for its attribute (for example, emotion, etc.)
>> >> >
>> >> > The filter dnn_classify have to check if the frame size is changed after
>> >> > it is detected, to handle the below filter chain:
>> >> > dnn_detect -> scale -> dnn_classify
>> >> >
>> >>
>> >> This doesn't look good. Why is dnn_classify needing to know
>> >> the original frame size at all?
>> >>
>> >
>> > For example, the original size of the frame is 100*100, and dnn_detect
>> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
>> > AVBoundingBox.top/left/right/bottom.
>> >
>> > Then, the frame is scaled into 50*50.
>> >
>> > Then, dnn_classify is used to analyze the emotion of the face, it needs to
>> > know the frame size (100*100) when it is detected, otherwise, it does not
>> > work with just (10,10), (30,40) and 50*50.
>> >
>>
>> Why can't the scale filter also rescale this side data as well?
>>
>
> I'm afraid that we could not make sure all such filters (including filters in the
> future) to do the rescale. And in the previous discussion, I got to know that
> 'many other existing side-data types are invalidated by scaling'.
>
> So, we need frame_width and frame_height here.
>

No, you don't. You just need to make sure filters which change resolution
or do cropping also change the side data parameters.
It's called maintainership. As-is, this won't even work with cropping,
only with basic aspect ratio preserving scaling.
For the lack of a better term, this is a hack.

I would accept just specifying that if the frame dimensions are
altered in any way, the side-data is no longer valid and it's up
to users to figure that out by out of bound coordinates.
This is what we currently do with video_enc_params.


>> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
>> >> >> > index a5ed91b20a..41e22de02a 100644
>> >> >> > --- a/libavutil/frame.h
>> >> >> > +++ b/libavutil/frame.h
>> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>> >> >> >  * Must be present for every frame which should have film grain
>> applied.
>> >> >> >  */
>> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
>> >> >> > +
>> >> >> > +    /**
>> >> >> > +     * Bounding boxes for object detection and classification, the
>> data is
>> >> a
>> >> >> AVBoundingBoxHeader
>> >> >> > +     * followed with an array of AVBoudingBox, and
>> >> >> AVBoundingBoxHeader.nb_bboxes is the number
>> >> >> > +     * of array element.
>> >> >> > +     */
>> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
>> >> >> >  };
>> >> >> >
>> >> >>
>> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
>> >> >> How about "Object Classification"? It makes much more sense, it's
>> >> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
>> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
>> >> >>
>> >> >
>> >> > In object detection papers, bounding box is usually used.
>> >> > We'd better use the same term, imho, thanks.
>> >> >
>> >>
>> >> Not in this case, API users won't have any idea what this is or what
>> >> it's for. This is user-facing code after all.
>> >> Papers in fields can get away with overloading language, but we're
>> >> trying to make a concise API. Object classification makes sense
>> >> because this is exactly what this is.
>> >>
>> >
>> > The term bounding box is dominating the domain, for example, even
>> > HEVC spec uses this term, see page 317 of
>> >
>> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!P
>> DF-E&type=items
>> >
>> > also copy some here for your convenient.
>> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
>> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
>> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
>> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
>> >
>> > I would prefer to use bounding box.
>> >
>>
>> It's for an entirely different thing, and like I said, it's just an overloaded
>> language because they can get away. We're trying to be generic.
>> This side data is for detecting _and_ classifying objects. In fact, most of
>> the structure is dedicated towards classifying. If you'd like users to actually
>> use this, give it a good name and don't leave them guessing what this
>> structure is by throwing vague jargon some other paper or spec has
>> because it's close enough.
>>
>
> all the people in the domain accepts bounding box, they can understand this
> struct name easily and clearly, they might be bothered if we use another name.
>
> btw, AVObjectClassification confuses people who just want object detection.
>

As I said, the name "bounding box" makes no sense once it gets overloaded
with object classification. Object classification is still the main use of the filters,
because the original proposal was to have all of this info be ffmpeg-private,
which would forbid simple object detection.
So I still maintain this should be called "Object classification". I'd accept
"Object detection" as well, but definitely not "bounding box".

Since the decision was made to make the side data public, we have to make
very sure it contains no hacks or is impossible to extend, since we don't want
to have an "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWED_UP"
faster than you can say "Recursive cascade correlation artificial neural networks".
Guo, Yejun April 9, 2021, 4:12 a.m. UTC | #15
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
> Sent: 2021年4月9日 0:57
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 

First of all, thanks for the quick replies, I see, all the discussions/comments are to
make this patch better, thank you.

> >> >
> >> >> >> > +
> >> >> >> > +typedef struct AVBoundingBoxHeader {
> >> >> >> > +    /**
> >> >> >> > +     * Information about how the bounding box is generated.
> >> >> >> > +     * for example, the DNN model name.
> >> >> >> > +     */
> >> >> >> > +    char source[128];
> >> >> >> > +
> >> >> >> > +    /**
> >> >> >> > +     * The size of frame when it is detected.
> >> >> >> > +     */
> >> >> >> > +    int frame_width;
> >> >> >> > +    int frame_height;
> >> >> >> >
> >> >> >>
> >> >> >> Why? This side data is attached to AVFrames only, where we
> >> >> >> already have width and height.
> >> >> >>
> >> >> >
> >> >> > The detection result will be used by other filters, for example,
> >> >> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
> >> >> >
> >> >> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
> >> >> > frame, while dnn_classify classifies one detected object (for example,
> >> person)
> >> >> > for its attribute (for example, emotion, etc.)
> >> >> >
> >> >> > The filter dnn_classify have to check if the frame size is changed after
> >> >> > it is detected, to handle the below filter chain:
> >> >> > dnn_detect -> scale -> dnn_classify
> >> >> >
> >> >>
> >> >> This doesn't look good. Why is dnn_classify needing to know
> >> >> the original frame size at all?
> >> >>
> >> >
> >> > For example, the original size of the frame is 100*100, and dnn_detect
> >> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
> >> > AVBoundingBox.top/left/right/bottom.
> >> >
> >> > Then, the frame is scaled into 50*50.
> >> >
> >> > Then, dnn_classify is used to analyze the emotion of the face, it needs to
> >> > know the frame size (100*100) when it is detected, otherwise, it does not
> >> > work with just (10,10), (30,40) and 50*50.
> >> >
> >>
> >> Why can't the scale filter also rescale this side data as well?
> >>
> >
> > I'm afraid that we could not make sure all such filters (including filters in the
> > future) to do the rescale. And in the previous discussion, I got to know that
> > 'many other existing side-data types are invalidated by scaling'.
> >
> > So, we need frame_width and frame_height here.
> >
> 
> No, you don't. You just need to make sure filters which change resolution
> or do cropping also change the side data parameters.
> It's called maintainership. As-is, this won't even work with cropping,
> only with basic aspect ratio preserving scaling.
> For the lack of a better term, this is a hack.

As discussed in previous email, for the frame size change case, dnn_classify
(and other filters which use the detection result, for example drawbox) can
just output a warning message to tell user what happens, and don't do the
classification, otherwise, it will give a wrong/weird result which makes the
user confused.

> 
> I would accept just specifying that if the frame dimensions are
> altered in any way, the side-data is no longer valid and it's up
> to users to figure that out by out of bound coordinates.
> This is what we currently do with video_enc_params.

frame_width/frame_height is not perfect (for the cases such as: scale down
+ crop + scale up to the same size), but it provides more info than the checking
of 'out of bound coordinates'. There are many other possible issues when the
coordinates are within the frame.

If we think we'd better not let user get more info from the warning message,
I'm ok to remove them.

I'll remove them if there's another comment supporting the removal, and
there's no objection.

> 
> 
> >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> >> >> >> > index a5ed91b20a..41e22de02a 100644
> >> >> >> > --- a/libavutil/frame.h
> >> >> >> > +++ b/libavutil/frame.h
> >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> >> >> >> >  * Must be present for every frame which should have film grain
> >> applied.
> >> >> >> >  */
> >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> >> >> >> > +
> >> >> >> > +    /**
> >> >> >> > +     * Bounding boxes for object detection and classification, the
> >> data is
> >> >> a
> >> >> >> AVBoundingBoxHeader
> >> >> >> > +     * followed with an array of AVBoudingBox, and
> >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
> >> >> >> > +     * of array element.
> >> >> >> > +     */
> >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> >> >> >> >  };
> >> >> >> >
> >> >> >>
> >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
> >> >> >> How about "Object Classification"? It makes much more sense, it's
> >> >> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
> >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> >> >> >>
> >> >> >
> >> >> > In object detection papers, bounding box is usually used.
> >> >> > We'd better use the same term, imho, thanks.
> >> >> >
> >> >>
> >> >> Not in this case, API users won't have any idea what this is or what
> >> >> it's for. This is user-facing code after all.
> >> >> Papers in fields can get away with overloading language, but we're
> >> >> trying to make a concise API. Object classification makes sense
> >> >> because this is exactly what this is.
> >> >>
> >> >
> >> > The term bounding box is dominating the domain, for example, even
> >> > HEVC spec uses this term, see page 317 of
> >> >
> >>
> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!P
> >> DF-E&type=items
> >> >
> >> > also copy some here for your convenient.
> >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
> >> >
> >> > I would prefer to use bounding box.
> >> >
> >>
> >> It's for an entirely different thing, and like I said, it's just an overloaded
> >> language because they can get away. We're trying to be generic.
> >> This side data is for detecting _and_ classifying objects. In fact, most of
> >> the structure is dedicated towards classifying. If you'd like users to actually
> >> use this, give it a good name and don't leave them guessing what this
> >> structure is by throwing vague jargon some other paper or spec has
> >> because it's close enough.
> >>
> >
> > all the people in the domain accepts bounding box, they can understand this
> > struct name easily and clearly, they might be bothered if we use another
> name.
> >
> > btw, AVObjectClassification confuses people who just want object detection.
> >
> 
> As I said, the name "bounding box" makes no sense once it gets overloaded
> with object classification. 

dnn_detect creates an array of 'bounding box' for all detected objects, and
dnn_classify assigns attributes for a set of bounding boxes (with same object
id). 'bounding box' serves both detection and classification properly.


> Object classification is still the main use of the filters,
> because the original proposal was to have all of this info be ffmpeg-private,
> which would forbid simple object detection.

The original proposal is to add it as side data which is ffmpeg-public, and then,
we spent much time discussing/trying with ffmpeg-private as an temporary
method, and since it is not good to be temporary, we now switch back to
ffmpeg-public.

During the whole period, we don't have any intention to 
'forbid simple object detection', not quite understand your point here.


> So I still maintain this should be called "Object classification". I'd accept
> "Object detection" as well, but definitely not "bounding box".

imho, ' Object detection' and ' Object classification' are worse, they just
describe one aspect of the struct. The users might just use filter dnn_detect,
they might use filters dnn_detect + dnn_classify.


> 
> Since the decision was made to make the side data public, we have to make
> very sure it contains no hacks or is impossible to extend, since we don't want
> to have an
> "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWED_
> UP"
> faster than you can say "Recursive cascade correlation artificial neural
> networks".

sorry, not quite understand your point here.

'bounding box' is designed for general purpose to contain the info for
detection/classification. It doesn't matter which DNN model is used, it doesn't
matter if a traditional algorithm (non-dnn) is used.

I'm open to use a better name. And bounding box is the best one for me till now.
Everyone in the domain knows the exact meaning of bounding box without
extra explanation. This word has been extended/evolved with such meaning in
this domain.
Lynne April 9, 2021, 10:03 a.m. UTC | #16
Apr 9, 2021, 06:12 by yejun.guo@intel.com:

>
>
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
>> Sent: 2021年4月9日 0:57
>> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> AV_FRAME_DATA_BOUNDING_BOXES
>>
>
> First of all, thanks for the quick replies, I see, all the discussions/comments are to
> make this patch better, thank you.
>
>> >> >
>> >> >> >> > +
>> >> >> >> > +typedef struct AVBoundingBoxHeader {
>> >> >> >> > +    /**
>> >> >> >> > +     * Information about how the bounding box is generated.
>> >> >> >> > +     * for example, the DNN model name.
>> >> >> >> > +     */
>> >> >> >> > +    char source[128];
>> >> >> >> > +
>> >> >> >> > +    /**
>> >> >> >> > +     * The size of frame when it is detected.
>> >> >> >> > +     */
>> >> >> >> > +    int frame_width;
>> >> >> >> > +    int frame_height;
>> >> >> >> >
>> >> >> >>
>> >> >> >> Why? This side data is attached to AVFrames only, where we
>> >> >> >> already have width and height.
>> >> >> >>
>> >> >> >
>> >> >> > The detection result will be used by other filters, for example,
>> >> >> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
>> >> >> >
>> >> >> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
>> >> >> > frame, while dnn_classify classifies one detected object (for example,
>> >> person)
>> >> >> > for its attribute (for example, emotion, etc.)
>> >> >> >
>> >> >> > The filter dnn_classify have to check if the frame size is changed after
>> >> >> > it is detected, to handle the below filter chain:
>> >> >> > dnn_detect -> scale -> dnn_classify
>> >> >> >
>> >> >>
>> >> >> This doesn't look good. Why is dnn_classify needing to know
>> >> >> the original frame size at all?
>> >> >>
>> >> >
>> >> > For example, the original size of the frame is 100*100, and dnn_detect
>> >> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
>> >> > AVBoundingBox.top/left/right/bottom.
>> >> >
>> >> > Then, the frame is scaled into 50*50.
>> >> >
>> >> > Then, dnn_classify is used to analyze the emotion of the face, it needs to
>> >> > know the frame size (100*100) when it is detected, otherwise, it does not
>> >> > work with just (10,10), (30,40) and 50*50.
>> >> >
>> >>
>> >> Why can't the scale filter also rescale this side data as well?
>> >>
>> >
>> > I'm afraid that we could not make sure all such filters (including filters in the
>> > future) to do the rescale. And in the previous discussion, I got to know that
>> > 'many other existing side-data types are invalidated by scaling'.
>> >
>> > So, we need frame_width and frame_height here.
>> >
>>
>> No, you don't. You just need to make sure filters which change resolution
>> or do cropping also change the side data parameters.
>> It's called maintainership. As-is, this won't even work with cropping,
>> only with basic aspect ratio preserving scaling.
>> For the lack of a better term, this is a hack.
>>
>
> As discussed in previous email, for the frame size change case, dnn_classify
> (and other filters which use the detection result, for example drawbox) can
> just output a warning message to tell user what happens, and don't do the
> classification, otherwise, it will give a wrong/weird result which makes the
> user confused.
>
>>
>> I would accept just specifying that if the frame dimensions are
>> altered in any way, the side-data is no longer valid and it's up
>> to users to figure that out by out of bound coordinates.
>> This is what we currently do with video_enc_params.
>>
>
> frame_width/frame_height is not perfect (for the cases such as: scale down
> + crop + scale up to the same size), but it provides more info than the checking
> of 'out of bound coordinates'. There are many other possible issues when the
> coordinates are within the frame.
>
> If we think we'd better not let user get more info from the warning message,
> I'm ok to remove them.
>
> I'll remove them if there's another comment supporting the removal, and
> there's no objection.
>

We definitely shouldn't include variables in public API structs
that only serve to print a warning if they don't match.
Especially one that's fragile and flawed like this one.
Users should know that scaling or altering a frame could break
this side data, and filters could detect obvious out of bounds
results and report them.

In the meantime, the main scaling and cropping filters could
receive separate patches to preserve metadata at a later data.
This is how the avframe cropping field work - they're just metadata,
and cropping/scaling filters update those.


>> >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
>> >> >> >> > index a5ed91b20a..41e22de02a 100644
>> >> >> >> > --- a/libavutil/frame.h
>> >> >> >> > +++ b/libavutil/frame.h
>> >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>> >> >> >> >  * Must be present for every frame which should have film grain
>> >> applied.
>> >> >> >> >  */
>> >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
>> >> >> >> > +
>> >> >> >> > +    /**
>> >> >> >> > +     * Bounding boxes for object detection and classification, the
>> >> data is
>> >> >> a
>> >> >> >> AVBoundingBoxHeader
>> >> >> >> > +     * followed with an array of AVBoudingBox, and
>> >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
>> >> >> >> > +     * of array element.
>> >> >> >> > +     */
>> >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
>> >> >> >> >  };
>> >> >> >> >
>> >> >> >>
>> >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
>> >> >> >> How about "Object Classification"? It makes much more sense, it's
>> >> >> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
>> >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
>> >> >> >>
>> >> >> >
>> >> >> > In object detection papers, bounding box is usually used.
>> >> >> > We'd better use the same term, imho, thanks.
>> >> >> >
>> >> >>
>> >> >> Not in this case, API users won't have any idea what this is or what
>> >> >> it's for. This is user-facing code after all.
>> >> >> Papers in fields can get away with overloading language, but we're
>> >> >> trying to make a concise API. Object classification makes sense
>> >> >> because this is exactly what this is.
>> >> >>
>> >> >
>> >> > The term bounding box is dominating the domain, for example, even
>> >> > HEVC spec uses this term, see page 317 of
>> >> >
>> >>
>> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!P
>> >> DF-E&type=items
>> >> >
>> >> > also copy some here for your convenient.
>> >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
>> >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
>> >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
>> >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
>> >> >
>> >> > I would prefer to use bounding box.
>> >> >
>> >>
>> >> It's for an entirely different thing, and like I said, it's just an overloaded
>> >> language because they can get away. We're trying to be generic.
>> >> This side data is for detecting _and_ classifying objects. In fact, most of
>> >> the structure is dedicated towards classifying. If you'd like users to actually
>> >> use this, give it a good name and don't leave them guessing what this
>> >> structure is by throwing vague jargon some other paper or spec has
>> >> because it's close enough.
>> >>
>> >
>> > all the people in the domain accepts bounding box, they can understand this
>> > struct name easily and clearly, they might be bothered if we use another
>> name.
>> >
>> > btw, AVObjectClassification confuses people who just want object detection.
>> >
>>
>> As I said, the name "bounding box" makes no sense once it gets overloaded
>> with object classification.
>>
>
> dnn_detect creates an array of 'bounding box' for all detected objects, and
> dnn_classify assigns attributes for a set of bounding boxes (with same object
> id). 'bounding box' serves both detection and classification properly.
>
>
>> Object classification is still the main use of the filters,
>> because the original proposal was to have all of this info be ffmpeg-private,
>> which would forbid simple object detection.
>>
>
> The original proposal is to add it as side data which is ffmpeg-public, and then,
> we spent much time discussing/trying with ffmpeg-private as an temporary
> method, and since it is not good to be temporary, we now switch back to
> ffmpeg-public.
>
> During the whole period, we don't have any intention to 
> 'forbid simple object detection', not quite understand your point here.
>
>
>> So I still maintain this should be called "Object classification". I'd accept
>> "Object detection" as well, but definitely not "bounding box".
>>
>
> imho, ' Object detection' and ' Object classification' are worse, they just
> describe one aspect of the struct. The users might just use filter dnn_detect,
> they might use filters dnn_detect + dnn_classify.
>

The whole reason why we have this side data is to both detect
_and_ classify. Keyword being _both_. Hence object detection
and object classification are much better names.
I am opposed to merging this without a name change.


>> Since the decision was made to make the side data public, we have to make
>> very sure it contains no hacks or is impossible to extend, since we don't want
>> to have an
>> "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWED_
>> UP"
>> faster than you can say "Recursive cascade correlation artificial neural
>> networks".
>>
>
> sorry, not quite understand your point here.
>

The point of this paragraph was to highlight the need to have other
people look at the struct rather than just you. It's called peer review.
It wasn't anything about the name at all.
Guo, Yejun April 9, 2021, 12:57 p.m. UTC | #17
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Lynne
> Sent: 2021年4月9日 18:03
> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Apr 9, 2021, 06:12 by yejun.guo@intel.com:
> 
> >
> >
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Lynne
> >> Sent: 2021年4月9日 0:57
> >> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> >> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> >> AV_FRAME_DATA_BOUNDING_BOXES
> >>
> >
> > First of all, thanks for the quick replies, I see, all the
> discussions/comments are to
> > make this patch better, thank you.
> >
> >> >> >
> >> >> >> >> > +
> >> >> >> >> > +typedef struct AVBoundingBoxHeader {
> >> >> >> >> > +    /**
> >> >> >> >> > +     * Information about how the bounding box is
> generated.
> >> >> >> >> > +     * for example, the DNN model name.
> >> >> >> >> > +     */
> >> >> >> >> > +    char source[128];
> >> >> >> >> > +
> >> >> >> >> > +    /**
> >> >> >> >> > +     * The size of frame when it is detected.
> >> >> >> >> > +     */
> >> >> >> >> > +    int frame_width;
> >> >> >> >> > +    int frame_height;
> >> >> >> >> >
> >> >> >> >>
> >> >> >> >> Why? This side data is attached to AVFrames only, where we
> >> >> >> >> already have width and height.
> >> >> >> >>
> >> >> >> >
> >> >> >> > The detection result will be used by other filters, for example,
> >> >> >> > dnn_classify (see
> https://github.com/guoyejun/ffmpeg/tree/classify).
> >> >> >> >
> >> >> >> > The filter dnn_detect detects all the objects (cat, dog, person ...)
> in a
> >> >> >> > frame, while dnn_classify classifies one detected object (for
> example,
> >> >> person)
> >> >> >> > for its attribute (for example, emotion, etc.)
> >> >> >> >
> >> >> >> > The filter dnn_classify have to check if the frame size is changed
> after
> >> >> >> > it is detected, to handle the below filter chain:
> >> >> >> > dnn_detect -> scale -> dnn_classify
> >> >> >> >
> >> >> >>
> >> >> >> This doesn't look good. Why is dnn_classify needing to know
> >> >> >> the original frame size at all?
> >> >> >>
> >> >> >
> >> >> > For example, the original size of the frame is 100*100, and
> dnn_detect
> >> >> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
> >> >> > AVBoundingBox.top/left/right/bottom.
> >> >> >
> >> >> > Then, the frame is scaled into 50*50.
> >> >> >
> >> >> > Then, dnn_classify is used to analyze the emotion of the face, it
> needs to
> >> >> > know the frame size (100*100) when it is detected, otherwise, it
> does not
> >> >> > work with just (10,10), (30,40) and 50*50.
> >> >> >
> >> >>
> >> >> Why can't the scale filter also rescale this side data as well?
> >> >>
> >> >
> >> > I'm afraid that we could not make sure all such filters (including filters
> in the
> >> > future) to do the rescale. And in the previous discussion, I got to
> know that
> >> > 'many other existing side-data types are invalidated by scaling'.
> >> >
> >> > So, we need frame_width and frame_height here.
> >> >
> >>
> >> No, you don't. You just need to make sure filters which change
> resolution
> >> or do cropping also change the side data parameters.
> >> It's called maintainership. As-is, this won't even work with cropping,
> >> only with basic aspect ratio preserving scaling.
> >> For the lack of a better term, this is a hack.
> >>
> >
> > As discussed in previous email, for the frame size change case,
> dnn_classify
> > (and other filters which use the detection result, for example drawbox)
> can
> > just output a warning message to tell user what happens, and don't do
> the
> > classification, otherwise, it will give a wrong/weird result which makes
> the
> > user confused.
> >
> >>
> >> I would accept just specifying that if the frame dimensions are
> >> altered in any way, the side-data is no longer valid and it's up
> >> to users to figure that out by out of bound coordinates.
> >> This is what we currently do with video_enc_params.
> >>
> >
> > frame_width/frame_height is not perfect (for the cases such as: scale
> down
> > + crop + scale up to the same size), but it provides more info than the
> checking
> > of 'out of bound coordinates'. There are many other possible issues
> when the
> > coordinates are within the frame.
> >
> > If we think we'd better not let user get more info from the warning
> message,
> > I'm ok to remove them.
> >
> > I'll remove them if there's another comment supporting the removal, and
> > there's no objection.
> >
> 
> We definitely shouldn't include variables in public API structs
> that only serve to print a warning if they don't match.

Not just 'print a warning', it also impacts the behavior of dnn_classify.

> Especially one that's fragile and flawed like this one.
> Users should know that scaling or altering a frame could break
> this side data, and filters could detect obvious out of bounds
> results and report them.

I'll remove them since it is user's responsibility.

> 
> In the meantime, the main scaling and cropping filters could
> receive separate patches to preserve metadata at a later data.
> This is how the avframe cropping field work - they're just metadata,
> and cropping/scaling filters update those.
> 
> 
> >> >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> >> >> >> >> > index a5ed91b20a..41e22de02a 100644
> >> >> >> >> > --- a/libavutil/frame.h
> >> >> >> >> > +++ b/libavutil/frame.h
> >> >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> >> >> >> >> >  * Must be present for every frame which should have film
> grain
> >> >> applied.
> >> >> >> >> >  */
> >> >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> >> >> >> >> > +
> >> >> >> >> > +    /**
> >> >> >> >> > +     * Bounding boxes for object detection and
> classification, the
> >> >> data is
> >> >> >> a
> >> >> >> >> AVBoundingBoxHeader
> >> >> >> >> > +     * followed with an array of AVBoudingBox, and
> >> >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
> >> >> >> >> > +     * of array element.
> >> >> >> >> > +     */
> >> >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> >> >> >> >> >  };
> >> >> >> >> >
> >> >> >> >>
> >> >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
> >> >> >> >> How about "Object Classification"? It makes much more sense,
> it's
> >> >> >> >> exactly what this is. So AVObjectClassification,
> AVObjectClassification,
> >> >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> >> >> >> >>
> >> >> >> >
> >> >> >> > In object detection papers, bounding box is usually used.
> >> >> >> > We'd better use the same term, imho, thanks.
> >> >> >> >
> >> >> >>
> >> >> >> Not in this case, API users won't have any idea what this is or
> what
> >> >> >> it's for. This is user-facing code after all.
> >> >> >> Papers in fields can get away with overloading language, but
> we're
> >> >> >> trying to make a concise API. Object classification makes sense
> >> >> >> because this is exactly what this is.
> >> >> >>
> >> >> >
> >> >> > The term bounding box is dominating the domain, for example,
> even
> >> >> > HEVC spec uses this term, see page 317 of
> >> >> >
> >> >>
> >>
> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I
> !!P
> >> >> DF-E&type=items
> >> >> >
> >> >> > also copy some here for your convenient.
> >> >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> >> >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> >> >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> >> >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
> >> >> >
> >> >> > I would prefer to use bounding box.
> >> >> >
> >> >>
> >> >> It's for an entirely different thing, and like I said, it's just an
> overloaded
> >> >> language because they can get away. We're trying to be generic.
> >> >> This side data is for detecting _and_ classifying objects. In fact, most
> of
> >> >> the structure is dedicated towards classifying. If you'd like users to
> actually
> >> >> use this, give it a good name and don't leave them guessing what
> this
> >> >> structure is by throwing vague jargon some other paper or spec has
> >> >> because it's close enough.
> >> >>
> >> >
> >> > all the people in the domain accepts bounding box, they can
> understand this
> >> > struct name easily and clearly, they might be bothered if we use
> another
> >> name.
> >> >
> >> > btw, AVObjectClassification confuses people who just want object
> detection.
> >> >
> >>
> >> As I said, the name "bounding box" makes no sense once it gets
> overloaded
> >> with object classification.
> >>
> >
> > dnn_detect creates an array of 'bounding box' for all detected objects,
> and
> > dnn_classify assigns attributes for a set of bounding boxes (with same
> object
> > id). 'bounding box' serves both detection and classification properly.
> >
> >
> >> Object classification is still the main use of the filters,
> >> because the original proposal was to have all of this info be
> ffmpeg-private,
> >> which would forbid simple object detection.
> >>
> >
> > The original proposal is to add it as side data which is ffmpeg-public, and
> then,
> > we spent much time discussing/trying with ffmpeg-private as an
> temporary
> > method, and since it is not good to be temporary, we now switch back to
> > ffmpeg-public.
> >
> > During the whole period, we don't have any intention to
> > 'forbid simple object detection', not quite understand your point here.
> >
> >
> >> So I still maintain this should be called "Object classification". I'd accept
> >> "Object detection" as well, but definitely not "bounding box".
> >>
> >
> > imho, ' Object detection' and ' Object classification' are worse, they just
> > describe one aspect of the struct. The users might just use filter
> dnn_detect,
> > they might use filters dnn_detect + dnn_classify.
> >
> 
> The whole reason why we have this side data is to both detect
> _and_ classify. Keyword being _both_. Hence object detection
> and object classification are much better names.
> I am opposed to merging this without a name change.

I understand it is not a good name with codec background, but it is a good
(possible best) name with object detection background.

To make things move forward, I'll change the name to 'object detection'
for both dnn_detect and dnn_classify, they would be:
AV_FRAME_DATA_OJBECT_DETECTION
AVObjectDetection
AVObjectDetectionHeader

> 
> 
> >> Since the decision was made to make the side data public, we have to
> make
> >> very sure it contains no hacks or is impossible to extend, since we don't
> want
> >> to have an
> >>
> "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWE
> D_
> >> UP"
> >> faster than you can say "Recursive cascade correlation artificial neural
> >> networks".
> >>
> >
> > sorry, not quite understand your point here.
> >
> 
> The point of this paragraph was to highlight the need to have other
> people look at the struct rather than just you. It's called peer review.
> It wasn't anything about the name at all.

sure, comments are welcome. And the struct has got many comments and
improved a lot in the past two months. Thanks all again.


> > 'bounding box' is designed for general purpose to contain the info for
> > detection/classification. It doesn't matter which DNN model is used, it
> > doesn't
> > matter if a traditional algorithm (non-dnn) is used.
> >
> > I'm open to use a better name. And bounding box is the best one for me
> > till now.
> > Everyone in the domain knows the exact meaning of bounding box
> > without
> > extra explanation. This word has been extended/evolved with such
> > meaning in
> > this domain.
Pedro Arthur April 9, 2021, 2:35 p.m. UTC | #18
Em sex., 9 de abr. de 2021 às 01:13, Guo, Yejun <yejun.guo@intel.com> escreveu:
>
>
>
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
> > Sent: 2021年4月9日 0:57
> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> > AV_FRAME_DATA_BOUNDING_BOXES
> >
>
> First of all, thanks for the quick replies, I see, all the discussions/comments are to
> make this patch better, thank you.
>
> > >> >
> > >> >> >> > +
> > >> >> >> > +typedef struct AVBoundingBoxHeader {
> > >> >> >> > +    /**
> > >> >> >> > +     * Information about how the bounding box is generated.
> > >> >> >> > +     * for example, the DNN model name.
> > >> >> >> > +     */
> > >> >> >> > +    char source[128];
> > >> >> >> > +
> > >> >> >> > +    /**
> > >> >> >> > +     * The size of frame when it is detected.
> > >> >> >> > +     */
> > >> >> >> > +    int frame_width;
> > >> >> >> > +    int frame_height;
> > >> >> >> >
> > >> >> >>
> > >> >> >> Why? This side data is attached to AVFrames only, where we
> > >> >> >> already have width and height.
> > >> >> >>
> > >> >> >
> > >> >> > The detection result will be used by other filters, for example,
> > >> >> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
> > >> >> >
> > >> >> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
> > >> >> > frame, while dnn_classify classifies one detected object (for example,
> > >> person)
> > >> >> > for its attribute (for example, emotion, etc.)
> > >> >> >
> > >> >> > The filter dnn_classify have to check if the frame size is changed after
> > >> >> > it is detected, to handle the below filter chain:
> > >> >> > dnn_detect -> scale -> dnn_classify
> > >> >> >
> > >> >>
> > >> >> This doesn't look good. Why is dnn_classify needing to know
> > >> >> the original frame size at all?
> > >> >>
> > >> >
> > >> > For example, the original size of the frame is 100*100, and dnn_detect
> > >> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
> > >> > AVBoundingBox.top/left/right/bottom.
> > >> >
> > >> > Then, the frame is scaled into 50*50.
> > >> >
> > >> > Then, dnn_classify is used to analyze the emotion of the face, it needs to
> > >> > know the frame size (100*100) when it is detected, otherwise, it does not
> > >> > work with just (10,10), (30,40) and 50*50.
> > >> >
> > >>
> > >> Why can't the scale filter also rescale this side data as well?
> > >>
> > >
> > > I'm afraid that we could not make sure all such filters (including filters in the
> > > future) to do the rescale. And in the previous discussion, I got to know that
> > > 'many other existing side-data types are invalidated by scaling'.
> > >
> > > So, we need frame_width and frame_height here.
> > >
> >
> > No, you don't. You just need to make sure filters which change resolution
> > or do cropping also change the side data parameters.
> > It's called maintainership. As-is, this won't even work with cropping,
> > only with basic aspect ratio preserving scaling.
> > For the lack of a better term, this is a hack.
>
> As discussed in previous email, for the frame size change case, dnn_classify
> (and other filters which use the detection result, for example drawbox) can
> just output a warning message to tell user what happens, and don't do the
> classification, otherwise, it will give a wrong/weird result which makes the
> user confused.
>
> >
> > I would accept just specifying that if the frame dimensions are
> > altered in any way, the side-data is no longer valid and it's up
> > to users to figure that out by out of bound coordinates.
> > This is what we currently do with video_enc_params.
>
> frame_width/frame_height is not perfect (for the cases such as: scale down
> + crop + scale up to the same size), but it provides more info than the checking
> of 'out of bound coordinates'. There are many other possible issues when the
> coordinates are within the frame.
>
> If we think we'd better not let user get more info from the warning message,
> I'm ok to remove them.
>
> I'll remove them if there's another comment supporting the removal, and
> there's no objection.
>
> >
> >
> > >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> > >> >> >> > index a5ed91b20a..41e22de02a 100644
> > >> >> >> > --- a/libavutil/frame.h
> > >> >> >> > +++ b/libavutil/frame.h
> > >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> > >> >> >> >  * Must be present for every frame which should have film grain
> > >> applied.
> > >> >> >> >  */
> > >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> > >> >> >> > +
> > >> >> >> > +    /**
> > >> >> >> > +     * Bounding boxes for object detection and classification, the
> > >> data is
> > >> >> a
> > >> >> >> AVBoundingBoxHeader
> > >> >> >> > +     * followed with an array of AVBoudingBox, and
> > >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
> > >> >> >> > +     * of array element.
> > >> >> >> > +     */
> > >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> > >> >> >> >  };
> > >> >> >> >
> > >> >> >>
> > >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
> > >> >> >> How about "Object Classification"? It makes much more sense, it's
> > >> >> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
> > >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> > >> >> >>
> > >> >> >
> > >> >> > In object detection papers, bounding box is usually used.
> > >> >> > We'd better use the same term, imho, thanks.
> > >> >> >
> > >> >>
> > >> >> Not in this case, API users won't have any idea what this is or what
> > >> >> it's for. This is user-facing code after all.
> > >> >> Papers in fields can get away with overloading language, but we're
> > >> >> trying to make a concise API. Object classification makes sense
> > >> >> because this is exactly what this is.
> > >> >>
> > >> >
> > >> > The term bounding box is dominating the domain, for example, even
> > >> > HEVC spec uses this term, see page 317 of
> > >> >
> > >>
> > https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!P
> > >> DF-E&type=items
> > >> >
> > >> > also copy some here for your convenient.
> > >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> > >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> > >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> > >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
> > >> >
> > >> > I would prefer to use bounding box.
> > >> >
> > >>
> > >> It's for an entirely different thing, and like I said, it's just an overloaded
> > >> language because they can get away. We're trying to be generic.
> > >> This side data is for detecting _and_ classifying objects. In fact, most of
> > >> the structure is dedicated towards classifying. If you'd like users to actually
> > >> use this, give it a good name and don't leave them guessing what this
> > >> structure is by throwing vague jargon some other paper or spec has
> > >> because it's close enough.
> > >>
> > >
> > > all the people in the domain accepts bounding box, they can understand this
> > > struct name easily and clearly, they might be bothered if we use another
> > name.
> > >
> > > btw, AVObjectClassification confuses people who just want object detection.
> > >
> >
> > As I said, the name "bounding box" makes no sense once it gets overloaded
> > with object classification.
>
> dnn_detect creates an array of 'bounding box' for all detected objects, and
> dnn_classify assigns attributes for a set of bounding boxes (with same object
> id). 'bounding box' serves both detection and classification properly.
>
>
> > Object classification is still the main use of the filters,
> > because the original proposal was to have all of this info be ffmpeg-private,
> > which would forbid simple object detection.
>
> The original proposal is to add it as side data which is ffmpeg-public, and then,
> we spent much time discussing/trying with ffmpeg-private as an temporary
> method, and since it is not good to be temporary, we now switch back to
> ffmpeg-public.
>
> During the whole period, we don't have any intention to
> 'forbid simple object detection', not quite understand your point here.
>
>
> > So I still maintain this should be called "Object classification". I'd accept
> > "Object detection" as well, but definitely not "bounding box".
>
> imho, ' Object detection' and ' Object classification' are worse, they just
> describe one aspect of the struct. The users might just use filter dnn_detect,
> they might use filters dnn_detect + dnn_classify.
>
>
> >
> > Since the decision was made to make the side data public, we have to make
> > very sure it contains no hacks or is impossible to extend, since we don't want
> > to have an
> > "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWED_
> > UP"
> > faster than you can say "Recursive cascade correlation artificial neural
> > networks".
>
> sorry, not quite understand your point here.
>
> 'bounding box' is designed for general purpose to contain the info for
> detection/classification. It doesn't matter which DNN model is used, it doesn't
> matter if a traditional algorithm (non-dnn) is used.
>
> I'm open to use a better name. And bounding box is the best one for me till now.
> Everyone in the domain knows the exact meaning of bounding box without
> extra explanation. This word has been extended/evolved with such meaning in
> this domain.
>
+1

I think it is wise to use the name which is widely used in the field.


> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Guo, Yejun April 9, 2021, 3:10 p.m. UTC | #19
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Guo, Yejun
> Sent: 2021年4月9日 20:57
> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > Lynne
> > Sent: 2021年4月9日 18:03
> > To: FFmpeg development discussions and patches
> > <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> > AV_FRAME_DATA_BOUNDING_BOXES
> >
> > Apr 9, 2021, 06:12 by yejun.guo@intel.com:
> >
> > >
> > >
> > >> -----Original Message-----
> > >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf
> Of
> > Lynne
> > >> Sent: 2021年4月9日 0:57
> > >> To: FFmpeg development discussions and patches
> > <ffmpeg-devel@ffmpeg.org>
> > >> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> > >> AV_FRAME_DATA_BOUNDING_BOXES
> > >>
> > >
> > > First of all, thanks for the quick replies, I see, all the
> > discussions/comments are to
> > > make this patch better, thank you.
> > >
> > >> >> >
> > >> >> >> >> > +
> > >> >> >> >> > +typedef struct AVBoundingBoxHeader {
> > >> >> >> >> > +    /**
> > >> >> >> >> > +     * Information about how the bounding box is
> > generated.
> > >> >> >> >> > +     * for example, the DNN model name.
> > >> >> >> >> > +     */
> > >> >> >> >> > +    char source[128];
> > >> >> >> >> > +
> > >> >> >> >> > +    /**
> > >> >> >> >> > +     * The size of frame when it is detected.
> > >> >> >> >> > +     */
> > >> >> >> >> > +    int frame_width;
> > >> >> >> >> > +    int frame_height;
> > >> >> >> >> >
> > >> >> >> >>
> > >> >> >> >> Why? This side data is attached to AVFrames only, where we
> > >> >> >> >> already have width and height.
> > >> >> >> >>
> > >> >> >> >
> > >> >> >> > The detection result will be used by other filters, for example,
> > >> >> >> > dnn_classify (see
> > https://github.com/guoyejun/ffmpeg/tree/classify).
> > >> >> >> >
> > >> >> >> > The filter dnn_detect detects all the objects (cat, dog,
> person ...)
> > in a
> > >> >> >> > frame, while dnn_classify classifies one detected object (for
> > example,
> > >> >> person)
> > >> >> >> > for its attribute (for example, emotion, etc.)
> > >> >> >> >
> > >> >> >> > The filter dnn_classify have to check if the frame size is
> changed
> > after
> > >> >> >> > it is detected, to handle the below filter chain:
> > >> >> >> > dnn_detect -> scale -> dnn_classify
> > >> >> >> >
> > >> >> >>
> > >> >> >> This doesn't look good. Why is dnn_classify needing to know
> > >> >> >> the original frame size at all?
> > >> >> >>
> > >> >> >
> > >> >> > For example, the original size of the frame is 100*100, and
> > dnn_detect
> > >> >> > detects a face at place (10, 10) -> (30, 40), such data will be saved
> in
> > >> >> > AVBoundingBox.top/left/right/bottom.
> > >> >> >
> > >> >> > Then, the frame is scaled into 50*50.
> > >> >> >
> > >> >> > Then, dnn_classify is used to analyze the emotion of the face, it
> > needs to
> > >> >> > know the frame size (100*100) when it is detected, otherwise, it
> > does not
> > >> >> > work with just (10,10), (30,40) and 50*50.
> > >> >> >
> > >> >>
> > >> >> Why can't the scale filter also rescale this side data as well?
> > >> >>
> > >> >
> > >> > I'm afraid that we could not make sure all such filters (including
> filters
> > in the
> > >> > future) to do the rescale. And in the previous discussion, I got to
> > know that
> > >> > 'many other existing side-data types are invalidated by scaling'.
> > >> >
> > >> > So, we need frame_width and frame_height here.
> > >> >
> > >>
> > >> No, you don't. You just need to make sure filters which change
> > resolution
> > >> or do cropping also change the side data parameters.
> > >> It's called maintainership. As-is, this won't even work with cropping,
> > >> only with basic aspect ratio preserving scaling.
> > >> For the lack of a better term, this is a hack.
> > >>
> > >
> > > As discussed in previous email, for the frame size change case,
> > dnn_classify
> > > (and other filters which use the detection result, for example drawbox)
> > can
> > > just output a warning message to tell user what happens, and don't do
> > the
> > > classification, otherwise, it will give a wrong/weird result which makes
> > the
> > > user confused.
> > >
> > >>
> > >> I would accept just specifying that if the frame dimensions are
> > >> altered in any way, the side-data is no longer valid and it's up
> > >> to users to figure that out by out of bound coordinates.
> > >> This is what we currently do with video_enc_params.
> > >>
> > >
> > > frame_width/frame_height is not perfect (for the cases such as: scale
> > down
> > > + crop + scale up to the same size), but it provides more info than the
> > checking
> > > of 'out of bound coordinates'. There are many other possible issues
> > when the
> > > coordinates are within the frame.
> > >
> > > If we think we'd better not let user get more info from the warning
> > message,
> > > I'm ok to remove them.
> > >
> > > I'll remove them if there's another comment supporting the removal,
> and
> > > there's no objection.
> > >
> >
> > We definitely shouldn't include variables in public API structs
> > that only serve to print a warning if they don't match.
> 
> Not just 'print a warning', it also impacts the behavior of dnn_classify.
> 
> > Especially one that's fragile and flawed like this one.
> > Users should know that scaling or altering a frame could break
> > this side data, and filters could detect obvious out of bounds
> > results and report them.
> 
> I'll remove them since it is user's responsibility.
> 
> >
> > In the meantime, the main scaling and cropping filters could
> > receive separate patches to preserve metadata at a later data.
> > This is how the avframe cropping field work - they're just metadata,
> > and cropping/scaling filters update those.
> >
> >
> > >> >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> > >> >> >> >> > index a5ed91b20a..41e22de02a 100644
> > >> >> >> >> > --- a/libavutil/frame.h
> > >> >> >> >> > +++ b/libavutil/frame.h
> > >> >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> > >> >> >> >> >  * Must be present for every frame which should have
> film
> > grain
> > >> >> applied.
> > >> >> >> >> >  */
> > >> >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> > >> >> >> >> > +
> > >> >> >> >> > +    /**
> > >> >> >> >> > +     * Bounding boxes for object detection and
> > classification, the
> > >> >> data is
> > >> >> >> a
> > >> >> >> >> AVBoundingBoxHeader
> > >> >> >> >> > +     * followed with an array of AVBoudingBox, and
> > >> >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
> > >> >> >> >> > +     * of array element.
> > >> >> >> >> > +     */
> > >> >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> > >> >> >> >> >  };
> > >> >> >> >> >
> > >> >> >> >>
> > >> >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
> > >> >> >> >> How about "Object Classification"? It makes much more
> sense,
> > it's
> > >> >> >> >> exactly what this is. So AVObjectClassification,
> > AVObjectClassification,
> > >> >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> > >> >> >> >>
> > >> >> >> >
> > >> >> >> > In object detection papers, bounding box is usually used.
> > >> >> >> > We'd better use the same term, imho, thanks.
> > >> >> >> >
> > >> >> >>
> > >> >> >> Not in this case, API users won't have any idea what this is or
> > what
> > >> >> >> it's for. This is user-facing code after all.
> > >> >> >> Papers in fields can get away with overloading language, but
> > we're
> > >> >> >> trying to make a concise API. Object classification makes sense
> > >> >> >> because this is exactly what this is.
> > >> >> >>
> > >> >> >
> > >> >> > The term bounding box is dominating the domain, for example,
> > even
> > >> >> > HEVC spec uses this term, see page 317 of
> > >> >> >
> > >> >>
> > >>
> >
> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I
> > !!P
> > >> >> DF-E&type=items
> > >> >> >
> > >> >> > also copy some here for your convenient.
> > >> >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> > >> >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> > >> >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> > >> >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
> > >> >> >
> > >> >> > I would prefer to use bounding box.
> > >> >> >
> > >> >>
> > >> >> It's for an entirely different thing, and like I said, it's just an
> > overloaded
> > >> >> language because they can get away. We're trying to be generic.
> > >> >> This side data is for detecting _and_ classifying objects. In fact,
> most
> > of
> > >> >> the structure is dedicated towards classifying. If you'd like users to
> > actually
> > >> >> use this, give it a good name and don't leave them guessing what
> > this
> > >> >> structure is by throwing vague jargon some other paper or spec
> has
> > >> >> because it's close enough.
> > >> >>
> > >> >
> > >> > all the people in the domain accepts bounding box, they can
> > understand this
> > >> > struct name easily and clearly, they might be bothered if we use
> > another
> > >> name.
> > >> >
> > >> > btw, AVObjectClassification confuses people who just want object
> > detection.
> > >> >
> > >>
> > >> As I said, the name "bounding box" makes no sense once it gets
> > overloaded
> > >> with object classification.
> > >>
> > >
> > > dnn_detect creates an array of 'bounding box' for all detected objects,
> > and
> > > dnn_classify assigns attributes for a set of bounding boxes (with same
> > object
> > > id). 'bounding box' serves both detection and classification properly.
> > >
> > >
> > >> Object classification is still the main use of the filters,
> > >> because the original proposal was to have all of this info be
> > ffmpeg-private,
> > >> which would forbid simple object detection.
> > >>
> > >
> > > The original proposal is to add it as side data which is ffmpeg-public,
> and
> > then,
> > > we spent much time discussing/trying with ffmpeg-private as an
> > temporary
> > > method, and since it is not good to be temporary, we now switch back
> to
> > > ffmpeg-public.
> > >
> > > During the whole period, we don't have any intention to
> > > 'forbid simple object detection', not quite understand your point here.
> > >
> > >
> > >> So I still maintain this should be called "Object classification". I'd
> accept
> > >> "Object detection" as well, but definitely not "bounding box".
> > >>
> > >
> > > imho, ' Object detection' and ' Object classification' are worse, they just
> > > describe one aspect of the struct. The users might just use filter
> > dnn_detect,
> > > they might use filters dnn_detect + dnn_classify.
> > >
> >
> > The whole reason why we have this side data is to both detect
> > _and_ classify. Keyword being _both_. Hence object detection
> > and object classification are much better names.
> > I am opposed to merging this without a name change.
> 
> I understand it is not a good name with codec background, but it is a good
> (possible best) name with object detection background.
> 
> To make things move forward, I'll change the name to 'object detection'
> for both dnn_detect and dnn_classify, they would be:
> AV_FRAME_DATA_OJBECT_DETECTION
> AVObjectDetection
> AVObjectDetectionHeader

I tried to change the code with this new name, but feel like it is not
straight-forward during the coding.

Pedro, who initialized DNN module, just commented on the naming, maybe
I can just wait for more several days to get more comments on the naming,
thanks.
Lynne April 9, 2021, 3:15 p.m. UTC | #20
Apr 9, 2021, 16:35 by bygrandao@gmail.com:

> Em sex., 9 de abr. de 2021 às 01:13, Guo, Yejun <yejun.guo@intel.com> escreveu:
>
>>
>>
>>
>> > -----Original Message-----
>> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
>> > Sent: 2021年4月9日 0:57
>> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
>> > Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> > AV_FRAME_DATA_BOUNDING_BOXES
>> >
>>
>> First of all, thanks for the quick replies, I see, all the discussions/comments are to
>> make this patch better, thank you.
>>
>> > >> >
>> > >> >> >> > +
>> > >> >> >> > +typedef struct AVBoundingBoxHeader {
>> > >> >> >> > +    /**
>> > >> >> >> > +     * Information about how the bounding box is generated.
>> > >> >> >> > +     * for example, the DNN model name.
>> > >> >> >> > +     */
>> > >> >> >> > +    char source[128];
>> > >> >> >> > +
>> > >> >> >> > +    /**
>> > >> >> >> > +     * The size of frame when it is detected.
>> > >> >> >> > +     */
>> > >> >> >> > +    int frame_width;
>> > >> >> >> > +    int frame_height;
>> > >> >> >> >
>> > >> >> >>
>> > >> >> >> Why? This side data is attached to AVFrames only, where we
>> > >> >> >> already have width and height.
>> > >> >> >>
>> > >> >> >
>> > >> >> > The detection result will be used by other filters, for example,
>> > >> >> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
>> > >> >> >
>> > >> >> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
>> > >> >> > frame, while dnn_classify classifies one detected object (for example,
>> > >> person)
>> > >> >> > for its attribute (for example, emotion, etc.)
>> > >> >> >
>> > >> >> > The filter dnn_classify have to check if the frame size is changed after
>> > >> >> > it is detected, to handle the below filter chain:
>> > >> >> > dnn_detect -> scale -> dnn_classify
>> > >> >> >
>> > >> >>
>> > >> >> This doesn't look good. Why is dnn_classify needing to know
>> > >> >> the original frame size at all?
>> > >> >>
>> > >> >
>> > >> > For example, the original size of the frame is 100*100, and dnn_detect
>> > >> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
>> > >> > AVBoundingBox.top/left/right/bottom.
>> > >> >
>> > >> > Then, the frame is scaled into 50*50.
>> > >> >
>> > >> > Then, dnn_classify is used to analyze the emotion of the face, it needs to
>> > >> > know the frame size (100*100) when it is detected, otherwise, it does not
>> > >> > work with just (10,10), (30,40) and 50*50.
>> > >> >
>> > >>
>> > >> Why can't the scale filter also rescale this side data as well?
>> > >>
>> > >
>> > > I'm afraid that we could not make sure all such filters (including filters in the
>> > > future) to do the rescale. And in the previous discussion, I got to know that
>> > > 'many other existing side-data types are invalidated by scaling'.
>> > >
>> > > So, we need frame_width and frame_height here.
>> > >
>> >
>> > No, you don't. You just need to make sure filters which change resolution
>> > or do cropping also change the side data parameters.
>> > It's called maintainership. As-is, this won't even work with cropping,
>> > only with basic aspect ratio preserving scaling.
>> > For the lack of a better term, this is a hack.
>>
>> As discussed in previous email, for the frame size change case, dnn_classify
>> (and other filters which use the detection result, for example drawbox) can
>> just output a warning message to tell user what happens, and don't do the
>> classification, otherwise, it will give a wrong/weird result which makes the
>> user confused.
>>
>> >
>> > I would accept just specifying that if the frame dimensions are
>> > altered in any way, the side-data is no longer valid and it's up
>> > to users to figure that out by out of bound coordinates.
>> > This is what we currently do with video_enc_params.
>>
>> frame_width/frame_height is not perfect (for the cases such as: scale down
>> + crop + scale up to the same size), but it provides more info than the checking
>> of 'out of bound coordinates'. There are many other possible issues when the
>> coordinates are within the frame.
>>
>> If we think we'd better not let user get more info from the warning message,
>> I'm ok to remove them.
>>
>> I'll remove them if there's another comment supporting the removal, and
>> there's no objection.
>>
>> >
>> >
>> > >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
>> > >> >> >> > index a5ed91b20a..41e22de02a 100644
>> > >> >> >> > --- a/libavutil/frame.h
>> > >> >> >> > +++ b/libavutil/frame.h
>> > >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>> > >> >> >> >  * Must be present for every frame which should have film grain
>> > >> applied.
>> > >> >> >> >  */
>> > >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
>> > >> >> >> > +
>> > >> >> >> > +    /**
>> > >> >> >> > +     * Bounding boxes for object detection and classification, the
>> > >> data is
>> > >> >> a
>> > >> >> >> AVBoundingBoxHeader
>> > >> >> >> > +     * followed with an array of AVBoudingBox, and
>> > >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
>> > >> >> >> > +     * of array element.
>> > >> >> >> > +     */
>> > >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
>> > >> >> >> >  };
>> > >> >> >> >
>> > >> >> >>
>> > >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
>> > >> >> >> How about "Object Classification"? It makes much more sense, it's
>> > >> >> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
>> > >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
>> > >> >> >>
>> > >> >> >
>> > >> >> > In object detection papers, bounding box is usually used.
>> > >> >> > We'd better use the same term, imho, thanks.
>> > >> >> >
>> > >> >>
>> > >> >> Not in this case, API users won't have any idea what this is or what
>> > >> >> it's for. This is user-facing code after all.
>> > >> >> Papers in fields can get away with overloading language, but we're
>> > >> >> trying to make a concise API. Object classification makes sense
>> > >> >> because this is exactly what this is.
>> > >> >>
>> > >> >
>> > >> > The term bounding box is dominating the domain, for example, even
>> > >> > HEVC spec uses this term, see page 317 of
>> > >> >
>> > >>
>> > https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!P
>> > >> DF-E&type=items
>> > >> >
>> > >> > also copy some here for your convenient.
>> > >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
>> > >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
>> > >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
>> > >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
>> > >> >
>> > >> > I would prefer to use bounding box.
>> > >> >
>> > >>
>> > >> It's for an entirely different thing, and like I said, it's just an overloaded
>> > >> language because they can get away. We're trying to be generic.
>> > >> This side data is for detecting _and_ classifying objects. In fact, most of
>> > >> the structure is dedicated towards classifying. If you'd like users to actually
>> > >> use this, give it a good name and don't leave them guessing what this
>> > >> structure is by throwing vague jargon some other paper or spec has
>> > >> because it's close enough.
>> > >>
>> > >
>> > > all the people in the domain accepts bounding box, they can understand this
>> > > struct name easily and clearly, they might be bothered if we use another
>> > name.
>> > >
>> > > btw, AVObjectClassification confuses people who just want object detection.
>> > >
>> >
>> > As I said, the name "bounding box" makes no sense once it gets overloaded
>> > with object classification.
>>
>> dnn_detect creates an array of 'bounding box' for all detected objects, and
>> dnn_classify assigns attributes for a set of bounding boxes (with same object
>> id). 'bounding box' serves both detection and classification properly.
>>
>>
>> > Object classification is still the main use of the filters,
>> > because the original proposal was to have all of this info be ffmpeg-private,
>> > which would forbid simple object detection.
>>
>> The original proposal is to add it as side data which is ffmpeg-public, and then,
>> we spent much time discussing/trying with ffmpeg-private as an temporary
>> method, and since it is not good to be temporary, we now switch back to
>> ffmpeg-public.
>>
>> During the whole period, we don't have any intention to
>> 'forbid simple object detection', not quite understand your point here.
>>
>>
>> > So I still maintain this should be called "Object classification". I'd accept
>> > "Object detection" as well, but definitely not "bounding box".
>>
>> imho, ' Object detection' and ' Object classification' are worse, they just
>> describe one aspect of the struct. The users might just use filter dnn_detect,
>> they might use filters dnn_detect + dnn_classify.
>>
>>
>> >
>> > Since the decision was made to make the side data public, we have to make
>> > very sure it contains no hacks or is impossible to extend, since we don't want
>> > to have an
>> > "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWED_
>> > UP"
>> > faster than you can say "Recursive cascade correlation artificial neural
>> > networks".
>>
>> sorry, not quite understand your point here.
>>
>> 'bounding box' is designed for general purpose to contain the info for
>> detection/classification. It doesn't matter which DNN model is used, it doesn't
>> matter if a traditional algorithm (non-dnn) is used.
>>
>> I'm open to use a better name. And bounding box is the best one for me till now.
>> Everyone in the domain knows the exact meaning of bounding box without
>> extra explanation. This word has been extended/evolved with such meaning in
>> this domain.
>>
> +1
>
> I think it is wise to use the name which is widely used in the field.
>

Way to go, ignoring half the exchange to voice an opinion after we
came to an agreement.
Lynne April 9, 2021, 3:17 p.m. UTC | #21
Apr 9, 2021, 17:10 by yejun.guo@intel.com:

>
>
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> Guo, Yejun
>> Sent: 2021年4月9日 20:57
>> To: FFmpeg development discussions and patches
>> <ffmpeg-devel@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> AV_FRAME_DATA_BOUNDING_BOXES
>>
>>
>>
>> > -----Original Message-----
>> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> > Lynne
>> > Sent: 2021年4月9日 18:03
>> > To: FFmpeg development discussions and patches
>> > <ffmpeg-devel@ffmpeg.org>
>> > Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> > AV_FRAME_DATA_BOUNDING_BOXES
>> >
>> > Apr 9, 2021, 06:12 by yejun.guo@intel.com:
>> >
>> > >
>> > >
>> > >> -----Original Message-----
>> > >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf
>> Of
>> > Lynne
>> > >> Sent: 2021年4月9日 0:57
>> > >> To: FFmpeg development discussions and patches
>> > <ffmpeg-devel@ffmpeg.org>
>> > >> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> > >> AV_FRAME_DATA_BOUNDING_BOXES
>> > >>
>> > >
>> > > First of all, thanks for the quick replies, I see, all the
>> > discussions/comments are to
>> > > make this patch better, thank you.
>> > >
>> > >> >> >
>> > >> >> >> >> > +
>> > >> >> >> >> > +typedef struct AVBoundingBoxHeader {
>> > >> >> >> >> > +    /**
>> > >> >> >> >> > +     * Information about how the bounding box is
>> > generated.
>> > >> >> >> >> > +     * for example, the DNN model name.
>> > >> >> >> >> > +     */
>> > >> >> >> >> > +    char source[128];
>> > >> >> >> >> > +
>> > >> >> >> >> > +    /**
>> > >> >> >> >> > +     * The size of frame when it is detected.
>> > >> >> >> >> > +     */
>> > >> >> >> >> > +    int frame_width;
>> > >> >> >> >> > +    int frame_height;
>> > >> >> >> >> >
>> > >> >> >> >>
>> > >> >> >> >> Why? This side data is attached to AVFrames only, where we
>> > >> >> >> >> already have width and height.
>> > >> >> >> >>
>> > >> >> >> >
>> > >> >> >> > The detection result will be used by other filters, for example,
>> > >> >> >> > dnn_classify (see
>> > https://github.com/guoyejun/ffmpeg/tree/classify).
>> > >> >> >> >
>> > >> >> >> > The filter dnn_detect detects all the objects (cat, dog,
>> person ...)
>> > in a
>> > >> >> >> > frame, while dnn_classify classifies one detected object (for
>> > example,
>> > >> >> person)
>> > >> >> >> > for its attribute (for example, emotion, etc.)
>> > >> >> >> >
>> > >> >> >> > The filter dnn_classify have to check if the frame size is
>> changed
>> > after
>> > >> >> >> > it is detected, to handle the below filter chain:
>> > >> >> >> > dnn_detect -> scale -> dnn_classify
>> > >> >> >> >
>> > >> >> >>
>> > >> >> >> This doesn't look good. Why is dnn_classify needing to know
>> > >> >> >> the original frame size at all?
>> > >> >> >>
>> > >> >> >
>> > >> >> > For example, the original size of the frame is 100*100, and
>> > dnn_detect
>> > >> >> > detects a face at place (10, 10) -> (30, 40), such data will be saved
>> in
>> > >> >> > AVBoundingBox.top/left/right/bottom.
>> > >> >> >
>> > >> >> > Then, the frame is scaled into 50*50.
>> > >> >> >
>> > >> >> > Then, dnn_classify is used to analyze the emotion of the face, it
>> > needs to
>> > >> >> > know the frame size (100*100) when it is detected, otherwise, it
>> > does not
>> > >> >> > work with just (10,10), (30,40) and 50*50.
>> > >> >> >
>> > >> >>
>> > >> >> Why can't the scale filter also rescale this side data as well?
>> > >> >>
>> > >> >
>> > >> > I'm afraid that we could not make sure all such filters (including
>> filters
>> > in the
>> > >> > future) to do the rescale. And in the previous discussion, I got to
>> > know that
>> > >> > 'many other existing side-data types are invalidated by scaling'.
>> > >> >
>> > >> > So, we need frame_width and frame_height here.
>> > >> >
>> > >>
>> > >> No, you don't. You just need to make sure filters which change
>> > resolution
>> > >> or do cropping also change the side data parameters.
>> > >> It's called maintainership. As-is, this won't even work with cropping,
>> > >> only with basic aspect ratio preserving scaling.
>> > >> For the lack of a better term, this is a hack.
>> > >>
>> > >
>> > > As discussed in previous email, for the frame size change case,
>> > dnn_classify
>> > > (and other filters which use the detection result, for example drawbox)
>> > can
>> > > just output a warning message to tell user what happens, and don't do
>> > the
>> > > classification, otherwise, it will give a wrong/weird result which makes
>> > the
>> > > user confused.
>> > >
>> > >>
>> > >> I would accept just specifying that if the frame dimensions are
>> > >> altered in any way, the side-data is no longer valid and it's up
>> > >> to users to figure that out by out of bound coordinates.
>> > >> This is what we currently do with video_enc_params.
>> > >>
>> > >
>> > > frame_width/frame_height is not perfect (for the cases such as: scale
>> > down
>> > > + crop + scale up to the same size), but it provides more info than the
>> > checking
>> > > of 'out of bound coordinates'. There are many other possible issues
>> > when the
>> > > coordinates are within the frame.
>> > >
>> > > If we think we'd better not let user get more info from the warning
>> > message,
>> > > I'm ok to remove them.
>> > >
>> > > I'll remove them if there's another comment supporting the removal,
>> and
>> > > there's no objection.
>> > >
>> >
>> > We definitely shouldn't include variables in public API structs
>> > that only serve to print a warning if they don't match.
>>
>> Not just 'print a warning', it also impacts the behavior of dnn_classify.
>>
>> > Especially one that's fragile and flawed like this one.
>> > Users should know that scaling or altering a frame could break
>> > this side data, and filters could detect obvious out of bounds
>> > results and report them.
>>
>> I'll remove them since it is user's responsibility.
>>
>> >
>> > In the meantime, the main scaling and cropping filters could
>> > receive separate patches to preserve metadata at a later data.
>> > This is how the avframe cropping field work - they're just metadata,
>> > and cropping/scaling filters update those.
>> >
>> >
>> > >> >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
>> > >> >> >> >> > index a5ed91b20a..41e22de02a 100644
>> > >> >> >> >> > --- a/libavutil/frame.h
>> > >> >> >> >> > +++ b/libavutil/frame.h
>> > >> >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>> > >> >> >> >> >  * Must be present for every frame which should have
>> film
>> > grain
>> > >> >> applied.
>> > >> >> >> >> >  */
>> > >> >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
>> > >> >> >> >> > +
>> > >> >> >> >> > +    /**
>> > >> >> >> >> > +     * Bounding boxes for object detection and
>> > classification, the
>> > >> >> data is
>> > >> >> >> a
>> > >> >> >> >> AVBoundingBoxHeader
>> > >> >> >> >> > +     * followed with an array of AVBoudingBox, and
>> > >> >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
>> > >> >> >> >> > +     * of array element.
>> > >> >> >> >> > +     */
>> > >> >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
>> > >> >> >> >> >  };
>> > >> >> >> >> >
>> > >> >> >> >>
>> > >> >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
>> > >> >> >> >> How about "Object Classification"? It makes much more
>> sense,
>> > it's
>> > >> >> >> >> exactly what this is. So AVObjectClassification,
>> > AVObjectClassification,
>> > >> >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
>> > >> >> >> >>
>> > >> >> >> >
>> > >> >> >> > In object detection papers, bounding box is usually used.
>> > >> >> >> > We'd better use the same term, imho, thanks.
>> > >> >> >> >
>> > >> >> >>
>> > >> >> >> Not in this case, API users won't have any idea what this is or
>> > what
>> > >> >> >> it's for. This is user-facing code after all.
>> > >> >> >> Papers in fields can get away with overloading language, but
>> > we're
>> > >> >> >> trying to make a concise API. Object classification makes sense
>> > >> >> >> because this is exactly what this is.
>> > >> >> >>
>> > >> >> >
>> > >> >> > The term bounding box is dominating the domain, for example,
>> > even
>> > >> >> > HEVC spec uses this term, see page 317 of
>> > >> >> >
>> > >> >>
>> > >>
>> >
>> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I
>> > !!P
>> > >> >> DF-E&type=items
>> > >> >> >
>> > >> >> > also copy some here for your convenient.
>> > >> >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
>> > >> >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
>> > >> >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
>> > >> >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
>> > >> >> >
>> > >> >> > I would prefer to use bounding box.
>> > >> >> >
>> > >> >>
>> > >> >> It's for an entirely different thing, and like I said, it's just an
>> > overloaded
>> > >> >> language because they can get away. We're trying to be generic.
>> > >> >> This side data is for detecting _and_ classifying objects. In fact,
>> most
>> > of
>> > >> >> the structure is dedicated towards classifying. If you'd like users to
>> > actually
>> > >> >> use this, give it a good name and don't leave them guessing what
>> > this
>> > >> >> structure is by throwing vague jargon some other paper or spec
>> has
>> > >> >> because it's close enough.
>> > >> >>
>> > >> >
>> > >> > all the people in the domain accepts bounding box, they can
>> > understand this
>> > >> > struct name easily and clearly, they might be bothered if we use
>> > another
>> > >> name.
>> > >> >
>> > >> > btw, AVObjectClassification confuses people who just want object
>> > detection.
>> > >> >
>> > >>
>> > >> As I said, the name "bounding box" makes no sense once it gets
>> > overloaded
>> > >> with object classification.
>> > >>
>> > >
>> > > dnn_detect creates an array of 'bounding box' for all detected objects,
>> > and
>> > > dnn_classify assigns attributes for a set of bounding boxes (with same
>> > object
>> > > id). 'bounding box' serves both detection and classification properly.
>> > >
>> > >
>> > >> Object classification is still the main use of the filters,
>> > >> because the original proposal was to have all of this info be
>> > ffmpeg-private,
>> > >> which would forbid simple object detection.
>> > >>
>> > >
>> > > The original proposal is to add it as side data which is ffmpeg-public,
>> and
>> > then,
>> > > we spent much time discussing/trying with ffmpeg-private as an
>> > temporary
>> > > method, and since it is not good to be temporary, we now switch back
>> to
>> > > ffmpeg-public.
>> > >
>> > > During the whole period, we don't have any intention to
>> > > 'forbid simple object detection', not quite understand your point here.
>> > >
>> > >
>> > >> So I still maintain this should be called "Object classification". I'd
>> accept
>> > >> "Object detection" as well, but definitely not "bounding box".
>> > >>
>> > >
>> > > imho, ' Object detection' and ' Object classification' are worse, they just
>> > > describe one aspect of the struct. The users might just use filter
>> > dnn_detect,
>> > > they might use filters dnn_detect + dnn_classify.
>> > >
>> >
>> > The whole reason why we have this side data is to both detect
>> > _and_ classify. Keyword being _both_. Hence object detection
>> > and object classification are much better names.
>> > I am opposed to merging this without a name change.
>>
>> I understand it is not a good name with codec background, but it is a good
>> (possible best) name with object detection background.
>>
>> To make things move forward, I'll change the name to 'object detection'
>> for both dnn_detect and dnn_classify, they would be:
>> AV_FRAME_DATA_OJBECT_DETECTION
>> AVObjectDetection
>> AVObjectDetectionHeader
>>
>
> I tried to change the code with this new name, but feel like it is not
> straight-forward during the coding.
>
> Pedro, who initialized DNN module, just commented on the naming, maybe
> I can just wait for more several days to get more comments on the naming,
> thanks.
>

If you want to move forward with this patch, please send a new one
with the field and name changes.
Pedro Arthur April 9, 2021, 6:08 p.m. UTC | #22
Em sex., 9 de abr. de 2021 às 12:15, Lynne <dev@lynne.ee> escreveu:
>
> Apr 9, 2021, 16:35 by bygrandao@gmail.com:
>
> > Em sex., 9 de abr. de 2021 às 01:13, Guo, Yejun <yejun.guo@intel.com> escreveu:
> >
> >>
> >>
> >>
> >> > -----Original Message-----
> >> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
> >> > Sent: 2021年4月9日 0:57
> >> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> >> > Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> >> > AV_FRAME_DATA_BOUNDING_BOXES
> >> >
> >>
> >> First of all, thanks for the quick replies, I see, all the discussions/comments are to
> >> make this patch better, thank you.
> >>
> >> > >> >
> >> > >> >> >> > +
> >> > >> >> >> > +typedef struct AVBoundingBoxHeader {
> >> > >> >> >> > +    /**
> >> > >> >> >> > +     * Information about how the bounding box is generated.
> >> > >> >> >> > +     * for example, the DNN model name.
> >> > >> >> >> > +     */
> >> > >> >> >> > +    char source[128];
> >> > >> >> >> > +
> >> > >> >> >> > +    /**
> >> > >> >> >> > +     * The size of frame when it is detected.
> >> > >> >> >> > +     */
> >> > >> >> >> > +    int frame_width;
> >> > >> >> >> > +    int frame_height;
> >> > >> >> >> >
> >> > >> >> >>
> >> > >> >> >> Why? This side data is attached to AVFrames only, where we
> >> > >> >> >> already have width and height.
> >> > >> >> >>
> >> > >> >> >
> >> > >> >> > The detection result will be used by other filters, for example,
> >> > >> >> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
> >> > >> >> >
> >> > >> >> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
> >> > >> >> > frame, while dnn_classify classifies one detected object (for example,
> >> > >> person)
> >> > >> >> > for its attribute (for example, emotion, etc.)
> >> > >> >> >
> >> > >> >> > The filter dnn_classify have to check if the frame size is changed after
> >> > >> >> > it is detected, to handle the below filter chain:
> >> > >> >> > dnn_detect -> scale -> dnn_classify
> >> > >> >> >
> >> > >> >>
> >> > >> >> This doesn't look good. Why is dnn_classify needing to know
> >> > >> >> the original frame size at all?
> >> > >> >>
> >> > >> >
> >> > >> > For example, the original size of the frame is 100*100, and dnn_detect
> >> > >> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
> >> > >> > AVBoundingBox.top/left/right/bottom.
> >> > >> >
> >> > >> > Then, the frame is scaled into 50*50.
> >> > >> >
> >> > >> > Then, dnn_classify is used to analyze the emotion of the face, it needs to
> >> > >> > know the frame size (100*100) when it is detected, otherwise, it does not
> >> > >> > work with just (10,10), (30,40) and 50*50.
> >> > >> >
> >> > >>
> >> > >> Why can't the scale filter also rescale this side data as well?
> >> > >>
> >> > >
> >> > > I'm afraid that we could not make sure all such filters (including filters in the
> >> > > future) to do the rescale. And in the previous discussion, I got to know that
> >> > > 'many other existing side-data types are invalidated by scaling'.
> >> > >
> >> > > So, we need frame_width and frame_height here.
> >> > >
> >> >
> >> > No, you don't. You just need to make sure filters which change resolution
> >> > or do cropping also change the side data parameters.
> >> > It's called maintainership. As-is, this won't even work with cropping,
> >> > only with basic aspect ratio preserving scaling.
> >> > For the lack of a better term, this is a hack.
> >>
> >> As discussed in previous email, for the frame size change case, dnn_classify
> >> (and other filters which use the detection result, for example drawbox) can
> >> just output a warning message to tell user what happens, and don't do the
> >> classification, otherwise, it will give a wrong/weird result which makes the
> >> user confused.
> >>
> >> >
> >> > I would accept just specifying that if the frame dimensions are
> >> > altered in any way, the side-data is no longer valid and it's up
> >> > to users to figure that out by out of bound coordinates.
> >> > This is what we currently do with video_enc_params.
> >>
> >> frame_width/frame_height is not perfect (for the cases such as: scale down
> >> + crop + scale up to the same size), but it provides more info than the checking
> >> of 'out of bound coordinates'. There are many other possible issues when the
> >> coordinates are within the frame.
> >>
> >> If we think we'd better not let user get more info from the warning message,
> >> I'm ok to remove them.
> >>
> >> I'll remove them if there's another comment supporting the removal, and
> >> there's no objection.
> >>
> >> >
> >> >
> >> > >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> >> > >> >> >> > index a5ed91b20a..41e22de02a 100644
> >> > >> >> >> > --- a/libavutil/frame.h
> >> > >> >> >> > +++ b/libavutil/frame.h
> >> > >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> >> > >> >> >> >  * Must be present for every frame which should have film grain
> >> > >> applied.
> >> > >> >> >> >  */
> >> > >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> >> > >> >> >> > +
> >> > >> >> >> > +    /**
> >> > >> >> >> > +     * Bounding boxes for object detection and classification, the
> >> > >> data is
> >> > >> >> a
> >> > >> >> >> AVBoundingBoxHeader
> >> > >> >> >> > +     * followed with an array of AVBoudingBox, and
> >> > >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
> >> > >> >> >> > +     * of array element.
> >> > >> >> >> > +     */
> >> > >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> >> > >> >> >> >  };
> >> > >> >> >> >
> >> > >> >> >>
> >> > >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
> >> > >> >> >> How about "Object Classification"? It makes much more sense, it's
> >> > >> >> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
> >> > >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> >> > >> >> >>
> >> > >> >> >
> >> > >> >> > In object detection papers, bounding box is usually used.
> >> > >> >> > We'd better use the same term, imho, thanks.
> >> > >> >> >
> >> > >> >>
> >> > >> >> Not in this case, API users won't have any idea what this is or what
> >> > >> >> it's for. This is user-facing code after all.
> >> > >> >> Papers in fields can get away with overloading language, but we're
> >> > >> >> trying to make a concise API. Object classification makes sense
> >> > >> >> because this is exactly what this is.
> >> > >> >>
> >> > >> >
> >> > >> > The term bounding box is dominating the domain, for example, even
> >> > >> > HEVC spec uses this term, see page 317 of
> >> > >> >
> >> > >>
> >> > https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!P
> >> > >> DF-E&type=items
> >> > >> >
> >> > >> > also copy some here for your convenient.
> >> > >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> >> > >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> >> > >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> >> > >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
> >> > >> >
> >> > >> > I would prefer to use bounding box.
> >> > >> >
> >> > >>
> >> > >> It's for an entirely different thing, and like I said, it's just an overloaded
> >> > >> language because they can get away. We're trying to be generic.
> >> > >> This side data is for detecting _and_ classifying objects. In fact, most of
> >> > >> the structure is dedicated towards classifying. If you'd like users to actually
> >> > >> use this, give it a good name and don't leave them guessing what this
> >> > >> structure is by throwing vague jargon some other paper or spec has
> >> > >> because it's close enough.
> >> > >>
> >> > >
> >> > > all the people in the domain accepts bounding box, they can understand this
> >> > > struct name easily and clearly, they might be bothered if we use another
> >> > name.
> >> > >
> >> > > btw, AVObjectClassification confuses people who just want object detection.
> >> > >
> >> >
> >> > As I said, the name "bounding box" makes no sense once it gets overloaded
> >> > with object classification.
> >>
> >> dnn_detect creates an array of 'bounding box' for all detected objects, and
> >> dnn_classify assigns attributes for a set of bounding boxes (with same object
> >> id). 'bounding box' serves both detection and classification properly.
> >>
> >>
> >> > Object classification is still the main use of the filters,
> >> > because the original proposal was to have all of this info be ffmpeg-private,
> >> > which would forbid simple object detection.
> >>
> >> The original proposal is to add it as side data which is ffmpeg-public, and then,
> >> we spent much time discussing/trying with ffmpeg-private as an temporary
> >> method, and since it is not good to be temporary, we now switch back to
> >> ffmpeg-public.
> >>
> >> During the whole period, we don't have any intention to
> >> 'forbid simple object detection', not quite understand your point here.
> >>
> >>
> >> > So I still maintain this should be called "Object classification". I'd accept
> >> > "Object detection" as well, but definitely not "bounding box".
> >>
> >> imho, ' Object detection' and ' Object classification' are worse, they just
> >> describe one aspect of the struct. The users might just use filter dnn_detect,
> >> they might use filters dnn_detect + dnn_classify.
> >>
> >>
> >> >
> >> > Since the decision was made to make the side data public, we have to make
> >> > very sure it contains no hacks or is impossible to extend, since we don't want
> >> > to have an
> >> > "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWED_
> >> > UP"
> >> > faster than you can say "Recursive cascade correlation artificial neural
> >> > networks".
> >>
> >> sorry, not quite understand your point here.
> >>
> >> 'bounding box' is designed for general purpose to contain the info for
> >> detection/classification. It doesn't matter which DNN model is used, it doesn't
> >> matter if a traditional algorithm (non-dnn) is used.
> >>
> >> I'm open to use a better name. And bounding box is the best one for me till now.
> >> Everyone in the domain knows the exact meaning of bounding box without
> >> extra explanation. This word has been extended/evolved with such meaning in
> >> this domain.
> >>
> > +1
> >
> > I think it is wise to use the name which is widely used in the field.
> >
>
> Way to go, ignoring half the exchange to voice an opinion after we
> came to an agreement.
Indeed that's the opposite, I gave my opinion *precisely* because I'm
following the whole discussion.
IMHO, ignoring a naming standard in a whole field is not wise.

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Lynne April 9, 2021, 7:20 p.m. UTC | #23
Apr 9, 2021, 20:08 by bygrandao@gmail.com:

> Em sex., 9 de abr. de 2021 às 12:15, Lynne <dev@lynne.ee> escreveu:
>
>>
>> Apr 9, 2021, 16:35 by bygrandao@gmail.com:
>>
>> > Em sex., 9 de abr. de 2021 às 01:13, Guo, Yejun <yejun.guo@intel.com> escreveu:
>> >
>> >>
>> >>
>> >>
>> >> > -----Original Message-----
>> >> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Lynne
>> >> > Sent: 2021年4月9日 0:57
>> >> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
>> >> > Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
>> >> > AV_FRAME_DATA_BOUNDING_BOXES
>> >> >
>> >>
>> >> First of all, thanks for the quick replies, I see, all the discussions/comments are to
>> >> make this patch better, thank you.
>> >>
>> >> > >> >
>> >> > >> >> >> > +
>> >> > >> >> >> > +typedef struct AVBoundingBoxHeader {
>> >> > >> >> >> > +    /**
>> >> > >> >> >> > +     * Information about how the bounding box is generated.
>> >> > >> >> >> > +     * for example, the DNN model name.
>> >> > >> >> >> > +     */
>> >> > >> >> >> > +    char source[128];
>> >> > >> >> >> > +
>> >> > >> >> >> > +    /**
>> >> > >> >> >> > +     * The size of frame when it is detected.
>> >> > >> >> >> > +     */
>> >> > >> >> >> > +    int frame_width;
>> >> > >> >> >> > +    int frame_height;
>> >> > >> >> >> >
>> >> > >> >> >>
>> >> > >> >> >> Why? This side data is attached to AVFrames only, where we
>> >> > >> >> >> already have width and height.
>> >> > >> >> >>
>> >> > >> >> >
>> >> > >> >> > The detection result will be used by other filters, for example,
>> >> > >> >> > dnn_classify (see https://github.com/guoyejun/ffmpeg/tree/classify).
>> >> > >> >> >
>> >> > >> >> > The filter dnn_detect detects all the objects (cat, dog, person ...) in a
>> >> > >> >> > frame, while dnn_classify classifies one detected object (for example,
>> >> > >> person)
>> >> > >> >> > for its attribute (for example, emotion, etc.)
>> >> > >> >> >
>> >> > >> >> > The filter dnn_classify have to check if the frame size is changed after
>> >> > >> >> > it is detected, to handle the below filter chain:
>> >> > >> >> > dnn_detect -> scale -> dnn_classify
>> >> > >> >> >
>> >> > >> >>
>> >> > >> >> This doesn't look good. Why is dnn_classify needing to know
>> >> > >> >> the original frame size at all?
>> >> > >> >>
>> >> > >> >
>> >> > >> > For example, the original size of the frame is 100*100, and dnn_detect
>> >> > >> > detects a face at place (10, 10) -> (30, 40), such data will be saved in
>> >> > >> > AVBoundingBox.top/left/right/bottom.
>> >> > >> >
>> >> > >> > Then, the frame is scaled into 50*50.
>> >> > >> >
>> >> > >> > Then, dnn_classify is used to analyze the emotion of the face, it needs to
>> >> > >> > know the frame size (100*100) when it is detected, otherwise, it does not
>> >> > >> > work with just (10,10), (30,40) and 50*50.
>> >> > >> >
>> >> > >>
>> >> > >> Why can't the scale filter also rescale this side data as well?
>> >> > >>
>> >> > >
>> >> > > I'm afraid that we could not make sure all such filters (including filters in the
>> >> > > future) to do the rescale. And in the previous discussion, I got to know that
>> >> > > 'many other existing side-data types are invalidated by scaling'.
>> >> > >
>> >> > > So, we need frame_width and frame_height here.
>> >> > >
>> >> >
>> >> > No, you don't. You just need to make sure filters which change resolution
>> >> > or do cropping also change the side data parameters.
>> >> > It's called maintainership. As-is, this won't even work with cropping,
>> >> > only with basic aspect ratio preserving scaling.
>> >> > For the lack of a better term, this is a hack.
>> >>
>> >> As discussed in previous email, for the frame size change case, dnn_classify
>> >> (and other filters which use the detection result, for example drawbox) can
>> >> just output a warning message to tell user what happens, and don't do the
>> >> classification, otherwise, it will give a wrong/weird result which makes the
>> >> user confused.
>> >>
>> >> >
>> >> > I would accept just specifying that if the frame dimensions are
>> >> > altered in any way, the side-data is no longer valid and it's up
>> >> > to users to figure that out by out of bound coordinates.
>> >> > This is what we currently do with video_enc_params.
>> >>
>> >> frame_width/frame_height is not perfect (for the cases such as: scale down
>> >> + crop + scale up to the same size), but it provides more info than the checking
>> >> of 'out of bound coordinates'. There are many other possible issues when the
>> >> coordinates are within the frame.
>> >>
>> >> If we think we'd better not let user get more info from the warning message,
>> >> I'm ok to remove them.
>> >>
>> >> I'll remove them if there's another comment supporting the removal, and
>> >> there's no objection.
>> >>
>> >> >
>> >> >
>> >> > >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
>> >> > >> >> >> > index a5ed91b20a..41e22de02a 100644
>> >> > >> >> >> > --- a/libavutil/frame.h
>> >> > >> >> >> > +++ b/libavutil/frame.h
>> >> > >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
>> >> > >> >> >> >  * Must be present for every frame which should have film grain
>> >> > >> applied.
>> >> > >> >> >> >  */
>> >> > >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
>> >> > >> >> >> > +
>> >> > >> >> >> > +    /**
>> >> > >> >> >> > +     * Bounding boxes for object detection and classification, the
>> >> > >> data is
>> >> > >> >> a
>> >> > >> >> >> AVBoundingBoxHeader
>> >> > >> >> >> > +     * followed with an array of AVBoudingBox, and
>> >> > >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
>> >> > >> >> >> > +     * of array element.
>> >> > >> >> >> > +     */
>> >> > >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
>> >> > >> >> >> >  };
>> >> > >> >> >> >
>> >> > >> >> >>
>> >> > >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
>> >> > >> >> >> How about "Object Classification"? It makes much more sense, it's
>> >> > >> >> >> exactly what this is. So AVObjectClassification, AVObjectClassification,
>> >> > >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
>> >> > >> >> >>
>> >> > >> >> >
>> >> > >> >> > In object detection papers, bounding box is usually used.
>> >> > >> >> > We'd better use the same term, imho, thanks.
>> >> > >> >> >
>> >> > >> >>
>> >> > >> >> Not in this case, API users won't have any idea what this is or what
>> >> > >> >> it's for. This is user-facing code after all.
>> >> > >> >> Papers in fields can get away with overloading language, but we're
>> >> > >> >> trying to make a concise API. Object classification makes sense
>> >> > >> >> because this is exactly what this is.
>> >> > >> >>
>> >> > >> >
>> >> > >> > The term bounding box is dominating the domain, for example, even
>> >> > >> > HEVC spec uses this term, see page 317 of
>> >> > >> >
>> >> > >>
>> >> > https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I!!P
>> >> > >> DF-E&type=items
>> >> > >> >
>> >> > >> > also copy some here for your convenient.
>> >> > >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
>> >> > >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
>> >> > >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
>> >> > >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
>> >> > >> >
>> >> > >> > I would prefer to use bounding box.
>> >> > >> >
>> >> > >>
>> >> > >> It's for an entirely different thing, and like I said, it's just an overloaded
>> >> > >> language because they can get away. We're trying to be generic.
>> >> > >> This side data is for detecting _and_ classifying objects. In fact, most of
>> >> > >> the structure is dedicated towards classifying. If you'd like users to actually
>> >> > >> use this, give it a good name and don't leave them guessing what this
>> >> > >> structure is by throwing vague jargon some other paper or spec has
>> >> > >> because it's close enough.
>> >> > >>
>> >> > >
>> >> > > all the people in the domain accepts bounding box, they can understand this
>> >> > > struct name easily and clearly, they might be bothered if we use another
>> >> > name.
>> >> > >
>> >> > > btw, AVObjectClassification confuses people who just want object detection.
>> >> > >
>> >> >
>> >> > As I said, the name "bounding box" makes no sense once it gets overloaded
>> >> > with object classification.
>> >>
>> >> dnn_detect creates an array of 'bounding box' for all detected objects, and
>> >> dnn_classify assigns attributes for a set of bounding boxes (with same object
>> >> id). 'bounding box' serves both detection and classification properly.
>> >>
>> >>
>> >> > Object classification is still the main use of the filters,
>> >> > because the original proposal was to have all of this info be ffmpeg-private,
>> >> > which would forbid simple object detection.
>> >>
>> >> The original proposal is to add it as side data which is ffmpeg-public, and then,
>> >> we spent much time discussing/trying with ffmpeg-private as an temporary
>> >> method, and since it is not good to be temporary, we now switch back to
>> >> ffmpeg-public.
>> >>
>> >> During the whole period, we don't have any intention to
>> >> 'forbid simple object detection', not quite understand your point here.
>> >>
>> >>
>> >> > So I still maintain this should be called "Object classification". I'd accept
>> >> > "Object detection" as well, but definitely not "bounding box".
>> >>
>> >> imho, ' Object detection' and ' Object classification' are worse, they just
>> >> describe one aspect of the struct. The users might just use filter dnn_detect,
>> >> they might use filters dnn_detect + dnn_classify.
>> >>
>> >>
>> >> >
>> >> > Since the decision was made to make the side data public, we have to make
>> >> > very sure it contains no hacks or is impossible to extend, since we don't want
>> >> > to have an
>> >> > "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWED_
>> >> > UP"
>> >> > faster than you can say "Recursive cascade correlation artificial neural
>> >> > networks".
>> >>
>> >> sorry, not quite understand your point here.
>> >>
>> >> 'bounding box' is designed for general purpose to contain the info for
>> >> detection/classification. It doesn't matter which DNN model is used, it doesn't
>> >> matter if a traditional algorithm (non-dnn) is used.
>> >>
>> >> I'm open to use a better name. And bounding box is the best one for me till now.
>> >> Everyone in the domain knows the exact meaning of bounding box without
>> >> extra explanation. This word has been extended/evolved with such meaning in
>> >> this domain.
>> >>
>> > +1
>> >
>> > I think it is wise to use the name which is widely used in the field.
>> >
>>
>> Way to go, ignoring half the exchange to voice an opinion after we
>> came to an agreement.
>>
> Indeed that's the opposite, I gave my opinion *precisely* because I'm
> following the whole discussion.
> IMHO, ignoring a naming standard in a whole field is not wise.
>

The convention is vague, broad, inappropriate, non-specific, incomplete,
will be mistaken for something it is not, something that's not it not will be
mistaken for what it is, and finally it doesn't cover the second part of what
it will be used for - classification.
We shouldn't follow a pseudo-naming convention if it clashes with other
parts of our code and isn't descriptive to the degree I described.
Guo, Yejun April 10, 2021, 9:44 a.m. UTC | #24
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Lynne
> Sent: 2021年4月9日 23:16
> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Apr 9, 2021, 16:35 by bygrandao@gmail.com:
> 
> > Em sex., 9 de abr. de 2021 às 01:13, Guo, Yejun <yejun.guo@intel.com>
> escreveu:
> >
> >>
> >>
> >>
> >> > -----Original Message-----
> >> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf
> Of Lynne
> >> > Sent: 2021年4月9日 0:57
> >> > To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> >> > Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> >> > AV_FRAME_DATA_BOUNDING_BOXES
> >> >
> >>
> >> First of all, thanks for the quick replies, I see, all the
> discussions/comments are to
> >> make this patch better, thank you.
> >>
> >> > >> >
> >> > >> >> >> > +
> >> > >> >> >> > +typedef struct AVBoundingBoxHeader {
> >> > >> >> >> > +    /**
> >> > >> >> >> > +     * Information about how the bounding box is
> generated.
> >> > >> >> >> > +     * for example, the DNN model name.
> >> > >> >> >> > +     */
> >> > >> >> >> > +    char source[128];
> >> > >> >> >> > +
> >> > >> >> >> > +    /**
> >> > >> >> >> > +     * The size of frame when it is detected.
> >> > >> >> >> > +     */
> >> > >> >> >> > +    int frame_width;
> >> > >> >> >> > +    int frame_height;
> >> > >> >> >> >
> >> > >> >> >>
> >> > >> >> >> Why? This side data is attached to AVFrames only, where we
> >> > >> >> >> already have width and height.
> >> > >> >> >>
> >> > >> >> >
> >> > >> >> > The detection result will be used by other filters, for example,
> >> > >> >> > dnn_classify (see
> https://github.com/guoyejun/ffmpeg/tree/classify).
> >> > >> >> >
> >> > >> >> > The filter dnn_detect detects all the objects (cat, dog,
> person ...) in a
> >> > >> >> > frame, while dnn_classify classifies one detected object (for
> example,
> >> > >> person)
> >> > >> >> > for its attribute (for example, emotion, etc.)
> >> > >> >> >
> >> > >> >> > The filter dnn_classify have to check if the frame size is
> changed after
> >> > >> >> > it is detected, to handle the below filter chain:
> >> > >> >> > dnn_detect -> scale -> dnn_classify
> >> > >> >> >
> >> > >> >>
> >> > >> >> This doesn't look good. Why is dnn_classify needing to know
> >> > >> >> the original frame size at all?
> >> > >> >>
> >> > >> >
> >> > >> > For example, the original size of the frame is 100*100, and
> dnn_detect
> >> > >> > detects a face at place (10, 10) -> (30, 40), such data will be saved
> in
> >> > >> > AVBoundingBox.top/left/right/bottom.
> >> > >> >
> >> > >> > Then, the frame is scaled into 50*50.
> >> > >> >
> >> > >> > Then, dnn_classify is used to analyze the emotion of the face, it
> needs to
> >> > >> > know the frame size (100*100) when it is detected, otherwise, it
> does not
> >> > >> > work with just (10,10), (30,40) and 50*50.
> >> > >> >
> >> > >>
> >> > >> Why can't the scale filter also rescale this side data as well?
> >> > >>
> >> > >
> >> > > I'm afraid that we could not make sure all such filters (including
> filters in the
> >> > > future) to do the rescale. And in the previous discussion, I got to
> know that
> >> > > 'many other existing side-data types are invalidated by scaling'.
> >> > >
> >> > > So, we need frame_width and frame_height here.
> >> > >
> >> >
> >> > No, you don't. You just need to make sure filters which change
> resolution
> >> > or do cropping also change the side data parameters.
> >> > It's called maintainership. As-is, this won't even work with cropping,
> >> > only with basic aspect ratio preserving scaling.
> >> > For the lack of a better term, this is a hack.
> >>
> >> As discussed in previous email, for the frame size change case,
> dnn_classify
> >> (and other filters which use the detection result, for example drawbox)
> can
> >> just output a warning message to tell user what happens, and don't do
> the
> >> classification, otherwise, it will give a wrong/weird result which makes
> the
> >> user confused.
> >>
> >> >
> >> > I would accept just specifying that if the frame dimensions are
> >> > altered in any way, the side-data is no longer valid and it's up
> >> > to users to figure that out by out of bound coordinates.
> >> > This is what we currently do with video_enc_params.
> >>
> >> frame_width/frame_height is not perfect (for the cases such as: scale
> down
> >> + crop + scale up to the same size), but it provides more info than the
> checking
> >> of 'out of bound coordinates'. There are many other possible issues
> when the
> >> coordinates are within the frame.
> >>
> >> If we think we'd better not let user get more info from the warning
> message,
> >> I'm ok to remove them.
> >>
> >> I'll remove them if there's another comment supporting the removal,
> and
> >> there's no objection.
> >>
> >> >
> >> >
> >> > >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> >> > >> >> >> > index a5ed91b20a..41e22de02a 100644
> >> > >> >> >> > --- a/libavutil/frame.h
> >> > >> >> >> > +++ b/libavutil/frame.h
> >> > >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> >> > >> >> >> >  * Must be present for every frame which should have
> film grain
> >> > >> applied.
> >> > >> >> >> >  */
> >> > >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> >> > >> >> >> > +
> >> > >> >> >> > +    /**
> >> > >> >> >> > +     * Bounding boxes for object detection and
> classification, the
> >> > >> data is
> >> > >> >> a
> >> > >> >> >> AVBoundingBoxHeader
> >> > >> >> >> > +     * followed with an array of AVBoudingBox, and
> >> > >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
> >> > >> >> >> > +     * of array element.
> >> > >> >> >> > +     */
> >> > >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> >> > >> >> >> >  };
> >> > >> >> >> >
> >> > >> >> >>
> >> > >> >> >> Finally, why call it a Bounding Box? It's not descriptive at all.
> >> > >> >> >> How about "Object Classification"? It makes much more
> sense, it's
> >> > >> >> >> exactly what this is. So AVObjectClassification,
> AVObjectClassification,
> >> > >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> >> > >> >> >>
> >> > >> >> >
> >> > >> >> > In object detection papers, bounding box is usually used.
> >> > >> >> > We'd better use the same term, imho, thanks.
> >> > >> >> >
> >> > >> >>
> >> > >> >> Not in this case, API users won't have any idea what this is or
> what
> >> > >> >> it's for. This is user-facing code after all.
> >> > >> >> Papers in fields can get away with overloading language, but
> we're
> >> > >> >> trying to make a concise API. Object classification makes sense
> >> > >> >> because this is exactly what this is.
> >> > >> >>
> >> > >> >
> >> > >> > The term bounding box is dominating the domain, for example,
> even
> >> > >> > HEVC spec uses this term, see page 317 of
> >> > >> >
> >> > >>
> >> >
> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I
> !!P
> >> > >> DF-E&type=items
> >> > >> >
> >> > >> > also copy some here for your convenient.
> >> > >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> >> > >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> >> > >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> >> > >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
> >> > >> >
> >> > >> > I would prefer to use bounding box.
> >> > >> >
> >> > >>
> >> > >> It's for an entirely different thing, and like I said, it's just an
> overloaded
> >> > >> language because they can get away. We're trying to be generic.
> >> > >> This side data is for detecting _and_ classifying objects. In fact,
> most of
> >> > >> the structure is dedicated towards classifying. If you'd like users to
> actually
> >> > >> use this, give it a good name and don't leave them guessing what
> this
> >> > >> structure is by throwing vague jargon some other paper or spec
> has
> >> > >> because it's close enough.
> >> > >>
> >> > >
> >> > > all the people in the domain accepts bounding box, they can
> understand this
> >> > > struct name easily and clearly, they might be bothered if we use
> another
> >> > name.
> >> > >
> >> > > btw, AVObjectClassification confuses people who just want object
> detection.
> >> > >
> >> >
> >> > As I said, the name "bounding box" makes no sense once it gets
> overloaded
> >> > with object classification.
> >>
> >> dnn_detect creates an array of 'bounding box' for all detected objects,
> and
> >> dnn_classify assigns attributes for a set of bounding boxes (with same
> object
> >> id). 'bounding box' serves both detection and classification properly.
> >>
> >>
> >> > Object classification is still the main use of the filters,
> >> > because the original proposal was to have all of this info be
> ffmpeg-private,
> >> > which would forbid simple object detection.
> >>
> >> The original proposal is to add it as side data which is ffmpeg-public,
> and then,
> >> we spent much time discussing/trying with ffmpeg-private as an
> temporary
> >> method, and since it is not good to be temporary, we now switch back
> to
> >> ffmpeg-public.
> >>
> >> During the whole period, we don't have any intention to
> >> 'forbid simple object detection', not quite understand your point here.
> >>
> >>
> >> > So I still maintain this should be called "Object classification". I'd
> accept
> >> > "Object detection" as well, but definitely not "bounding box".
> >>
> >> imho, ' Object detection' and ' Object classification' are worse, they just
> >> describe one aspect of the struct. The users might just use filter
> dnn_detect,
> >> they might use filters dnn_detect + dnn_classify.
> >>
> >>
> >> >
> >> > Since the decision was made to make the side data public, we have to
> make
> >> > very sure it contains no hacks or is impossible to extend, since we
> don't want
> >> > to have an
> >> >
> "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWE
> D_
> >> > UP"
> >> > faster than you can say "Recursive cascade correlation artificial neural
> >> > networks".
> >>
> >> sorry, not quite understand your point here.
> >>
> >> 'bounding box' is designed for general purpose to contain the info for
> >> detection/classification. It doesn't matter which DNN model is used, it
> doesn't
> >> matter if a traditional algorithm (non-dnn) is used.
> >>
> >> I'm open to use a better name. And bounding box is the best one for
> me till now.
> >> Everyone in the domain knows the exact meaning of bounding box
> without
> >> extra explanation. This word has been extended/evolved with such
> meaning in
> >> this domain.
> >>
> > +1
> >
> > I think it is wise to use the name which is widely used in the field.
> >
> 
> Way to go, ignoring half the exchange to voice an opinion after we
> came to an agreement.

Some people take days for a thorough thinking and then give comment. 
It's nice and welcome. I even received some comments in the last day
before pushing.

I might be too busy for preparing the next version patch (we have some
video analytic feature codes/plans depending on this one, and hope to
move things forward better). I need to wait for several more days to get
more comments.

My point is that bounding box is a good (might the best) name if we have
object detection/classification background, it might be not the best name
without such background.

Anyway, I'm open to use a better name.
Guo, Yejun April 10, 2021, 9:52 a.m. UTC | #25
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Lynne
> Sent: 2021年4月10日 3:20
> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Apr 9, 2021, 20:08 by bygrandao@gmail.com:
> 
> > Em sex., 9 de abr. de 2021 às 12:15, Lynne <dev@lynne.ee> escreveu:
> >
> >>
> >> Apr 9, 2021, 16:35 by bygrandao@gmail.com:
> >>
> >> > Em sex., 9 de abr. de 2021 às 01:13, Guo, Yejun <yejun.guo@intel.com>
> escreveu:
> >> >
> >> >>
> >> >>
> >> >>
> >> >> > -----Original Message-----
> >> >> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On
> Behalf Of Lynne
> >> >> > Sent: 2021年4月9日 0:57
> >> >> > To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> >> >> > Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> >> >> > AV_FRAME_DATA_BOUNDING_BOXES
> >> >> >
> >> >>
> >> >> First of all, thanks for the quick replies, I see, all the
> discussions/comments are to
> >> >> make this patch better, thank you.
> >> >>
> >> >> > >> >
> >> >> > >> >> >> > +
> >> >> > >> >> >> > +typedef struct AVBoundingBoxHeader {
> >> >> > >> >> >> > +    /**
> >> >> > >> >> >> > +     * Information about how the bounding box is
> generated.
> >> >> > >> >> >> > +     * for example, the DNN model name.
> >> >> > >> >> >> > +     */
> >> >> > >> >> >> > +    char source[128];
> >> >> > >> >> >> > +
> >> >> > >> >> >> > +    /**
> >> >> > >> >> >> > +     * The size of frame when it is detected.
> >> >> > >> >> >> > +     */
> >> >> > >> >> >> > +    int frame_width;
> >> >> > >> >> >> > +    int frame_height;
> >> >> > >> >> >> >
> >> >> > >> >> >>
> >> >> > >> >> >> Why? This side data is attached to AVFrames only, where
> we
> >> >> > >> >> >> already have width and height.
> >> >> > >> >> >>
> >> >> > >> >> >
> >> >> > >> >> > The detection result will be used by other filters, for
> example,
> >> >> > >> >> > dnn_classify (see
> https://github.com/guoyejun/ffmpeg/tree/classify).
> >> >> > >> >> >
> >> >> > >> >> > The filter dnn_detect detects all the objects (cat, dog,
> person ...) in a
> >> >> > >> >> > frame, while dnn_classify classifies one detected object (for
> example,
> >> >> > >> person)
> >> >> > >> >> > for its attribute (for example, emotion, etc.)
> >> >> > >> >> >
> >> >> > >> >> > The filter dnn_classify have to check if the frame size is
> changed after
> >> >> > >> >> > it is detected, to handle the below filter chain:
> >> >> > >> >> > dnn_detect -> scale -> dnn_classify
> >> >> > >> >> >
> >> >> > >> >>
> >> >> > >> >> This doesn't look good. Why is dnn_classify needing to know
> >> >> > >> >> the original frame size at all?
> >> >> > >> >>
> >> >> > >> >
> >> >> > >> > For example, the original size of the frame is 100*100, and
> dnn_detect
> >> >> > >> > detects a face at place (10, 10) -> (30, 40), such data will be
> saved in
> >> >> > >> > AVBoundingBox.top/left/right/bottom.
> >> >> > >> >
> >> >> > >> > Then, the frame is scaled into 50*50.
> >> >> > >> >
> >> >> > >> > Then, dnn_classify is used to analyze the emotion of the face,
> it needs to
> >> >> > >> > know the frame size (100*100) when it is detected, otherwise,
> it does not
> >> >> > >> > work with just (10,10), (30,40) and 50*50.
> >> >> > >> >
> >> >> > >>
> >> >> > >> Why can't the scale filter also rescale this side data as well?
> >> >> > >>
> >> >> > >
> >> >> > > I'm afraid that we could not make sure all such filters (including
> filters in the
> >> >> > > future) to do the rescale. And in the previous discussion, I got to
> know that
> >> >> > > 'many other existing side-data types are invalidated by scaling'.
> >> >> > >
> >> >> > > So, we need frame_width and frame_height here.
> >> >> > >
> >> >> >
> >> >> > No, you don't. You just need to make sure filters which change
> resolution
> >> >> > or do cropping also change the side data parameters.
> >> >> > It's called maintainership. As-is, this won't even work with
> cropping,
> >> >> > only with basic aspect ratio preserving scaling.
> >> >> > For the lack of a better term, this is a hack.
> >> >>
> >> >> As discussed in previous email, for the frame size change case,
> dnn_classify
> >> >> (and other filters which use the detection result, for example
> drawbox) can
> >> >> just output a warning message to tell user what happens, and don't
> do the
> >> >> classification, otherwise, it will give a wrong/weird result which
> makes the
> >> >> user confused.
> >> >>
> >> >> >
> >> >> > I would accept just specifying that if the frame dimensions are
> >> >> > altered in any way, the side-data is no longer valid and it's up
> >> >> > to users to figure that out by out of bound coordinates.
> >> >> > This is what we currently do with video_enc_params.
> >> >>
> >> >> frame_width/frame_height is not perfect (for the cases such as: scale
> down
> >> >> + crop + scale up to the same size), but it provides more info than
> the checking
> >> >> of 'out of bound coordinates'. There are many other possible issues
> when the
> >> >> coordinates are within the frame.
> >> >>
> >> >> If we think we'd better not let user get more info from the warning
> message,
> >> >> I'm ok to remove them.
> >> >>
> >> >> I'll remove them if there's another comment supporting the removal,
> and
> >> >> there's no objection.
> >> >>
> >> >> >
> >> >> >
> >> >> > >> >> >> > diff --git a/libavutil/frame.h b/libavutil/frame.h
> >> >> > >> >> >> > index a5ed91b20a..41e22de02a 100644
> >> >> > >> >> >> > --- a/libavutil/frame.h
> >> >> > >> >> >> > +++ b/libavutil/frame.h
> >> >> > >> >> >> > @@ -198,6 +198,13 @@ enum AVFrameSideDataType {
> >> >> > >> >> >> >  * Must be present for every frame which should have
> film grain
> >> >> > >> applied.
> >> >> > >> >> >> >  */
> >> >> > >> >> >> >  AV_FRAME_DATA_FILM_GRAIN_PARAMS,
> >> >> > >> >> >> > +
> >> >> > >> >> >> > +    /**
> >> >> > >> >> >> > +     * Bounding boxes for object detection and
> classification, the
> >> >> > >> data is
> >> >> > >> >> a
> >> >> > >> >> >> AVBoundingBoxHeader
> >> >> > >> >> >> > +     * followed with an array of AVBoudingBox, and
> >> >> > >> >> >> AVBoundingBoxHeader.nb_bboxes is the number
> >> >> > >> >> >> > +     * of array element.
> >> >> > >> >> >> > +     */
> >> >> > >> >> >> > +    AV_FRAME_DATA_BOUNDING_BOXES,
> >> >> > >> >> >> >  };
> >> >> > >> >> >> >
> >> >> > >> >> >>
> >> >> > >> >> >> Finally, why call it a Bounding Box? It's not descriptive at
> all.
> >> >> > >> >> >> How about "Object Classification"? It makes much more
> sense, it's
> >> >> > >> >> >> exactly what this is. So AVObjectClassification,
> AVObjectClassification,
> >> >> > >> >> >> AV_FRAME_DATA_OBJECT_CLASSIFICATION and so on.
> >> >> > >> >> >>
> >> >> > >> >> >
> >> >> > >> >> > In object detection papers, bounding box is usually used.
> >> >> > >> >> > We'd better use the same term, imho, thanks.
> >> >> > >> >> >
> >> >> > >> >>
> >> >> > >> >> Not in this case, API users won't have any idea what this is or
> what
> >> >> > >> >> it's for. This is user-facing code after all.
> >> >> > >> >> Papers in fields can get away with overloading language, but
> we're
> >> >> > >> >> trying to make a concise API. Object classification makes
> sense
> >> >> > >> >> because this is exactly what this is.
> >> >> > >> >>
> >> >> > >> >
> >> >> > >> > The term bounding box is dominating the domain, for
> example, even
> >> >> > >> > HEVC spec uses this term, see page 317 of
> >> >> > >> >
> >> >> > >>
> >> >> >
> https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-H.265-201911-I
> !!P
> >> >> > >> DF-E&type=items
> >> >> > >> >
> >> >> > >> > also copy some here for your convenient.
> >> >> > >> > ar_bounding_box_top[ ar_object_idx[ i ] ] u(16)
> >> >> > >> > ar_bounding_box_left[ ar_object_idx[ i ] ] u(16)
> >> >> > >> > ar_bounding_box_width[ ar_object_idx[ i ] ] u(16)
> >> >> > >> > ar_bounding_box_height[ ar_object_idx[ i ] ] u(16)
> >> >> > >> >
> >> >> > >> > I would prefer to use bounding box.
> >> >> > >> >
> >> >> > >>
> >> >> > >> It's for an entirely different thing, and like I said, it's just an
> overloaded
> >> >> > >> language because they can get away. We're trying to be generic.
> >> >> > >> This side data is for detecting _and_ classifying objects. In fact,
> most of
> >> >> > >> the structure is dedicated towards classifying. If you'd like users
> to actually
> >> >> > >> use this, give it a good name and don't leave them guessing
> what this
> >> >> > >> structure is by throwing vague jargon some other paper or spec
> has
> >> >> > >> because it's close enough.
> >> >> > >>
> >> >> > >
> >> >> > > all the people in the domain accepts bounding box, they can
> understand this
> >> >> > > struct name easily and clearly, they might be bothered if we use
> another
> >> >> > name.
> >> >> > >
> >> >> > > btw, AVObjectClassification confuses people who just want object
> detection.
> >> >> > >
> >> >> >
> >> >> > As I said, the name "bounding box" makes no sense once it gets
> overloaded
> >> >> > with object classification.
> >> >>
> >> >> dnn_detect creates an array of 'bounding box' for all detected
> objects, and
> >> >> dnn_classify assigns attributes for a set of bounding boxes (with
> same object
> >> >> id). 'bounding box' serves both detection and classification properly.
> >> >>
> >> >>
> >> >> > Object classification is still the main use of the filters,
> >> >> > because the original proposal was to have all of this info be
> ffmpeg-private,
> >> >> > which would forbid simple object detection.
> >> >>
> >> >> The original proposal is to add it as side data which is ffmpeg-public,
> and then,
> >> >> we spent much time discussing/trying with ffmpeg-private as an
> temporary
> >> >> method, and since it is not good to be temporary, we now switch
> back to
> >> >> ffmpeg-public.
> >> >>
> >> >> During the whole period, we don't have any intention to
> >> >> 'forbid simple object detection', not quite understand your point
> here.
> >> >>
> >> >>
> >> >> > So I still maintain this should be called "Object classification". I'd
> accept
> >> >> > "Object detection" as well, but definitely not "bounding box".
> >> >>
> >> >> imho, ' Object detection' and ' Object classification' are worse, they
> just
> >> >> describe one aspect of the struct. The users might just use filter
> dnn_detect,
> >> >> they might use filters dnn_detect + dnn_classify.
> >> >>
> >> >>
> >> >> >
> >> >> > Since the decision was made to make the side data public, we have
> to make
> >> >> > very sure it contains no hacks or is impossible to extend, since we
> don't want
> >> >> > to have an
> >> >> >
> "AV_SIDE_DATA_OBJECT_CLASSIFICATION_VERSION_2_SORRY_WE_SCREWE
> D_
> >> >> > UP"
> >> >> > faster than you can say "Recursive cascade correlation artificial
> neural
> >> >> > networks".
> >> >>
> >> >> sorry, not quite understand your point here.
> >> >>
> >> >> 'bounding box' is designed for general purpose to contain the info
> for
> >> >> detection/classification. It doesn't matter which DNN model is used,
> it doesn't
> >> >> matter if a traditional algorithm (non-dnn) is used.
> >> >>
> >> >> I'm open to use a better name. And bounding box is the best one for
> me till now.
> >> >> Everyone in the domain knows the exact meaning of bounding box
> without
> >> >> extra explanation. This word has been extended/evolved with such
> meaning in
> >> >> this domain.
> >> >>
> >> > +1
> >> >
> >> > I think it is wise to use the name which is widely used in the field.
> >> >
> >>
> >> Way to go, ignoring half the exchange to voice an opinion after we
> >> came to an agreement.
> >>
> > Indeed that's the opposite, I gave my opinion *precisely* because I'm
> > following the whole discussion.
> > IMHO, ignoring a naming standard in a whole field is not wise.
> >
> 
> The convention is vague, broad, inappropriate, non-specific, incomplete,
> will be mistaken for something it is not, something that's not it not will be
> mistaken for what it is, and finally it doesn't cover the second part of what
> it will be used for - classification.

'bounding box' serves both detection and classification, the detection
detects objects and creates an array of bounding box, the classification
assigns an attribute for each bounding box.

'object detection' or 'object classification' just covers one part.

> We shouldn't follow a pseudo-naming convention if it clashes with other
> parts of our code and isn't descriptive to the degree I described.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Anton Khirnov April 11, 2021, 10:57 a.m. UTC | #26
Quoting Pedro Arthur (2021-04-09 16:35:52)
> +1
> 
> I think it is wise to use the name which is widely used in the field.

We are a generic multimedia framework. "the field" for us is multimedia
in general, so we should use names meaningful in general multimedia
context.
I mostly agree with Lynne, "bounding box" is confusing and misleading
when this structure is built around object detection and classification.
Nicolas George April 11, 2021, 5:53 p.m. UTC | #27
Anton Khirnov (12021-04-11):
> We are a generic multimedia framework. "the field" for us is multimedia
> in general, so we should use names meaningful in general multimedia
> context.
> I mostly agree with Lynne, "bounding box" is confusing and misleading
> when this structure is built around object detection and classification.

I agree with both of you. When faced with this kind of choice, we must
choose the wording that will be as clear and not confusing as possible
for people familiar with general concepts of video encoding but not
necessarily familiar with the jargon of any particular sub-field. The
specialists of the particular subfield are supposed to be more capable
of adjusting.

Also, I have observed that the jargon of a narrow field is frequently
made of awful misnomers kept more as cargo cult than for any good
reason.

Regards,
Pedro Arthur April 11, 2021, 9:12 p.m. UTC | #28
Em dom., 11 de abr. de 2021 às 14:53, Nicolas George <george@nsup.org> escreveu:
>
> Anton Khirnov (12021-04-11):
> > We are a generic multimedia framework. "the field" for us is multimedia
> > in general, so we should use names meaningful in general multimedia
> > context.
> > I mostly agree with Lynne, "bounding box" is confusing and misleading
> > when this structure is built around object detection and classification.
>
> I agree with both of you. When faced with this kind of choice, we must
> choose the wording that will be as clear and not confusing as possible
> for people familiar with general concepts of video encoding but not
> necessarily familiar with the jargon of any particular sub-field. The
> specialists of the particular subfield are supposed to be more capable
> of adjusting.
>
Personally, "bounding box" is very clear to me, I might be biased.
It seems we are bikeshedding over a naming. I think it is more
constructive if we propose a better name as the only other option
proposed until now seems worst to me.

I think something like "AV_SIDE_DATA_DETECTION_BOUNDING_BOX" is
reasonable, as it conveys what it is and what it is used for.


> Also, I have observed that the jargon of a narrow field is frequently
> made of awful misnomers kept more as cargo cult than for any good
> reason.
>
> Regards,
>
> --
>   Nicolas George
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Guo, Yejun April 12, 2021, 2:18 a.m. UTC | #29
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Pedro Arthur
> Sent: 2021年4月12日 5:12
> To: FFmpeg development discussions and patches
> <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V7 4/6] lavu: add side data
> AV_FRAME_DATA_BOUNDING_BOXES
> 
> Em dom., 11 de abr. de 2021 às 14:53, Nicolas George <george@nsup.org>
> escreveu:
> >
> > Anton Khirnov (12021-04-11):
> > > We are a generic multimedia framework. "the field" for us is
> multimedia
> > > in general, so we should use names meaningful in general multimedia
> > > context.
> > > I mostly agree with Lynne, "bounding box" is confusing and misleading
> > > when this structure is built around object detection and classification.
> >
> > I agree with both of you. When faced with this kind of choice, we must
> > choose the wording that will be as clear and not confusing as possible
> > for people familiar with general concepts of video encoding but not
> > necessarily familiar with the jargon of any particular sub-field. The
> > specialists of the particular subfield are supposed to be more capable
> > of adjusting.

Thanks both, got the point.

> >
> Personally, "bounding box" is very clear to me, I might be biased.
> It seems we are bikeshedding over a naming. I think it is more
> constructive if we propose a better name as the only other option
> proposed until now seems worst to me.
> 
> I think something like "AV_SIDE_DATA_DETECTION_BOUNDING_BOX" is
> reasonable, as it conveys what it is and what it is used for.
> 

Agree, thanks. For people who use this feature will feel a bit
strange if they do not see bounding box or bbox. (bbox is the
abbreviation of bounding box).

We might use BBOX to replace BOUNDING_BOX since it is too long,
and people in the sub-filed know BBOX as well. I'm open to change
it back. And I also add 'ES' as the suffix since it is an array. So, the
code might looks like:

frame.h:
    /**
     * Bounding boxes for object detection and classification, ...
     */
    AV_FRAME_DATA_DETECTION_BBOXES,

detection_bbox.h/c:
struct AVDetectionBbox;
struct AVDetectionBboxHeader;

av_get_detection_bbox
av_detection_bboxes_alloc
av_detection_bboxes_create_side_data
diff mbox series

Patch

diff --git a/doc/APIchanges b/doc/APIchanges
index 9dfcc97d5c..81d01aac1e 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -14,6 +14,8 @@  libavutil:     2017-10-21
 
 
 API changes, most recent first:
+2021-04-xx - xxxxxxxxxx - lavu 56.xx.100 - frame.h boundingbox.h
+  Add AV_FRAME_DATA_BOUNDING_BOXES
 
 2021-04-06 - xxxxxxxxxx - lavf 58.78.100 - avformat.h
   Add avformat_index_get_entries_count(), avformat_index_get_entry(),
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 27bafe9e12..f6d21bb5ce 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -11,6 +11,7 @@  HEADERS = adler32.h                                                     \
           avutil.h                                                      \
           base64.h                                                      \
           blowfish.h                                                    \
+          boundingbox.h                                                 \
           bprint.h                                                      \
           bswap.h                                                       \
           buffer.h                                                      \
@@ -104,6 +105,7 @@  OBJS = adler32.o                                                        \
        avsscanf.o                                                       \
        base64.o                                                         \
        blowfish.o                                                       \
+       boundingbox.o                                                    \
        bprint.o                                                         \
        buffer.o                                                         \
        cast5.o                                                          \
diff --git a/libavutil/boundingbox.c b/libavutil/boundingbox.c
new file mode 100644
index 0000000000..881661ec18
--- /dev/null
+++ b/libavutil/boundingbox.c
@@ -0,0 +1,73 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "boundingbox.h"
+
+AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t *out_size)
+{
+    size_t size;
+    struct {
+	    AVBoundingBoxHeader header;
+	    AVBoundingBox boxes[];
+    } *ret;
+
+    size = sizeof(*ret);
+    if (nb_bboxes > (SIZE_MAX - size) / sizeof(*ret->boxes))
+        return NULL;
+    size += sizeof(*ret->boxes) * nb_bboxes;
+
+    ret = av_mallocz(size);
+    if (!ret)
+        return NULL;
+
+    ret->header.nb_bboxes = nb_bboxes;
+    ret->header.bbox_size = sizeof(*ret->boxes);
+    ret->header.bboxes_offset = (char *)&ret->boxes - (char *)&ret->header;
+
+    if (out_size)
+        *out_size = sizeof(*ret);
+
+    return &ret->header;
+}
+
+AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes)
+{
+    AVBufferRef         *buf;
+    AVBoundingBoxHeader *header;
+    size_t size;
+
+    header = av_bbox_alloc(nb_bboxes, &size);
+    if (!header)
+        return NULL;
+    if (size > INT_MAX) {
+        av_freep(&header);
+        return NULL;
+    }
+    buf = av_buffer_create((uint8_t *)header, size, NULL, NULL, 0);
+    if (!buf) {
+        av_freep(&header);
+        return NULL;
+    }
+
+    if (!av_frame_new_side_data_from_buf(frame, AV_FRAME_DATA_BOUNDING_BOXES, buf)) {
+        av_buffer_unref(&buf);
+        return NULL;
+    }
+
+    return header;
+}
diff --git a/libavutil/boundingbox.h b/libavutil/boundingbox.h
new file mode 100644
index 0000000000..2b77c6c0dc
--- /dev/null
+++ b/libavutil/boundingbox.h
@@ -0,0 +1,114 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_BOUNDINGBOX_H
+#define AVUTIL_BOUNDINGBOX_H
+
+#include "rational.h"
+#include "avassert.h"
+#include "frame.h"
+
+typedef struct AVBoundingBox {
+    /**
+     * Distance in pixels from the top edge of the frame to top
+     * and bottom, and from the left edge of the frame to left and
+     * right, defining the bounding box.
+     */
+    int top;
+    int left;
+    int bottom;
+    int right;
+
+#define AV_BBOX_LABEL_NAME_MAX_SIZE 32
+
+    /**
+     * Detect result with confidence
+     */
+    char detect_label[AV_BBOX_LABEL_NAME_MAX_SIZE];
+    AVRational detect_confidence;
+
+    /**
+     * At most 4 classifications based on the detected bounding box.
+     * For example, we can get max 4 different attributes with 4 different
+     * DNN models on one bounding box.
+     * classify_count is zero if no classification.
+     */
+#define AV_NUM_BBOX_CLASSIFY 4
+    uint32_t classify_count;
+    char classify_labels[AV_NUM_BBOX_CLASSIFY][AV_BBOX_LABEL_NAME_MAX_SIZE];
+    AVRational classify_confidences[AV_NUM_BBOX_CLASSIFY];
+} AVBoundingBox;
+
+typedef struct AVBoundingBoxHeader {
+    /**
+     * Information about how the bounding box is generated.
+     * for example, the DNN model name.
+     */
+    char source[128];
+
+    /**
+     * The size of frame when it is detected.
+     */
+    int frame_width;
+    int frame_height;
+
+    /**
+     * Number of bounding boxes in the array.
+     */
+    uint32_t nb_bboxes;
+
+    /**
+     * Offset in bytes from the beginning of this structure at which
+     * the array of bounding boxes starts.
+     */
+    size_t bboxes_offset;
+
+    /**
+     * Size of each bounding box in bytes.
+     */
+    size_t bbox_size;
+} AVBoundingBoxHeader;
+
+/*
+ * Get the bounding box at the specified {@code idx}. Must be between 0 and nb_bboxes.
+ */
+static av_always_inline AVBoundingBox*
+av_get_bounding_box(const AVBoundingBoxHeader *header, unsigned int idx)
+{
+    av_assert0(idx < header->nb_bboxes);
+    return (AVBoundingBox *)((uint8_t *)header + header->bboxes_offset +
+                             idx * header->bbox_size);
+}
+
+/**
+ * Allocates memory for AVBoundingBoxHeader, plus an array of {@code nb_bboxes}
+ * AVBoundingBox, and initializes the variables.
+ * Can be freed with a normal av_free() call.
+ *
+ * @param out_size if non-NULL, the size in bytes of the resulting data array is
+ * written here.
+ */
+AVBoundingBoxHeader *av_bbox_alloc(uint32_t nb_bboxes, size_t *out_size);
+
+/**
+ * Allocates memory for AVBoundingBoxHeader, plus an array of {@code nb_bboxes}
+ * AVBoundingBox, in the given AVFrame {@code frame} as AVFrameSideData of type
+ * AV_FRAME_DATA_BOUNDING_BOXES and initializes the variables.
+ */
+AVBoundingBoxHeader* av_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes);
+#endif
diff --git a/libavutil/frame.c b/libavutil/frame.c
index fefb2b69c0..d8e86263af 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
@@ -853,6 +853,7 @@  const char *av_frame_side_data_name(enum AVFrameSideDataType type)
     case AV_FRAME_DATA_VIDEO_ENC_PARAMS:            return "Video encoding parameters";
     case AV_FRAME_DATA_SEI_UNREGISTERED:            return "H.26[45] User Data Unregistered SEI message";
     case AV_FRAME_DATA_FILM_GRAIN_PARAMS:           return "Film grain parameters";
+    case AV_FRAME_DATA_BOUNDING_BOXES:              return "Bounding boxes";
     }
     return NULL;
 }
diff --git a/libavutil/frame.h b/libavutil/frame.h
index a5ed91b20a..41e22de02a 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -198,6 +198,13 @@  enum AVFrameSideDataType {
      * Must be present for every frame which should have film grain applied.
      */
     AV_FRAME_DATA_FILM_GRAIN_PARAMS,
+
+    /**
+     * Bounding boxes for object detection and classification, the data is a AVBoundingBoxHeader
+     * followed with an array of AVBoudingBox, and AVBoundingBoxHeader.nb_bboxes is the number
+     * of array element.
+     */
+    AV_FRAME_DATA_BOUNDING_BOXES,
 };
 
 enum AVActiveFormatDescription {