diff mbox series

[FFmpeg-devel,V3,3/3] libavfilter: add filter dnn_detect for object detection

Message ID 20210222073045.16070-3-yejun.guo@intel.com
State New
Headers show
Series [FFmpeg-devel,V3,1/3] libavfilter/bbox.h: add BoundingBoxHeader and BoundingBox | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Guo, Yejun Feb. 22, 2021, 7:30 a.m. UTC
Below are the example steps to do object detection:

1. download and install l_openvino_toolkit_p_2021.1.110.tgz from
https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit/download.html
  or, we can get source code (tag 2021.1), build and install.
2. export LD_LIBRARY_PATH with openvino settings, for example:
.../deployment_tools/inference_engine/lib/intel64/:.../deployment_tools/inference_engine/external/tbb/lib/
3. rebuild ffmpeg from source code with configure option:
--enable-libopenvino
--extra-cflags='-I.../deployment_tools/inference_engine/include/'
--extra-ldflags='-L.../deployment_tools/inference_engine/lib/intel64'
4. download model files and test image
wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.bin
wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.xml
wget
https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.label
wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/images/cici.jpg
5. run ffmpeg with:
./ffmpeg -i cici.jpg -vf dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:input=data:output=detection_out:confidence=0.6:labels=face-detection-adas-0001.label -f null -

We'll see the detect result as below:
[Parsed_dnn_detect_0 @ 0x56226644e540] frame->private_ref (bounding boxes):
[Parsed_dnn_detect_0 @ 0x56226644e540] source: face-detection-adas-0001.xml
[Parsed_dnn_detect_0 @ 0x56226644e540] index: 0, region: (1005, 813) -> (1086, 905), label: face, confidence: 10000/10000.
[Parsed_dnn_detect_0 @ 0x56226644e540] index: 1, region: (888, 839) -> (967, 926), label: face, confidence: 6917/10000.

There are two faces detected with confidence 100% and 69.17%.

Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
---
 configure                              |   1 +
 doc/filters.texi                       |  40 +++
 libavfilter/Makefile                   |   1 +
 libavfilter/allfilters.c               |   1 +
 libavfilter/dnn/dnn_backend_openvino.c |  12 +
 libavfilter/dnn_filter_common.c        |   7 +
 libavfilter/dnn_filter_common.h        |   1 +
 libavfilter/dnn_interface.h            |   6 +-
 libavfilter/vf_dnn_detect.c            | 462 +++++++++++++++++++++++++
 9 files changed, 529 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/vf_dnn_detect.c

Comments

Guo, Yejun Feb. 25, 2021, 2:01 a.m. UTC | #1
> -----Original Message-----
> From: Guo, Yejun <yejun.guo@intel.com>
> Sent: 2021年2月22日 15:31
> To: ffmpeg-devel@ffmpeg.org
> Cc: Guo, Yejun <yejun.guo@intel.com>
> Subject: [PATCH V3 3/3] libavfilter: add filter dnn_detect for object detection
> 
> Below are the example steps to do object detection:
> 
> 1. download and install l_openvino_toolkit_p_2021.1.110.tgz from
> https://software.intel.com/content/www/us/en/develop/tools/openvino-toolk
> it/download.html
>   or, we can get source code (tag 2021.1), build and install.
> 2. export LD_LIBRARY_PATH with openvino settings, for example:
> .../deployment_tools/inference_engine/lib/intel64/:.../deployment_tools/infer
> ence_engine/external/tbb/lib/
> 3. rebuild ffmpeg from source code with configure option:
> --enable-libopenvino
> --extra-cflags='-I.../deployment_tools/inference_engine/include/'
> --extra-ldflags='-L.../deployment_tools/inference_engine/lib/intel64'
> 4. download model files and test image
> wget
> https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.
> 1/face-detection-adas-0001.bin
> wget
> https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.
> 1/face-detection-adas-0001.xml
> wget
> https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.
> 1/face-detection-adas-0001.label
> wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/images/cici.jpg
> 5. run ffmpeg with:
> ./ffmpeg -i cici.jpg -vf
> dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:inp
> ut=data:output=detection_out:confidence=0.6:labels=face-detection-adas-000
> 1.label -f null -
> 
> We'll see the detect result as below:
> [Parsed_dnn_detect_0 @ 0x56226644e540] frame->private_ref (bounding
> boxes):
> [Parsed_dnn_detect_0 @ 0x56226644e540] source:
> face-detection-adas-0001.xml
> [Parsed_dnn_detect_0 @ 0x56226644e540] index: 0, region: (1005, 813) ->
> (1086, 905), label: face, confidence: 10000/10000.
> [Parsed_dnn_detect_0 @ 0x56226644e540] index: 1, region: (888, 839) ->
> (967, 926), label: face, confidence: 6917/10000.
> 
> There are two faces detected with confidence 100% and 69.17%.
> 
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  configure                              |   1 +
>  doc/filters.texi                       |  40 +++
>  libavfilter/Makefile                   |   1 +
>  libavfilter/allfilters.c               |   1 +
>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
>  libavfilter/dnn_filter_common.c        |   7 +
>  libavfilter/dnn_filter_common.h        |   1 +
>  libavfilter/dnn_interface.h            |   6 +-
>  libavfilter/vf_dnn_detect.c            | 462
> +++++++++++++++++++++++++
>  9 files changed, 529 insertions(+), 2 deletions(-)  create mode 100644
> libavfilter/vf_dnn_detect.c
> 

any other comment for this patch set? thanks.
Guo, Yejun March 1, 2021, 12:26 a.m. UTC | #2
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Guo,
> Yejun
> Sent: 2021年2月25日 10:02
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> for object detection
> 
> 
> 
> > -----Original Message-----
> > From: Guo, Yejun <yejun.guo@intel.com>
> > Sent: 2021年2月22日 15:31
> > To: ffmpeg-devel@ffmpeg.org
> > Cc: Guo, Yejun <yejun.guo@intel.com>
> > Subject: [PATCH V3 3/3] libavfilter: add filter dnn_detect for object
> > detection
> >
> > Below are the example steps to do object detection:
> >
> > 1. download and install l_openvino_toolkit_p_2021.1.110.tgz from
> > https://software.intel.com/content/www/us/en/develop/tools/openvino-to
> > olk
> > it/download.html
> >   or, we can get source code (tag 2021.1), build and install.
> > 2. export LD_LIBRARY_PATH with openvino settings, for example:
> > .../deployment_tools/inference_engine/lib/intel64/:.../deployment_tool
> > s/infer
> > ence_engine/external/tbb/lib/
> > 3. rebuild ffmpeg from source code with configure option:
> > --enable-libopenvino
> > --extra-cflags='-I.../deployment_tools/inference_engine/include/'
> > --extra-ldflags='-L.../deployment_tools/inference_engine/lib/intel64'
> > 4. download model files and test image wget
> >
> https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.
> > 1/face-detection-adas-0001.bin
> > wget
> >
> https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.
> > 1/face-detection-adas-0001.xml
> > wget
> >
> https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.
> > 1/face-detection-adas-0001.label
> > wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/images/cici.jpg
> > 5. run ffmpeg with:
> > ./ffmpeg -i cici.jpg -vf
> >
> dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:inp
> > ut=data:output=detection_out:confidence=0.6:labels=face-detection-adas
> > -000
> > 1.label -f null -
> >
> > We'll see the detect result as below:
> > [Parsed_dnn_detect_0 @ 0x56226644e540] frame->private_ref (bounding
> > boxes):
> > [Parsed_dnn_detect_0 @ 0x56226644e540] source:
> > face-detection-adas-0001.xml
> > [Parsed_dnn_detect_0 @ 0x56226644e540] index: 0, region: (1005, 813)
> > -> (1086, 905), label: face, confidence: 10000/10000.
> > [Parsed_dnn_detect_0 @ 0x56226644e540] index: 1, region: (888, 839) ->
> > (967, 926), label: face, confidence: 6917/10000.
> >
> > There are two faces detected with confidence 100% and 69.17%.
> >
> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > ---
> >  configure                              |   1 +
> >  doc/filters.texi                       |  40 +++
> >  libavfilter/Makefile                   |   1 +
> >  libavfilter/allfilters.c               |   1 +
> >  libavfilter/dnn/dnn_backend_openvino.c |  12 +
> >  libavfilter/dnn_filter_common.c        |   7 +
> >  libavfilter/dnn_filter_common.h        |   1 +
> >  libavfilter/dnn_interface.h            |   6 +-
> >  libavfilter/vf_dnn_detect.c            | 462
> > +++++++++++++++++++++++++
> >  9 files changed, 529 insertions(+), 2 deletions(-)  create mode
> > 100644 libavfilter/vf_dnn_detect.c
> >
> 
> any other comment for this patch set? thanks.

will push tomorrow if no other objections, thanks.
Andreas Rheinhardt March 1, 2021, 4:50 a.m. UTC | #3
Guo, Yejun:
> Below are the example steps to do object detection:
> 
> 1. download and install l_openvino_toolkit_p_2021.1.110.tgz from
> https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit/download.html
>   or, we can get source code (tag 2021.1), build and install.
> 2. export LD_LIBRARY_PATH with openvino settings, for example:
> .../deployment_tools/inference_engine/lib/intel64/:.../deployment_tools/inference_engine/external/tbb/lib/
> 3. rebuild ffmpeg from source code with configure option:
> --enable-libopenvino
> --extra-cflags='-I.../deployment_tools/inference_engine/include/'
> --extra-ldflags='-L.../deployment_tools/inference_engine/lib/intel64'
> 4. download model files and test image
> wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.bin
> wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.xml
> wget
> https://github.com/guoyejun/ffmpeg_dnn/raw/main/models/openvino/2021.1/face-detection-adas-0001.label
> wget https://github.com/guoyejun/ffmpeg_dnn/raw/main/images/cici.jpg
> 5. run ffmpeg with:
> ./ffmpeg -i cici.jpg -vf dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:input=data:output=detection_out:confidence=0.6:labels=face-detection-adas-0001.label -f null -
> 
> We'll see the detect result as below:
> [Parsed_dnn_detect_0 @ 0x56226644e540] frame->private_ref (bounding boxes):
> [Parsed_dnn_detect_0 @ 0x56226644e540] source: face-detection-adas-0001.xml
> [Parsed_dnn_detect_0 @ 0x56226644e540] index: 0, region: (1005, 813) -> (1086, 905), label: face, confidence: 10000/10000.
> [Parsed_dnn_detect_0 @ 0x56226644e540] index: 1, region: (888, 839) -> (967, 926), label: face, confidence: 6917/10000.
> 
> There are two faces detected with confidence 100% and 69.17%.
> 
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  configure                              |   1 +
>  doc/filters.texi                       |  40 +++
>  libavfilter/Makefile                   |   1 +
>  libavfilter/allfilters.c               |   1 +
>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
>  libavfilter/dnn_filter_common.c        |   7 +
>  libavfilter/dnn_filter_common.h        |   1 +
>  libavfilter/dnn_interface.h            |   6 +-
>  libavfilter/vf_dnn_detect.c            | 462 +++++++++++++++++++++++++
>  9 files changed, 529 insertions(+), 2 deletions(-)
>  create mode 100644 libavfilter/vf_dnn_detect.c
> 
> diff --git a/configure b/configure
> index 336301cb40..cdac292c2f 100755
> --- a/configure
> +++ b/configure
> @@ -3549,6 +3549,7 @@ derain_filter_select="dnn"
>  deshake_filter_select="pixelutils"
>  deshake_opencl_filter_deps="opencl"
>  dilation_opencl_filter_deps="opencl"
> +dnn_detect_filter_select="dnn"
>  dnn_processing_filter_select="dnn"
>  drawtext_filter_deps="libfreetype"
>  drawtext_filter_suggest="libfontconfig libfribidi"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 426cb158da..55368c6f1b 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -10133,6 +10133,46 @@ ffmpeg -i INPUT -f lavfi -i nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
>  @end example
>  @end itemize
>  
> +@section dnn_detect
> +
> +Do object detection with deep neural networks.
> +
> +The filter accepts the following options:
> +
> +@table @option
> +@item dnn_backend
> +Specify which DNN backend to use for model loading and execution. This option accepts
> +only openvino now, tensorflow backends will be added.
> +
> +@item model
> +Set path to model file specifying network architecture and its parameters.
> +Note that different backends use different file formats.
> +
> +@item input
> +Set the input name of the dnn network.
> +
> +@item output
> +Set the output name of the dnn network.
> +
> +@item confidence
> +Set the confidence threshold (default: 0.5).
> +
> +@item labels
> +Set path to label file specifying the mapping between label id and name.
> +Each label name is written in one line, tailing spaces and empty lines are skipped.
> +The first line is the name of label id 0 (usually it is 'background'),
> +and the second line is the name of label id 1, etc.
> +The label id is considered as name if the label file is not provided.
> +
> +@item backend_configs
> +Set the configs to be passed into backend
> +
> +@item async
> +use DNN async execution if set (default: set),
> +roll back to sync execution if the backend does not support async.
> +
> +@end table
> +
>  @anchor{dnn_processing}
>  @section dnn_processing
>  
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 359ea7f903..b14c0ecdb9 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -245,6 +245,7 @@ OBJS-$(CONFIG_DILATION_FILTER)               += vf_neighbor.o
>  OBJS-$(CONFIG_DILATION_OPENCL_FILTER)        += vf_neighbor_opencl.o opencl.o \
>                                                  opencl/neighbor.o
>  OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o framesync.o
> +OBJS-$(CONFIG_DNN_DETECT_FILTER)             += vf_dnn_detect.o
>  OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         += vf_dnn_processing.o
>  OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
>  OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 452c030706..85419ec3f1 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -230,6 +230,7 @@ extern AVFilter ff_vf_detelecine;
>  extern AVFilter ff_vf_dilation;
>  extern AVFilter ff_vf_dilation_opencl;
>  extern AVFilter ff_vf_displace;
> +extern AVFilter ff_vf_dnn_detect;
>  extern AVFilter ff_vf_dnn_processing;
>  extern AVFilter ff_vf_doubleweave;
>  extern AVFilter ff_vf_drawbox;
> diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
> index 5be053b7f8..928d84b744 100644
> --- a/libavfilter/dnn/dnn_backend_openvino.c
> +++ b/libavfilter/dnn/dnn_backend_openvino.c
> @@ -621,6 +621,12 @@ DNNReturnType ff_dnn_execute_model_ov(const DNNModel *model, const char *input_n
>          return DNN_ERROR;
>      }
>  
> +    if (model->func_type != DFT_PROCESS_FRAME) {
> +        if (!out_frame) {
> +            out_frame = in_frame;
> +        }
> +    }
> +
>      if (nb_output != 1) {
>          // currently, the filter does not need multiple outputs,
>          // so we just pending the support until we really need it.
> @@ -674,6 +680,12 @@ DNNReturnType ff_dnn_execute_model_async_ov(const DNNModel *model, const char *i
>          return DNN_ERROR;
>      }
>  
> +    if (model->func_type != DFT_PROCESS_FRAME) {
> +        if (!out_frame) {
> +            out_frame = in_frame;
> +        }
> +    }
> +
>      task = av_malloc(sizeof(*task));
>      if (!task) {
>          av_log(ctx, AV_LOG_ERROR, "unable to alloc memory for task item.\n");
> diff --git a/libavfilter/dnn_filter_common.c b/libavfilter/dnn_filter_common.c
> index 413adba406..92b696e710 100644
> --- a/libavfilter/dnn_filter_common.c
> +++ b/libavfilter/dnn_filter_common.c
> @@ -64,6 +64,13 @@ int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *fil
>      return 0;
>  }
>  
> +int ff_dnn_set_proc(DnnContext *ctx, PRE_POST_PROC pre_proc, PRE_POST_PROC post_proc)
> +{
> +    ctx->model->pre_proc = pre_proc;
> +    ctx->model->post_proc = post_proc;
> +    return 0;
> +}
> +
>  DNNReturnType ff_dnn_get_input(DnnContext *ctx, DNNData *input)
>  {
>      return ctx->model->get_input(ctx->model->model, input, ctx->model_inputname);
> diff --git a/libavfilter/dnn_filter_common.h b/libavfilter/dnn_filter_common.h
> index 79c4d3efe3..0e88b88bdd 100644
> --- a/libavfilter/dnn_filter_common.h
> +++ b/libavfilter/dnn_filter_common.h
> @@ -48,6 +48,7 @@ typedef struct DnnContext {
>  
>  
>  int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *filter_ctx);
> +int ff_dnn_set_proc(DnnContext *ctx, PRE_POST_PROC pre_proc, PRE_POST_PROC post_proc);
>  DNNReturnType ff_dnn_get_input(DnnContext *ctx, DNNData *input);
>  DNNReturnType ff_dnn_get_output(DnnContext *ctx, int input_width, int input_height, int *output_width, int *output_height);
>  DNNReturnType ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame);
> diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
> index d3a0c58a61..90a08129f4 100644
> --- a/libavfilter/dnn_interface.h
> +++ b/libavfilter/dnn_interface.h
> @@ -63,6 +63,8 @@ typedef struct DNNData{
>      DNNColorOrder order;
>  } DNNData;
>  
> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model, AVFilterContext *filter_ctx);

Why uppercase? It is a typedef, not a macro.

> +
>  typedef struct DNNModel{
>      // Stores model that can be different for different backends.
>      void *model;
> @@ -80,10 +82,10 @@ typedef struct DNNModel{
>                                  const char *output_name, int *output_width, int *output_height);
>      // set the pre process to transfer data from AVFrame to DNNData
>      // the default implementation within DNN is used if it is not provided by the filter
> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input, AVFilterContext *filter_ctx);
> +    PRE_POST_PROC pre_proc;
>      // set the post process to transfer data from DNNData to AVFrame
>      // the default implementation within DNN is used if it is not provided by the filter
> -    int (*post_proc)(AVFrame *frame_out, DNNData *model_output, AVFilterContext *filter_ctx);
> +    PRE_POST_PROC post_proc;

Spurious change.

>  } DNNModel;
>  
>  // Stores pointers to functions for loading, executing, freeing DNN models for one of the backends.
> diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
> new file mode 100644
> index 0000000000..13b82579a9
> --- /dev/null
> +++ b/libavfilter/vf_dnn_detect.c
> @@ -0,0 +1,462 @@
> +/*
> + * Copyright (c) 2021
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * implementing an object detecting filter using deep learning networks.
> + */
> +
> +#include "libavformat/avio.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/avassert.h"
> +#include "libavutil/imgutils.h"
> +#include "filters.h"
> +#include "dnn_filter_common.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "libavutil/time.h"
> +#include "bbox.h"
> +
> +typedef struct DnnDetectContext {
> +    const AVClass *class;
> +    DnnContext dnnctx;
> +    float confidence;
> +    char *labels_filename;
> +    char **labels;
> +    int label_count;
> +} DnnDetectContext;
> +
> +#define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
> +#define OFFSET2(x) offsetof(DnnDetectContext, x)
> +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
> +static const AVOption dnn_detect_options[] = {
> +    { "dnn_backend", "DNN backend",                OFFSET(backend_type),     AV_OPT_TYPE_INT,       { .i64 = 2 },    INT_MIN, INT_MAX, FLAGS, "backend" },
> +#if (CONFIG_LIBOPENVINO == 1)
> +    { "openvino",    "openvino backend flag",      0,                        AV_OPT_TYPE_CONST,     { .i64 = 2 },    0, 0, FLAGS, "backend" },
> +#endif
> +    DNN_COMMON_OPTIONS
> +    { "confidence",  "threshold of confidence",    OFFSET2(confidence),      AV_OPT_TYPE_FLOAT,     { .dbl = 0.5 },  0, 1, FLAGS},
> +    { "labels",      "path to labels file",        OFFSET2(labels_filename), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(dnn_detect);
> +
> +// remove this function once we update vf_drawbox/text
> +// to visualize the bbox
> +static void dump_boundingbox(AVFilterContext *ctx, AVFrame *frame)
> +{
> +    int nb_bbox = 0;
> +    BoundingBox *bbox;
> +    BoundingBoxHeader *header;
> +
> +    if (!frame->private_ref)
> +        return;
> +
> +    header = (BoundingBoxHeader *)frame->private_ref->data;
> +    bbox = (BoundingBox *)(header + 1);
> +
> +    if (!header->bbox_size || (frame->private_ref->size - sizeof(*header)) % header->bbox_size != 0) {
> +        av_log(ctx, AV_LOG_ERROR, "invalid size of bounding box\n");
> +        return;
> +    }
> +
> +    nb_bbox = (frame->private_ref->size - sizeof(*header)) / header->bbox_size;
> +    av_log(ctx, AV_LOG_INFO, "frame->private_ref (bounding boxes):\n");
> +    av_log(ctx, AV_LOG_INFO, "source: %s\n", header->source);
> +    for (int i = 0; i < nb_bbox; i++) {
> +        av_log(ctx, AV_LOG_INFO, "index: %d, region: (%d, %d) -> (%d, %d), label: %s, confidence: %d/%d.\n",
> +                                 i, bbox->left, bbox->top, bbox->right, bbox->bottom,
> +                                 bbox->detect_label, bbox->detect_confidence.num, bbox->detect_confidence.den);
> +        if (bbox->classify_count > 0) {
> +            for (int j = 0; j < bbox->classify_count; j++) {
> +                av_log(ctx, AV_LOG_INFO, "\t\tclassify:  label: %s, confidence: %d/%d.\n",
> +                       bbox->classify_labels[j], bbox->classify_confidences[j].num, bbox->classify_confidences[j].den);
> +            }
> +        }
> +        bbox++;
> +    }
> +}
> +
> +static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
> +{
> +    DnnDetectContext *ctx = filter_ctx->priv;
> +    float conf_threshold = ctx->confidence;
> +    int proposal_count = output->height;
> +    int detect_size = output->width;
> +    float *detections = output->data;
> +    int nb_bbox = 0;
> +    BoundingBox *bbox;
> +    BoundingBoxHeader *header;
> +
> +    for (int i = 0; i < proposal_count; ++i) {
> +        float conf = detections[i * detect_size + 2];
> +        if (conf < conf_threshold) {
> +            continue;
> +        }
> +        nb_bbox++;
> +    }
> +
> +    if (nb_bbox == 0) {
> +        av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
> +        return 0;
> +    }
> +
> +    if (frame->private_ref) {
> +        av_log(filter_ctx, AV_LOG_ERROR, "frame->private_ref is already occupied\n");
> +        return -1;
> +    }
> +
> +    frame->private_ref = av_buffer_alloc(sizeof(*header) + sizeof(*bbox) * nb_bbox);
> +    if (!frame->private_ref) {
> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate buffer for %d bboxes\n", nb_bbox);
> +        return AVERROR(ENOMEM);;

Double ;

> +    }
> +
> +    header = (BoundingBoxHeader *)frame->private_ref->data;
> +    strncpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
> +    header->source[sizeof(header->source) - 1] = '\0';
> +    header->bbox_size = sizeof(*bbox);
> +
> +    bbox = (BoundingBox *)(header + 1);

This does not guarantee proper alignment. You could use a flexible array
member for that.

> +    for (int i = 0; i < proposal_count; ++i) {
> +        int av_unused image_id = (int)detections[i * detect_size + 0];
> +        int label_id = (int)detections[i * detect_size + 1];
> +        float conf   =      detections[i * detect_size + 2];
> +        float x0     =      detections[i * detect_size + 3];
> +        float y0     =      detections[i * detect_size + 4];
> +        float x1     =      detections[i * detect_size + 5];
> +        float y1     =      detections[i * detect_size + 6];
> +
> +        if (conf < conf_threshold) {
> +            continue;
> +        }
> +
> +        bbox->left      = (int)(x0 * frame->width);
> +        bbox->right     = (int)(x1 * frame->width);
> +        bbox->top       = (int)(y0 * frame->height);
> +        bbox->bottom    = (int)(y1 * frame->height);
> +
> +        bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
> +        bbox->classify_count = 0;
> +
> +        if (ctx->labels && label_id < ctx->label_count) {
> +            strcpy(bbox->detect_label, ctx->labels[label_id]);
> +        } else {
> +            snprintf(bbox->detect_label, BBOX_LABEL_NAME_MAX_LENGTH, "%d", label_id);
> +        }
> +
> +        nb_bbox--;
> +        if (nb_bbox == 0) {
> +            break;
> +        }
> +        bbox++;
> +    }
> +
> +    dump_boundingbox(filter_ctx, frame);
> +
> +    return 0;
> +}
> +
> +static void free_detect_labels(DnnDetectContext *ctx)
> +{
> +    for (int i = 0; i < ctx->label_count; i++) {
> +        av_freep(&ctx->labels[i]);
> +    }
> +    av_freep(&ctx->labels);
> +}
> +
> +static int read_detect_label_file(AVFilterContext *context)
> +{
> +    int line_len;
> +    FILE *file;
> +    DnnDetectContext *ctx = context->priv;
> +
> +    file = av_fopen_utf8(ctx->labels_filename, "r");
> +    if (!file){
> +        av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    while (!feof(file)) {
> +        char *label;
> +        char buf[256];
> +        if (!fgets(buf, 256, file)) {
> +            break;
> +        }
> +
> +        line_len = strlen(buf);
> +        while (line_len) {
> +            int i = line_len - 1;
> +            if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
> +                buf[i] = '\0';
> +                line_len--;
> +            } else {
> +                break;
> +            }
> +        }
> +
> +        if (line_len == 0)  // empty line
> +            continue;
> +
> +        if (line_len > BBOX_LABEL_NAME_MAX_LENGTH) {
> +            av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
> +            fclose(file);
> +            free_detect_labels(ctx);
> +            return AVERROR(EINVAL);
> +        }
> +
> +        label = av_strdup(buf);
> +        if (!label) {
> +            av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
> +            fclose(file);
> +            free_detect_labels(ctx);
> +            return AVERROR(ENOMEM);
> +        }
> +
> +        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
> +            av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
> +            fclose(file);
> +            free_detect_labels(ctx);

1. You are leaking label here.
2. You are repeating yourself with the cleanup code.
3. When you return an error in a filter's init function, the filter's
uninit function is called automatically. In this case this means that
free_detect_labels is called twice which is not only wasteful, but
harmful: You are freeing ctx->labels (and all labels contained in it) in
the first run, but you are not resetting the number of labels. If
ctx->label_count is > 0, there will be a segfault when
free_detect_labels is called the second time.
4. Return the error code.
(5. I consider your use of av_log for every error to be excessive.)

> +            return AVERROR(ENOMEM);
> +        }
> +    }
> +
> +    fclose(file);
> +    return 0;
> +}
> +
> +static av_cold int dnn_detect_init(AVFilterContext *context)
> +{
> +    DnnDetectContext *ctx = context->priv;
> +    int ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_DETECT, context);
> +    if (ret < 0)
> +        return ret;
> +    ff_dnn_set_proc(&ctx->dnnctx, NULL, dnn_detect_post_proc);
> +
> +    if (ctx->labels_filename) {
> +        return read_detect_label_file(context);
> +    }
> +    return 0;
> +}
> +
> +static int dnn_detect_query_formats(AVFilterContext *context)
> +{
> +    static const enum AVPixelFormat pix_fmts[] = {
> +        AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
> +        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
> +        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
> +        AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
> +        AV_PIX_FMT_NV12,
> +        AV_PIX_FMT_NONE
> +    };
> +    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
> +    return ff_set_common_formats(context, fmts_list);
> +}
> +
> +static int dnn_detect_filter_frame(AVFilterLink *inlink, AVFrame *in)
> +{
> +    AVFilterContext *context  = inlink->dst;
> +    AVFilterLink *outlink = context->outputs[0];
> +    DnnDetectContext *ctx = context->priv;
> +    DNNReturnType dnn_result;
> +
> +    dnn_result = ff_dnn_execute_model(&ctx->dnnctx, in, NULL);
> +    if (dnn_result != DNN_SUCCESS){
> +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
> +        av_frame_free(&in);
> +        return AVERROR(EIO);
> +    }
> +
> +    return ff_filter_frame(outlink, in);
> +}
> +
> +static int dnn_detect_activate_sync(AVFilterContext *filter_ctx)
> +{
> +    AVFilterLink *inlink = filter_ctx->inputs[0];
> +    AVFilterLink *outlink = filter_ctx->outputs[0];
> +    AVFrame *in = NULL;
> +    int64_t pts;
> +    int ret, status;
> +    int got_frame = 0;
> +
> +    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
> +
> +    do {
> +        // drain all input frames
> +        ret = ff_inlink_consume_frame(inlink, &in);
> +        if (ret < 0)
> +            return ret;
> +        if (ret > 0) {
> +            ret = dnn_detect_filter_frame(inlink, in);
> +            if (ret < 0)
> +                return ret;
> +            got_frame = 1;
> +        }
> +    } while (ret > 0);
> +
> +    // if frame got, schedule to next filter
> +    if (got_frame)
> +        return 0;
> +
> +    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
> +        if (status == AVERROR_EOF) {
> +            ff_outlink_set_status(outlink, status, pts);
> +            return ret;
> +        }
> +    }
> +
> +    FF_FILTER_FORWARD_WANTED(outlink, inlink);
> +
> +    return FFERROR_NOT_READY;
> +}
> +
> +static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
> +{
> +    DnnDetectContext *ctx = outlink->src->priv;
> +    int ret;
> +    DNNAsyncStatusType async_state;
> +
> +    ret = ff_dnn_flush(&ctx->dnnctx);
> +    if (ret != DNN_SUCCESS) {
> +        return -1;
> +    }
> +
> +    do {
> +        AVFrame *in_frame = NULL;
> +        AVFrame *out_frame = NULL;
> +        async_state = ff_dnn_get_async_result(&ctx->dnnctx, &in_frame, &out_frame);
> +        if (out_frame) {
> +            av_assert0(in_frame == out_frame);
> +            ret = ff_filter_frame(outlink, out_frame);
> +            if (ret < 0)
> +                return ret;
> +            if (out_pts)
> +                *out_pts = out_frame->pts + pts;
> +        }
> +        av_usleep(5000);
> +    } while (async_state >= DAST_NOT_READY);
> +
> +    return 0;
> +}
> +
> +static int dnn_detect_activate_async(AVFilterContext *filter_ctx)
> +{
> +    AVFilterLink *inlink = filter_ctx->inputs[0];
> +    AVFilterLink *outlink = filter_ctx->outputs[0];
> +    DnnDetectContext *ctx = filter_ctx->priv;
> +    AVFrame *in = NULL;
> +    int64_t pts;
> +    int ret, status;
> +    int got_frame = 0;
> +    int async_state;
> +
> +    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
> +
> +    do {
> +        // drain all input frames
> +        ret = ff_inlink_consume_frame(inlink, &in);
> +        if (ret < 0)
> +            return ret;
> +        if (ret > 0) {
> +            if (ff_dnn_execute_model_async(&ctx->dnnctx, in, NULL) != DNN_SUCCESS) {
> +                return AVERROR(EIO);
> +            }
> +        }
> +    } while (ret > 0);
> +
> +    // drain all processed frames
> +    do {
> +        AVFrame *in_frame = NULL;
> +        AVFrame *out_frame = NULL;
> +        async_state = ff_dnn_get_async_result(&ctx->dnnctx, &in_frame, &out_frame);
> +        if (out_frame) {
> +            av_assert0(in_frame == out_frame);
> +            ret = ff_filter_frame(outlink, out_frame);
> +            if (ret < 0)
> +                return ret;
> +            got_frame = 1;
> +        }
> +    } while (async_state == DAST_SUCCESS);
> +
> +    // if frame got, schedule to next filter
> +    if (got_frame)
> +        return 0;
> +
> +    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
> +        if (status == AVERROR_EOF) {
> +            int64_t out_pts = pts;
> +            ret = dnn_detect_flush_frame(outlink, pts, &out_pts);
> +            ff_outlink_set_status(outlink, status, out_pts);
> +            return ret;
> +        }
> +    }
> +
> +    FF_FILTER_FORWARD_WANTED(outlink, inlink);
> +
> +    return 0;
> +}
> +
> +static int dnn_detect_activate(AVFilterContext *filter_ctx)
> +{
> +    DnnDetectContext *ctx = filter_ctx->priv;
> +
> +    if (ctx->dnnctx.async)
> +        return dnn_detect_activate_async(filter_ctx);
> +    else
> +        return dnn_detect_activate_sync(filter_ctx);
> +}
> +
> +static av_cold void dnn_detect_uninit(AVFilterContext *context)
> +{
> +    DnnDetectContext *ctx = context->priv;
> +    ff_dnn_uninit(&ctx->dnnctx);
> +    free_detect_labels(ctx);
> +}
> +
> +static const AVFilterPad dnn_detect_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad dnn_detect_outputs[] = {
> +    {
> +        .name = "default",
> +        .type = AVMEDIA_TYPE_VIDEO,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_dnn_detect = {
> +    .name          = "dnn_detect",
> +    .description   = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),
> +    .priv_size     = sizeof(DnnDetectContext),
> +    .init          = dnn_detect_init,
> +    .uninit        = dnn_detect_uninit,
> +    .query_formats = dnn_detect_query_formats,
> +    .inputs        = dnn_detect_inputs,
> +    .outputs       = dnn_detect_outputs,
> +    .priv_class    = &dnn_detect_class,
> +    .activate      = dnn_detect_activate,
> +};
>
Guo, Yejun March 1, 2021, 8:02 a.m. UTC | #4
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Andreas Rheinhardt
> Sent: 2021年3月1日 12:50
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> for object detection
> 
> Guo, Yejun:
> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > ---
> >  configure                              |   1 +
> >  doc/filters.texi                       |  40 +++
> >  libavfilter/Makefile                   |   1 +
> >  libavfilter/allfilters.c               |   1 +
> >  libavfilter/dnn/dnn_backend_openvino.c |  12 +
> >  libavfilter/dnn_filter_common.c        |   7 +
> >  libavfilter/dnn_filter_common.h        |   1 +
> >  libavfilter/dnn_interface.h            |   6 +-
> >  libavfilter/vf_dnn_detect.c            | 462
> +++++++++++++++++++++++++
> >  9 files changed, 529 insertions(+), 2 deletions(-)  create mode

> > b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4 100644
> > --- a/libavfilter/dnn_interface.h
> > +++ b/libavfilter/dnn_interface.h
> > @@ -63,6 +63,8 @@ typedef struct DNNData{
> >      DNNColorOrder order;
> >  } DNNData;
> >
> > +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model,
> > +AVFilterContext *filter_ctx);
> 
> Why uppercase? It is a typedef, not a macro.

will change to CamelCase, thanks.

> 
> > +
> >  typedef struct DNNModel{
> >      // Stores model that can be different for different backends.
> >      void *model;
> > @@ -80,10 +82,10 @@ typedef struct DNNModel{
> >                                  const char *output_name, int
> *output_width, int *output_height);
> >      // set the pre process to transfer data from AVFrame to DNNData
> >      // the default implementation within DNN is used if it is not provided
> by the filter
> > -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
> AVFilterContext *filter_ctx);
> > +    PRE_POST_PROC pre_proc;
> >      // set the post process to transfer data from DNNData to AVFrame
> >      // the default implementation within DNN is used if it is not provided
> by the filter
> > -    int (*post_proc)(AVFrame *frame_out, DNNData *model_output,
> AVFilterContext *filter_ctx);
> > +    PRE_POST_PROC post_proc;
> 
> Spurious change.

sorry, not quite understand this comment. It is used for code refine. 
Maybe I need to let it in a separate patch.

> 
> >  } DNNModel;
> >
> > +
> > +    frame->private_ref = av_buffer_alloc(sizeof(*header) + sizeof(*bbox) *
> nb_bbox);
> > +    if (!frame->private_ref) {
> > +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate buffer for %d
> bboxes\n", nb_bbox);
> > +        return AVERROR(ENOMEM);;
> 
> Double ;

thanks, will remove it.

> 
> > +    }
> > +
> > +    header = (BoundingBoxHeader *)frame->private_ref->data;
> > +    strncpy(header->source, ctx->dnnctx.model_filename,
> sizeof(header->source));
> > +    header->source[sizeof(header->source) - 1] = '\0';
> > +    header->bbox_size = sizeof(*bbox);
> > +
> > +    bbox = (BoundingBox *)(header + 1);
> 
> This does not guarantee proper alignment. You could use a flexible array
> member for that.

The memory layout of frame->private_ref->data is:
sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...

I think 'header + 1' goes correctly to the first bbox. thanks.

> > +
> > +        label = av_strdup(buf);
> > +        if (!label) {
> > +            av_log(context, AV_LOG_ERROR, "failed to allocate memory
> for label %s\n", buf);
> > +            fclose(file);
> > +            free_detect_labels(ctx);
> > +            return AVERROR(ENOMEM);
> > +        }
> > +
> > +        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label)
> < 0) {
> > +            av_log(context, AV_LOG_ERROR, "failed to do
> av_dynarray_add\n");
> > +            fclose(file);
> > +            free_detect_labels(ctx);
> 
> 1. You are leaking label here.
> 2. You are repeating yourself with the cleanup code.
> 3. When you return an error in a filter's init function, the filter's uninit function
> is called automatically. In this case this means that free_detect_labels is called
> twice which is not only wasteful, but
> harmful: You are freeing ctx->labels (and all labels contained in it) in the first
> run, but you are not resetting the number of labels. If
> ctx->label_count is > 0, there will be a segfault when
> free_detect_labels is called the second time.

good catch, will free label, remove the duplicate calling, and also reset ctx->label_count.

> 4. Return the error code.
> (5. I consider your use of av_log for every error to be excessive.)

not quite understand this one, thanks.

> 
> > +            return AVERROR(ENOMEM);
> > +        }
> > +    }
> > +
> > +    fclose(file);
> > +    return 0;
> > +}
> > +
Andreas Rheinhardt March 1, 2021, 8:30 a.m. UTC | #5
Guo, Yejun:
> 
> 
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> Andreas Rheinhardt
>> Sent: 2021年3月1日 12:50
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
>> for object detection
>>
>> Guo, Yejun:
>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>>> ---
>>>  configure                              |   1 +
>>>  doc/filters.texi                       |  40 +++
>>>  libavfilter/Makefile                   |   1 +
>>>  libavfilter/allfilters.c               |   1 +
>>>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
>>>  libavfilter/dnn_filter_common.c        |   7 +
>>>  libavfilter/dnn_filter_common.h        |   1 +
>>>  libavfilter/dnn_interface.h            |   6 +-
>>>  libavfilter/vf_dnn_detect.c            | 462
>> +++++++++++++++++++++++++
>>>  9 files changed, 529 insertions(+), 2 deletions(-)  create mode
> 
>>> b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4 100644
>>> --- a/libavfilter/dnn_interface.h
>>> +++ b/libavfilter/dnn_interface.h
>>> @@ -63,6 +63,8 @@ typedef struct DNNData{
>>>      DNNColorOrder order;
>>>  } DNNData;
>>>
>>> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model,
>>> +AVFilterContext *filter_ctx);
>>
>> Why uppercase? It is a typedef, not a macro.
> 
> will change to CamelCase, thanks.
> 
>>
>>> +
>>>  typedef struct DNNModel{
>>>      // Stores model that can be different for different backends.
>>>      void *model;
>>> @@ -80,10 +82,10 @@ typedef struct DNNModel{
>>>                                  const char *output_name, int
>> *output_width, int *output_height);
>>>      // set the pre process to transfer data from AVFrame to DNNData
>>>      // the default implementation within DNN is used if it is not provided
>> by the filter
>>> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
>> AVFilterContext *filter_ctx);
>>> +    PRE_POST_PROC pre_proc;
>>>      // set the post process to transfer data from DNNData to AVFrame
>>>      // the default implementation within DNN is used if it is not provided
>> by the filter
>>> -    int (*post_proc)(AVFrame *frame_out, DNNData *model_output,
>> AVFilterContext *filter_ctx);
>>> +    PRE_POST_PROC post_proc;
>>
>> Spurious change.
> 
> sorry, not quite understand this comment. It is used for code refine. 
> Maybe I need to let it in a separate patch.
> 
>>
>>>  } DNNModel;
>>>
>>> +
>>> +    frame->private_ref = av_buffer_alloc(sizeof(*header) + sizeof(*bbox) *
>> nb_bbox);
>>> +    if (!frame->private_ref) {
>>> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate buffer for %d
>> bboxes\n", nb_bbox);
>>> +        return AVERROR(ENOMEM);;
>>
>> Double ;
> 
> thanks, will remove it.
> 
>>
>>> +    }
>>> +
>>> +    header = (BoundingBoxHeader *)frame->private_ref->data;
>>> +    strncpy(header->source, ctx->dnnctx.model_filename,
>> sizeof(header->source));
>>> +    header->source[sizeof(header->source) - 1] = '\0';
>>> +    header->bbox_size = sizeof(*bbox);
>>> +
>>> +    bbox = (BoundingBox *)(header + 1);
>>
>> This does not guarantee proper alignment. You could use a flexible array
>> member for that.
> 
> The memory layout of frame->private_ref->data is:
> sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...
> 
> I think 'header + 1' goes correctly to the first bbox. thanks.
> 

header + 1 points to the position where another BoundingBoxHeader would
be if header were an array of BoundingBoxHeaders (i.e. it points
sizeof(BoundingBoxHeader) bytes after header). So this guarantees that
there is no overlap between your header and your actual boxes, but this
does not guarantee proper alignment. This will be a problem on platforms
where int is 64bit and requires eight byte alignment (unlikely) or if
you add something that requires alignment of eight bytes (like a 64bit
integer or a pointer on a 64bit system) to BoundingBox.

>>> +
>>> +        label = av_strdup(buf);
>>> +        if (!label) {
>>> +            av_log(context, AV_LOG_ERROR, "failed to allocate memory
>> for label %s\n", buf);
>>> +            fclose(file);
>>> +            free_detect_labels(ctx);
>>> +            return AVERROR(ENOMEM);
>>> +        }
>>> +
>>> +        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label)
>> < 0) {
>>> +            av_log(context, AV_LOG_ERROR, "failed to do
>> av_dynarray_add\n");
>>> +            fclose(file);
>>> +            free_detect_labels(ctx);
>>
>> 1. You are leaking label here.
>> 2. You are repeating yourself with the cleanup code.
>> 3. When you return an error in a filter's init function, the filter's uninit function
>> is called automatically. In this case this means that free_detect_labels is called
>> twice which is not only wasteful, but
>> harmful: You are freeing ctx->labels (and all labels contained in it) in the first
>> run, but you are not resetting the number of labels. If
>> ctx->label_count is > 0, there will be a segfault when
>> free_detect_labels is called the second time.
> 
> good catch, will free label, remove the duplicate calling, and also reset ctx->label_count.
> 
>> 4. Return the error code.
>> (5. I consider your use of av_log for every error to be excessive.)
> 
> not quite understand this one, thanks.
> 

It means that it is unnecessary to add an av_log for every failed
allocation; returning the error code is enough.

>>
>>> +            return AVERROR(ENOMEM);
>>> +        }
>>> +    }
>>> +
>>> +    fclose(file);
>>> +    return 0;
>>> +}
>>> +
Guo, Yejun March 1, 2021, 11:47 a.m. UTC | #6
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Andreas Rheinhardt
> Sent: 2021年3月1日 16:31
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> for object detection
> 
> Guo, Yejun:
> >
> >
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >> Andreas Rheinhardt
> >> Sent: 2021年3月1日 12:50
> >> To: ffmpeg-devel@ffmpeg.org
> >> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
> >> dnn_detect for object detection
> >>
> >> Guo, Yejun:
> >>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> >>> ---
> >>>  configure                              |   1 +
> >>>  doc/filters.texi                       |  40 +++
> >>>  libavfilter/Makefile                   |   1 +
> >>>  libavfilter/allfilters.c               |   1 +
> >>>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
> >>>  libavfilter/dnn_filter_common.c        |   7 +
> >>>  libavfilter/dnn_filter_common.h        |   1 +
> >>>  libavfilter/dnn_interface.h            |   6 +-
> >>>  libavfilter/vf_dnn_detect.c            | 462
> >> +++++++++++++++++++++++++
> >>>  9 files changed, 529 insertions(+), 2 deletions(-)  create mode
> >
> >>> b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4 100644
> >>> --- a/libavfilter/dnn_interface.h
> >>> +++ b/libavfilter/dnn_interface.h
> >>> @@ -63,6 +63,8 @@ typedef struct DNNData{
> >>>      DNNColorOrder order;
> >>>  } DNNData;
> >>>
> >>> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model,
> >>> +AVFilterContext *filter_ctx);
> >>
> >> Why uppercase? It is a typedef, not a macro.
> >
> > will change to CamelCase, thanks.
> >
> >>
> >>> +
> >>>  typedef struct DNNModel{
> >>>      // Stores model that can be different for different backends.
> >>>      void *model;
> >>> @@ -80,10 +82,10 @@ typedef struct DNNModel{
> >>>                                  const char *output_name, int
> >> *output_width, int *output_height);
> >>>      // set the pre process to transfer data from AVFrame to DNNData
> >>>      // the default implementation within DNN is used if it is not
> >>> provided
> >> by the filter
> >>> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
> >> AVFilterContext *filter_ctx);
> >>> +    PRE_POST_PROC pre_proc;
> >>>      // set the post process to transfer data from DNNData to AVFrame
> >>>      // the default implementation within DNN is used if it is not
> >>> provided
> >> by the filter
> >>> -    int (*post_proc)(AVFrame *frame_out, DNNData *model_output,
> >> AVFilterContext *filter_ctx);
> >>> +    PRE_POST_PROC post_proc;
> >>
> >> Spurious change.
> >
> > sorry, not quite understand this comment. It is used for code refine.
> > Maybe I need to let it in a separate patch.
> >
> >>
> >>>  } DNNModel;
> >>>
> >>> +
> >>> +    frame->private_ref = av_buffer_alloc(sizeof(*header) +
> >>> + sizeof(*bbox) *
> >> nb_bbox);
> >>> +    if (!frame->private_ref) {
> >>> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate buffer
> >>> + for %d
> >> bboxes\n", nb_bbox);
> >>> +        return AVERROR(ENOMEM);;
> >>
> >> Double ;
> >
> > thanks, will remove it.
> >
> >>
> >>> +    }
> >>> +
> >>> +    header = (BoundingBoxHeader *)frame->private_ref->data;
> >>> +    strncpy(header->source, ctx->dnnctx.model_filename,
> >> sizeof(header->source));
> >>> +    header->source[sizeof(header->source) - 1] = '\0';
> >>> +    header->bbox_size = sizeof(*bbox);
> >>> +
> >>> +    bbox = (BoundingBox *)(header + 1);
> >>
> >> This does not guarantee proper alignment. You could use a flexible
> >> array member for that.
> >
> > The memory layout of frame->private_ref->data is:
> > sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...
> >
> > I think 'header + 1' goes correctly to the first bbox. thanks.
> >
> 
> header + 1 points to the position where another BoundingBoxHeader would be
> if header were an array of BoundingBoxHeaders (i.e. it points
> sizeof(BoundingBoxHeader) bytes after header). So this guarantees that there is
> no overlap between your header and your actual boxes, but this does not
> guarantee proper alignment. This will be a problem on platforms where int is
> 64bit and requires eight byte alignment (unlikely) or if you add something that
> requires alignment of eight bytes (like a 64bit integer or a pointer on a 64bit
> system) to BoundingBox.

got the point, thanks. Will add 'BoundingBox bboxes[0]' at the end of struct BoundingBoxHeader.

> 
> >>> +
> >>> +        label = av_strdup(buf);
> >>> +        if (!label) {
> >>> +            av_log(context, AV_LOG_ERROR, "failed to allocate
> >>> + memory
> >> for label %s\n", buf);
> >>> +            fclose(file);
> >>> +            free_detect_labels(ctx);
> >>> +            return AVERROR(ENOMEM);
> >>> +        }
> >>> +
> >>> +        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count,
> >>> + label)
> >> < 0) {
> >>> +            av_log(context, AV_LOG_ERROR, "failed to do
> >> av_dynarray_add\n");
> >>> +            fclose(file);
> >>> +            free_detect_labels(ctx);
> >>
> >> 1. You are leaking label here.
> >> 2. You are repeating yourself with the cleanup code.
> >> 3. When you return an error in a filter's init function, the filter's
> >> uninit function is called automatically. In this case this means that
> >> free_detect_labels is called twice which is not only wasteful, but
> >> harmful: You are freeing ctx->labels (and all labels contained in it)
> >> in the first run, but you are not resetting the number of labels. If
> >> ctx->label_count is > 0, there will be a segfault when
> >> free_detect_labels is called the second time.
> >
> > good catch, will free label, remove the duplicate calling, and also reset
> ctx->label_count.
> >
> >> 4. Return the error code.
> >> (5. I consider your use of av_log for every error to be excessive.)
> >
> > not quite understand this one, thanks.
> >
> 
> It means that it is unnecessary to add an av_log for every failed allocation;
> returning the error code is enough.
> 
> >>
> >>> +            return AVERROR(ENOMEM);
> >>> +        }
> >>> +    }
> >>> +
> >>> +    fclose(file);
> >>> +    return 0;
> >>> +}
> >>> +
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> with subject "unsubscribe".
Andreas Rheinhardt March 1, 2021, 12:13 p.m. UTC | #7
Guo, Yejun:
> 
> 
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> Andreas Rheinhardt
>> Sent: 2021年3月1日 16:31
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
>> for object detection
>>
>> Guo, Yejun:
>>>
>>>
>>>> -----Original Message-----
>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>>>> Andreas Rheinhardt
>>>> Sent: 2021年3月1日 12:50
>>>> To: ffmpeg-devel@ffmpeg.org
>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
>>>> dnn_detect for object detection
>>>>
>>>> Guo, Yejun:
>>>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>>>>> ---
>>>>>  configure                              |   1 +
>>>>>  doc/filters.texi                       |  40 +++
>>>>>  libavfilter/Makefile                   |   1 +
>>>>>  libavfilter/allfilters.c               |   1 +
>>>>>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
>>>>>  libavfilter/dnn_filter_common.c        |   7 +
>>>>>  libavfilter/dnn_filter_common.h        |   1 +
>>>>>  libavfilter/dnn_interface.h            |   6 +-
>>>>>  libavfilter/vf_dnn_detect.c            | 462
>>>> +++++++++++++++++++++++++
>>>>>  9 files changed, 529 insertions(+), 2 deletions(-)  create mode
>>>
>>>>> b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4 100644
>>>>> --- a/libavfilter/dnn_interface.h
>>>>> +++ b/libavfilter/dnn_interface.h
>>>>> @@ -63,6 +63,8 @@ typedef struct DNNData{
>>>>>      DNNColorOrder order;
>>>>>  } DNNData;
>>>>>
>>>>> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model,
>>>>> +AVFilterContext *filter_ctx);
>>>>
>>>> Why uppercase? It is a typedef, not a macro.
>>>
>>> will change to CamelCase, thanks.
>>>
>>>>
>>>>> +
>>>>>  typedef struct DNNModel{
>>>>>      // Stores model that can be different for different backends.
>>>>>      void *model;
>>>>> @@ -80,10 +82,10 @@ typedef struct DNNModel{
>>>>>                                  const char *output_name, int
>>>> *output_width, int *output_height);
>>>>>      // set the pre process to transfer data from AVFrame to DNNData
>>>>>      // the default implementation within DNN is used if it is not
>>>>> provided
>>>> by the filter
>>>>> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
>>>> AVFilterContext *filter_ctx);
>>>>> +    PRE_POST_PROC pre_proc;
>>>>>      // set the post process to transfer data from DNNData to AVFrame
>>>>>      // the default implementation within DNN is used if it is not
>>>>> provided
>>>> by the filter
>>>>> -    int (*post_proc)(AVFrame *frame_out, DNNData *model_output,
>>>> AVFilterContext *filter_ctx);
>>>>> +    PRE_POST_PROC post_proc;
>>>>
>>>> Spurious change.
>>>
>>> sorry, not quite understand this comment. It is used for code refine.
>>> Maybe I need to let it in a separate patch.
>>>
>>>>
>>>>>  } DNNModel;
>>>>>
>>>>> +
>>>>> +    frame->private_ref = av_buffer_alloc(sizeof(*header) +
>>>>> + sizeof(*bbox) *
>>>> nb_bbox);
>>>>> +    if (!frame->private_ref) {
>>>>> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate buffer
>>>>> + for %d
>>>> bboxes\n", nb_bbox);
>>>>> +        return AVERROR(ENOMEM);;
>>>>
>>>> Double ;
>>>
>>> thanks, will remove it.
>>>
>>>>
>>>>> +    }
>>>>> +
>>>>> +    header = (BoundingBoxHeader *)frame->private_ref->data;
>>>>> +    strncpy(header->source, ctx->dnnctx.model_filename,
>>>> sizeof(header->source));
>>>>> +    header->source[sizeof(header->source) - 1] = '\0';
>>>>> +    header->bbox_size = sizeof(*bbox);
>>>>> +
>>>>> +    bbox = (BoundingBox *)(header + 1);
>>>>
>>>> This does not guarantee proper alignment. You could use a flexible
>>>> array member for that.
>>>
>>> The memory layout of frame->private_ref->data is:
>>> sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...
>>>
>>> I think 'header + 1' goes correctly to the first bbox. thanks.
>>>
>>
>> header + 1 points to the position where another BoundingBoxHeader would be
>> if header were an array of BoundingBoxHeaders (i.e. it points
>> sizeof(BoundingBoxHeader) bytes after header). So this guarantees that there is
>> no overlap between your header and your actual boxes, but this does not
>> guarantee proper alignment. This will be a problem on platforms where int is
>> 64bit and requires eight byte alignment (unlikely) or if you add something that
>> requires alignment of eight bytes (like a 64bit integer or a pointer on a 64bit
>> system) to BoundingBox.
> 
> got the point, thanks. Will add 'BoundingBox bboxes[0]' at the end of struct BoundingBoxHeader.
> 

An array with zero elements is not allowed in ISO C (see 6.7.6.2 1 in
C11 or 6.7.5.2 1 in C99), although many compilers support it.

>>
>>>>> +
>>>>> +        label = av_strdup(buf);
>>>>> +        if (!label) {
>>>>> +            av_log(context, AV_LOG_ERROR, "failed to allocate
>>>>> + memory
>>>> for label %s\n", buf);
>>>>> +            fclose(file);
>>>>> +            free_detect_labels(ctx);
>>>>> +            return AVERROR(ENOMEM);
>>>>> +        }
>>>>> +
>>>>> +        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count,
>>>>> + label)
>>>> < 0) {
>>>>> +            av_log(context, AV_LOG_ERROR, "failed to do
>>>> av_dynarray_add\n");
>>>>> +            fclose(file);
>>>>> +            free_detect_labels(ctx);
>>>>
>>>> 1. You are leaking label here.
>>>> 2. You are repeating yourself with the cleanup code.
>>>> 3. When you return an error in a filter's init function, the filter's
>>>> uninit function is called automatically. In this case this means that
>>>> free_detect_labels is called twice which is not only wasteful, but
>>>> harmful: You are freeing ctx->labels (and all labels contained in it)
>>>> in the first run, but you are not resetting the number of labels. If
>>>> ctx->label_count is > 0, there will be a segfault when
>>>> free_detect_labels is called the second time.
>>>
>>> good catch, will free label, remove the duplicate calling, and also reset
>> ctx->label_count.
>>>
>>>> 4. Return the error code.
>>>> (5. I consider your use of av_log for every error to be excessive.)
>>>
>>> not quite understand this one, thanks.
>>>
>>
>> It means that it is unnecessary to add an av_log for every failed allocation;
>> returning the error code is enough.
>>
>>>>
>>>>> +            return AVERROR(ENOMEM);
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    fclose(file);
>>>>> +    return 0;
>>>>> +}
>>>>> +
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
>> with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Guo, Yejun March 1, 2021, 12:19 p.m. UTC | #8
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Andreas Rheinhardt
> Sent: 2021年3月1日 20:13
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> for object detection
> 
> Guo, Yejun:
> >
> >
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >> Andreas Rheinhardt
> >> Sent: 2021年3月1日 16:31
> >> To: ffmpeg-devel@ffmpeg.org
> >> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
> >> dnn_detect for object detection
> >>
> >> Guo, Yejun:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >>>> Andreas Rheinhardt
> >>>> Sent: 2021年3月1日 12:50
> >>>> To: ffmpeg-devel@ffmpeg.org
> >>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
> >>>> dnn_detect for object detection
> >>>>
> >>>> Guo, Yejun:
> >>>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> >>>>> ---
> >>>>>  configure                              |   1 +
> >>>>>  doc/filters.texi                       |  40 +++
> >>>>>  libavfilter/Makefile                   |   1 +
> >>>>>  libavfilter/allfilters.c               |   1 +
> >>>>>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
> >>>>>  libavfilter/dnn_filter_common.c        |   7 +
> >>>>>  libavfilter/dnn_filter_common.h        |   1 +
> >>>>>  libavfilter/dnn_interface.h            |   6 +-
> >>>>>  libavfilter/vf_dnn_detect.c            | 462
> >>>> +++++++++++++++++++++++++
> >>>>>  9 files changed, 529 insertions(+), 2 deletions(-)  create mode
> >>>
> >>>>> b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4 100644
> >>>>> --- a/libavfilter/dnn_interface.h
> >>>>> +++ b/libavfilter/dnn_interface.h
> >>>>> @@ -63,6 +63,8 @@ typedef struct DNNData{
> >>>>>      DNNColorOrder order;
> >>>>>  } DNNData;
> >>>>>
> >>>>> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model,
> >>>>> +AVFilterContext *filter_ctx);
> >>>>
> >>>> Why uppercase? It is a typedef, not a macro.
> >>>
> >>> will change to CamelCase, thanks.
> >>>
> >>>>
> >>>>> +
> >>>>>  typedef struct DNNModel{
> >>>>>      // Stores model that can be different for different backends.
> >>>>>      void *model;
> >>>>> @@ -80,10 +82,10 @@ typedef struct DNNModel{
> >>>>>                                  const char *output_name, int
> >>>> *output_width, int *output_height);
> >>>>>      // set the pre process to transfer data from AVFrame to DNNData
> >>>>>      // the default implementation within DNN is used if it is not
> >>>>> provided
> >>>> by the filter
> >>>>> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
> >>>> AVFilterContext *filter_ctx);
> >>>>> +    PRE_POST_PROC pre_proc;
> >>>>>      // set the post process to transfer data from DNNData to
> AVFrame
> >>>>>      // the default implementation within DNN is used if it is not
> >>>>> provided
> >>>> by the filter
> >>>>> -    int (*post_proc)(AVFrame *frame_out, DNNData *model_output,
> >>>> AVFilterContext *filter_ctx);
> >>>>> +    PRE_POST_PROC post_proc;
> >>>>
> >>>> Spurious change.
> >>>
> >>> sorry, not quite understand this comment. It is used for code refine.
> >>> Maybe I need to let it in a separate patch.
> >>>
> >>>>
> >>>>>  } DNNModel;
> >>>>>
> >>>>> +
> >>>>> +    frame->private_ref = av_buffer_alloc(sizeof(*header) +
> >>>>> + sizeof(*bbox) *
> >>>> nb_bbox);
> >>>>> +    if (!frame->private_ref) {
> >>>>> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate
> >>>>> + buffer for %d
> >>>> bboxes\n", nb_bbox);
> >>>>> +        return AVERROR(ENOMEM);;
> >>>>
> >>>> Double ;
> >>>
> >>> thanks, will remove it.
> >>>
> >>>>
> >>>>> +    }
> >>>>> +
> >>>>> +    header = (BoundingBoxHeader *)frame->private_ref->data;
> >>>>> +    strncpy(header->source, ctx->dnnctx.model_filename,
> >>>> sizeof(header->source));
> >>>>> +    header->source[sizeof(header->source) - 1] = '\0';
> >>>>> +    header->bbox_size = sizeof(*bbox);
> >>>>> +
> >>>>> +    bbox = (BoundingBox *)(header + 1);
> >>>>
> >>>> This does not guarantee proper alignment. You could use a flexible
> >>>> array member for that.
> >>>
> >>> The memory layout of frame->private_ref->data is:
> >>> sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...
> >>>
> >>> I think 'header + 1' goes correctly to the first bbox. thanks.
> >>>
> >>
> >> header + 1 points to the position where another BoundingBoxHeader
> >> would be if header were an array of BoundingBoxHeaders (i.e. it
> >> points
> >> sizeof(BoundingBoxHeader) bytes after header). So this guarantees
> >> that there is no overlap between your header and your actual boxes,
> >> but this does not guarantee proper alignment. This will be a problem
> >> on platforms where int is 64bit and requires eight byte alignment
> >> (unlikely) or if you add something that requires alignment of eight
> >> bytes (like a 64bit integer or a pointer on a 64bit
> >> system) to BoundingBox.
> >
> > got the point, thanks. Will add 'BoundingBox bboxes[0]' at the end of struct
> BoundingBoxHeader.
> >
> 
> An array with zero elements is not allowed in ISO C (see 6.7.6.2 1 in
> C11 or 6.7.5.2 1 in C99), although many compilers support it.

thanks for the info, this struct is expected to be in side_data in the future, 
I'll add 'bboxes[1]' in it, and allocate sizeof(*header) + (nb_bbox - 1) * sizeof(*bbox).
Andreas Rheinhardt March 1, 2021, 12:24 p.m. UTC | #9
Guo, Yejun:
> 
> 
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> Andreas Rheinhardt
>> Sent: 2021年3月1日 20:13
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
>> for object detection
>>
>> Guo, Yejun:
>>>
>>>
>>>> -----Original Message-----
>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>>>> Andreas Rheinhardt
>>>> Sent: 2021年3月1日 16:31
>>>> To: ffmpeg-devel@ffmpeg.org
>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
>>>> dnn_detect for object detection
>>>>
>>>> Guo, Yejun:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>>>>>> Andreas Rheinhardt
>>>>>> Sent: 2021年3月1日 12:50
>>>>>> To: ffmpeg-devel@ffmpeg.org
>>>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
>>>>>> dnn_detect for object detection
>>>>>>
>>>>>> Guo, Yejun:
>>>>>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>>>>>>> ---
>>>>>>>  configure                              |   1 +
>>>>>>>  doc/filters.texi                       |  40 +++
>>>>>>>  libavfilter/Makefile                   |   1 +
>>>>>>>  libavfilter/allfilters.c               |   1 +
>>>>>>>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
>>>>>>>  libavfilter/dnn_filter_common.c        |   7 +
>>>>>>>  libavfilter/dnn_filter_common.h        |   1 +
>>>>>>>  libavfilter/dnn_interface.h            |   6 +-
>>>>>>>  libavfilter/vf_dnn_detect.c            | 462
>>>>>> +++++++++++++++++++++++++
>>>>>>>  9 files changed, 529 insertions(+), 2 deletions(-)  create mode
>>>>>
>>>>>>> b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4 100644
>>>>>>> --- a/libavfilter/dnn_interface.h
>>>>>>> +++ b/libavfilter/dnn_interface.h
>>>>>>> @@ -63,6 +63,8 @@ typedef struct DNNData{
>>>>>>>      DNNColorOrder order;
>>>>>>>  } DNNData;
>>>>>>>
>>>>>>> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model,
>>>>>>> +AVFilterContext *filter_ctx);
>>>>>>
>>>>>> Why uppercase? It is a typedef, not a macro.
>>>>>
>>>>> will change to CamelCase, thanks.
>>>>>
>>>>>>
>>>>>>> +
>>>>>>>  typedef struct DNNModel{
>>>>>>>      // Stores model that can be different for different backends.
>>>>>>>      void *model;
>>>>>>> @@ -80,10 +82,10 @@ typedef struct DNNModel{
>>>>>>>                                  const char *output_name, int
>>>>>> *output_width, int *output_height);
>>>>>>>      // set the pre process to transfer data from AVFrame to DNNData
>>>>>>>      // the default implementation within DNN is used if it is not
>>>>>>> provided
>>>>>> by the filter
>>>>>>> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
>>>>>> AVFilterContext *filter_ctx);
>>>>>>> +    PRE_POST_PROC pre_proc;
>>>>>>>      // set the post process to transfer data from DNNData to
>> AVFrame
>>>>>>>      // the default implementation within DNN is used if it is not
>>>>>>> provided
>>>>>> by the filter
>>>>>>> -    int (*post_proc)(AVFrame *frame_out, DNNData *model_output,
>>>>>> AVFilterContext *filter_ctx);
>>>>>>> +    PRE_POST_PROC post_proc;
>>>>>>
>>>>>> Spurious change.
>>>>>
>>>>> sorry, not quite understand this comment. It is used for code refine.
>>>>> Maybe I need to let it in a separate patch.
>>>>>
>>>>>>
>>>>>>>  } DNNModel;
>>>>>>>
>>>>>>> +
>>>>>>> +    frame->private_ref = av_buffer_alloc(sizeof(*header) +
>>>>>>> + sizeof(*bbox) *
>>>>>> nb_bbox);
>>>>>>> +    if (!frame->private_ref) {
>>>>>>> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate
>>>>>>> + buffer for %d
>>>>>> bboxes\n", nb_bbox);
>>>>>>> +        return AVERROR(ENOMEM);;
>>>>>>
>>>>>> Double ;
>>>>>
>>>>> thanks, will remove it.
>>>>>
>>>>>>
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    header = (BoundingBoxHeader *)frame->private_ref->data;
>>>>>>> +    strncpy(header->source, ctx->dnnctx.model_filename,
>>>>>> sizeof(header->source));
>>>>>>> +    header->source[sizeof(header->source) - 1] = '\0';
>>>>>>> +    header->bbox_size = sizeof(*bbox);
>>>>>>> +
>>>>>>> +    bbox = (BoundingBox *)(header + 1);
>>>>>>
>>>>>> This does not guarantee proper alignment. You could use a flexible
>>>>>> array member for that.
>>>>>
>>>>> The memory layout of frame->private_ref->data is:
>>>>> sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...
>>>>>
>>>>> I think 'header + 1' goes correctly to the first bbox. thanks.
>>>>>
>>>>
>>>> header + 1 points to the position where another BoundingBoxHeader
>>>> would be if header were an array of BoundingBoxHeaders (i.e. it
>>>> points
>>>> sizeof(BoundingBoxHeader) bytes after header). So this guarantees
>>>> that there is no overlap between your header and your actual boxes,
>>>> but this does not guarantee proper alignment. This will be a problem
>>>> on platforms where int is 64bit and requires eight byte alignment
>>>> (unlikely) or if you add something that requires alignment of eight
>>>> bytes (like a 64bit integer or a pointer on a 64bit
>>>> system) to BoundingBox.
>>>
>>> got the point, thanks. Will add 'BoundingBox bboxes[0]' at the end of struct
>> BoundingBoxHeader.
>>>
>>
>> An array with zero elements is not allowed in ISO C (see 6.7.6.2 1 in
>> C11 or 6.7.5.2 1 in C99), although many compilers support it.
> 
> thanks for the info, this struct is expected to be in side_data in the future, 
> I'll add 'bboxes[1]' in it, and allocate sizeof(*header) + (nb_bbox - 1) * sizeof(*bbox).

Notice that in this case it is undefined behaviour to access any of the
boxes outside of BoundingBoxHeader (i.e. when using header->bboxes[i],
the compiler is allowed to infer that i == 0 as all other cases would be
undefined behaviour).

- Andreas
Guo, Yejun March 1, 2021, 12:31 p.m. UTC | #10
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Andreas Rheinhardt
> Sent: 2021年3月1日 20:24
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> for object detection
> 
> Guo, Yejun:
> >
> >
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >> Andreas Rheinhardt
> >> Sent: 2021年3月1日 20:13
> >> To: ffmpeg-devel@ffmpeg.org
> >> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> >> for object detection
> >>
> >> Guo, Yejun:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >>>> Andreas Rheinhardt
> >>>> Sent: 2021年3月1日 16:31
> >>>> To: ffmpeg-devel@ffmpeg.org
> >>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
> >>>> dnn_detect for object detection
> >>>>
> >>>> Guo, Yejun:
> >>>>>
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >>>>>> Andreas Rheinhardt
> >>>>>> Sent: 2021年3月1日 12:50
> >>>>>> To: ffmpeg-devel@ffmpeg.org
> >>>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
> >>>>>> dnn_detect for object detection
> >>>>>>
> >>>>>> Guo, Yejun:
> >>>>>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> >>>>>>> ---
> >>>>>>>  configure                              |   1 +
> >>>>>>>  doc/filters.texi                       |  40 +++
> >>>>>>>  libavfilter/Makefile                   |   1 +
> >>>>>>>  libavfilter/allfilters.c               |   1 +
> >>>>>>>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
> >>>>>>>  libavfilter/dnn_filter_common.c        |   7 +
> >>>>>>>  libavfilter/dnn_filter_common.h        |   1 +
> >>>>>>>  libavfilter/dnn_interface.h            |   6 +-
> >>>>>>>  libavfilter/vf_dnn_detect.c            | 462
> >>>>>> +++++++++++++++++++++++++
> >>>>>>>  9 files changed, 529 insertions(+), 2 deletions(-)  create mode
> >>>>>
> >>>>>>> b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4 100644
> >>>>>>> --- a/libavfilter/dnn_interface.h
> >>>>>>> +++ b/libavfilter/dnn_interface.h
> >>>>>>> @@ -63,6 +63,8 @@ typedef struct DNNData{
> >>>>>>>      DNNColorOrder order;
> >>>>>>>  } DNNData;
> >>>>>>>
> >>>>>>> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model,
> >>>>>>> +AVFilterContext *filter_ctx);
> >>>>>>
> >>>>>> Why uppercase? It is a typedef, not a macro.
> >>>>>
> >>>>> will change to CamelCase, thanks.
> >>>>>
> >>>>>>
> >>>>>>> +
> >>>>>>>  typedef struct DNNModel{
> >>>>>>>      // Stores model that can be different for different backends.
> >>>>>>>      void *model;
> >>>>>>> @@ -80,10 +82,10 @@ typedef struct DNNModel{
> >>>>>>>                                  const char *output_name, int
> >>>>>> *output_width, int *output_height);
> >>>>>>>      // set the pre process to transfer data from AVFrame to
> DNNData
> >>>>>>>      // the default implementation within DNN is used if it is not
> >>>>>>> provided
> >>>>>> by the filter
> >>>>>>> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
> >>>>>> AVFilterContext *filter_ctx);
> >>>>>>> +    PRE_POST_PROC pre_proc;
> >>>>>>>      // set the post process to transfer data from DNNData to
> >> AVFrame
> >>>>>>>      // the default implementation within DNN is used if it is not
> >>>>>>> provided
> >>>>>> by the filter
> >>>>>>> -    int (*post_proc)(AVFrame *frame_out, DNNData
> *model_output,
> >>>>>> AVFilterContext *filter_ctx);
> >>>>>>> +    PRE_POST_PROC post_proc;
> >>>>>>
> >>>>>> Spurious change.
> >>>>>
> >>>>> sorry, not quite understand this comment. It is used for code refine.
> >>>>> Maybe I need to let it in a separate patch.
> >>>>>
> >>>>>>
> >>>>>>>  } DNNModel;
> >>>>>>>
> >>>>>>> +
> >>>>>>> +    frame->private_ref = av_buffer_alloc(sizeof(*header) +
> >>>>>>> + sizeof(*bbox) *
> >>>>>> nb_bbox);
> >>>>>>> +    if (!frame->private_ref) {
> >>>>>>> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate
> >>>>>>> + buffer for %d
> >>>>>> bboxes\n", nb_bbox);
> >>>>>>> +        return AVERROR(ENOMEM);;
> >>>>>>
> >>>>>> Double ;
> >>>>>
> >>>>> thanks, will remove it.
> >>>>>
> >>>>>>
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    header = (BoundingBoxHeader *)frame->private_ref->data;
> >>>>>>> +    strncpy(header->source, ctx->dnnctx.model_filename,
> >>>>>> sizeof(header->source));
> >>>>>>> +    header->source[sizeof(header->source) - 1] = '\0';
> >>>>>>> +    header->bbox_size = sizeof(*bbox);
> >>>>>>> +
> >>>>>>> +    bbox = (BoundingBox *)(header + 1);
> >>>>>>
> >>>>>> This does not guarantee proper alignment. You could use a flexible
> >>>>>> array member for that.
> >>>>>
> >>>>> The memory layout of frame->private_ref->data is:
> >>>>> sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...
> >>>>>
> >>>>> I think 'header + 1' goes correctly to the first bbox. thanks.
> >>>>>
> >>>>
> >>>> header + 1 points to the position where another BoundingBoxHeader
> >>>> would be if header were an array of BoundingBoxHeaders (i.e. it
> >>>> points
> >>>> sizeof(BoundingBoxHeader) bytes after header). So this guarantees
> >>>> that there is no overlap between your header and your actual boxes,
> >>>> but this does not guarantee proper alignment. This will be a problem
> >>>> on platforms where int is 64bit and requires eight byte alignment
> >>>> (unlikely) or if you add something that requires alignment of eight
> >>>> bytes (like a 64bit integer or a pointer on a 64bit
> >>>> system) to BoundingBox.
> >>>
> >>> got the point, thanks. Will add 'BoundingBox bboxes[0]' at the end of
> struct
> >> BoundingBoxHeader.
> >>>
> >>
> >> An array with zero elements is not allowed in ISO C (see 6.7.6.2 1 in
> >> C11 or 6.7.5.2 1 in C99), although many compilers support it.
> >
> > thanks for the info, this struct is expected to be in side_data in the future,
> > I'll add 'bboxes[1]' in it, and allocate sizeof(*header) + (nb_bbox - 1) *
> sizeof(*bbox).
> 
> Notice that in this case it is undefined behaviour to access any of the
> boxes outside of BoundingBoxHeader (i.e. when using header->bboxes[i],
> the compiler is allowed to infer that i == 0 as all other cases would be
> undefined behaviour).
> 
> - Andreas

Thanks for let me know it. I'll not use header->bboxes[i], but will use:
bbox = header->bboxes;
bbox++;

and i'll also add comment in code to warn: don't use header->bboxes[i].
Andreas Rheinhardt March 1, 2021, 1:50 p.m. UTC | #11
Guo, Yejun:
> 
> 
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> Andreas Rheinhardt
>> Sent: 2021年3月1日 20:24
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
>> for object detection
>>
>> Guo, Yejun:
>>>
>>>
>>>> -----Original Message-----
>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>>>> Andreas Rheinhardt
>>>> Sent: 2021年3月1日 20:13
>>>> To: ffmpeg-devel@ffmpeg.org
>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
>>>> for object detection
>>>>
>>>> Guo, Yejun:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>>>>>> Andreas Rheinhardt
>>>>>> Sent: 2021年3月1日 16:31
>>>>>> To: ffmpeg-devel@ffmpeg.org
>>>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
>>>>>> dnn_detect for object detection
>>>>>>
>>>>>> Guo, Yejun:
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>>>>>>>> Andreas Rheinhardt
>>>>>>>> Sent: 2021年3月1日 12:50
>>>>>>>> To: ffmpeg-devel@ffmpeg.org
>>>>>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
>>>>>>>> dnn_detect for object detection
>>>>>>>>
>>>>>>>> Guo, Yejun:
>>>>>>>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>>>>>>>>> ---
>>>>>>>>>  configure                              |   1 +
>>>>>>>>>  doc/filters.texi                       |  40 +++
>>>>>>>>>  libavfilter/Makefile                   |   1 +
>>>>>>>>>  libavfilter/allfilters.c               |   1 +
>>>>>>>>>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
>>>>>>>>>  libavfilter/dnn_filter_common.c        |   7 +
>>>>>>>>>  libavfilter/dnn_filter_common.h        |   1 +
>>>>>>>>>  libavfilter/dnn_interface.h            |   6 +-
>>>>>>>>>  libavfilter/vf_dnn_detect.c            | 462
>>>>>>>> +++++++++++++++++++++++++
>>>>>>>>>  9 files changed, 529 insertions(+), 2 deletions(-)  create mode
>>>>>>>
>>>>>>>>> b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4 100644
>>>>>>>>> --- a/libavfilter/dnn_interface.h
>>>>>>>>> +++ b/libavfilter/dnn_interface.h
>>>>>>>>> @@ -63,6 +63,8 @@ typedef struct DNNData{
>>>>>>>>>      DNNColorOrder order;
>>>>>>>>>  } DNNData;
>>>>>>>>>
>>>>>>>>> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model,
>>>>>>>>> +AVFilterContext *filter_ctx);
>>>>>>>>
>>>>>>>> Why uppercase? It is a typedef, not a macro.
>>>>>>>
>>>>>>> will change to CamelCase, thanks.
>>>>>>>
>>>>>>>>
>>>>>>>>> +
>>>>>>>>>  typedef struct DNNModel{
>>>>>>>>>      // Stores model that can be different for different backends.
>>>>>>>>>      void *model;
>>>>>>>>> @@ -80,10 +82,10 @@ typedef struct DNNModel{
>>>>>>>>>                                  const char *output_name, int
>>>>>>>> *output_width, int *output_height);
>>>>>>>>>      // set the pre process to transfer data from AVFrame to
>> DNNData
>>>>>>>>>      // the default implementation within DNN is used if it is not
>>>>>>>>> provided
>>>>>>>> by the filter
>>>>>>>>> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
>>>>>>>> AVFilterContext *filter_ctx);
>>>>>>>>> +    PRE_POST_PROC pre_proc;
>>>>>>>>>      // set the post process to transfer data from DNNData to
>>>> AVFrame
>>>>>>>>>      // the default implementation within DNN is used if it is not
>>>>>>>>> provided
>>>>>>>> by the filter
>>>>>>>>> -    int (*post_proc)(AVFrame *frame_out, DNNData
>> *model_output,
>>>>>>>> AVFilterContext *filter_ctx);
>>>>>>>>> +    PRE_POST_PROC post_proc;
>>>>>>>>
>>>>>>>> Spurious change.
>>>>>>>
>>>>>>> sorry, not quite understand this comment. It is used for code refine.
>>>>>>> Maybe I need to let it in a separate patch.
>>>>>>>
>>>>>>>>
>>>>>>>>>  } DNNModel;
>>>>>>>>>
>>>>>>>>> +
>>>>>>>>> +    frame->private_ref = av_buffer_alloc(sizeof(*header) +
>>>>>>>>> + sizeof(*bbox) *
>>>>>>>> nb_bbox);
>>>>>>>>> +    if (!frame->private_ref) {
>>>>>>>>> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate
>>>>>>>>> + buffer for %d
>>>>>>>> bboxes\n", nb_bbox);
>>>>>>>>> +        return AVERROR(ENOMEM);;
>>>>>>>>
>>>>>>>> Double ;
>>>>>>>
>>>>>>> thanks, will remove it.
>>>>>>>
>>>>>>>>
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    header = (BoundingBoxHeader *)frame->private_ref->data;
>>>>>>>>> +    strncpy(header->source, ctx->dnnctx.model_filename,
>>>>>>>> sizeof(header->source));
>>>>>>>>> +    header->source[sizeof(header->source) - 1] = '\0';
>>>>>>>>> +    header->bbox_size = sizeof(*bbox);
>>>>>>>>> +
>>>>>>>>> +    bbox = (BoundingBox *)(header + 1);
>>>>>>>>
>>>>>>>> This does not guarantee proper alignment. You could use a flexible
>>>>>>>> array member for that.
>>>>>>>
>>>>>>> The memory layout of frame->private_ref->data is:
>>>>>>> sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...
>>>>>>>
>>>>>>> I think 'header + 1' goes correctly to the first bbox. thanks.
>>>>>>>
>>>>>>
>>>>>> header + 1 points to the position where another BoundingBoxHeader
>>>>>> would be if header were an array of BoundingBoxHeaders (i.e. it
>>>>>> points
>>>>>> sizeof(BoundingBoxHeader) bytes after header). So this guarantees
>>>>>> that there is no overlap between your header and your actual boxes,
>>>>>> but this does not guarantee proper alignment. This will be a problem
>>>>>> on platforms where int is 64bit and requires eight byte alignment
>>>>>> (unlikely) or if you add something that requires alignment of eight
>>>>>> bytes (like a 64bit integer or a pointer on a 64bit
>>>>>> system) to BoundingBox.
>>>>>
>>>>> got the point, thanks. Will add 'BoundingBox bboxes[0]' at the end of
>> struct
>>>> BoundingBoxHeader.
>>>>>
>>>>
>>>> An array with zero elements is not allowed in ISO C (see 6.7.6.2 1 in
>>>> C11 or 6.7.5.2 1 in C99), although many compilers support it.
>>>
>>> thanks for the info, this struct is expected to be in side_data in the future,
>>> I'll add 'bboxes[1]' in it, and allocate sizeof(*header) + (nb_bbox - 1) *
>> sizeof(*bbox).
>>
>> Notice that in this case it is undefined behaviour to access any of the
>> boxes outside of BoundingBoxHeader (i.e. when using header->bboxes[i],
>> the compiler is allowed to infer that i == 0 as all other cases would be
>> undefined behaviour).
>>
>> - Andreas
> 
> Thanks for let me know it. I'll not use header->bboxes[i], but will use:
> bbox = header->bboxes;
> bbox++;
> 
> and i'll also add comment in code to warn: don't use header->bboxes[i].

Well, the spec-compliant and most elegant way to deal with such
situations is a flexible array member: Add "BoundingBox boxes[];" to
BoundingBoxHeader and allocate the memory the way you do now. You could
then access these via header->boxes[i]. But this is a C99 feature that
apparently older versions of MSVC don't support. So I don't know whether
we are allowed to use it or not. Just do whatever works.
Apologies for wasting your time here.

- Andreas
Nicolas George March 1, 2021, 1:57 p.m. UTC | #12
Andreas Rheinhardt (12021-03-01):
> > thanks for the info, this struct is expected to be in side_data in the future, 
> > I'll add 'bboxes[1]' in it, and allocate sizeof(*header) + (nb_bbox - 1) * sizeof(*bbox).
> 
> Notice that in this case it is undefined behaviour to access any of the
> boxes outside of BoundingBoxHeader (i.e. when using header->bboxes[i],
> the compiler is allowed to infer that i == 0 as all other cases would be
> undefined behaviour).

Are you sure about it? Can you quote the standard?

Anyway, even if this is true, we can work around it with an extra
pointer or a cast, possibly wrapped in a macro. Saving a few dynamic
allocation is well worth the unusual code; we should shoo away the
people who oppose to go work on GStreamer or something.

Regards,
Guo, Yejun March 1, 2021, 2:44 p.m. UTC | #13
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nicolas
> George
> Sent: 2021年3月1日 21:57
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> for object detection
> 
> Andreas Rheinhardt (12021-03-01):
> > > thanks for the info, this struct is expected to be in side_data in
> > > the future, I'll add 'bboxes[1]' in it, and allocate sizeof(*header) + (nb_bbox
> - 1) * sizeof(*bbox).
> >
> > Notice that in this case it is undefined behaviour to access any of
> > the boxes outside of BoundingBoxHeader (i.e. when using
> > header->bboxes[i], the compiler is allowed to infer that i == 0 as all
> > other cases would be undefined behaviour).
> 
> Are you sure about it? Can you quote the standard?
> 
> Anyway, even if this is true, we can work around it with an extra pointer or a
> cast, possibly wrapped in a macro. Saving a few dynamic allocation is well
> worth the unusual code; 

Hi, glad to hear there's a good solution, could you share a bit of code as an example,
it would be nice if I could refine my code better when possible, thanks.
Guo, Yejun March 1, 2021, 2:44 p.m. UTC | #14
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Andreas Rheinhardt
> Sent: 2021年3月1日 21:50
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> for object detection
> 
> Guo, Yejun:
> >
> >
> >> -----Original Message-----
> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >> Andreas Rheinhardt
> >> Sent: 2021年3月1日 20:24
> >> To: ffmpeg-devel@ffmpeg.org
> >> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
> >> dnn_detect for object detection
> >>
> >> Guo, Yejun:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >>>> Andreas Rheinhardt
> >>>> Sent: 2021年3月1日 20:13
> >>>> To: ffmpeg-devel@ffmpeg.org
> >>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter
> >>>> dnn_detect for object detection
> >>>>
> >>>> Guo, Yejun:
> >>>>>
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> >>>>>> Andreas Rheinhardt
> >>>>>> Sent: 2021年3月1日 16:31
> >>>>>> To: ffmpeg-devel@ffmpeg.org
> >>>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add
> >>>>>> filter dnn_detect for object detection
> >>>>>>
> >>>>>> Guo, Yejun:
> >>>>>>>
> >>>>>>>
> >>>>>>>> -----Original Message-----
> >>>>>>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf
> >>>>>>>> Of Andreas Rheinhardt
> >>>>>>>> Sent: 2021年3月1日 12:50
> >>>>>>>> To: ffmpeg-devel@ffmpeg.org
> >>>>>>>> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add
> >>>>>>>> filter dnn_detect for object detection
> >>>>>>>>
> >>>>>>>> Guo, Yejun:
> >>>>>>>>> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> >>>>>>>>> ---
> >>>>>>>>>  configure                              |   1 +
> >>>>>>>>>  doc/filters.texi                       |  40 +++
> >>>>>>>>>  libavfilter/Makefile                   |   1 +
> >>>>>>>>>  libavfilter/allfilters.c               |   1 +
> >>>>>>>>>  libavfilter/dnn/dnn_backend_openvino.c |  12 +
> >>>>>>>>>  libavfilter/dnn_filter_common.c        |   7 +
> >>>>>>>>>  libavfilter/dnn_filter_common.h        |   1 +
> >>>>>>>>>  libavfilter/dnn_interface.h            |   6 +-
> >>>>>>>>>  libavfilter/vf_dnn_detect.c            | 462
> >>>>>>>> +++++++++++++++++++++++++
> >>>>>>>>>  9 files changed, 529 insertions(+), 2 deletions(-)  create
> >>>>>>>>> mode
> >>>>>>>
> >>>>>>>>> b/libavfilter/dnn_interface.h index d3a0c58a61..90a08129f4
> >>>>>>>>> 100644
> >>>>>>>>> --- a/libavfilter/dnn_interface.h
> >>>>>>>>> +++ b/libavfilter/dnn_interface.h
> >>>>>>>>> @@ -63,6 +63,8 @@ typedef struct DNNData{
> >>>>>>>>>      DNNColorOrder order;
> >>>>>>>>>  } DNNData;
> >>>>>>>>>
> >>>>>>>>> +typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData
> *model,
> >>>>>>>>> +AVFilterContext *filter_ctx);
> >>>>>>>>
> >>>>>>>> Why uppercase? It is a typedef, not a macro.
> >>>>>>>
> >>>>>>> will change to CamelCase, thanks.
> >>>>>>>
> >>>>>>>>
> >>>>>>>>> +
> >>>>>>>>>  typedef struct DNNModel{
> >>>>>>>>>      // Stores model that can be different for different backends.
> >>>>>>>>>      void *model;
> >>>>>>>>> @@ -80,10 +82,10 @@ typedef struct DNNModel{
> >>>>>>>>>                                  const char *output_name,
> int
> >>>>>>>> *output_width, int *output_height);
> >>>>>>>>>      // set the pre process to transfer data from AVFrame to
> >> DNNData
> >>>>>>>>>      // the default implementation within DNN is used if it is
> >>>>>>>>> not provided
> >>>>>>>> by the filter
> >>>>>>>>> -    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input,
> >>>>>>>> AVFilterContext *filter_ctx);
> >>>>>>>>> +    PRE_POST_PROC pre_proc;
> >>>>>>>>>      // set the post process to transfer data from DNNData to
> >>>> AVFrame
> >>>>>>>>>      // the default implementation within DNN is used if it is
> >>>>>>>>> not provided
> >>>>>>>> by the filter
> >>>>>>>>> -    int (*post_proc)(AVFrame *frame_out, DNNData
> >> *model_output,
> >>>>>>>> AVFilterContext *filter_ctx);
> >>>>>>>>> +    PRE_POST_PROC post_proc;
> >>>>>>>>
> >>>>>>>> Spurious change.
> >>>>>>>
> >>>>>>> sorry, not quite understand this comment. It is used for code refine.
> >>>>>>> Maybe I need to let it in a separate patch.
> >>>>>>>
> >>>>>>>>
> >>>>>>>>>  } DNNModel;
> >>>>>>>>>
> >>>>>>>>> +
> >>>>>>>>> +    frame->private_ref = av_buffer_alloc(sizeof(*header) +
> >>>>>>>>> + sizeof(*bbox) *
> >>>>>>>> nb_bbox);
> >>>>>>>>> +    if (!frame->private_ref) {
> >>>>>>>>> +        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate
> >>>>>>>>> + buffer for %d
> >>>>>>>> bboxes\n", nb_bbox);
> >>>>>>>>> +        return AVERROR(ENOMEM);;
> >>>>>>>>
> >>>>>>>> Double ;
> >>>>>>>
> >>>>>>> thanks, will remove it.
> >>>>>>>
> >>>>>>>>
> >>>>>>>>> +    }
> >>>>>>>>> +
> >>>>>>>>> +    header = (BoundingBoxHeader *)frame->private_ref->data;
> >>>>>>>>> +    strncpy(header->source, ctx->dnnctx.model_filename,
> >>>>>>>> sizeof(header->source));
> >>>>>>>>> +    header->source[sizeof(header->source) - 1] = '\0';
> >>>>>>>>> +    header->bbox_size = sizeof(*bbox);
> >>>>>>>>> +
> >>>>>>>>> +    bbox = (BoundingBox *)(header + 1);
> >>>>>>>>
> >>>>>>>> This does not guarantee proper alignment. You could use a
> >>>>>>>> flexible array member for that.
> >>>>>>>
> >>>>>>> The memory layout of frame->private_ref->data is:
> >>>>>>> sizeof(*header) + sizeof(*bbox) + sizeof(*bbox) + ...
> >>>>>>>
> >>>>>>> I think 'header + 1' goes correctly to the first bbox. thanks.
> >>>>>>>
> >>>>>>
> >>>>>> header + 1 points to the position where another BoundingBoxHeader
> >>>>>> would be if header were an array of BoundingBoxHeaders (i.e. it
> >>>>>> points
> >>>>>> sizeof(BoundingBoxHeader) bytes after header). So this guarantees
> >>>>>> that there is no overlap between your header and your actual
> >>>>>> boxes, but this does not guarantee proper alignment. This will be
> >>>>>> a problem on platforms where int is 64bit and requires eight byte
> >>>>>> alignment
> >>>>>> (unlikely) or if you add something that requires alignment of
> >>>>>> eight bytes (like a 64bit integer or a pointer on a 64bit
> >>>>>> system) to BoundingBox.
> >>>>>
> >>>>> got the point, thanks. Will add 'BoundingBox bboxes[0]' at the end
> >>>>> of
> >> struct
> >>>> BoundingBoxHeader.
> >>>>>
> >>>>
> >>>> An array with zero elements is not allowed in ISO C (see 6.7.6.2 1
> >>>> in
> >>>> C11 or 6.7.5.2 1 in C99), although many compilers support it.
> >>>
> >>> thanks for the info, this struct is expected to be in side_data in
> >>> the future, I'll add 'bboxes[1]' in it, and allocate sizeof(*header)
> >>> + (nb_bbox - 1) *
> >> sizeof(*bbox).
> >>
> >> Notice that in this case it is undefined behaviour to access any of
> >> the boxes outside of BoundingBoxHeader (i.e. when using
> >> header->bboxes[i], the compiler is allowed to infer that i == 0 as
> >> all other cases would be undefined behaviour).
> >>
> >> - Andreas
> >
> > Thanks for let me know it. I'll not use header->bboxes[i], but will use:
> > bbox = header->bboxes;
> > bbox++;
> >
> > and i'll also add comment in code to warn: don't use header->bboxes[i].
> 
> Well, the spec-compliant and most elegant way to deal with such situations is
> a flexible array member: Add "BoundingBox boxes[];" to BoundingBoxHeader
> and allocate the memory the way you do now. You could then access these via
> header->boxes[i]. But this is a C99 feature that apparently older versions of
> MSVC don't support. So I don't know whether we are allowed to use it or not.
> Just do whatever works.
> Apologies for wasting your time here.
> 

Anyway, I at least learned a lot, thanks.
Nicolas George March 1, 2021, 2:50 p.m. UTC | #15
Guo, Yejun (12021-03-01):
> Hi, glad to hear there's a good solution, could you share a bit of
> code as an example, it would be nice if I could refine my code better
> when possible, thanks.

The best choice is to go with BoundingBox boxes[], as Andreas pointed,
and see if people who use compilers where it does not work manifest.
Probably they will not, and if they do we can decide if we want to
continue supporting these obsolete compilers.

The second best choice is to go with BoundingBox boxes[1], if it is not
proven that the compiler can make assumptions that will break the code.

The third choice would be something like:

#define BOX(p, i) (((BoundingBox *)&(p)->boxes)[i])

or similar. But best try to make the best choice work.

Regards,
Paul B Mahol March 1, 2021, 2:56 p.m. UTC | #16
On Mon, Mar 1, 2021 at 3:51 PM Nicolas George <george@nsup.org> wrote:

> Guo, Yejun (12021-03-01):
> > Hi, glad to hear there's a good solution, could you share a bit of
> > code as an example, it would be nice if I could refine my code better
> > when possible, thanks.
>
> The best choice is to go with BoundingBox boxes[], as Andreas pointed,
> and see if people who use compilers where it does not work manifest.
> Probably they will not, and if they do we can decide if we want to
> continue supporting these obsolete compilers.
>
> The second best choice is to go with BoundingBox boxes[1], if it is not
> proven that the compiler can make assumptions that will break the code.
>
> The third choice would be something like:
>
> #define BOX(p, i) (((BoundingBox *)&(p)->boxes)[i])
>
or similar. But best try to make the best choice work


Why not use frame metadata for this? Instead of clumsy structures?


> .
>
> Regards,
>
> --
>   Nicolas George
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Andreas Rheinhardt March 1, 2021, 3:28 p.m. UTC | #17
Nicolas George:
> Andreas Rheinhardt (12021-03-01):
>>> thanks for the info, this struct is expected to be in side_data in the future, 
>>> I'll add 'bboxes[1]' in it, and allocate sizeof(*header) + (nb_bbox - 1) * sizeof(*bbox).
>>
>> Notice that in this case it is undefined behaviour to access any of the
>> boxes outside of BoundingBoxHeader (i.e. when using header->bboxes[i],
>> the compiler is allowed to infer that i == 0 as all other cases would be
>> undefined behaviour).
> 
> Are you sure about it? Can you quote the standard?
> 
> Anyway, even if this is true, we can work around it with an extra
> pointer or a cast, possibly wrapped in a macro. Saving a few dynamic
> allocation is well worth the unusual code; we should shoo away the
> people who oppose to go work on GStreamer or something.
> 
1. The C standards committee seems to think so (at least, it did so a
few decades ago; the current composition has probably changed considerably):
"The validity of this construct has always been questionable.  In the
response to one Defect Report, the Committee decided that it was
undefined behavior because the array p->itemscontains only one item,
irrespective of whether the space exists." (p.74 from
http://www.open-std.org/jtc1/sc22/wg14/www/C99RationaleV5.10.pdf)
2. 6.5.6 (the section on pointer arithmetic) contains this:
"When an expression that has integer type is added to or subtracted from
a pointer, the result has the type of the pointer operand. If the
pointer operand points to an element of an array object, and the array
is large enough,"
It also contains a provision that pointer arithmetic is legal up to one
past the last element of the array as long as the pointer pointing one
past the last element is not accessed. Everything else is UB.
So whether using [1] as a flexible-array member is UB or not depends
upon whether the array object still counts as containing only one
element when it is declared as [1] even when the allocated size is
sufficient for many more elements. Let's call them the small- and
big-object interpretations.
[1] contains a bug report where someone claims that using [1] led to a
miscompilation and fixed it by using [0] (a GCC extension). Yet said
code also did not put the [1] array at the end of the struct, so IMO it
doesn't count.
Compilers seem to intentionally treat [0] and [1] special in order not
to break code using this feature. When accessing array[3] of a two
element array located at the end of a struct, I get a warning from GCC
and Clang; I get no such warning for a one element array.
Also notice that when using an array of arrays GCC uses the small-object
interpretation: In the code fragment

extern int a[][2];

int f (int i, int j)
{
  int t = a[1][j];
  a[0][i] = 0;
  return a[1][j] - t;
}

GCC does always returns 0, i.e it treats it as UB if i were outside of
0..1, so that a[0][i] and a[1][j] don't alias. Yet if the big-object
interpretation is the correct one, then this optimization is invalid: a
is first transformed to a pointer to its first element which (as an
array) is transformed to a pointer to its first element and then pointer
arithmetic (adding i) is applied to this pointer. And with the
big-object interpretation the compiler knows nothing about the real size
of the object it points to (except that it is at least two ints big).
If one makes the implicit array-to-pointer transformation explicit (i.e.
uses "(&a[0][0])[i] = 0;" or "((int*)(a[0]))[i] = 0;"), then GCC no
longer makes this optimization.
Other compilers are not that aggressive [2].
3. As you probably guessed from the last part of 2., I don't think that
casting avoids the possibility of undefined behaviour* (because this
conversion is done automatically anyway); but it might very well protect
the code from aggressively optimizing compilers.

- Andreas

[1]: https://lkml.org/lkml/2015/2/18/407
[2]: https://godbolt.org/z/j46bMn

*: In the int a[][2] case using ((int*)a)[i] would indeed avoid the
undefined behaviour, because even with the little-object interpretation
the object that is pointed to is the big one. But for the cases that you
are interested in this is not so.
Guo, Yejun March 2, 2021, 2:36 a.m. UTC | #18
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nicolas
> George
> Sent: 2021年3月1日 22:51
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V3 3/3] libavfilter: add filter dnn_detect
> for object detection
> 
> Guo, Yejun (12021-03-01):
> > Hi, glad to hear there's a good solution, could you share a bit of
> > code as an example, it would be nice if I could refine my code better
> > when possible, thanks.
> 
> The best choice is to go with BoundingBox boxes[], as Andreas pointed, and see
> if people who use compilers where it does not work manifest.
> Probably they will not, and if they do we can decide if we want to continue
> supporting these obsolete compilers.

thanks, I will try this one.

> 
> The second best choice is to go with BoundingBox boxes[1], if it is not proven
> that the compiler can make assumptions that will break the code.
> 
> The third choice would be something like:
> 
> #define BOX(p, i) (((BoundingBox *)&(p)->boxes)[i])
> 
> or similar. But best try to make the best choice work.
> 
> Regards,
> 
> --
>   Nicolas George
Nicolas George March 28, 2021, 5:58 p.m. UTC | #19
Andreas Rheinhardt (12021-03-01):
> 3. As you probably guessed from the last part of 2., I don't think that
> casting avoids the possibility of undefined behaviour* (because this
> conversion is done automatically anyway); but it might very well protect
> the code from aggressively optimizing compilers.

Possibly not.

On the other hand, there is no doubt that (type *)((char *)structp +
offsetof(field)) would be valid, since:

- (char *)structp + offsetof(field) is really a pointer on field, so it
  has proper alignment for type;

- the entire array is within a memory block obtained through malloc().

So, worst case scenario, we can use this construct to work around a
compiler that does stupid optimizations.

Regards,
diff mbox series

Patch

diff --git a/configure b/configure
index 336301cb40..cdac292c2f 100755
--- a/configure
+++ b/configure
@@ -3549,6 +3549,7 @@  derain_filter_select="dnn"
 deshake_filter_select="pixelutils"
 deshake_opencl_filter_deps="opencl"
 dilation_opencl_filter_deps="opencl"
+dnn_detect_filter_select="dnn"
 dnn_processing_filter_select="dnn"
 drawtext_filter_deps="libfreetype"
 drawtext_filter_suggest="libfontconfig libfribidi"
diff --git a/doc/filters.texi b/doc/filters.texi
index 426cb158da..55368c6f1b 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -10133,6 +10133,46 @@  ffmpeg -i INPUT -f lavfi -i nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
 @end example
 @end itemize
 
+@section dnn_detect
+
+Do object detection with deep neural networks.
+
+The filter accepts the following options:
+
+@table @option
+@item dnn_backend
+Specify which DNN backend to use for model loading and execution. This option accepts
+only openvino now, tensorflow backends will be added.
+
+@item model
+Set path to model file specifying network architecture and its parameters.
+Note that different backends use different file formats.
+
+@item input
+Set the input name of the dnn network.
+
+@item output
+Set the output name of the dnn network.
+
+@item confidence
+Set the confidence threshold (default: 0.5).
+
+@item labels
+Set path to label file specifying the mapping between label id and name.
+Each label name is written in one line, tailing spaces and empty lines are skipped.
+The first line is the name of label id 0 (usually it is 'background'),
+and the second line is the name of label id 1, etc.
+The label id is considered as name if the label file is not provided.
+
+@item backend_configs
+Set the configs to be passed into backend
+
+@item async
+use DNN async execution if set (default: set),
+roll back to sync execution if the backend does not support async.
+
+@end table
+
 @anchor{dnn_processing}
 @section dnn_processing
 
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 359ea7f903..b14c0ecdb9 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -245,6 +245,7 @@  OBJS-$(CONFIG_DILATION_FILTER)               += vf_neighbor.o
 OBJS-$(CONFIG_DILATION_OPENCL_FILTER)        += vf_neighbor_opencl.o opencl.o \
                                                 opencl/neighbor.o
 OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o framesync.o
+OBJS-$(CONFIG_DNN_DETECT_FILTER)             += vf_dnn_detect.o
 OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         += vf_dnn_processing.o
 OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
 OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 452c030706..85419ec3f1 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -230,6 +230,7 @@  extern AVFilter ff_vf_detelecine;
 extern AVFilter ff_vf_dilation;
 extern AVFilter ff_vf_dilation_opencl;
 extern AVFilter ff_vf_displace;
+extern AVFilter ff_vf_dnn_detect;
 extern AVFilter ff_vf_dnn_processing;
 extern AVFilter ff_vf_doubleweave;
 extern AVFilter ff_vf_drawbox;
diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
index 5be053b7f8..928d84b744 100644
--- a/libavfilter/dnn/dnn_backend_openvino.c
+++ b/libavfilter/dnn/dnn_backend_openvino.c
@@ -621,6 +621,12 @@  DNNReturnType ff_dnn_execute_model_ov(const DNNModel *model, const char *input_n
         return DNN_ERROR;
     }
 
+    if (model->func_type != DFT_PROCESS_FRAME) {
+        if (!out_frame) {
+            out_frame = in_frame;
+        }
+    }
+
     if (nb_output != 1) {
         // currently, the filter does not need multiple outputs,
         // so we just pending the support until we really need it.
@@ -674,6 +680,12 @@  DNNReturnType ff_dnn_execute_model_async_ov(const DNNModel *model, const char *i
         return DNN_ERROR;
     }
 
+    if (model->func_type != DFT_PROCESS_FRAME) {
+        if (!out_frame) {
+            out_frame = in_frame;
+        }
+    }
+
     task = av_malloc(sizeof(*task));
     if (!task) {
         av_log(ctx, AV_LOG_ERROR, "unable to alloc memory for task item.\n");
diff --git a/libavfilter/dnn_filter_common.c b/libavfilter/dnn_filter_common.c
index 413adba406..92b696e710 100644
--- a/libavfilter/dnn_filter_common.c
+++ b/libavfilter/dnn_filter_common.c
@@ -64,6 +64,13 @@  int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *fil
     return 0;
 }
 
+int ff_dnn_set_proc(DnnContext *ctx, PRE_POST_PROC pre_proc, PRE_POST_PROC post_proc)
+{
+    ctx->model->pre_proc = pre_proc;
+    ctx->model->post_proc = post_proc;
+    return 0;
+}
+
 DNNReturnType ff_dnn_get_input(DnnContext *ctx, DNNData *input)
 {
     return ctx->model->get_input(ctx->model->model, input, ctx->model_inputname);
diff --git a/libavfilter/dnn_filter_common.h b/libavfilter/dnn_filter_common.h
index 79c4d3efe3..0e88b88bdd 100644
--- a/libavfilter/dnn_filter_common.h
+++ b/libavfilter/dnn_filter_common.h
@@ -48,6 +48,7 @@  typedef struct DnnContext {
 
 
 int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *filter_ctx);
+int ff_dnn_set_proc(DnnContext *ctx, PRE_POST_PROC pre_proc, PRE_POST_PROC post_proc);
 DNNReturnType ff_dnn_get_input(DnnContext *ctx, DNNData *input);
 DNNReturnType ff_dnn_get_output(DnnContext *ctx, int input_width, int input_height, int *output_width, int *output_height);
 DNNReturnType ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame);
diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
index d3a0c58a61..90a08129f4 100644
--- a/libavfilter/dnn_interface.h
+++ b/libavfilter/dnn_interface.h
@@ -63,6 +63,8 @@  typedef struct DNNData{
     DNNColorOrder order;
 } DNNData;
 
+typedef int (*PRE_POST_PROC)(AVFrame *frame, DNNData *model, AVFilterContext *filter_ctx);
+
 typedef struct DNNModel{
     // Stores model that can be different for different backends.
     void *model;
@@ -80,10 +82,10 @@  typedef struct DNNModel{
                                 const char *output_name, int *output_width, int *output_height);
     // set the pre process to transfer data from AVFrame to DNNData
     // the default implementation within DNN is used if it is not provided by the filter
-    int (*pre_proc)(AVFrame *frame_in, DNNData *model_input, AVFilterContext *filter_ctx);
+    PRE_POST_PROC pre_proc;
     // set the post process to transfer data from DNNData to AVFrame
     // the default implementation within DNN is used if it is not provided by the filter
-    int (*post_proc)(AVFrame *frame_out, DNNData *model_output, AVFilterContext *filter_ctx);
+    PRE_POST_PROC post_proc;
 } DNNModel;
 
 // Stores pointers to functions for loading, executing, freeing DNN models for one of the backends.
diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
new file mode 100644
index 0000000000..13b82579a9
--- /dev/null
+++ b/libavfilter/vf_dnn_detect.c
@@ -0,0 +1,462 @@ 
+/*
+ * Copyright (c) 2021
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * implementing an object detecting filter using deep learning networks.
+ */
+
+#include "libavformat/avio.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "filters.h"
+#include "dnn_filter_common.h"
+#include "formats.h"
+#include "internal.h"
+#include "libavutil/time.h"
+#include "bbox.h"
+
+typedef struct DnnDetectContext {
+    const AVClass *class;
+    DnnContext dnnctx;
+    float confidence;
+    char *labels_filename;
+    char **labels;
+    int label_count;
+} DnnDetectContext;
+
+#define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
+#define OFFSET2(x) offsetof(DnnDetectContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+static const AVOption dnn_detect_options[] = {
+    { "dnn_backend", "DNN backend",                OFFSET(backend_type),     AV_OPT_TYPE_INT,       { .i64 = 2 },    INT_MIN, INT_MAX, FLAGS, "backend" },
+#if (CONFIG_LIBOPENVINO == 1)
+    { "openvino",    "openvino backend flag",      0,                        AV_OPT_TYPE_CONST,     { .i64 = 2 },    0, 0, FLAGS, "backend" },
+#endif
+    DNN_COMMON_OPTIONS
+    { "confidence",  "threshold of confidence",    OFFSET2(confidence),      AV_OPT_TYPE_FLOAT,     { .dbl = 0.5 },  0, 1, FLAGS},
+    { "labels",      "path to labels file",        OFFSET2(labels_filename), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(dnn_detect);
+
+// remove this function once we update vf_drawbox/text
+// to visualize the bbox
+static void dump_boundingbox(AVFilterContext *ctx, AVFrame *frame)
+{
+    int nb_bbox = 0;
+    BoundingBox *bbox;
+    BoundingBoxHeader *header;
+
+    if (!frame->private_ref)
+        return;
+
+    header = (BoundingBoxHeader *)frame->private_ref->data;
+    bbox = (BoundingBox *)(header + 1);
+
+    if (!header->bbox_size || (frame->private_ref->size - sizeof(*header)) % header->bbox_size != 0) {
+        av_log(ctx, AV_LOG_ERROR, "invalid size of bounding box\n");
+        return;
+    }
+
+    nb_bbox = (frame->private_ref->size - sizeof(*header)) / header->bbox_size;
+    av_log(ctx, AV_LOG_INFO, "frame->private_ref (bounding boxes):\n");
+    av_log(ctx, AV_LOG_INFO, "source: %s\n", header->source);
+    for (int i = 0; i < nb_bbox; i++) {
+        av_log(ctx, AV_LOG_INFO, "index: %d, region: (%d, %d) -> (%d, %d), label: %s, confidence: %d/%d.\n",
+                                 i, bbox->left, bbox->top, bbox->right, bbox->bottom,
+                                 bbox->detect_label, bbox->detect_confidence.num, bbox->detect_confidence.den);
+        if (bbox->classify_count > 0) {
+            for (int j = 0; j < bbox->classify_count; j++) {
+                av_log(ctx, AV_LOG_INFO, "\t\tclassify:  label: %s, confidence: %d/%d.\n",
+                       bbox->classify_labels[j], bbox->classify_confidences[j].num, bbox->classify_confidences[j].den);
+            }
+        }
+        bbox++;
+    }
+}
+
+static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
+{
+    DnnDetectContext *ctx = filter_ctx->priv;
+    float conf_threshold = ctx->confidence;
+    int proposal_count = output->height;
+    int detect_size = output->width;
+    float *detections = output->data;
+    int nb_bbox = 0;
+    BoundingBox *bbox;
+    BoundingBoxHeader *header;
+
+    for (int i = 0; i < proposal_count; ++i) {
+        float conf = detections[i * detect_size + 2];
+        if (conf < conf_threshold) {
+            continue;
+        }
+        nb_bbox++;
+    }
+
+    if (nb_bbox == 0) {
+        av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
+        return 0;
+    }
+
+    if (frame->private_ref) {
+        av_log(filter_ctx, AV_LOG_ERROR, "frame->private_ref is already occupied\n");
+        return -1;
+    }
+
+    frame->private_ref = av_buffer_alloc(sizeof(*header) + sizeof(*bbox) * nb_bbox);
+    if (!frame->private_ref) {
+        av_log(filter_ctx, AV_LOG_ERROR, "failed to allocate buffer for %d bboxes\n", nb_bbox);
+        return AVERROR(ENOMEM);;
+    }
+
+    header = (BoundingBoxHeader *)frame->private_ref->data;
+    strncpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
+    header->source[sizeof(header->source) - 1] = '\0';
+    header->bbox_size = sizeof(*bbox);
+
+    bbox = (BoundingBox *)(header + 1);
+    for (int i = 0; i < proposal_count; ++i) {
+        int av_unused image_id = (int)detections[i * detect_size + 0];
+        int label_id = (int)detections[i * detect_size + 1];
+        float conf   =      detections[i * detect_size + 2];
+        float x0     =      detections[i * detect_size + 3];
+        float y0     =      detections[i * detect_size + 4];
+        float x1     =      detections[i * detect_size + 5];
+        float y1     =      detections[i * detect_size + 6];
+
+        if (conf < conf_threshold) {
+            continue;
+        }
+
+        bbox->left      = (int)(x0 * frame->width);
+        bbox->right     = (int)(x1 * frame->width);
+        bbox->top       = (int)(y0 * frame->height);
+        bbox->bottom    = (int)(y1 * frame->height);
+
+        bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
+        bbox->classify_count = 0;
+
+        if (ctx->labels && label_id < ctx->label_count) {
+            strcpy(bbox->detect_label, ctx->labels[label_id]);
+        } else {
+            snprintf(bbox->detect_label, BBOX_LABEL_NAME_MAX_LENGTH, "%d", label_id);
+        }
+
+        nb_bbox--;
+        if (nb_bbox == 0) {
+            break;
+        }
+        bbox++;
+    }
+
+    dump_boundingbox(filter_ctx, frame);
+
+    return 0;
+}
+
+static void free_detect_labels(DnnDetectContext *ctx)
+{
+    for (int i = 0; i < ctx->label_count; i++) {
+        av_freep(&ctx->labels[i]);
+    }
+    av_freep(&ctx->labels);
+}
+
+static int read_detect_label_file(AVFilterContext *context)
+{
+    int line_len;
+    FILE *file;
+    DnnDetectContext *ctx = context->priv;
+
+    file = av_fopen_utf8(ctx->labels_filename, "r");
+    if (!file){
+        av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
+        return AVERROR(EINVAL);
+    }
+
+    while (!feof(file)) {
+        char *label;
+        char buf[256];
+        if (!fgets(buf, 256, file)) {
+            break;
+        }
+
+        line_len = strlen(buf);
+        while (line_len) {
+            int i = line_len - 1;
+            if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
+                buf[i] = '\0';
+                line_len--;
+            } else {
+                break;
+            }
+        }
+
+        if (line_len == 0)  // empty line
+            continue;
+
+        if (line_len > BBOX_LABEL_NAME_MAX_LENGTH) {
+            av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
+            fclose(file);
+            free_detect_labels(ctx);
+            return AVERROR(EINVAL);
+        }
+
+        label = av_strdup(buf);
+        if (!label) {
+            av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
+            fclose(file);
+            free_detect_labels(ctx);
+            return AVERROR(ENOMEM);
+        }
+
+        if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
+            av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
+            fclose(file);
+            free_detect_labels(ctx);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    fclose(file);
+    return 0;
+}
+
+static av_cold int dnn_detect_init(AVFilterContext *context)
+{
+    DnnDetectContext *ctx = context->priv;
+    int ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_DETECT, context);
+    if (ret < 0)
+        return ret;
+    ff_dnn_set_proc(&ctx->dnnctx, NULL, dnn_detect_post_proc);
+
+    if (ctx->labels_filename) {
+        return read_detect_label_file(context);
+    }
+    return 0;
+}
+
+static int dnn_detect_query_formats(AVFilterContext *context)
+{
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_NV12,
+        AV_PIX_FMT_NONE
+    };
+    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
+    return ff_set_common_formats(context, fmts_list);
+}
+
+static int dnn_detect_filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    AVFilterContext *context  = inlink->dst;
+    AVFilterLink *outlink = context->outputs[0];
+    DnnDetectContext *ctx = context->priv;
+    DNNReturnType dnn_result;
+
+    dnn_result = ff_dnn_execute_model(&ctx->dnnctx, in, NULL);
+    if (dnn_result != DNN_SUCCESS){
+        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
+        av_frame_free(&in);
+        return AVERROR(EIO);
+    }
+
+    return ff_filter_frame(outlink, in);
+}
+
+static int dnn_detect_activate_sync(AVFilterContext *filter_ctx)
+{
+    AVFilterLink *inlink = filter_ctx->inputs[0];
+    AVFilterLink *outlink = filter_ctx->outputs[0];
+    AVFrame *in = NULL;
+    int64_t pts;
+    int ret, status;
+    int got_frame = 0;
+
+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+    do {
+        // drain all input frames
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+        if (ret > 0) {
+            ret = dnn_detect_filter_frame(inlink, in);
+            if (ret < 0)
+                return ret;
+            got_frame = 1;
+        }
+    } while (ret > 0);
+
+    // if frame got, schedule to next filter
+    if (got_frame)
+        return 0;
+
+    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+        if (status == AVERROR_EOF) {
+            ff_outlink_set_status(outlink, status, pts);
+            return ret;
+        }
+    }
+
+    FF_FILTER_FORWARD_WANTED(outlink, inlink);
+
+    return FFERROR_NOT_READY;
+}
+
+static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
+{
+    DnnDetectContext *ctx = outlink->src->priv;
+    int ret;
+    DNNAsyncStatusType async_state;
+
+    ret = ff_dnn_flush(&ctx->dnnctx);
+    if (ret != DNN_SUCCESS) {
+        return -1;
+    }
+
+    do {
+        AVFrame *in_frame = NULL;
+        AVFrame *out_frame = NULL;
+        async_state = ff_dnn_get_async_result(&ctx->dnnctx, &in_frame, &out_frame);
+        if (out_frame) {
+            av_assert0(in_frame == out_frame);
+            ret = ff_filter_frame(outlink, out_frame);
+            if (ret < 0)
+                return ret;
+            if (out_pts)
+                *out_pts = out_frame->pts + pts;
+        }
+        av_usleep(5000);
+    } while (async_state >= DAST_NOT_READY);
+
+    return 0;
+}
+
+static int dnn_detect_activate_async(AVFilterContext *filter_ctx)
+{
+    AVFilterLink *inlink = filter_ctx->inputs[0];
+    AVFilterLink *outlink = filter_ctx->outputs[0];
+    DnnDetectContext *ctx = filter_ctx->priv;
+    AVFrame *in = NULL;
+    int64_t pts;
+    int ret, status;
+    int got_frame = 0;
+    int async_state;
+
+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+    do {
+        // drain all input frames
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+        if (ret > 0) {
+            if (ff_dnn_execute_model_async(&ctx->dnnctx, in, NULL) != DNN_SUCCESS) {
+                return AVERROR(EIO);
+            }
+        }
+    } while (ret > 0);
+
+    // drain all processed frames
+    do {
+        AVFrame *in_frame = NULL;
+        AVFrame *out_frame = NULL;
+        async_state = ff_dnn_get_async_result(&ctx->dnnctx, &in_frame, &out_frame);
+        if (out_frame) {
+            av_assert0(in_frame == out_frame);
+            ret = ff_filter_frame(outlink, out_frame);
+            if (ret < 0)
+                return ret;
+            got_frame = 1;
+        }
+    } while (async_state == DAST_SUCCESS);
+
+    // if frame got, schedule to next filter
+    if (got_frame)
+        return 0;
+
+    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+        if (status == AVERROR_EOF) {
+            int64_t out_pts = pts;
+            ret = dnn_detect_flush_frame(outlink, pts, &out_pts);
+            ff_outlink_set_status(outlink, status, out_pts);
+            return ret;
+        }
+    }
+
+    FF_FILTER_FORWARD_WANTED(outlink, inlink);
+
+    return 0;
+}
+
+static int dnn_detect_activate(AVFilterContext *filter_ctx)
+{
+    DnnDetectContext *ctx = filter_ctx->priv;
+
+    if (ctx->dnnctx.async)
+        return dnn_detect_activate_async(filter_ctx);
+    else
+        return dnn_detect_activate_sync(filter_ctx);
+}
+
+static av_cold void dnn_detect_uninit(AVFilterContext *context)
+{
+    DnnDetectContext *ctx = context->priv;
+    ff_dnn_uninit(&ctx->dnnctx);
+    free_detect_labels(ctx);
+}
+
+static const AVFilterPad dnn_detect_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+    },
+    { NULL }
+};
+
+static const AVFilterPad dnn_detect_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_dnn_detect = {
+    .name          = "dnn_detect",
+    .description   = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),
+    .priv_size     = sizeof(DnnDetectContext),
+    .init          = dnn_detect_init,
+    .uninit        = dnn_detect_uninit,
+    .query_formats = dnn_detect_query_formats,
+    .inputs        = dnn_detect_inputs,
+    .outputs       = dnn_detect_outputs,
+    .priv_class    = &dnn_detect_class,
+    .activate      = dnn_detect_activate,
+};