diff mbox

[FFmpeg-devel,V2,4/4] avfilter/vf_dnn_processing: add a generic filter for image proccessing with dnn networks

Message ID 1571661520-21573-1-git-send-email-yejun.guo@intel.com
State New
Headers show

Commit Message

Guo, Yejun Oct. 21, 2019, 12:38 p.m. UTC
This filter accepts all the dnn networks which do image processing.
Currently, frame with formats rgb24 and bgr24 are supported. Other
formats such as gray and YUV will be supported next. The dnn network
can accept data in float32 or uint8 format. And the dnn network can
change frame size.

Let's take an example with the following python script. This script
halves the value of the first channel of the pixel.
import tensorflow as tf
import numpy as np
import scipy.misc
in_img = scipy.misc.imread('in.bmp')
in_img = in_img.astype(np.float32)/255.0
in_data = in_img[np.newaxis, :]
filter_data = np.array([0.5, 0, 0, 0, 1., 0, 0, 0, 1.]).reshape(1,1,3,3).astype(np.float32)
filter = tf.Variable(filter_data)
x = tf.placeholder(tf.float32, shape=[1, None, None, 3], name='dnn_in')
y = tf.nn.conv2d(x, filter, strides=[1, 1, 1, 1], padding='VALID', name='dnn_out')
sess=tf.Session()
sess.run(tf.global_variables_initializer())
output = sess.run(y, feed_dict={x: in_data})
graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['dnn_out'])
tf.train.write_graph(graph_def, '.', 'halve_first_channel.pb', as_text=False)
output = output * 255.0
output = output.astype(np.uint8)
scipy.misc.imsave("out.bmp", np.squeeze(output))

- generate halve_first_channel.pb with the above script
- generate halve_first_channel.model with tools/python/convert.py
- try with following commands
  ./ffmpeg -i input.jpg -vf dnn_processing=model=halve_first_channel.model:input=dnn_in:output=dnn_out:fmt=rgb24:dnn_backend=native -y out.native.png
  ./ffmpeg -i input.jpg -vf dnn_processing=model=halve_first_channel.pb:input=dnn_in:output=dnn_out:fmt=rgb24:dnn_backend=tensorflow -y out.tf.png

Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
---
 configure                       |   1 +
 doc/filters.texi                |  44 ++++++
 libavfilter/Makefile            |   1 +
 libavfilter/allfilters.c        |   1 +
 libavfilter/vf_dnn_processing.c | 333 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 380 insertions(+)
 create mode 100644 libavfilter/vf_dnn_processing.c

Comments

Guo, Yejun Oct. 28, 2019, 6:27 a.m. UTC | #1
> -----Original Message-----
> From: Guo, Yejun
> Sent: Monday, October 21, 2019 8:39 PM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Guo, Yejun <yejun.guo@intel.com>
> Subject: [PATCH V2 4/4] avfilter/vf_dnn_processing: add a generic filter for
> image proccessing with dnn networks
> 
> This filter accepts all the dnn networks which do image processing.
> Currently, frame with formats rgb24 and bgr24 are supported. Other
> formats such as gray and YUV will be supported next. The dnn network
> can accept data in float32 or uint8 format. And the dnn network can
> change frame size.
> 
> Let's take an example with the following python script. This script
> halves the value of the first channel of the pixel.
> import tensorflow as tf
> import numpy as np
> import scipy.misc
> in_img = scipy.misc.imread('in.bmp')
> in_img = in_img.astype(np.float32)/255.0
> in_data = in_img[np.newaxis, :]
> filter_data = np.array([0.5, 0, 0, 0, 1., 0, 0, 0,
> 1.]).reshape(1,1,3,3).astype(np.float32)
> filter = tf.Variable(filter_data)
> x = tf.placeholder(tf.float32, shape=[1, None, None, 3], name='dnn_in')
> y = tf.nn.conv2d(x, filter, strides=[1, 1, 1, 1], padding='VALID', name='dnn_out')
> sess=tf.Session()
> sess.run(tf.global_variables_initializer())
> output = sess.run(y, feed_dict={x: in_data})
> graph_def = tf.graph_util.convert_variables_to_constants(sess,
> sess.graph_def, ['dnn_out'])
> tf.train.write_graph(graph_def, '.', 'halve_first_channel.pb', as_text=False)
> output = output * 255.0
> output = output.astype(np.uint8)
> scipy.misc.imsave("out.bmp", np.squeeze(output))
> 
> - generate halve_first_channel.pb with the above script
> - generate halve_first_channel.model with tools/python/convert.py
> - try with following commands
>   ./ffmpeg -i input.jpg -vf
> dnn_processing=model=halve_first_channel.model:input=dnn_in:output=dnn_
> out:fmt=rgb24:dnn_backend=native -y out.native.png
>   ./ffmpeg -i input.jpg -vf
> dnn_processing=model=halve_first_channel.pb:input=dnn_in:output=dnn_out:f
> mt=rgb24:dnn_backend=tensorflow -y out.tf.png
> 
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  configure                       |   1 +
>  doc/filters.texi                |  44 ++++++
>  libavfilter/Makefile            |   1 +
>  libavfilter/allfilters.c        |   1 +
>  libavfilter/vf_dnn_processing.c | 333
> ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 380 insertions(+)


this patch set asks for review, thanks.

btw, the first 3 patches is to improve dnn module base and can be reviewed first.
The fourth patch is a filter based on dnn and can be reviewed secondly.
I put them together to explain why dnn module needs such change. thanks.
Paul B Mahol Oct. 28, 2019, 7:59 a.m. UTC | #2
On 10/21/19, Guo, Yejun <yejun.guo@intel.com> wrote:
> This filter accepts all the dnn networks which do image processing.
> Currently, frame with formats rgb24 and bgr24 are supported. Other
> formats such as gray and YUV will be supported next. The dnn network
> can accept data in float32 or uint8 format. And the dnn network can
> change frame size.
>
> Let's take an example with the following python script. This script
> halves the value of the first channel of the pixel.
> import tensorflow as tf
> import numpy as np
> import scipy.misc
> in_img = scipy.misc.imread('in.bmp')
> in_img = in_img.astype(np.float32)/255.0
> in_data = in_img[np.newaxis, :]
> filter_data = np.array([0.5, 0, 0, 0, 1., 0, 0, 0,
> 1.]).reshape(1,1,3,3).astype(np.float32)
> filter = tf.Variable(filter_data)
> x = tf.placeholder(tf.float32, shape=[1, None, None, 3], name='dnn_in')
> y = tf.nn.conv2d(x, filter, strides=[1, 1, 1, 1], padding='VALID',
> name='dnn_out')
> sess=tf.Session()
> sess.run(tf.global_variables_initializer())
> output = sess.run(y, feed_dict={x: in_data})
> graph_def = tf.graph_util.convert_variables_to_constants(sess,
> sess.graph_def, ['dnn_out'])
> tf.train.write_graph(graph_def, '.', 'halve_first_channel.pb',
> as_text=False)
> output = output * 255.0
> output = output.astype(np.uint8)
> scipy.misc.imsave("out.bmp", np.squeeze(output))

So this one executes python code without ever returning or using AVFrame* ?
This is extremely limited usage.

>
> - generate halve_first_channel.pb with the above script
> - generate halve_first_channel.model with tools/python/convert.py
> - try with following commands
>   ./ffmpeg -i input.jpg -vf
> dnn_processing=model=halve_first_channel.model:input=dnn_in:output=dnn_out:fmt=rgb24:dnn_backend=native
> -y out.native.png
>   ./ffmpeg -i input.jpg -vf
> dnn_processing=model=halve_first_channel.pb:input=dnn_in:output=dnn_out:fmt=rgb24:dnn_backend=tensorflow
> -y out.tf.png
>
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  configure                       |   1 +
>  doc/filters.texi                |  44 ++++++
>  libavfilter/Makefile            |   1 +
>  libavfilter/allfilters.c        |   1 +
>  libavfilter/vf_dnn_processing.c | 333
> ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 380 insertions(+)
>  create mode 100644 libavfilter/vf_dnn_processing.c
>
> diff --git a/configure b/configure
> index 8413826..bf2bac9 100755
> --- a/configure
> +++ b/configure
> @@ -3460,6 +3460,7 @@ derain_filter_select="dnn"
>  deshake_filter_select="pixelutils"
>  deshake_opencl_filter_deps="opencl"
>  dilation_opencl_filter_deps="opencl"
> +dnn_processing_filter_select="dnn"
>  drawtext_filter_deps="libfreetype"
>  drawtext_filter_suggest="libfontconfig libfribidi"
>  elbg_filter_deps="avcodec"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index bdc4136..c11a616 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -8928,6 +8928,50 @@ ffmpeg -i INPUT -f lavfi -i
> nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
>  @end example
>  @end itemize
>
> +@section dnn_processing
> +
> +Do image processing with deep neural networks. Currently only AVFrame with
> RGB24
> +and BGR24 are supported, more formats will be added later.
> +
> +The filter accepts the following options:
> +
> +@table @option
> +@item dnn_backend
> +Specify which DNN backend to use for model loading and execution. This
> option accepts
> +the following values:
> +
> +@table @samp
> +@item native
> +Native implementation of DNN loading and execution.
> +
> +@item tensorflow
> +TensorFlow backend. To enable this backend you
> +need to install the TensorFlow for C library (see
> +@url{https://www.tensorflow.org/install/install_c}) and configure FFmpeg
> with
> +@code{--enable-libtensorflow}
> +@end table
> +
> +Default value is @samp{native}.
> +
> +@item model
> +Set path to model file specifying network architecture and its parameters.
> +Note that different backends use different file formats. TensorFlow and
> native
> +backend can load files for only its format.
> +
> +Native model file (.model) can be generated from TensorFlow model file
> (.pb) by using tools/python/convert.py
> +
> +@item input
> +Set the input name of the dnn network.
> +
> +@item output
> +Set the output name of the dnn network.
> +
> +@item fmt
> +Set the pixel format for the Frame. Allowed values are
> @code{AV_PIX_FMT_RGB24}, and @code{AV_PIX_FMT_BGR24}.
> +Default value is @code{AV_PIX_FMT_RGB24}.
> +
> +@end table
> +
>  @section drawbox
>
>  Draw a colored box on the input image.
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 63d2fba..47a485a 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -224,6 +224,7 @@ OBJS-$(CONFIG_DILATION_OPENCL_FILTER)        +=
> vf_neighbor_opencl.o opencl.o \
>                                                  opencl/neighbor.o
>  OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o framesync.o
>  OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
> +OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         += vf_dnn_processing.o
>  OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
>  OBJS-$(CONFIG_DRAWGRAPH_FILTER)              += f_drawgraph.o
>  OBJS-$(CONFIG_DRAWGRID_FILTER)               += vf_drawbox.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index e4186f9..485409f 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -209,6 +209,7 @@ extern AVFilter ff_vf_detelecine;
>  extern AVFilter ff_vf_dilation;
>  extern AVFilter ff_vf_dilation_opencl;
>  extern AVFilter ff_vf_displace;
> +extern AVFilter ff_vf_dnn_processing;
>  extern AVFilter ff_vf_doubleweave;
>  extern AVFilter ff_vf_drawbox;
>  extern AVFilter ff_vf_drawgraph;
> diff --git a/libavfilter/vf_dnn_processing.c
> b/libavfilter/vf_dnn_processing.c
> new file mode 100644
> index 0000000..de89af4
> --- /dev/null
> +++ b/libavfilter/vf_dnn_processing.c
> @@ -0,0 +1,333 @@
> +/*
> + * Copyright (c) 2019 Guo Yejun
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +/**
> + * @file
> + * implementing a generic image processing filter using deep learning
> networks.
> + */
> +
> +#include "libavformat/avio.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/avassert.h"
> +#include "avfilter.h"
> +#include "dnn_interface.h"
> +#include "formats.h"
> +#include "internal.h"
> +
> +typedef struct DnnProcessingContext {
> +    const AVClass *class;
> +
> +    char *model_filename;
> +    DNNBackendType backend_type;
> +    enum AVPixelFormat fmt;

This should be int.

> +    char *model_inputname;
> +    char *model_outputname;
> +
> +    DNNModule *dnn_module;
> +    DNNModel *model;
> +
> +    // input & output of the model at execution time
> +    DNNData input;
> +    DNNData output;
> +} DnnProcessingContext;
> +
> +#define OFFSET(x) offsetof(DnnProcessingContext, x)
> +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
> +static const AVOption dnn_processing_options[] = {
> +    { "dnn_backend", "DNN backend",                OFFSET(backend_type),
>  AV_OPT_TYPE_INT,       { .i64 = 0 },    0, 1, FLAGS, "backend" },
> +    { "native",      "native backend flag",        0,
>  AV_OPT_TYPE_CONST,     { .i64 = 0 },    0, 0, FLAGS, "backend" },
> +#if (CONFIG_LIBTENSORFLOW == 1)
> +    { "tensorflow",  "tensorflow backend flag",    0,
>  AV_OPT_TYPE_CONST,     { .i64 = 1 },    0, 0, FLAGS, "backend" },
> +#endif
> +    { "model",       "path to model file",         OFFSET(model_filename),
>  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
> +    { "input",       "input name of the model",    OFFSET(model_inputname),
>  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
> +    { "output",      "output name of the model",
> OFFSET(model_outputname), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0,
> FLAGS },
> +    { "fmt",         "AVPixelFormat of the frame", OFFSET(fmt),
>  AV_OPT_TYPE_PIXEL_FMT, { .i64=AV_PIX_FMT_RGB24 }, AV_PIX_FMT_NONE,
> AV_PIX_FMT_NB - 1, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(dnn_processing);
> +
> +static av_cold int init(AVFilterContext *context)
> +{
> +    DnnProcessingContext *ctx = context->priv;
> +    int supported = 0;
> +    // as the first step, only rgb24 and bgr24 are supported
> +    const enum AVPixelFormat supported_pixel_fmts[] = {
> +        AV_PIX_FMT_RGB24,
> +        AV_PIX_FMT_BGR24,
> +    };
> +    for (int i = 0; i < sizeof(supported_pixel_fmts) / sizeof(enum
> AVPixelFormat); ++i) {
> +        if (supported_pixel_fmts[i] == ctx->fmt) {
> +            supported = 1;
> +            break;
> +        }
> +    }
> +    if (!supported) {
> +        av_log(context, AV_LOG_ERROR, "pixel fmt %s not supported yet\n",
> +                                       av_get_pix_fmt_name(ctx->fmt));
> +        return AVERROR(AVERROR_INVALIDDATA);
> +    }
> +
> +    if (!ctx->model_filename) {
> +        av_log(ctx, AV_LOG_ERROR, "model file for network is not
> specified\n");
> +        return AVERROR(EINVAL);
> +    }
> +    if (!ctx->model_inputname) {
> +        av_log(ctx, AV_LOG_ERROR, "intput name of the model network is not

Typo

> specified\n");
> +        return AVERROR(EINVAL);
> +    }
> +    if (!ctx->model_outputname) {
> +        av_log(ctx, AV_LOG_ERROR, "output name of the model network is not
> specified\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    ctx->dnn_module = ff_get_dnn_module(ctx->backend_type);
> +    if (!ctx->dnn_module) {
> +        av_log(ctx, AV_LOG_ERROR, "could not create DNN module for
> requested backend\n");
> +        return AVERROR(ENOMEM);
> +    }
> +    if (!ctx->dnn_module->load_model) {
> +        av_log(ctx, AV_LOG_ERROR, "load_model for network is not
> specified\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    ctx->model = (ctx->dnn_module->load_model)(ctx->model_filename);
> +    if (!ctx->model) {
> +        av_log(ctx, AV_LOG_ERROR, "could not load DNN model\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    return 0;
> +}
> +
> +static int query_formats(AVFilterContext *context)
> +{
> +    AVFilterFormats *formats;
> +    DnnProcessingContext *ctx = context->priv;
> +    enum AVPixelFormat pixel_fmts[2];
> +    pixel_fmts[0] = ctx->fmt;
> +    pixel_fmts[1] = AV_PIX_FMT_NONE;
> +
> +    formats = ff_make_format_list(pixel_fmts);
> +    return ff_set_common_formats(context, formats);
> +}
> +
> +static int config_input(AVFilterLink *inlink)
> +{
> +    AVFilterContext *context     = inlink->dst;
> +    DnnProcessingContext *ctx = context->priv;
> +    DNNReturnType result;
> +    DNNData dnn_data;
> +
> +    result = ctx->model->get_input(ctx->model->model, &dnn_data,
> ctx->model_inputname);
> +    if (result != DNN_SUCCESS) {
> +        av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
> +        return AVERROR(EIO);
> +    }
> +
> +    // the design is to add explicit scale filter before this filter
> +    if (dnn_data.height != -1 && dnn_data.height != inlink->h) {
> +        av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d but
> got %d\n",
> +                                   dnn_data.height, inlink->h);
> +        return AVERROR(EIO);
> +    }
> +    if (dnn_data.width != -1 && dnn_data.width != inlink->w) {
> +        av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d but
> got %d\n",
> +                                   dnn_data.width, inlink->w);
> +        return AVERROR(EIO);
> +    }
> +
> +    if (dnn_data.channels != 3) {
> +        av_log(ctx, AV_LOG_ERROR, "the model requires input channels %d\n",
> +                                   dnn_data.channels);
> +        return AVERROR(EIO);
> +    }
> +    if (dnn_data.dt != DNN_FLOAT && dnn_data.dt != DNN_UINT8) {
> +        av_log(ctx, AV_LOG_ERROR, "only support dnn models with input data
> type as float32 and uint8.\n");
> +        return AVERROR(EIO);
> +    }
> +
> +    ctx->input.width    = inlink->w;
> +    ctx->input.height   = inlink->h;
> +    ctx->input.channels = dnn_data.channels;
> +    ctx->input.dt = dnn_data.dt;
> +
> +    result = (ctx->model->set_input_output)(ctx->model->model,
> +                                        &ctx->input, ctx->model_inputname,
> +                                        (const char
> **)&ctx->model_outputname, 1);
> +    if (result != DNN_SUCCESS) {
> +        av_log(ctx, AV_LOG_ERROR, "could not set input and output for the
> model\n");
> +        return AVERROR(EIO);
> +    }
> +
> +    return 0;
> +}
> +
> +static int config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *context = outlink->src;
> +    DnnProcessingContext *ctx = context->priv;
> +    DNNReturnType result;
> +
> +    // have a try run in case that the dnn model resize the frame
> +    result = (ctx->dnn_module->execute_model)(ctx->model, &ctx->output, 1);
> +    if (result != DNN_SUCCESS){
> +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
> +        return AVERROR(EIO);
> +    }
> +
> +    outlink->w = ctx->output.width;
> +    outlink->h = ctx->output.height;
> +
> +    return 0;
> +}
> +
> +static int copy_from_frame_to_dnn(DNNData *dnn_data, const AVFrame *in)
> +{
> +    // extend this function to support more formats
> +    av_assert0(in->format == AV_PIX_FMT_RGB24 || in->format ==
> AV_PIX_FMT_RGB24);
> +
> +    if (dnn_data->dt == DNN_FLOAT) {
> +        float *dnn_input = dnn_data->data;
> +        for (int i = 0; i < in->height; i++) {
> +            for(int j = 0; j < in->width * 3; j++) {
> +                int k = i * in->linesize[0] + j;
> +                int t = i * in->width * 3 + j;
> +                dnn_input[t] = in->data[0][k] / 255.0f;
> +            }
> +        }
> +    } else {
> +        uint8_t *dnn_input = dnn_data->data;
> +        av_assert0(dnn_data->dt == DNN_UINT8);
> +        for (int i = 0; i < in->height; i++) {
> +            for(int j = 0; j < in->width * 3; j++) {
> +                int k = i * in->linesize[0] + j;
> +                int t = i * in->width * 3 + j;
> +                dnn_input[t] = in->data[0][k];
> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int copy_from_dnn_to_frame(AVFrame *out, const DNNData *dnn_data)
> +{
> +    // extend this function to support more formats
> +    av_assert0(out->format == AV_PIX_FMT_RGB24 || out->format ==
> AV_PIX_FMT_RGB24);
> +
> +    if (dnn_data->dt == DNN_FLOAT) {
> +        float *dnn_output = dnn_data->data;
> +        for (int i = 0; i < out->height; i++) {
> +            for(int j = 0; j < out->width * 3; j++) {
> +                int k = i * out->linesize[0] + j;
> +                int t = i * out->width * 3 + j;
> +                out->data[0][k] = av_clip((int)(dnn_output[t] * 255.0f), 0,
> 255);
> +            }
> +        }
> +    } else {
> +        uint8_t *dnn_output = dnn_data->data;
> +        av_assert0(dnn_data->dt == DNN_UINT8);
> +        for (int i = 0; i < out->height; i++) {
> +            for(int j = 0; j < out->width * 3; j++) {
> +                int k = i * out->linesize[0] + j;
> +                int t = i * out->width * 3 + j;
> +                out->data[0][k] = dnn_output[t];
> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int filter_frame(AVFilterLink *inlink, AVFrame *in)
> +{
> +    AVFilterContext *context  = inlink->dst;
> +    AVFilterLink *outlink = context->outputs[0];
> +    DnnProcessingContext *ctx = context->priv;
> +    DNNReturnType dnn_result;
> +    AVFrame *out;
> +
> +    copy_from_frame_to_dnn(&ctx->input, in);
> +
> +    dnn_result = (ctx->dnn_module->execute_model)(ctx->model, &ctx->output,
> 1);
> +    if (dnn_result != DNN_SUCCESS){
> +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
> +        av_frame_free(&in);
> +        return AVERROR(EIO);
> +    }
> +    av_assert0(ctx->output.channels == 3);
> +
> +    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!out) {
> +        av_log(ctx, AV_LOG_ERROR, "could not allocate memory for output
> frame\n");

This log message should be removed, as it is not useful at all.

> +        av_frame_free(&in);
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    av_frame_copy_props(out, in);
> +    copy_from_dnn_to_frame(out, &ctx->output);
> +    av_frame_free(&in);
> +    return ff_filter_frame(outlink, out);
> +}
> +
> +static av_cold void uninit(AVFilterContext *ctx)
> +{
> +    DnnProcessingContext *context = ctx->priv;
> +
> +    if (context->dnn_module)
> +        (context->dnn_module->free_model)(&context->model);
> +
> +    av_freep(&context->dnn_module);
> +}
> +
> +static const AVFilterPad dnn_processing_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = config_input,
> +        .filter_frame = filter_frame,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad dnn_processing_outputs[] = {
> +    {
> +        .name = "default",
> +        .type = AVMEDIA_TYPE_VIDEO,
> +        .config_props  = config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_dnn_processing = {
> +    .name          = "dnn_processing",
> +    .description   = NULL_IF_CONFIG_SMALL("Apply DNN processing filter to
> the input."),
> +    .priv_size     = sizeof(DnnProcessingContext),
> +    .init          = init,
> +    .uninit        = uninit,
> +    .query_formats = query_formats,
> +    .inputs        = dnn_processing_inputs,
> +    .outputs       = dnn_processing_outputs,
> +    .priv_class    = &dnn_processing_class,
> +    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,

If filter changes w/h, this can not be supported.

> +};
> --
> 2.7.4
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Guo, Yejun Oct. 28, 2019, 11:04 a.m. UTC | #3
> -----Original Message-----

> From: Paul B Mahol [mailto:onemda@gmail.com]

> Sent: Monday, October 28, 2019 4:00 PM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Cc: Guo, Yejun <yejun.guo@intel.com>

> Subject: Re: [FFmpeg-devel] [PATCH V2 4/4] avfilter/vf_dnn_processing: add a

> generic filter for image proccessing with dnn networks

> 

> On 10/21/19, Guo, Yejun <yejun.guo@intel.com> wrote:

> > This filter accepts all the dnn networks which do image processing.

> > Currently, frame with formats rgb24 and bgr24 are supported. Other

> > formats such as gray and YUV will be supported next. The dnn network

> > can accept data in float32 or uint8 format. And the dnn network can

> > change frame size.

> >

> > Let's take an example with the following python script. This script

> > halves the value of the first channel of the pixel.

> > import tensorflow as tf

> > import numpy as np

> > import scipy.misc

> > in_img = scipy.misc.imread('in.bmp')

> > in_img = in_img.astype(np.float32)/255.0

> > in_data = in_img[np.newaxis, :]

> > filter_data = np.array([0.5, 0, 0, 0, 1., 0, 0, 0,

> > 1.]).reshape(1,1,3,3).astype(np.float32)

> > filter = tf.Variable(filter_data)

> > x = tf.placeholder(tf.float32, shape=[1, None, None, 3], name='dnn_in')

> > y = tf.nn.conv2d(x, filter, strides=[1, 1, 1, 1], padding='VALID',

> > name='dnn_out')

> > sess=tf.Session()

> > sess.run(tf.global_variables_initializer())

> > output = sess.run(y, feed_dict={x: in_data})

> > graph_def = tf.graph_util.convert_variables_to_constants(sess,

> > sess.graph_def, ['dnn_out'])

> > tf.train.write_graph(graph_def, '.', 'halve_first_channel.pb',

> > as_text=False)

> > output = output * 255.0

> > output = output.astype(np.uint8)

> > scipy.misc.imsave("out.bmp", np.squeeze(output))

> 

> So this one executes python code without ever returning or using AVFrame* ?

> This is extremely limited usage.


the purpose of this script is to demo how to setup and execute dnn models with python+tensorflow.
The only relationship with ffmpeg is that the script prepares the model file halve_first_channel.pb.

The next description shows how ffmpeg can execute the model in a filter.

I'll try to update the commit log to avoid misleading words, thanks.

> 

> >

> > - generate halve_first_channel.pb with the above script

> > - generate halve_first_channel.model with tools/python/convert.py

> > - try with following commands

> >   ./ffmpeg -i input.jpg -vf

> >

> dnn_processing=model=halve_first_channel.model:input=dnn_in:output=dnn_

> out:fmt=rgb24:dnn_backend=native

> > -y out.native.png

> >   ./ffmpeg -i input.jpg -vf

> >

> dnn_processing=model=halve_first_channel.pb:input=dnn_in:output=dnn_out:f

> mt=rgb24:dnn_backend=tensorflow

> > -y out.tf.png

> >

> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>

> > ---

> >  configure                       |   1 +

> >  doc/filters.texi                |  44 ++++++

> >  libavfilter/Makefile            |   1 +

> >  libavfilter/allfilters.c        |   1 +

> >  libavfilter/vf_dnn_processing.c | 333

> > ++++++++++++++++++++++++++++++++++++++++

> >  5 files changed, 380 insertions(+)

> >  create mode 100644 libavfilter/vf_dnn_processing.c

> >

> > diff --git a/configure b/configure

> > index 8413826..bf2bac9 100755

> > --- a/configure

> > +++ b/configure

> > @@ -3460,6 +3460,7 @@ derain_filter_select="dnn"

> >  deshake_filter_select="pixelutils"

> >  deshake_opencl_filter_deps="opencl"

> >  dilation_opencl_filter_deps="opencl"

> > +dnn_processing_filter_select="dnn"

> >  drawtext_filter_deps="libfreetype"

> >  drawtext_filter_suggest="libfontconfig libfribidi"

> >  elbg_filter_deps="avcodec"

> > diff --git a/doc/filters.texi b/doc/filters.texi

> > index bdc4136..c11a616 100644

> > --- a/doc/filters.texi

> > +++ b/doc/filters.texi

> > @@ -8928,6 +8928,50 @@ ffmpeg -i INPUT -f lavfi -i

> > nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2

> >  @end example

> >  @end itemize

> >

> > +@section dnn_processing

> > +

> > +Do image processing with deep neural networks. Currently only AVFrame

> with

> > RGB24

> > +and BGR24 are supported, more formats will be added later.

> > +

> > +The filter accepts the following options:

> > +

> > +@table @option

> > +@item dnn_backend

> > +Specify which DNN backend to use for model loading and execution. This

> > option accepts

> > +the following values:

> > +

> > +@table @samp

> > +@item native

> > +Native implementation of DNN loading and execution.

> > +

> > +@item tensorflow

> > +TensorFlow backend. To enable this backend you

> > +need to install the TensorFlow for C library (see

> > +@url{https://www.tensorflow.org/install/install_c}) and configure FFmpeg

> > with

> > +@code{--enable-libtensorflow}

> > +@end table

> > +

> > +Default value is @samp{native}.

> > +

> > +@item model

> > +Set path to model file specifying network architecture and its parameters.

> > +Note that different backends use different file formats. TensorFlow and

> > native

> > +backend can load files for only its format.

> > +

> > +Native model file (.model) can be generated from TensorFlow model file

> > (.pb) by using tools/python/convert.py

> > +

> > +@item input

> > +Set the input name of the dnn network.

> > +

> > +@item output

> > +Set the output name of the dnn network.

> > +

> > +@item fmt

> > +Set the pixel format for the Frame. Allowed values are

> > @code{AV_PIX_FMT_RGB24}, and @code{AV_PIX_FMT_BGR24}.

> > +Default value is @code{AV_PIX_FMT_RGB24}.

> > +

> > +@end table

> > +

> >  @section drawbox

> >

> >  Draw a colored box on the input image.

> > diff --git a/libavfilter/Makefile b/libavfilter/Makefile

> > index 63d2fba..47a485a 100644

> > --- a/libavfilter/Makefile

> > +++ b/libavfilter/Makefile

> > @@ -224,6 +224,7 @@ OBJS-$(CONFIG_DILATION_OPENCL_FILTER)

> +=

> > vf_neighbor_opencl.o opencl.o \

> >

> opencl/neighbor.o

> >  OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o

> framesync.o

> >  OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o

> > +OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         +=

> vf_dnn_processing.o

> >  OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o

> >  OBJS-$(CONFIG_DRAWGRAPH_FILTER)              += f_drawgraph.o

> >  OBJS-$(CONFIG_DRAWGRID_FILTER)               += vf_drawbox.o

> > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c

> > index e4186f9..485409f 100644

> > --- a/libavfilter/allfilters.c

> > +++ b/libavfilter/allfilters.c

> > @@ -209,6 +209,7 @@ extern AVFilter ff_vf_detelecine;

> >  extern AVFilter ff_vf_dilation;

> >  extern AVFilter ff_vf_dilation_opencl;

> >  extern AVFilter ff_vf_displace;

> > +extern AVFilter ff_vf_dnn_processing;

> >  extern AVFilter ff_vf_doubleweave;

> >  extern AVFilter ff_vf_drawbox;

> >  extern AVFilter ff_vf_drawgraph;

> > diff --git a/libavfilter/vf_dnn_processing.c

> > b/libavfilter/vf_dnn_processing.c

> > new file mode 100644

> > index 0000000..de89af4

> > --- /dev/null

> > +++ b/libavfilter/vf_dnn_processing.c

> > @@ -0,0 +1,333 @@

> > +/*

> > + * Copyright (c) 2019 Guo Yejun

> > + *

> > + * This file is part of FFmpeg.

> > + *

> > + * FFmpeg is free software; you can redistribute it and/or

> > + * modify it under the terms of the GNU Lesser General Public

> > + * License as published by the Free Software Foundation; either

> > + * version 2.1 of the License, or (at your option) any later version.

> > + *

> > + * FFmpeg is distributed in the hope that it will be useful,

> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of

> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

> GNU

> > + * Lesser General Public License for more details.

> > + *

> > + * You should have received a copy of the GNU Lesser General Public

> > + * License along with FFmpeg; if not, write to the Free Software

> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301

> > USA

> > + */

> > +

> > +/**

> > + * @file

> > + * implementing a generic image processing filter using deep learning

> > networks.

> > + */

> > +

> > +#include "libavformat/avio.h"

> > +#include "libavutil/opt.h"

> > +#include "libavutil/pixdesc.h"

> > +#include "libavutil/avassert.h"

> > +#include "avfilter.h"

> > +#include "dnn_interface.h"

> > +#include "formats.h"

> > +#include "internal.h"

> > +

> > +typedef struct DnnProcessingContext {

> > +    const AVClass *class;

> > +

> > +    char *model_filename;

> > +    DNNBackendType backend_type;

> > +    enum AVPixelFormat fmt;

> 

> This should be int.


could you please help to explain a bit more why 'enum AVPixelFormat' should be int.

I searched 'AV_OPT_TYPE_PIXEL_FMT' in vf_* files and found 'enum AVPixelFormat' is used in 
vf_mergeplanes.c, vf_program_opencl.c and vf_tonemap_opencl.c.

> 

> > +    char *model_inputname;

> > +    char *model_outputname;

> > +

> > +    DNNModule *dnn_module;

> > +    DNNModel *model;

> > +

> > +    // input & output of the model at execution time

> > +    DNNData input;

> > +    DNNData output;

> > +} DnnProcessingContext;

> > +

> > +#define OFFSET(x) offsetof(DnnProcessingContext, x)

> > +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM |

> AV_OPT_FLAG_VIDEO_PARAM

> > +static const AVOption dnn_processing_options[] = {

> > +    { "dnn_backend", "DNN backend",

> OFFSET(backend_type),

> >  AV_OPT_TYPE_INT,       { .i64 = 0 },    0, 1, FLAGS, "backend" },

> > +    { "native",      "native backend flag",        0,

> >  AV_OPT_TYPE_CONST,     { .i64 = 0 },    0, 0, FLAGS, "backend" },

> > +#if (CONFIG_LIBTENSORFLOW == 1)

> > +    { "tensorflow",  "tensorflow backend flag",    0,

> >  AV_OPT_TYPE_CONST,     { .i64 = 1 },    0, 0, FLAGS, "backend" },

> > +#endif

> > +    { "model",       "path to model file",

> OFFSET(model_filename),

> >  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },

> > +    { "input",       "input name of the model",

> OFFSET(model_inputname),

> >  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },

> > +    { "output",      "output name of the model",

> > OFFSET(model_outputname), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0,

> > FLAGS },

> > +    { "fmt",         "AVPixelFormat of the frame", OFFSET(fmt),

> >  AV_OPT_TYPE_PIXEL_FMT, { .i64=AV_PIX_FMT_RGB24 },

> AV_PIX_FMT_NONE,

> > AV_PIX_FMT_NB - 1, FLAGS },

> > +    { NULL }

> > +};

> > +

> > +AVFILTER_DEFINE_CLASS(dnn_processing);

> > +

> > +static av_cold int init(AVFilterContext *context)

> > +{

> > +    DnnProcessingContext *ctx = context->priv;

> > +    int supported = 0;

> > +    // as the first step, only rgb24 and bgr24 are supported

> > +    const enum AVPixelFormat supported_pixel_fmts[] = {

> > +        AV_PIX_FMT_RGB24,

> > +        AV_PIX_FMT_BGR24,

> > +    };

> > +    for (int i = 0; i < sizeof(supported_pixel_fmts) / sizeof(enum

> > AVPixelFormat); ++i) {

> > +        if (supported_pixel_fmts[i] == ctx->fmt) {

> > +            supported = 1;

> > +            break;

> > +        }

> > +    }

> > +    if (!supported) {

> > +        av_log(context, AV_LOG_ERROR, "pixel fmt %s not supported

> yet\n",

> > +

> av_get_pix_fmt_name(ctx->fmt));

> > +        return AVERROR(AVERROR_INVALIDDATA);

> > +    }

> > +

> > +    if (!ctx->model_filename) {

> > +        av_log(ctx, AV_LOG_ERROR, "model file for network is not

> > specified\n");

> > +        return AVERROR(EINVAL);

> > +    }

> > +    if (!ctx->model_inputname) {

> > +        av_log(ctx, AV_LOG_ERROR, "intput name of the model network is

> not

> 

> Typo


thanks, will fix.

> 

> > specified\n");

> > +        return AVERROR(EINVAL);

> > +    }

> > +    if (!ctx->model_outputname) {

> > +        av_log(ctx, AV_LOG_ERROR, "output name of the model network

> is not

> > specified\n");

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    ctx->dnn_module = ff_get_dnn_module(ctx->backend_type);

> > +    if (!ctx->dnn_module) {

> > +        av_log(ctx, AV_LOG_ERROR, "could not create DNN module for

> > requested backend\n");

> > +        return AVERROR(ENOMEM);

> > +    }

> > +    if (!ctx->dnn_module->load_model) {

> > +        av_log(ctx, AV_LOG_ERROR, "load_model for network is not

> > specified\n");

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    ctx->model = (ctx->dnn_module->load_model)(ctx->model_filename);

> > +    if (!ctx->model) {

> > +        av_log(ctx, AV_LOG_ERROR, "could not load DNN model\n");

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    return 0;

> > +}

> > +

> > +static int query_formats(AVFilterContext *context)

> > +{

> > +    AVFilterFormats *formats;

> > +    DnnProcessingContext *ctx = context->priv;

> > +    enum AVPixelFormat pixel_fmts[2];

> > +    pixel_fmts[0] = ctx->fmt;

> > +    pixel_fmts[1] = AV_PIX_FMT_NONE;

> > +

> > +    formats = ff_make_format_list(pixel_fmts);

> > +    return ff_set_common_formats(context, formats);

> > +}

> > +

> > +static int config_input(AVFilterLink *inlink)

> > +{

> > +    AVFilterContext *context     = inlink->dst;

> > +    DnnProcessingContext *ctx = context->priv;

> > +    DNNReturnType result;

> > +    DNNData dnn_data;

> > +

> > +    result = ctx->model->get_input(ctx->model->model, &dnn_data,

> > ctx->model_inputname);

> > +    if (result != DNN_SUCCESS) {

> > +        av_log(ctx, AV_LOG_ERROR, "could not get input from the

> model\n");

> > +        return AVERROR(EIO);

> > +    }

> > +

> > +    // the design is to add explicit scale filter before this filter

> > +    if (dnn_data.height != -1 && dnn_data.height != inlink->h) {

> > +        av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d

> but

> > got %d\n",

> > +                                   dnn_data.height, inlink->h);

> > +        return AVERROR(EIO);

> > +    }

> > +    if (dnn_data.width != -1 && dnn_data.width != inlink->w) {

> > +        av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d

> but

> > got %d\n",

> > +                                   dnn_data.width, inlink->w);

> > +        return AVERROR(EIO);

> > +    }

> > +

> > +    if (dnn_data.channels != 3) {

> > +        av_log(ctx, AV_LOG_ERROR, "the model requires input

> channels %d\n",

> > +                                   dnn_data.channels);

> > +        return AVERROR(EIO);

> > +    }

> > +    if (dnn_data.dt != DNN_FLOAT && dnn_data.dt != DNN_UINT8) {

> > +        av_log(ctx, AV_LOG_ERROR, "only support dnn models with input

> data

> > type as float32 and uint8.\n");

> > +        return AVERROR(EIO);

> > +    }

> > +

> > +    ctx->input.width    = inlink->w;

> > +    ctx->input.height   = inlink->h;

> > +    ctx->input.channels = dnn_data.channels;

> > +    ctx->input.dt = dnn_data.dt;

> > +

> > +    result = (ctx->model->set_input_output)(ctx->model->model,

> > +                                        &ctx->input,

> ctx->model_inputname,

> > +                                        (const char

> > **)&ctx->model_outputname, 1);

> > +    if (result != DNN_SUCCESS) {

> > +        av_log(ctx, AV_LOG_ERROR, "could not set input and output for

> the

> > model\n");

> > +        return AVERROR(EIO);

> > +    }

> > +

> > +    return 0;

> > +}

> > +

> > +static int config_output(AVFilterLink *outlink)

> > +{

> > +    AVFilterContext *context = outlink->src;

> > +    DnnProcessingContext *ctx = context->priv;

> > +    DNNReturnType result;

> > +

> > +    // have a try run in case that the dnn model resize the frame

> > +    result = (ctx->dnn_module->execute_model)(ctx->model, &ctx->output,

> 1);

> > +    if (result != DNN_SUCCESS){

> > +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");

> > +        return AVERROR(EIO);

> > +    }

> > +

> > +    outlink->w = ctx->output.width;

> > +    outlink->h = ctx->output.height;

> > +

> > +    return 0;

> > +}

> > +

> > +static int copy_from_frame_to_dnn(DNNData *dnn_data, const AVFrame

> *in)

> > +{

> > +    // extend this function to support more formats

> > +    av_assert0(in->format == AV_PIX_FMT_RGB24 || in->format ==

> > AV_PIX_FMT_RGB24);

> > +

> > +    if (dnn_data->dt == DNN_FLOAT) {

> > +        float *dnn_input = dnn_data->data;

> > +        for (int i = 0; i < in->height; i++) {

> > +            for(int j = 0; j < in->width * 3; j++) {

> > +                int k = i * in->linesize[0] + j;

> > +                int t = i * in->width * 3 + j;

> > +                dnn_input[t] = in->data[0][k] / 255.0f;

> > +            }

> > +        }

> > +    } else {

> > +        uint8_t *dnn_input = dnn_data->data;

> > +        av_assert0(dnn_data->dt == DNN_UINT8);

> > +        for (int i = 0; i < in->height; i++) {

> > +            for(int j = 0; j < in->width * 3; j++) {

> > +                int k = i * in->linesize[0] + j;

> > +                int t = i * in->width * 3 + j;

> > +                dnn_input[t] = in->data[0][k];

> > +            }

> > +        }

> > +    }

> > +

> > +    return 0;

> > +}

> > +

> > +static int copy_from_dnn_to_frame(AVFrame *out, const DNNData

> *dnn_data)

> > +{

> > +    // extend this function to support more formats

> > +    av_assert0(out->format == AV_PIX_FMT_RGB24 || out->format ==

> > AV_PIX_FMT_RGB24);

> > +

> > +    if (dnn_data->dt == DNN_FLOAT) {

> > +        float *dnn_output = dnn_data->data;

> > +        for (int i = 0; i < out->height; i++) {

> > +            for(int j = 0; j < out->width * 3; j++) {

> > +                int k = i * out->linesize[0] + j;

> > +                int t = i * out->width * 3 + j;

> > +                out->data[0][k] = av_clip((int)(dnn_output[t] * 255.0f), 0,

> > 255);

> > +            }

> > +        }

> > +    } else {

> > +        uint8_t *dnn_output = dnn_data->data;

> > +        av_assert0(dnn_data->dt == DNN_UINT8);

> > +        for (int i = 0; i < out->height; i++) {

> > +            for(int j = 0; j < out->width * 3; j++) {

> > +                int k = i * out->linesize[0] + j;

> > +                int t = i * out->width * 3 + j;

> > +                out->data[0][k] = dnn_output[t];

> > +            }

> > +        }

> > +    }

> > +

> > +    return 0;

> > +}

> > +

> > +static int filter_frame(AVFilterLink *inlink, AVFrame *in)

> > +{

> > +    AVFilterContext *context  = inlink->dst;

> > +    AVFilterLink *outlink = context->outputs[0];

> > +    DnnProcessingContext *ctx = context->priv;

> > +    DNNReturnType dnn_result;

> > +    AVFrame *out;

> > +

> > +    copy_from_frame_to_dnn(&ctx->input, in);

> > +

> > +    dnn_result = (ctx->dnn_module->execute_model)(ctx->model,

> &ctx->output,

> > 1);

> > +    if (dnn_result != DNN_SUCCESS){

> > +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");

> > +        av_frame_free(&in);

> > +        return AVERROR(EIO);

> > +    }

> > +    av_assert0(ctx->output.channels == 3);

> > +

> > +    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);

> > +    if (!out) {

> > +        av_log(ctx, AV_LOG_ERROR, "could not allocate memory for

> output

> > frame\n");

> 

> This log message should be removed, as it is not useful at all.


thanks, will remove

> 

> > +        av_frame_free(&in);

> > +        return AVERROR(ENOMEM);

> > +    }

> > +

> > +    av_frame_copy_props(out, in);

> > +    copy_from_dnn_to_frame(out, &ctx->output);

> > +    av_frame_free(&in);

> > +    return ff_filter_frame(outlink, out);

> > +}

> > +

> > +static av_cold void uninit(AVFilterContext *ctx)

> > +{

> > +    DnnProcessingContext *context = ctx->priv;

> > +

> > +    if (context->dnn_module)

> > +        (context->dnn_module->free_model)(&context->model);

> > +

> > +    av_freep(&context->dnn_module);

> > +}

> > +

> > +static const AVFilterPad dnn_processing_inputs[] = {

> > +    {

> > +        .name         = "default",

> > +        .type         = AVMEDIA_TYPE_VIDEO,

> > +        .config_props = config_input,

> > +        .filter_frame = filter_frame,

> > +    },

> > +    { NULL }

> > +};

> > +

> > +static const AVFilterPad dnn_processing_outputs[] = {

> > +    {

> > +        .name = "default",

> > +        .type = AVMEDIA_TYPE_VIDEO,

> > +        .config_props  = config_output,

> > +    },

> > +    { NULL }

> > +};

> > +

> > +AVFilter ff_vf_dnn_processing = {

> > +    .name          = "dnn_processing",

> > +    .description   = NULL_IF_CONFIG_SMALL("Apply DNN processing

> filter to

> > the input."),

> > +    .priv_size     = sizeof(DnnProcessingContext),

> > +    .init          = init,

> > +    .uninit        = uninit,

> > +    .query_formats = query_formats,

> > +    .inputs        = dnn_processing_inputs,

> > +    .outputs       = dnn_processing_outputs,

> > +    .priv_class    = &dnn_processing_class,

> > +    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,

> 

> If filter changes w/h, this can not be supported.


got it, will remove this line, thank you.

> 

> > +};

> > --

> > 2.7.4

> >

> > _______________________________________________

> > ffmpeg-devel mailing list

> > ffmpeg-devel@ffmpeg.org

> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> >

> > To unsubscribe, visit link above, or email

> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Paul B Mahol Oct. 28, 2019, 11:51 a.m. UTC | #4
On 10/28/19, Guo, Yejun <yejun.guo@intel.com> wrote:
>
>
>> -----Original Message-----
>> From: Paul B Mahol [mailto:onemda@gmail.com]
>> Sent: Monday, October 28, 2019 4:00 PM
>> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
>> Cc: Guo, Yejun <yejun.guo@intel.com>
>> Subject: Re: [FFmpeg-devel] [PATCH V2 4/4] avfilter/vf_dnn_processing: add
>> a
>> generic filter for image proccessing with dnn networks
>>
>> On 10/21/19, Guo, Yejun <yejun.guo@intel.com> wrote:
>> > This filter accepts all the dnn networks which do image processing.
>> > Currently, frame with formats rgb24 and bgr24 are supported. Other
>> > formats such as gray and YUV will be supported next. The dnn network
>> > can accept data in float32 or uint8 format. And the dnn network can
>> > change frame size.
>> >
>> > Let's take an example with the following python script. This script
>> > halves the value of the first channel of the pixel.
>> > import tensorflow as tf
>> > import numpy as np
>> > import scipy.misc
>> > in_img = scipy.misc.imread('in.bmp')
>> > in_img = in_img.astype(np.float32)/255.0
>> > in_data = in_img[np.newaxis, :]
>> > filter_data = np.array([0.5, 0, 0, 0, 1., 0, 0, 0,
>> > 1.]).reshape(1,1,3,3).astype(np.float32)
>> > filter = tf.Variable(filter_data)
>> > x = tf.placeholder(tf.float32, shape=[1, None, None, 3], name='dnn_in')
>> > y = tf.nn.conv2d(x, filter, strides=[1, 1, 1, 1], padding='VALID',
>> > name='dnn_out')
>> > sess=tf.Session()
>> > sess.run(tf.global_variables_initializer())
>> > output = sess.run(y, feed_dict={x: in_data})
>> > graph_def = tf.graph_util.convert_variables_to_constants(sess,
>> > sess.graph_def, ['dnn_out'])
>> > tf.train.write_graph(graph_def, '.', 'halve_first_channel.pb',
>> > as_text=False)
>> > output = output * 255.0
>> > output = output.astype(np.uint8)
>> > scipy.misc.imsave("out.bmp", np.squeeze(output))
>>
>> So this one executes python code without ever returning or using AVFrame*
>> ?
>> This is extremely limited usage.
>
> the purpose of this script is to demo how to setup and execute dnn models
> with python+tensorflow.
> The only relationship with ffmpeg is that the script prepares the model file
> halve_first_channel.pb.
>
> The next description shows how ffmpeg can execute the model in a filter.
>
> I'll try to update the commit log to avoid misleading words, thanks.
>
>>
>> >
>> > - generate halve_first_channel.pb with the above script
>> > - generate halve_first_channel.model with tools/python/convert.py
>> > - try with following commands
>> >   ./ffmpeg -i input.jpg -vf
>> >
>> dnn_processing=model=halve_first_channel.model:input=dnn_in:output=dnn_
>> out:fmt=rgb24:dnn_backend=native
>> > -y out.native.png
>> >   ./ffmpeg -i input.jpg -vf
>> >
>> dnn_processing=model=halve_first_channel.pb:input=dnn_in:output=dnn_out:f
>> mt=rgb24:dnn_backend=tensorflow
>> > -y out.tf.png
>> >
>> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>> > ---
>> >  configure                       |   1 +
>> >  doc/filters.texi                |  44 ++++++
>> >  libavfilter/Makefile            |   1 +
>> >  libavfilter/allfilters.c        |   1 +
>> >  libavfilter/vf_dnn_processing.c | 333
>> > ++++++++++++++++++++++++++++++++++++++++
>> >  5 files changed, 380 insertions(+)
>> >  create mode 100644 libavfilter/vf_dnn_processing.c
>> >
>> > diff --git a/configure b/configure
>> > index 8413826..bf2bac9 100755
>> > --- a/configure
>> > +++ b/configure
>> > @@ -3460,6 +3460,7 @@ derain_filter_select="dnn"
>> >  deshake_filter_select="pixelutils"
>> >  deshake_opencl_filter_deps="opencl"
>> >  dilation_opencl_filter_deps="opencl"
>> > +dnn_processing_filter_select="dnn"
>> >  drawtext_filter_deps="libfreetype"
>> >  drawtext_filter_suggest="libfontconfig libfribidi"
>> >  elbg_filter_deps="avcodec"
>> > diff --git a/doc/filters.texi b/doc/filters.texi
>> > index bdc4136..c11a616 100644
>> > --- a/doc/filters.texi
>> > +++ b/doc/filters.texi
>> > @@ -8928,6 +8928,50 @@ ffmpeg -i INPUT -f lavfi -i
>> > nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
>> >  @end example
>> >  @end itemize
>> >
>> > +@section dnn_processing
>> > +
>> > +Do image processing with deep neural networks. Currently only AVFrame
>> with
>> > RGB24
>> > +and BGR24 are supported, more formats will be added later.
>> > +
>> > +The filter accepts the following options:
>> > +
>> > +@table @option
>> > +@item dnn_backend
>> > +Specify which DNN backend to use for model loading and execution. This
>> > option accepts
>> > +the following values:
>> > +
>> > +@table @samp
>> > +@item native
>> > +Native implementation of DNN loading and execution.
>> > +
>> > +@item tensorflow
>> > +TensorFlow backend. To enable this backend you
>> > +need to install the TensorFlow for C library (see
>> > +@url{https://www.tensorflow.org/install/install_c}) and configure
>> > FFmpeg
>> > with
>> > +@code{--enable-libtensorflow}
>> > +@end table
>> > +
>> > +Default value is @samp{native}.
>> > +
>> > +@item model
>> > +Set path to model file specifying network architecture and its
>> > parameters.
>> > +Note that different backends use different file formats. TensorFlow
>> > and
>> > native
>> > +backend can load files for only its format.
>> > +
>> > +Native model file (.model) can be generated from TensorFlow model file
>> > (.pb) by using tools/python/convert.py
>> > +
>> > +@item input
>> > +Set the input name of the dnn network.
>> > +
>> > +@item output
>> > +Set the output name of the dnn network.
>> > +
>> > +@item fmt
>> > +Set the pixel format for the Frame. Allowed values are
>> > @code{AV_PIX_FMT_RGB24}, and @code{AV_PIX_FMT_BGR24}.
>> > +Default value is @code{AV_PIX_FMT_RGB24}.
>> > +
>> > +@end table
>> > +
>> >  @section drawbox
>> >
>> >  Draw a colored box on the input image.
>> > diff --git a/libavfilter/Makefile b/libavfilter/Makefile
>> > index 63d2fba..47a485a 100644
>> > --- a/libavfilter/Makefile
>> > +++ b/libavfilter/Makefile
>> > @@ -224,6 +224,7 @@ OBJS-$(CONFIG_DILATION_OPENCL_FILTER)
>> +=
>> > vf_neighbor_opencl.o opencl.o \
>> >
>> opencl/neighbor.o
>> >  OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o
>> framesync.o
>> >  OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
>> > +OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         +=
>> vf_dnn_processing.o
>> >  OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
>> >  OBJS-$(CONFIG_DRAWGRAPH_FILTER)              += f_drawgraph.o
>> >  OBJS-$(CONFIG_DRAWGRID_FILTER)               += vf_drawbox.o
>> > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
>> > index e4186f9..485409f 100644
>> > --- a/libavfilter/allfilters.c
>> > +++ b/libavfilter/allfilters.c
>> > @@ -209,6 +209,7 @@ extern AVFilter ff_vf_detelecine;
>> >  extern AVFilter ff_vf_dilation;
>> >  extern AVFilter ff_vf_dilation_opencl;
>> >  extern AVFilter ff_vf_displace;
>> > +extern AVFilter ff_vf_dnn_processing;
>> >  extern AVFilter ff_vf_doubleweave;
>> >  extern AVFilter ff_vf_drawbox;
>> >  extern AVFilter ff_vf_drawgraph;
>> > diff --git a/libavfilter/vf_dnn_processing.c
>> > b/libavfilter/vf_dnn_processing.c
>> > new file mode 100644
>> > index 0000000..de89af4
>> > --- /dev/null
>> > +++ b/libavfilter/vf_dnn_processing.c
>> > @@ -0,0 +1,333 @@
>> > +/*
>> > + * Copyright (c) 2019 Guo Yejun
>> > + *
>> > + * This file is part of FFmpeg.
>> > + *
>> > + * FFmpeg is free software; you can redistribute it and/or
>> > + * modify it under the terms of the GNU Lesser General Public
>> > + * License as published by the Free Software Foundation; either
>> > + * version 2.1 of the License, or (at your option) any later version.
>> > + *
>> > + * FFmpeg is distributed in the hope that it will be useful,
>> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> GNU
>> > + * Lesser General Public License for more details.
>> > + *
>> > + * You should have received a copy of the GNU Lesser General Public
>> > + * License along with FFmpeg; if not, write to the Free Software
>> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> > 02110-1301
>> > USA
>> > + */
>> > +
>> > +/**
>> > + * @file
>> > + * implementing a generic image processing filter using deep learning
>> > networks.
>> > + */
>> > +
>> > +#include "libavformat/avio.h"
>> > +#include "libavutil/opt.h"
>> > +#include "libavutil/pixdesc.h"
>> > +#include "libavutil/avassert.h"
>> > +#include "avfilter.h"
>> > +#include "dnn_interface.h"
>> > +#include "formats.h"
>> > +#include "internal.h"
>> > +
>> > +typedef struct DnnProcessingContext {
>> > +    const AVClass *class;
>> > +
>> > +    char *model_filename;
>> > +    DNNBackendType backend_type;
>> > +    enum AVPixelFormat fmt;
>>
>> This should be int.
>
> could you please help to explain a bit more why 'enum AVPixelFormat' should
> be int.
>
> I searched 'AV_OPT_TYPE_PIXEL_FMT' in vf_* files and found 'enum
> AVPixelFormat' is used in
> vf_mergeplanes.c, vf_program_opencl.c and vf_tonemap_opencl.c.

That is error, I gonna fix mergeplanes ASAP. Sometimes enum may use
completely another type, Michael may know more.
Besides options assumes ints and not enums.

>
>>
>> > +    char *model_inputname;
>> > +    char *model_outputname;
>> > +
>> > +    DNNModule *dnn_module;
>> > +    DNNModel *model;
>> > +
>> > +    // input & output of the model at execution time
>> > +    DNNData input;
>> > +    DNNData output;
>> > +} DnnProcessingContext;
>> > +
>> > +#define OFFSET(x) offsetof(DnnProcessingContext, x)
>> > +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM |
>> AV_OPT_FLAG_VIDEO_PARAM
>> > +static const AVOption dnn_processing_options[] = {
>> > +    { "dnn_backend", "DNN backend",
>> OFFSET(backend_type),
>> >  AV_OPT_TYPE_INT,       { .i64 = 0 },    0, 1, FLAGS, "backend" },
>> > +    { "native",      "native backend flag",        0,
>> >  AV_OPT_TYPE_CONST,     { .i64 = 0 },    0, 0, FLAGS, "backend" },
>> > +#if (CONFIG_LIBTENSORFLOW == 1)
>> > +    { "tensorflow",  "tensorflow backend flag",    0,
>> >  AV_OPT_TYPE_CONST,     { .i64 = 1 },    0, 0, FLAGS, "backend" },
>> > +#endif
>> > +    { "model",       "path to model file",
>> OFFSET(model_filename),
>> >  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
>> > +    { "input",       "input name of the model",
>> OFFSET(model_inputname),
>> >  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
>> > +    { "output",      "output name of the model",
>> > OFFSET(model_outputname), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0,
>> > FLAGS },
>> > +    { "fmt",         "AVPixelFormat of the frame", OFFSET(fmt),
>> >  AV_OPT_TYPE_PIXEL_FMT, { .i64=AV_PIX_FMT_RGB24 },
>> AV_PIX_FMT_NONE,
>> > AV_PIX_FMT_NB - 1, FLAGS },
>> > +    { NULL }
>> > +};
>> > +
>> > +AVFILTER_DEFINE_CLASS(dnn_processing);
>> > +
>> > +static av_cold int init(AVFilterContext *context)
>> > +{
>> > +    DnnProcessingContext *ctx = context->priv;
>> > +    int supported = 0;
>> > +    // as the first step, only rgb24 and bgr24 are supported
>> > +    const enum AVPixelFormat supported_pixel_fmts[] = {
>> > +        AV_PIX_FMT_RGB24,
>> > +        AV_PIX_FMT_BGR24,
>> > +    };
>> > +    for (int i = 0; i < sizeof(supported_pixel_fmts) / sizeof(enum
>> > AVPixelFormat); ++i) {
>> > +        if (supported_pixel_fmts[i] == ctx->fmt) {
>> > +            supported = 1;
>> > +            break;
>> > +        }
>> > +    }
>> > +    if (!supported) {
>> > +        av_log(context, AV_LOG_ERROR, "pixel fmt %s not supported
>> yet\n",
>> > +
>> av_get_pix_fmt_name(ctx->fmt));
>> > +        return AVERROR(AVERROR_INVALIDDATA);
>> > +    }
>> > +
>> > +    if (!ctx->model_filename) {
>> > +        av_log(ctx, AV_LOG_ERROR, "model file for network is not
>> > specified\n");
>> > +        return AVERROR(EINVAL);
>> > +    }
>> > +    if (!ctx->model_inputname) {
>> > +        av_log(ctx, AV_LOG_ERROR, "intput name of the model network is
>> not
>>
>> Typo
>
> thanks, will fix.
>
>>
>> > specified\n");
>> > +        return AVERROR(EINVAL);
>> > +    }
>> > +    if (!ctx->model_outputname) {
>> > +        av_log(ctx, AV_LOG_ERROR, "output name of the model network
>> is not
>> > specified\n");
>> > +        return AVERROR(EINVAL);
>> > +    }
>> > +
>> > +    ctx->dnn_module = ff_get_dnn_module(ctx->backend_type);
>> > +    if (!ctx->dnn_module) {
>> > +        av_log(ctx, AV_LOG_ERROR, "could not create DNN module for
>> > requested backend\n");
>> > +        return AVERROR(ENOMEM);
>> > +    }
>> > +    if (!ctx->dnn_module->load_model) {
>> > +        av_log(ctx, AV_LOG_ERROR, "load_model for network is not
>> > specified\n");
>> > +        return AVERROR(EINVAL);
>> > +    }
>> > +
>> > +    ctx->model = (ctx->dnn_module->load_model)(ctx->model_filename);
>> > +    if (!ctx->model) {
>> > +        av_log(ctx, AV_LOG_ERROR, "could not load DNN model\n");
>> > +        return AVERROR(EINVAL);
>> > +    }
>> > +
>> > +    return 0;
>> > +}
>> > +
>> > +static int query_formats(AVFilterContext *context)
>> > +{
>> > +    AVFilterFormats *formats;
>> > +    DnnProcessingContext *ctx = context->priv;
>> > +    enum AVPixelFormat pixel_fmts[2];
>> > +    pixel_fmts[0] = ctx->fmt;
>> > +    pixel_fmts[1] = AV_PIX_FMT_NONE;
>> > +
>> > +    formats = ff_make_format_list(pixel_fmts);
>> > +    return ff_set_common_formats(context, formats);
>> > +}
>> > +
>> > +static int config_input(AVFilterLink *inlink)
>> > +{
>> > +    AVFilterContext *context     = inlink->dst;
>> > +    DnnProcessingContext *ctx = context->priv;
>> > +    DNNReturnType result;
>> > +    DNNData dnn_data;
>> > +
>> > +    result = ctx->model->get_input(ctx->model->model, &dnn_data,
>> > ctx->model_inputname);
>> > +    if (result != DNN_SUCCESS) {
>> > +        av_log(ctx, AV_LOG_ERROR, "could not get input from the
>> model\n");
>> > +        return AVERROR(EIO);
>> > +    }
>> > +
>> > +    // the design is to add explicit scale filter before this filter
>> > +    if (dnn_data.height != -1 && dnn_data.height != inlink->h) {
>> > +        av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d
>> but
>> > got %d\n",
>> > +                                   dnn_data.height, inlink->h);
>> > +        return AVERROR(EIO);
>> > +    }
>> > +    if (dnn_data.width != -1 && dnn_data.width != inlink->w) {
>> > +        av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d
>> but
>> > got %d\n",
>> > +                                   dnn_data.width, inlink->w);
>> > +        return AVERROR(EIO);
>> > +    }
>> > +
>> > +    if (dnn_data.channels != 3) {
>> > +        av_log(ctx, AV_LOG_ERROR, "the model requires input
>> channels %d\n",
>> > +                                   dnn_data.channels);
>> > +        return AVERROR(EIO);
>> > +    }
>> > +    if (dnn_data.dt != DNN_FLOAT && dnn_data.dt != DNN_UINT8) {
>> > +        av_log(ctx, AV_LOG_ERROR, "only support dnn models with input
>> data
>> > type as float32 and uint8.\n");
>> > +        return AVERROR(EIO);
>> > +    }
>> > +
>> > +    ctx->input.width    = inlink->w;
>> > +    ctx->input.height   = inlink->h;
>> > +    ctx->input.channels = dnn_data.channels;
>> > +    ctx->input.dt = dnn_data.dt;
>> > +
>> > +    result = (ctx->model->set_input_output)(ctx->model->model,
>> > +                                        &ctx->input,
>> ctx->model_inputname,
>> > +                                        (const char
>> > **)&ctx->model_outputname, 1);
>> > +    if (result != DNN_SUCCESS) {
>> > +        av_log(ctx, AV_LOG_ERROR, "could not set input and output for
>> the
>> > model\n");
>> > +        return AVERROR(EIO);
>> > +    }
>> > +
>> > +    return 0;
>> > +}
>> > +
>> > +static int config_output(AVFilterLink *outlink)
>> > +{
>> > +    AVFilterContext *context = outlink->src;
>> > +    DnnProcessingContext *ctx = context->priv;
>> > +    DNNReturnType result;
>> > +
>> > +    // have a try run in case that the dnn model resize the frame
>> > +    result = (ctx->dnn_module->execute_model)(ctx->model,
>> > &ctx->output,
>> 1);
>> > +    if (result != DNN_SUCCESS){
>> > +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
>> > +        return AVERROR(EIO);
>> > +    }
>> > +
>> > +    outlink->w = ctx->output.width;
>> > +    outlink->h = ctx->output.height;
>> > +
>> > +    return 0;
>> > +}
>> > +
>> > +static int copy_from_frame_to_dnn(DNNData *dnn_data, const AVFrame
>> *in)
>> > +{
>> > +    // extend this function to support more formats
>> > +    av_assert0(in->format == AV_PIX_FMT_RGB24 || in->format ==
>> > AV_PIX_FMT_RGB24);
>> > +
>> > +    if (dnn_data->dt == DNN_FLOAT) {
>> > +        float *dnn_input = dnn_data->data;
>> > +        for (int i = 0; i < in->height; i++) {
>> > +            for(int j = 0; j < in->width * 3; j++) {
>> > +                int k = i * in->linesize[0] + j;
>> > +                int t = i * in->width * 3 + j;
>> > +                dnn_input[t] = in->data[0][k] / 255.0f;
>> > +            }
>> > +        }
>> > +    } else {
>> > +        uint8_t *dnn_input = dnn_data->data;
>> > +        av_assert0(dnn_data->dt == DNN_UINT8);
>> > +        for (int i = 0; i < in->height; i++) {
>> > +            for(int j = 0; j < in->width * 3; j++) {
>> > +                int k = i * in->linesize[0] + j;
>> > +                int t = i * in->width * 3 + j;
>> > +                dnn_input[t] = in->data[0][k];
>> > +            }
>> > +        }
>> > +    }
>> > +
>> > +    return 0;
>> > +}
>> > +
>> > +static int copy_from_dnn_to_frame(AVFrame *out, const DNNData
>> *dnn_data)
>> > +{
>> > +    // extend this function to support more formats
>> > +    av_assert0(out->format == AV_PIX_FMT_RGB24 || out->format ==
>> > AV_PIX_FMT_RGB24);
>> > +
>> > +    if (dnn_data->dt == DNN_FLOAT) {
>> > +        float *dnn_output = dnn_data->data;
>> > +        for (int i = 0; i < out->height; i++) {
>> > +            for(int j = 0; j < out->width * 3; j++) {
>> > +                int k = i * out->linesize[0] + j;
>> > +                int t = i * out->width * 3 + j;
>> > +                out->data[0][k] = av_clip((int)(dnn_output[t] *
>> > 255.0f), 0,
>> > 255);
>> > +            }
>> > +        }
>> > +    } else {
>> > +        uint8_t *dnn_output = dnn_data->data;
>> > +        av_assert0(dnn_data->dt == DNN_UINT8);
>> > +        for (int i = 0; i < out->height; i++) {
>> > +            for(int j = 0; j < out->width * 3; j++) {
>> > +                int k = i * out->linesize[0] + j;
>> > +                int t = i * out->width * 3 + j;
>> > +                out->data[0][k] = dnn_output[t];
>> > +            }
>> > +        }
>> > +    }
>> > +
>> > +    return 0;
>> > +}
>> > +
>> > +static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>> > +{
>> > +    AVFilterContext *context  = inlink->dst;
>> > +    AVFilterLink *outlink = context->outputs[0];
>> > +    DnnProcessingContext *ctx = context->priv;
>> > +    DNNReturnType dnn_result;
>> > +    AVFrame *out;
>> > +
>> > +    copy_from_frame_to_dnn(&ctx->input, in);
>> > +
>> > +    dnn_result = (ctx->dnn_module->execute_model)(ctx->model,
>> &ctx->output,
>> > 1);
>> > +    if (dnn_result != DNN_SUCCESS){
>> > +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
>> > +        av_frame_free(&in);
>> > +        return AVERROR(EIO);
>> > +    }
>> > +    av_assert0(ctx->output.channels == 3);
>> > +
>> > +    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
>> > +    if (!out) {
>> > +        av_log(ctx, AV_LOG_ERROR, "could not allocate memory for
>> output
>> > frame\n");
>>
>> This log message should be removed, as it is not useful at all.
>
> thanks, will remove
>
>>
>> > +        av_frame_free(&in);
>> > +        return AVERROR(ENOMEM);
>> > +    }
>> > +
>> > +    av_frame_copy_props(out, in);
>> > +    copy_from_dnn_to_frame(out, &ctx->output);
>> > +    av_frame_free(&in);
>> > +    return ff_filter_frame(outlink, out);
>> > +}
>> > +
>> > +static av_cold void uninit(AVFilterContext *ctx)
>> > +{
>> > +    DnnProcessingContext *context = ctx->priv;
>> > +
>> > +    if (context->dnn_module)
>> > +        (context->dnn_module->free_model)(&context->model);
>> > +
>> > +    av_freep(&context->dnn_module);
>> > +}
>> > +
>> > +static const AVFilterPad dnn_processing_inputs[] = {
>> > +    {
>> > +        .name         = "default",
>> > +        .type         = AVMEDIA_TYPE_VIDEO,
>> > +        .config_props = config_input,
>> > +        .filter_frame = filter_frame,
>> > +    },
>> > +    { NULL }
>> > +};
>> > +
>> > +static const AVFilterPad dnn_processing_outputs[] = {
>> > +    {
>> > +        .name = "default",
>> > +        .type = AVMEDIA_TYPE_VIDEO,
>> > +        .config_props  = config_output,
>> > +    },
>> > +    { NULL }
>> > +};
>> > +
>> > +AVFilter ff_vf_dnn_processing = {
>> > +    .name          = "dnn_processing",
>> > +    .description   = NULL_IF_CONFIG_SMALL("Apply DNN processing
>> filter to
>> > the input."),
>> > +    .priv_size     = sizeof(DnnProcessingContext),
>> > +    .init          = init,
>> > +    .uninit        = uninit,
>> > +    .query_formats = query_formats,
>> > +    .inputs        = dnn_processing_inputs,
>> > +    .outputs       = dnn_processing_outputs,
>> > +    .priv_class    = &dnn_processing_class,
>> > +    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,
>>
>> If filter changes w/h, this can not be supported.
>
> got it, will remove this line, thank you.
>
>>
>> > +};
>> > --
>> > 2.7.4
>> >
>> > _______________________________________________
>> > ffmpeg-devel mailing list
>> > ffmpeg-devel@ffmpeg.org
>> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> >
>> > To unsubscribe, visit link above, or email
>> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Paul B Mahol Oct. 28, 2019, 12:01 p.m. UTC | #5
On 10/28/19, Paul B Mahol <onemda@gmail.com> wrote:
> On 10/28/19, Guo, Yejun <yejun.guo@intel.com> wrote:
>>
>>
>>> -----Original Message-----
>>> From: Paul B Mahol [mailto:onemda@gmail.com]
>>> Sent: Monday, October 28, 2019 4:00 PM
>>> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
>>> Cc: Guo, Yejun <yejun.guo@intel.com>
>>> Subject: Re: [FFmpeg-devel] [PATCH V2 4/4] avfilter/vf_dnn_processing:
>>> add
>>> a
>>> generic filter for image proccessing with dnn networks
>>>
>>> On 10/21/19, Guo, Yejun <yejun.guo@intel.com> wrote:
>>> > This filter accepts all the dnn networks which do image processing.
>>> > Currently, frame with formats rgb24 and bgr24 are supported. Other
>>> > formats such as gray and YUV will be supported next. The dnn network
>>> > can accept data in float32 or uint8 format. And the dnn network can
>>> > change frame size.
>>> >
>>> > Let's take an example with the following python script. This script
>>> > halves the value of the first channel of the pixel.
>>> > import tensorflow as tf
>>> > import numpy as np
>>> > import scipy.misc
>>> > in_img = scipy.misc.imread('in.bmp')
>>> > in_img = in_img.astype(np.float32)/255.0
>>> > in_data = in_img[np.newaxis, :]
>>> > filter_data = np.array([0.5, 0, 0, 0, 1., 0, 0, 0,
>>> > 1.]).reshape(1,1,3,3).astype(np.float32)
>>> > filter = tf.Variable(filter_data)
>>> > x = tf.placeholder(tf.float32, shape=[1, None, None, 3],
>>> > name='dnn_in')
>>> > y = tf.nn.conv2d(x, filter, strides=[1, 1, 1, 1], padding='VALID',
>>> > name='dnn_out')
>>> > sess=tf.Session()
>>> > sess.run(tf.global_variables_initializer())
>>> > output = sess.run(y, feed_dict={x: in_data})
>>> > graph_def = tf.graph_util.convert_variables_to_constants(sess,
>>> > sess.graph_def, ['dnn_out'])
>>> > tf.train.write_graph(graph_def, '.', 'halve_first_channel.pb',
>>> > as_text=False)
>>> > output = output * 255.0
>>> > output = output.astype(np.uint8)
>>> > scipy.misc.imsave("out.bmp", np.squeeze(output))
>>>
>>> So this one executes python code without ever returning or using
>>> AVFrame*
>>> ?
>>> This is extremely limited usage.
>>
>> the purpose of this script is to demo how to setup and execute dnn models
>> with python+tensorflow.
>> The only relationship with ffmpeg is that the script prepares the model
>> file
>> halve_first_channel.pb.
>>
>> The next description shows how ffmpeg can execute the model in a filter.
>>
>> I'll try to update the commit log to avoid misleading words, thanks.
>>
>>>
>>> >
>>> > - generate halve_first_channel.pb with the above script
>>> > - generate halve_first_channel.model with tools/python/convert.py
>>> > - try with following commands
>>> >   ./ffmpeg -i input.jpg -vf
>>> >
>>> dnn_processing=model=halve_first_channel.model:input=dnn_in:output=dnn_
>>> out:fmt=rgb24:dnn_backend=native
>>> > -y out.native.png
>>> >   ./ffmpeg -i input.jpg -vf
>>> >
>>> dnn_processing=model=halve_first_channel.pb:input=dnn_in:output=dnn_out:f
>>> mt=rgb24:dnn_backend=tensorflow
>>> > -y out.tf.png
>>> >
>>> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
>>> > ---
>>> >  configure                       |   1 +
>>> >  doc/filters.texi                |  44 ++++++
>>> >  libavfilter/Makefile            |   1 +
>>> >  libavfilter/allfilters.c        |   1 +
>>> >  libavfilter/vf_dnn_processing.c | 333
>>> > ++++++++++++++++++++++++++++++++++++++++
>>> >  5 files changed, 380 insertions(+)
>>> >  create mode 100644 libavfilter/vf_dnn_processing.c
>>> >
>>> > diff --git a/configure b/configure
>>> > index 8413826..bf2bac9 100755
>>> > --- a/configure
>>> > +++ b/configure
>>> > @@ -3460,6 +3460,7 @@ derain_filter_select="dnn"
>>> >  deshake_filter_select="pixelutils"
>>> >  deshake_opencl_filter_deps="opencl"
>>> >  dilation_opencl_filter_deps="opencl"
>>> > +dnn_processing_filter_select="dnn"
>>> >  drawtext_filter_deps="libfreetype"
>>> >  drawtext_filter_suggest="libfontconfig libfribidi"
>>> >  elbg_filter_deps="avcodec"
>>> > diff --git a/doc/filters.texi b/doc/filters.texi
>>> > index bdc4136..c11a616 100644
>>> > --- a/doc/filters.texi
>>> > +++ b/doc/filters.texi
>>> > @@ -8928,6 +8928,50 @@ ffmpeg -i INPUT -f lavfi -i
>>> > nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
>>> >  @end example
>>> >  @end itemize
>>> >
>>> > +@section dnn_processing
>>> > +
>>> > +Do image processing with deep neural networks. Currently only AVFrame
>>> with
>>> > RGB24
>>> > +and BGR24 are supported, more formats will be added later.
>>> > +
>>> > +The filter accepts the following options:
>>> > +
>>> > +@table @option
>>> > +@item dnn_backend
>>> > +Specify which DNN backend to use for model loading and execution.
>>> > This
>>> > option accepts
>>> > +the following values:
>>> > +
>>> > +@table @samp
>>> > +@item native
>>> > +Native implementation of DNN loading and execution.
>>> > +
>>> > +@item tensorflow
>>> > +TensorFlow backend. To enable this backend you
>>> > +need to install the TensorFlow for C library (see
>>> > +@url{https://www.tensorflow.org/install/install_c}) and configure
>>> > FFmpeg
>>> > with
>>> > +@code{--enable-libtensorflow}
>>> > +@end table
>>> > +
>>> > +Default value is @samp{native}.
>>> > +
>>> > +@item model
>>> > +Set path to model file specifying network architecture and its
>>> > parameters.
>>> > +Note that different backends use different file formats. TensorFlow
>>> > and
>>> > native
>>> > +backend can load files for only its format.
>>> > +
>>> > +Native model file (.model) can be generated from TensorFlow model
>>> > file
>>> > (.pb) by using tools/python/convert.py
>>> > +
>>> > +@item input
>>> > +Set the input name of the dnn network.
>>> > +
>>> > +@item output
>>> > +Set the output name of the dnn network.
>>> > +
>>> > +@item fmt
>>> > +Set the pixel format for the Frame. Allowed values are
>>> > @code{AV_PIX_FMT_RGB24}, and @code{AV_PIX_FMT_BGR24}.
>>> > +Default value is @code{AV_PIX_FMT_RGB24}.
>>> > +
>>> > +@end table
>>> > +
>>> >  @section drawbox
>>> >
>>> >  Draw a colored box on the input image.
>>> > diff --git a/libavfilter/Makefile b/libavfilter/Makefile
>>> > index 63d2fba..47a485a 100644
>>> > --- a/libavfilter/Makefile
>>> > +++ b/libavfilter/Makefile
>>> > @@ -224,6 +224,7 @@ OBJS-$(CONFIG_DILATION_OPENCL_FILTER)
>>> +=
>>> > vf_neighbor_opencl.o opencl.o \
>>> >
>>> opencl/neighbor.o
>>> >  OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o
>>> framesync.o
>>> >  OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
>>> > +OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         +=
>>> vf_dnn_processing.o
>>> >  OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
>>> >  OBJS-$(CONFIG_DRAWGRAPH_FILTER)              += f_drawgraph.o
>>> >  OBJS-$(CONFIG_DRAWGRID_FILTER)               += vf_drawbox.o
>>> > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
>>> > index e4186f9..485409f 100644
>>> > --- a/libavfilter/allfilters.c
>>> > +++ b/libavfilter/allfilters.c
>>> > @@ -209,6 +209,7 @@ extern AVFilter ff_vf_detelecine;
>>> >  extern AVFilter ff_vf_dilation;
>>> >  extern AVFilter ff_vf_dilation_opencl;
>>> >  extern AVFilter ff_vf_displace;
>>> > +extern AVFilter ff_vf_dnn_processing;
>>> >  extern AVFilter ff_vf_doubleweave;
>>> >  extern AVFilter ff_vf_drawbox;
>>> >  extern AVFilter ff_vf_drawgraph;
>>> > diff --git a/libavfilter/vf_dnn_processing.c
>>> > b/libavfilter/vf_dnn_processing.c
>>> > new file mode 100644
>>> > index 0000000..de89af4
>>> > --- /dev/null
>>> > +++ b/libavfilter/vf_dnn_processing.c
>>> > @@ -0,0 +1,333 @@
>>> > +/*
>>> > + * Copyright (c) 2019 Guo Yejun
>>> > + *
>>> > + * This file is part of FFmpeg.
>>> > + *
>>> > + * FFmpeg is free software; you can redistribute it and/or
>>> > + * modify it under the terms of the GNU Lesser General Public
>>> > + * License as published by the Free Software Foundation; either
>>> > + * version 2.1 of the License, or (at your option) any later version.
>>> > + *
>>> > + * FFmpeg is distributed in the hope that it will be useful,
>>> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> GNU
>>> > + * Lesser General Public License for more details.
>>> > + *
>>> > + * You should have received a copy of the GNU Lesser General Public
>>> > + * License along with FFmpeg; if not, write to the Free Software
>>> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>>> > 02110-1301
>>> > USA
>>> > + */
>>> > +
>>> > +/**
>>> > + * @file
>>> > + * implementing a generic image processing filter using deep learning
>>> > networks.
>>> > + */
>>> > +
>>> > +#include "libavformat/avio.h"
>>> > +#include "libavutil/opt.h"
>>> > +#include "libavutil/pixdesc.h"
>>> > +#include "libavutil/avassert.h"
>>> > +#include "avfilter.h"
>>> > +#include "dnn_interface.h"
>>> > +#include "formats.h"
>>> > +#include "internal.h"
>>> > +
>>> > +typedef struct DnnProcessingContext {
>>> > +    const AVClass *class;
>>> > +
>>> > +    char *model_filename;
>>> > +    DNNBackendType backend_type;
>>> > +    enum AVPixelFormat fmt;
>>>
>>> This should be int.
>>
>> could you please help to explain a bit more why 'enum AVPixelFormat'
>> should
>> be int.
>>
>> I searched 'AV_OPT_TYPE_PIXEL_FMT' in vf_* files and found 'enum
>> AVPixelFormat' is used in
>> vf_mergeplanes.c, vf_program_opencl.c and vf_tonemap_opencl.c.
>
> That is error, I gonna fix mergeplanes ASAP. Sometimes enum may use
> completely another type, Michael may know more.
> Besides options assumes ints and not enums.

Actually they take also enums, so it should be safe. So ignore this one. Sorry.

>
>>
>>>
>>> > +    char *model_inputname;
>>> > +    char *model_outputname;
>>> > +
>>> > +    DNNModule *dnn_module;
>>> > +    DNNModel *model;
>>> > +
>>> > +    // input & output of the model at execution time
>>> > +    DNNData input;
>>> > +    DNNData output;
>>> > +} DnnProcessingContext;
>>> > +
>>> > +#define OFFSET(x) offsetof(DnnProcessingContext, x)
>>> > +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM |
>>> AV_OPT_FLAG_VIDEO_PARAM
>>> > +static const AVOption dnn_processing_options[] = {
>>> > +    { "dnn_backend", "DNN backend",
>>> OFFSET(backend_type),
>>> >  AV_OPT_TYPE_INT,       { .i64 = 0 },    0, 1, FLAGS, "backend" },
>>> > +    { "native",      "native backend flag",        0,
>>> >  AV_OPT_TYPE_CONST,     { .i64 = 0 },    0, 0, FLAGS, "backend" },
>>> > +#if (CONFIG_LIBTENSORFLOW == 1)
>>> > +    { "tensorflow",  "tensorflow backend flag",    0,
>>> >  AV_OPT_TYPE_CONST,     { .i64 = 1 },    0, 0, FLAGS, "backend" },
>>> > +#endif
>>> > +    { "model",       "path to model file",
>>> OFFSET(model_filename),
>>> >  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
>>> > +    { "input",       "input name of the model",
>>> OFFSET(model_inputname),
>>> >  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
>>> > +    { "output",      "output name of the model",
>>> > OFFSET(model_outputname), AV_OPT_TYPE_STRING,    { .str = NULL }, 0,
>>> > 0,
>>> > FLAGS },
>>> > +    { "fmt",         "AVPixelFormat of the frame", OFFSET(fmt),
>>> >  AV_OPT_TYPE_PIXEL_FMT, { .i64=AV_PIX_FMT_RGB24 },
>>> AV_PIX_FMT_NONE,
>>> > AV_PIX_FMT_NB - 1, FLAGS },
>>> > +    { NULL }
>>> > +};
>>> > +
>>> > +AVFILTER_DEFINE_CLASS(dnn_processing);
>>> > +
>>> > +static av_cold int init(AVFilterContext *context)
>>> > +{
>>> > +    DnnProcessingContext *ctx = context->priv;
>>> > +    int supported = 0;
>>> > +    // as the first step, only rgb24 and bgr24 are supported
>>> > +    const enum AVPixelFormat supported_pixel_fmts[] = {
>>> > +        AV_PIX_FMT_RGB24,
>>> > +        AV_PIX_FMT_BGR24,
>>> > +    };
>>> > +    for (int i = 0; i < sizeof(supported_pixel_fmts) / sizeof(enum
>>> > AVPixelFormat); ++i) {
>>> > +        if (supported_pixel_fmts[i] == ctx->fmt) {
>>> > +            supported = 1;
>>> > +            break;
>>> > +        }
>>> > +    }
>>> > +    if (!supported) {
>>> > +        av_log(context, AV_LOG_ERROR, "pixel fmt %s not supported
>>> yet\n",
>>> > +
>>> av_get_pix_fmt_name(ctx->fmt));
>>> > +        return AVERROR(AVERROR_INVALIDDATA);
>>> > +    }
>>> > +
>>> > +    if (!ctx->model_filename) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "model file for network is not
>>> > specified\n");
>>> > +        return AVERROR(EINVAL);
>>> > +    }
>>> > +    if (!ctx->model_inputname) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "intput name of the model network
>>> > is
>>> not
>>>
>>> Typo
>>
>> thanks, will fix.
>>
>>>
>>> > specified\n");
>>> > +        return AVERROR(EINVAL);
>>> > +    }
>>> > +    if (!ctx->model_outputname) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "output name of the model network
>>> is not
>>> > specified\n");
>>> > +        return AVERROR(EINVAL);
>>> > +    }
>>> > +
>>> > +    ctx->dnn_module = ff_get_dnn_module(ctx->backend_type);
>>> > +    if (!ctx->dnn_module) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "could not create DNN module for
>>> > requested backend\n");
>>> > +        return AVERROR(ENOMEM);
>>> > +    }
>>> > +    if (!ctx->dnn_module->load_model) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "load_model for network is not
>>> > specified\n");
>>> > +        return AVERROR(EINVAL);
>>> > +    }
>>> > +
>>> > +    ctx->model = (ctx->dnn_module->load_model)(ctx->model_filename);
>>> > +    if (!ctx->model) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "could not load DNN model\n");
>>> > +        return AVERROR(EINVAL);
>>> > +    }
>>> > +
>>> > +    return 0;
>>> > +}
>>> > +
>>> > +static int query_formats(AVFilterContext *context)
>>> > +{
>>> > +    AVFilterFormats *formats;
>>> > +    DnnProcessingContext *ctx = context->priv;
>>> > +    enum AVPixelFormat pixel_fmts[2];
>>> > +    pixel_fmts[0] = ctx->fmt;
>>> > +    pixel_fmts[1] = AV_PIX_FMT_NONE;
>>> > +
>>> > +    formats = ff_make_format_list(pixel_fmts);
>>> > +    return ff_set_common_formats(context, formats);
>>> > +}
>>> > +
>>> > +static int config_input(AVFilterLink *inlink)
>>> > +{
>>> > +    AVFilterContext *context     = inlink->dst;
>>> > +    DnnProcessingContext *ctx = context->priv;
>>> > +    DNNReturnType result;
>>> > +    DNNData dnn_data;
>>> > +
>>> > +    result = ctx->model->get_input(ctx->model->model, &dnn_data,
>>> > ctx->model_inputname);
>>> > +    if (result != DNN_SUCCESS) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "could not get input from the
>>> model\n");
>>> > +        return AVERROR(EIO);
>>> > +    }
>>> > +
>>> > +    // the design is to add explicit scale filter before this filter
>>> > +    if (dnn_data.height != -1 && dnn_data.height != inlink->h) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d
>>> but
>>> > got %d\n",
>>> > +                                   dnn_data.height, inlink->h);
>>> > +        return AVERROR(EIO);
>>> > +    }
>>> > +    if (dnn_data.width != -1 && dnn_data.width != inlink->w) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d
>>> but
>>> > got %d\n",
>>> > +                                   dnn_data.width, inlink->w);
>>> > +        return AVERROR(EIO);
>>> > +    }
>>> > +
>>> > +    if (dnn_data.channels != 3) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "the model requires input
>>> channels %d\n",
>>> > +                                   dnn_data.channels);
>>> > +        return AVERROR(EIO);
>>> > +    }
>>> > +    if (dnn_data.dt != DNN_FLOAT && dnn_data.dt != DNN_UINT8) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "only support dnn models with input
>>> data
>>> > type as float32 and uint8.\n");
>>> > +        return AVERROR(EIO);
>>> > +    }
>>> > +
>>> > +    ctx->input.width    = inlink->w;
>>> > +    ctx->input.height   = inlink->h;
>>> > +    ctx->input.channels = dnn_data.channels;
>>> > +    ctx->input.dt = dnn_data.dt;
>>> > +
>>> > +    result = (ctx->model->set_input_output)(ctx->model->model,
>>> > +                                        &ctx->input,
>>> ctx->model_inputname,
>>> > +                                        (const char
>>> > **)&ctx->model_outputname, 1);
>>> > +    if (result != DNN_SUCCESS) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "could not set input and output for
>>> the
>>> > model\n");
>>> > +        return AVERROR(EIO);
>>> > +    }
>>> > +
>>> > +    return 0;
>>> > +}
>>> > +
>>> > +static int config_output(AVFilterLink *outlink)
>>> > +{
>>> > +    AVFilterContext *context = outlink->src;
>>> > +    DnnProcessingContext *ctx = context->priv;
>>> > +    DNNReturnType result;
>>> > +
>>> > +    // have a try run in case that the dnn model resize the frame
>>> > +    result = (ctx->dnn_module->execute_model)(ctx->model,
>>> > &ctx->output,
>>> 1);
>>> > +    if (result != DNN_SUCCESS){
>>> > +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
>>> > +        return AVERROR(EIO);
>>> > +    }
>>> > +
>>> > +    outlink->w = ctx->output.width;
>>> > +    outlink->h = ctx->output.height;
>>> > +
>>> > +    return 0;
>>> > +}
>>> > +
>>> > +static int copy_from_frame_to_dnn(DNNData *dnn_data, const AVFrame
>>> *in)
>>> > +{
>>> > +    // extend this function to support more formats
>>> > +    av_assert0(in->format == AV_PIX_FMT_RGB24 || in->format ==
>>> > AV_PIX_FMT_RGB24);
>>> > +
>>> > +    if (dnn_data->dt == DNN_FLOAT) {
>>> > +        float *dnn_input = dnn_data->data;
>>> > +        for (int i = 0; i < in->height; i++) {
>>> > +            for(int j = 0; j < in->width * 3; j++) {
>>> > +                int k = i * in->linesize[0] + j;
>>> > +                int t = i * in->width * 3 + j;
>>> > +                dnn_input[t] = in->data[0][k] / 255.0f;
>>> > +            }
>>> > +        }
>>> > +    } else {
>>> > +        uint8_t *dnn_input = dnn_data->data;
>>> > +        av_assert0(dnn_data->dt == DNN_UINT8);
>>> > +        for (int i = 0; i < in->height; i++) {
>>> > +            for(int j = 0; j < in->width * 3; j++) {
>>> > +                int k = i * in->linesize[0] + j;
>>> > +                int t = i * in->width * 3 + j;
>>> > +                dnn_input[t] = in->data[0][k];
>>> > +            }
>>> > +        }
>>> > +    }
>>> > +
>>> > +    return 0;
>>> > +}
>>> > +
>>> > +static int copy_from_dnn_to_frame(AVFrame *out, const DNNData
>>> *dnn_data)
>>> > +{
>>> > +    // extend this function to support more formats
>>> > +    av_assert0(out->format == AV_PIX_FMT_RGB24 || out->format ==
>>> > AV_PIX_FMT_RGB24);
>>> > +
>>> > +    if (dnn_data->dt == DNN_FLOAT) {
>>> > +        float *dnn_output = dnn_data->data;
>>> > +        for (int i = 0; i < out->height; i++) {
>>> > +            for(int j = 0; j < out->width * 3; j++) {
>>> > +                int k = i * out->linesize[0] + j;
>>> > +                int t = i * out->width * 3 + j;
>>> > +                out->data[0][k] = av_clip((int)(dnn_output[t] *
>>> > 255.0f), 0,
>>> > 255);
>>> > +            }
>>> > +        }
>>> > +    } else {
>>> > +        uint8_t *dnn_output = dnn_data->data;
>>> > +        av_assert0(dnn_data->dt == DNN_UINT8);
>>> > +        for (int i = 0; i < out->height; i++) {
>>> > +            for(int j = 0; j < out->width * 3; j++) {
>>> > +                int k = i * out->linesize[0] + j;
>>> > +                int t = i * out->width * 3 + j;
>>> > +                out->data[0][k] = dnn_output[t];
>>> > +            }
>>> > +        }
>>> > +    }
>>> > +
>>> > +    return 0;
>>> > +}
>>> > +
>>> > +static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>>> > +{
>>> > +    AVFilterContext *context  = inlink->dst;
>>> > +    AVFilterLink *outlink = context->outputs[0];
>>> > +    DnnProcessingContext *ctx = context->priv;
>>> > +    DNNReturnType dnn_result;
>>> > +    AVFrame *out;
>>> > +
>>> > +    copy_from_frame_to_dnn(&ctx->input, in);
>>> > +
>>> > +    dnn_result = (ctx->dnn_module->execute_model)(ctx->model,
>>> &ctx->output,
>>> > 1);
>>> > +    if (dnn_result != DNN_SUCCESS){
>>> > +        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
>>> > +        av_frame_free(&in);
>>> > +        return AVERROR(EIO);
>>> > +    }
>>> > +    av_assert0(ctx->output.channels == 3);
>>> > +
>>> > +    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
>>> > +    if (!out) {
>>> > +        av_log(ctx, AV_LOG_ERROR, "could not allocate memory for
>>> output
>>> > frame\n");
>>>
>>> This log message should be removed, as it is not useful at all.
>>
>> thanks, will remove
>>
>>>
>>> > +        av_frame_free(&in);
>>> > +        return AVERROR(ENOMEM);
>>> > +    }
>>> > +
>>> > +    av_frame_copy_props(out, in);
>>> > +    copy_from_dnn_to_frame(out, &ctx->output);
>>> > +    av_frame_free(&in);
>>> > +    return ff_filter_frame(outlink, out);
>>> > +}
>>> > +
>>> > +static av_cold void uninit(AVFilterContext *ctx)
>>> > +{
>>> > +    DnnProcessingContext *context = ctx->priv;
>>> > +
>>> > +    if (context->dnn_module)
>>> > +        (context->dnn_module->free_model)(&context->model);
>>> > +
>>> > +    av_freep(&context->dnn_module);
>>> > +}
>>> > +
>>> > +static const AVFilterPad dnn_processing_inputs[] = {
>>> > +    {
>>> > +        .name         = "default",
>>> > +        .type         = AVMEDIA_TYPE_VIDEO,
>>> > +        .config_props = config_input,
>>> > +        .filter_frame = filter_frame,
>>> > +    },
>>> > +    { NULL }
>>> > +};
>>> > +
>>> > +static const AVFilterPad dnn_processing_outputs[] = {
>>> > +    {
>>> > +        .name = "default",
>>> > +        .type = AVMEDIA_TYPE_VIDEO,
>>> > +        .config_props  = config_output,
>>> > +    },
>>> > +    { NULL }
>>> > +};
>>> > +
>>> > +AVFilter ff_vf_dnn_processing = {
>>> > +    .name          = "dnn_processing",
>>> > +    .description   = NULL_IF_CONFIG_SMALL("Apply DNN processing
>>> filter to
>>> > the input."),
>>> > +    .priv_size     = sizeof(DnnProcessingContext),
>>> > +    .init          = init,
>>> > +    .uninit        = uninit,
>>> > +    .query_formats = query_formats,
>>> > +    .inputs        = dnn_processing_inputs,
>>> > +    .outputs       = dnn_processing_outputs,
>>> > +    .priv_class    = &dnn_processing_class,
>>> > +    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,
>>>
>>> If filter changes w/h, this can not be supported.
>>
>> got it, will remove this line, thank you.
>>
>>>
>>> > +};
>>> > --
>>> > 2.7.4
>>> >
>>> > _______________________________________________
>>> > ffmpeg-devel mailing list
>>> > ffmpeg-devel@ffmpeg.org
>>> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>> >
>>> > To unsubscribe, visit link above, or email
>>> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
Guo, Yejun Oct. 28, 2019, 12:56 p.m. UTC | #6
> -----Original Message-----

> From: Paul B Mahol [mailto:onemda@gmail.com]

> Sent: Monday, October 28, 2019 8:01 PM

> To: Guo, Yejun <yejun.guo@intel.com>

> Cc: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH V2 4/4] avfilter/vf_dnn_processing: add a

> generic filter for image proccessing with dnn networks

> 

> On 10/28/19, Paul B Mahol <onemda@gmail.com> wrote:

> > On 10/28/19, Guo, Yejun <yejun.guo@intel.com> wrote:

> >>

> >>

> >>> -----Original Message-----

> >>> From: Paul B Mahol [mailto:onemda@gmail.com]

> >>> Sent: Monday, October 28, 2019 4:00 PM

> >>> To: FFmpeg development discussions and patches

> <ffmpeg-devel@ffmpeg.org>

> >>> Cc: Guo, Yejun <yejun.guo@intel.com>

> >>> Subject: Re: [FFmpeg-devel] [PATCH V2 4/4] avfilter/vf_dnn_processing:

> >>> add

> >>> a

> >>> generic filter for image proccessing with dnn networks

> >>>

> >>> On 10/21/19, Guo, Yejun <yejun.guo@intel.com> wrote:

> >>> > This filter accepts all the dnn networks which do image processing.

> >>> > Currently, frame with formats rgb24 and bgr24 are supported. Other

> >>> > formats such as gray and YUV will be supported next. The dnn network

> >>> > can accept data in float32 or uint8 format. And the dnn network can

...
> >>> > +

> >>> > +typedef struct DnnProcessingContext {

> >>> > +    const AVClass *class;

> >>> > +

> >>> > +    char *model_filename;

> >>> > +    DNNBackendType backend_type;

> >>> > +    enum AVPixelFormat fmt;

> >>>

> >>> This should be int.

> >>

> >> could you please help to explain a bit more why 'enum AVPixelFormat'

> >> should

> >> be int.

> >>

> >> I searched 'AV_OPT_TYPE_PIXEL_FMT' in vf_* files and found 'enum

> >> AVPixelFormat' is used in

> >> vf_mergeplanes.c, vf_program_opencl.c and vf_tonemap_opencl.c.

> >

> > That is error, I gonna fix mergeplanes ASAP. Sometimes enum may use

> > completely another type, Michael may know more.

> > Besides options assumes ints and not enums.

> 

> Actually they take also enums, so it should be safe. So ignore this one. Sorry.


got it, no problem, thanks.
diff mbox

Patch

diff --git a/configure b/configure
index 8413826..bf2bac9 100755
--- a/configure
+++ b/configure
@@ -3460,6 +3460,7 @@  derain_filter_select="dnn"
 deshake_filter_select="pixelutils"
 deshake_opencl_filter_deps="opencl"
 dilation_opencl_filter_deps="opencl"
+dnn_processing_filter_select="dnn"
 drawtext_filter_deps="libfreetype"
 drawtext_filter_suggest="libfontconfig libfribidi"
 elbg_filter_deps="avcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index bdc4136..c11a616 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -8928,6 +8928,50 @@  ffmpeg -i INPUT -f lavfi -i nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
 @end example
 @end itemize
 
+@section dnn_processing
+
+Do image processing with deep neural networks. Currently only AVFrame with RGB24
+and BGR24 are supported, more formats will be added later.
+
+The filter accepts the following options:
+
+@table @option
+@item dnn_backend
+Specify which DNN backend to use for model loading and execution. This option accepts
+the following values:
+
+@table @samp
+@item native
+Native implementation of DNN loading and execution.
+
+@item tensorflow
+TensorFlow backend. To enable this backend you
+need to install the TensorFlow for C library (see
+@url{https://www.tensorflow.org/install/install_c}) and configure FFmpeg with
+@code{--enable-libtensorflow}
+@end table
+
+Default value is @samp{native}.
+
+@item model
+Set path to model file specifying network architecture and its parameters.
+Note that different backends use different file formats. TensorFlow and native
+backend can load files for only its format.
+
+Native model file (.model) can be generated from TensorFlow model file (.pb) by using tools/python/convert.py
+
+@item input
+Set the input name of the dnn network.
+
+@item output
+Set the output name of the dnn network.
+
+@item fmt
+Set the pixel format for the Frame. Allowed values are @code{AV_PIX_FMT_RGB24}, and @code{AV_PIX_FMT_BGR24}.
+Default value is @code{AV_PIX_FMT_RGB24}.
+
+@end table
+
 @section drawbox
 
 Draw a colored box on the input image.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 63d2fba..47a485a 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -224,6 +224,7 @@  OBJS-$(CONFIG_DILATION_OPENCL_FILTER)        += vf_neighbor_opencl.o opencl.o \
                                                 opencl/neighbor.o
 OBJS-$(CONFIG_DISPLACE_FILTER)               += vf_displace.o framesync.o
 OBJS-$(CONFIG_DOUBLEWEAVE_FILTER)            += vf_weave.o
+OBJS-$(CONFIG_DNN_PROCESSING_FILTER)         += vf_dnn_processing.o
 OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
 OBJS-$(CONFIG_DRAWGRAPH_FILTER)              += f_drawgraph.o
 OBJS-$(CONFIG_DRAWGRID_FILTER)               += vf_drawbox.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index e4186f9..485409f 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -209,6 +209,7 @@  extern AVFilter ff_vf_detelecine;
 extern AVFilter ff_vf_dilation;
 extern AVFilter ff_vf_dilation_opencl;
 extern AVFilter ff_vf_displace;
+extern AVFilter ff_vf_dnn_processing;
 extern AVFilter ff_vf_doubleweave;
 extern AVFilter ff_vf_drawbox;
 extern AVFilter ff_vf_drawgraph;
diff --git a/libavfilter/vf_dnn_processing.c b/libavfilter/vf_dnn_processing.c
new file mode 100644
index 0000000..de89af4
--- /dev/null
+++ b/libavfilter/vf_dnn_processing.c
@@ -0,0 +1,333 @@ 
+/*
+ * Copyright (c) 2019 Guo Yejun
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * implementing a generic image processing filter using deep learning networks.
+ */
+
+#include "libavformat/avio.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
+#include "avfilter.h"
+#include "dnn_interface.h"
+#include "formats.h"
+#include "internal.h"
+
+typedef struct DnnProcessingContext {
+    const AVClass *class;
+
+    char *model_filename;
+    DNNBackendType backend_type;
+    enum AVPixelFormat fmt;
+    char *model_inputname;
+    char *model_outputname;
+
+    DNNModule *dnn_module;
+    DNNModel *model;
+
+    // input & output of the model at execution time
+    DNNData input;
+    DNNData output;
+} DnnProcessingContext;
+
+#define OFFSET(x) offsetof(DnnProcessingContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+static const AVOption dnn_processing_options[] = {
+    { "dnn_backend", "DNN backend",                OFFSET(backend_type),     AV_OPT_TYPE_INT,       { .i64 = 0 },    0, 1, FLAGS, "backend" },
+    { "native",      "native backend flag",        0,                        AV_OPT_TYPE_CONST,     { .i64 = 0 },    0, 0, FLAGS, "backend" },
+#if (CONFIG_LIBTENSORFLOW == 1)
+    { "tensorflow",  "tensorflow backend flag",    0,                        AV_OPT_TYPE_CONST,     { .i64 = 1 },    0, 0, FLAGS, "backend" },
+#endif
+    { "model",       "path to model file",         OFFSET(model_filename),   AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
+    { "input",       "input name of the model",    OFFSET(model_inputname),  AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
+    { "output",      "output name of the model",   OFFSET(model_outputname), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
+    { "fmt",         "AVPixelFormat of the frame", OFFSET(fmt),              AV_OPT_TYPE_PIXEL_FMT, { .i64=AV_PIX_FMT_RGB24 }, AV_PIX_FMT_NONE, AV_PIX_FMT_NB - 1, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(dnn_processing);
+
+static av_cold int init(AVFilterContext *context)
+{
+    DnnProcessingContext *ctx = context->priv;
+    int supported = 0;
+    // as the first step, only rgb24 and bgr24 are supported
+    const enum AVPixelFormat supported_pixel_fmts[] = {
+        AV_PIX_FMT_RGB24,
+        AV_PIX_FMT_BGR24,
+    };
+    for (int i = 0; i < sizeof(supported_pixel_fmts) / sizeof(enum AVPixelFormat); ++i) {
+        if (supported_pixel_fmts[i] == ctx->fmt) {
+            supported = 1;
+            break;
+        }
+    }
+    if (!supported) {
+        av_log(context, AV_LOG_ERROR, "pixel fmt %s not supported yet\n",
+                                       av_get_pix_fmt_name(ctx->fmt));
+        return AVERROR(AVERROR_INVALIDDATA);
+    }
+
+    if (!ctx->model_filename) {
+        av_log(ctx, AV_LOG_ERROR, "model file for network is not specified\n");
+        return AVERROR(EINVAL);
+    }
+    if (!ctx->model_inputname) {
+        av_log(ctx, AV_LOG_ERROR, "intput name of the model network is not specified\n");
+        return AVERROR(EINVAL);
+    }
+    if (!ctx->model_outputname) {
+        av_log(ctx, AV_LOG_ERROR, "output name of the model network is not specified\n");
+        return AVERROR(EINVAL);
+    }
+
+    ctx->dnn_module = ff_get_dnn_module(ctx->backend_type);
+    if (!ctx->dnn_module) {
+        av_log(ctx, AV_LOG_ERROR, "could not create DNN module for requested backend\n");
+        return AVERROR(ENOMEM);
+    }
+    if (!ctx->dnn_module->load_model) {
+        av_log(ctx, AV_LOG_ERROR, "load_model for network is not specified\n");
+        return AVERROR(EINVAL);
+    }
+
+    ctx->model = (ctx->dnn_module->load_model)(ctx->model_filename);
+    if (!ctx->model) {
+        av_log(ctx, AV_LOG_ERROR, "could not load DNN model\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static int query_formats(AVFilterContext *context)
+{
+    AVFilterFormats *formats;
+    DnnProcessingContext *ctx = context->priv;
+    enum AVPixelFormat pixel_fmts[2];
+    pixel_fmts[0] = ctx->fmt;
+    pixel_fmts[1] = AV_PIX_FMT_NONE;
+
+    formats = ff_make_format_list(pixel_fmts);
+    return ff_set_common_formats(context, formats);
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *context     = inlink->dst;
+    DnnProcessingContext *ctx = context->priv;
+    DNNReturnType result;
+    DNNData dnn_data;
+
+    result = ctx->model->get_input(ctx->model->model, &dnn_data, ctx->model_inputname);
+    if (result != DNN_SUCCESS) {
+        av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
+        return AVERROR(EIO);
+    }
+
+    // the design is to add explicit scale filter before this filter
+    if (dnn_data.height != -1 && dnn_data.height != inlink->h) {
+        av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d but got %d\n",
+                                   dnn_data.height, inlink->h);
+        return AVERROR(EIO);
+    }
+    if (dnn_data.width != -1 && dnn_data.width != inlink->w) {
+        av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d but got %d\n",
+                                   dnn_data.width, inlink->w);
+        return AVERROR(EIO);
+    }
+
+    if (dnn_data.channels != 3) {
+        av_log(ctx, AV_LOG_ERROR, "the model requires input channels %d\n",
+                                   dnn_data.channels);
+        return AVERROR(EIO);
+    }
+    if (dnn_data.dt != DNN_FLOAT && dnn_data.dt != DNN_UINT8) {
+        av_log(ctx, AV_LOG_ERROR, "only support dnn models with input data type as float32 and uint8.\n");
+        return AVERROR(EIO);
+    }
+
+    ctx->input.width    = inlink->w;
+    ctx->input.height   = inlink->h;
+    ctx->input.channels = dnn_data.channels;
+    ctx->input.dt = dnn_data.dt;
+
+    result = (ctx->model->set_input_output)(ctx->model->model,
+                                        &ctx->input, ctx->model_inputname,
+                                        (const char **)&ctx->model_outputname, 1);
+    if (result != DNN_SUCCESS) {
+        av_log(ctx, AV_LOG_ERROR, "could not set input and output for the model\n");
+        return AVERROR(EIO);
+    }
+
+    return 0;
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *context = outlink->src;
+    DnnProcessingContext *ctx = context->priv;
+    DNNReturnType result;
+
+    // have a try run in case that the dnn model resize the frame
+    result = (ctx->dnn_module->execute_model)(ctx->model, &ctx->output, 1);
+    if (result != DNN_SUCCESS){
+        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
+        return AVERROR(EIO);
+    }
+
+    outlink->w = ctx->output.width;
+    outlink->h = ctx->output.height;
+
+    return 0;
+}
+
+static int copy_from_frame_to_dnn(DNNData *dnn_data, const AVFrame *in)
+{
+    // extend this function to support more formats
+    av_assert0(in->format == AV_PIX_FMT_RGB24 || in->format == AV_PIX_FMT_RGB24);
+
+    if (dnn_data->dt == DNN_FLOAT) {
+        float *dnn_input = dnn_data->data;
+        for (int i = 0; i < in->height; i++) {
+            for(int j = 0; j < in->width * 3; j++) {
+                int k = i * in->linesize[0] + j;
+                int t = i * in->width * 3 + j;
+                dnn_input[t] = in->data[0][k] / 255.0f;
+            }
+        }
+    } else {
+        uint8_t *dnn_input = dnn_data->data;
+        av_assert0(dnn_data->dt == DNN_UINT8);
+        for (int i = 0; i < in->height; i++) {
+            for(int j = 0; j < in->width * 3; j++) {
+                int k = i * in->linesize[0] + j;
+                int t = i * in->width * 3 + j;
+                dnn_input[t] = in->data[0][k];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int copy_from_dnn_to_frame(AVFrame *out, const DNNData *dnn_data)
+{
+    // extend this function to support more formats
+    av_assert0(out->format == AV_PIX_FMT_RGB24 || out->format == AV_PIX_FMT_RGB24);
+
+    if (dnn_data->dt == DNN_FLOAT) {
+        float *dnn_output = dnn_data->data;
+        for (int i = 0; i < out->height; i++) {
+            for(int j = 0; j < out->width * 3; j++) {
+                int k = i * out->linesize[0] + j;
+                int t = i * out->width * 3 + j;
+                out->data[0][k] = av_clip((int)(dnn_output[t] * 255.0f), 0, 255);
+            }
+        }
+    } else {
+        uint8_t *dnn_output = dnn_data->data;
+        av_assert0(dnn_data->dt == DNN_UINT8);
+        for (int i = 0; i < out->height; i++) {
+            for(int j = 0; j < out->width * 3; j++) {
+                int k = i * out->linesize[0] + j;
+                int t = i * out->width * 3 + j;
+                out->data[0][k] = dnn_output[t];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    AVFilterContext *context  = inlink->dst;
+    AVFilterLink *outlink = context->outputs[0];
+    DnnProcessingContext *ctx = context->priv;
+    DNNReturnType dnn_result;
+    AVFrame *out;
+
+    copy_from_frame_to_dnn(&ctx->input, in);
+
+    dnn_result = (ctx->dnn_module->execute_model)(ctx->model, &ctx->output, 1);
+    if (dnn_result != DNN_SUCCESS){
+        av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
+        av_frame_free(&in);
+        return AVERROR(EIO);
+    }
+    av_assert0(ctx->output.channels == 3);
+
+    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!out) {
+        av_log(ctx, AV_LOG_ERROR, "could not allocate memory for output frame\n");
+        av_frame_free(&in);
+        return AVERROR(ENOMEM);
+    }
+
+    av_frame_copy_props(out, in);
+    copy_from_dnn_to_frame(out, &ctx->output);
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    DnnProcessingContext *context = ctx->priv;
+
+    if (context->dnn_module)
+        (context->dnn_module->free_model)(&context->model);
+
+    av_freep(&context->dnn_module);
+}
+
+static const AVFilterPad dnn_processing_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = config_input,
+        .filter_frame = filter_frame,
+    },
+    { NULL }
+};
+
+static const AVFilterPad dnn_processing_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO,
+        .config_props  = config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_dnn_processing = {
+    .name          = "dnn_processing",
+    .description   = NULL_IF_CONFIG_SMALL("Apply DNN processing filter to the input."),
+    .priv_size     = sizeof(DnnProcessingContext),
+    .init          = init,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    .inputs        = dnn_processing_inputs,
+    .outputs       = dnn_processing_outputs,
+    .priv_class    = &dnn_processing_class,
+    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,
+};