diff mbox

[FFmpeg-devel] libavfilter: Add more operation supports in FFmpeg dnn native mode.

Message ID 3d9d5b50.458db.16a63450325.Coremail.xwmeng@pku.edu.cn
State New
Headers show

Commit Message

xwmeng@pku.edu.cn April 28, 2019, 9:27 a.m. UTC
This patch is for the support of derain filter project in GSoC. It adds supports for the following operations: 




 (1) Conv padding method: "SAME" and "VALID"

 (2) Dilation

 (3) Activation: "NONE" and "LEAKY_RELU"




These operations are all needed in derain filter. And if modify the dnn native mode in FFmpeg, the generation process of Super Resolution model should be changed accordingly, e.g. add padding method parameter (= 0) and dilation parameter (= 1).




In addition, I have a question about the Super Resulotion implementation. The model training process of SR uses "VALID" method. According to my understanding of "VALID" mode in tensorflow, the size of output image should be smaller than the current design in SR. Because pixels near the boundary are not processed in "VALID" mode, however, these unprocessed pixels are filled with adjacent pixels in current dnn native mode. I wonder why to do like this here.




From 4d92ef21a5acf064122c51f442d0e2f5437b3343 Mon Sep 17 00:00:00 2001
From: Xuewei Meng <xwmeng@pku.edu.cn>
Date: Sun, 28 Apr 2019 17:21:35 +0800
Subject: [PATCH] Add operation supports in dnn_native

Signed-off-by: Xuewei Meng <xwmeng@pku.edu.cn>
---
 libavfilter/dnn_backend_native.c | 36 +++++++++++++++++++++-----------
 libavfilter/dnn_backend_native.h |  6 +++++-
 2 files changed, 29 insertions(+), 13 deletions(-)

Comments

Jun Zhao April 28, 2019, 10:28 a.m. UTC | #1
On Sun, Apr 28, 2019 at 5:27 PM <xwmeng@pku.edu.cn> wrote:
>
> This patch is for the support of derain filter project in GSoC. It adds supports for the following operations:
>
>
>
>
>  (1) Conv padding method: "SAME" and "VALID"
>
>  (2) Dilation
>
>  (3) Activation: "NONE" and "LEAKY_RELU"
>
>
>
>
> These operations are all needed in derain filter. And if modify the dnn native mode in FFmpeg, the generation process of Super Resolution model should be changed accordingly, e.g. add padding method parameter (= 0) and dilation parameter (= 1).
>
>
>
>
> In addition, I have a question about the Super Resulotion implementation. The model training process of SR uses "VALID" method. According to my understanding of "VALID" mode in tensorflow, the size of output image should be smaller than the current design in SR. Because pixels near the boundary are not processed in "VALID" mode, however, these unprocessed pixels are filled with adjacent pixels in current dnn native mode. I wonder why to do like this here.
>
>
>
>
> From 4d92ef21a5acf064122c51f442d0e2f5437b3343 Mon Sep 17 00:00:00 2001
> From: Xuewei Meng <xwmeng@pku.edu.cn>
> Date: Sun, 28 Apr 2019 17:21:35 +0800
> Subject: [PATCH] Add operation supports in dnn_native
>
> Signed-off-by: Xuewei Meng <xwmeng@pku.edu.cn>
> ---
>  libavfilter/dnn_backend_native.c | 36 +++++++++++++++++++++-----------
>  libavfilter/dnn_backend_native.h |  6 +++++-
>  2 files changed, 29 insertions(+), 13 deletions(-)
>
> diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c
> index 70d857f5f2..0e3ef5d64d 100644
> --- a/libavfilter/dnn_backend_native.c
> +++ b/libavfilter/dnn_backend_native.c
> @@ -157,13 +157,15 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
>                  ff_dnn_free_model_native(&model);
>                  return NULL;
>              }
> +            conv_params->dilation = (int32_t)avio_rl32(model_file_context);
> +            conv_params->padding_method = (int32_t)avio_rl32(model_file_context);
>              conv_params->activation = (int32_t)avio_rl32(model_file_context);
>              conv_params->input_num = (int32_t)avio_rl32(model_file_context);
>              conv_params->output_num = (int32_t)avio_rl32(model_file_context);
>              conv_params->kernel_size = (int32_t)avio_rl32(model_file_context);
>              kernel_size = conv_params->input_num * conv_params->output_num *
>                            conv_params->kernel_size * conv_params->kernel_size;
> -            dnn_size += 16 + (kernel_size + conv_params->output_num << 2);
> +            dnn_size += 24 + (kernel_size + conv_params->output_num << 2);
Add some comments for the number 16 or 24 ?
>              if (dnn_size > file_size || conv_params->input_num <= 0 ||
>                  conv_params->output_num <= 0 || conv_params->kernel_size <= 0){
>                  avio_closep(&model_file_context);
> @@ -221,23 +223,28 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
>
>  static void convolve(const float *input, float *output, const ConvolutionalParams *conv_params, int width, int height)
>  {
> -    int y, x, n_filter, ch, kernel_y, kernel_x;
Why?
>      int radius = conv_params->kernel_size >> 1;
>      int src_linesize = width * conv_params->input_num;
>      int filter_linesize = conv_params->kernel_size * conv_params->input_num;
>      int filter_size = conv_params->kernel_size * filter_linesize;
> +    int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
>
> -    for (y = 0; y < height; ++y){
> -        for (x = 0; x < width; ++x){
> -            for (n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
> +    for (int y = pad_size; y < height - pad_size; ++y){
> +        for (int x = pad_size; x < width - pad_size; ++x){
> +            for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
>                  output[n_filter] = conv_params->biases[n_filter];
> -                for (ch = 0; ch < conv_params->input_num; ++ch){
> -                    for (kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
> -                        for (kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
> -                            output[n_filter] += input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
> -                                                      CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch] *
> -                                                conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> -                                                                    kernel_x * conv_params->input_num + ch];
> +
> +                for (int ch = 0; ch < conv_params->input_num; ++ch){
> +                    for (int kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
> +                        for (int kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
> +                            int y_pos = y + (kernel_y - radius) * conv_params->dilation;
> +                            int x_pos = x + (kernel_x - radius) * conv_params->dilation;
> +
> +                            float input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 :
> +                                               input[y_pos * src_linesize + x_pos * conv_params->input_num + ch];
> +
> +                            output[n_filter] += input_pel * conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> +                                                                                kernel_x * conv_params->input_num + ch];
>                          }
>                      }
>                  }
> @@ -250,6 +257,11 @@ static void convolve(const float *input, float *output, const ConvolutionalParam
>                      break;
>                  case SIGMOID:
>                      output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
> +                    break;
> +                case NONE:
> +                    break;
> +                case LEAKY_RELU:
> +                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 * FFMIN(output[n_filter], 0.0);
>                  }
>              }
>              output += conv_params->output_num;
> diff --git a/libavfilter/dnn_backend_native.h b/libavfilter/dnn_backend_native.h
> index 51d4cac955..f7d4eb823b 100644
> --- a/libavfilter/dnn_backend_native.h
> +++ b/libavfilter/dnn_backend_native.h
> @@ -32,7 +32,9 @@
>
>  typedef enum {INPUT, CONV, DEPTH_TO_SPACE} DNNLayerType;
>
> -typedef enum {RELU, TANH, SIGMOID} DNNActivationFunc;
> +typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc;
> +
> +typedef enum {VALID, SAME} DNNPaddingFunc;
>
>  typedef struct Layer{
>      DNNLayerType type;
> @@ -43,6 +45,8 @@ typedef struct Layer{
>  typedef struct ConvolutionalParams{
>      int32_t input_num, output_num, kernel_size;
>      DNNActivationFunc activation;
> +    DNNPaddingFunc padding_method;
> +    int32_t dilation;
>      float *kernel;
>      float *biases;
>  } ConvolutionalParams;
> --
> 2.17.1
xwmeng@pku.edu.cn April 28, 2019, 11 a.m. UTC | #2
> -----原始邮件-----
> 发件人: "mypopy@gmail.com" <mypopy@gmail.com>
> 发送时间: 2019-04-28 18:28:21 (星期日)
> 收件人: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> 抄送: 
> 主题: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in FFmpeg dnn native mode.
> 
> On Sun, Apr 28, 2019 at 5:27 PM <xwmeng@pku.edu.cn> wrote:
> >
> > This patch is for the support of derain filter project in GSoC. It adds supports for the following operations:
> >
> >
> >
> >
> >  (1) Conv padding method: "SAME" and "VALID"
> >
> >  (2) Dilation
> >
> >  (3) Activation: "NONE" and "LEAKY_RELU"
> >
> >
> >
> >
> > These operations are all needed in derain filter. And if modify the dnn native mode in FFmpeg, the generation process of Super Resolution model should be changed accordingly, e.g. add padding method parameter (= 0) and dilation parameter (= 1).
> >
> >
> >
> >
> > In addition, I have a question about the Super Resulotion implementation. The model training process of SR uses "VALID" method. According to my understanding of "VALID" mode in tensorflow, the size of output image should be smaller than the current design in SR. Because pixels near the boundary are not processed in "VALID" mode, however, these unprocessed pixels are filled with adjacent pixels in current dnn native mode. I wonder why to do like this here.
> >
> >
> >
> >
> > From 4d92ef21a5acf064122c51f442d0e2f5437b3343 Mon Sep 17 00:00:00 2001
> > From: Xuewei Meng <xwmeng@pku.edu.cn>
> > Date: Sun, 28 Apr 2019 17:21:35 +0800
> > Subject: [PATCH] Add operation supports in dnn_native
> >
> > Signed-off-by: Xuewei Meng <xwmeng@pku.edu.cn>
> > ---
> >  libavfilter/dnn_backend_native.c | 36 +++++++++++++++++++++-----------
> >  libavfilter/dnn_backend_native.h |  6 +++++-
> >  2 files changed, 29 insertions(+), 13 deletions(-)
> >
> > diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c
> > index 70d857f5f2..0e3ef5d64d 100644
> > --- a/libavfilter/dnn_backend_native.c
> > +++ b/libavfilter/dnn_backend_native.c
> > @@ -157,13 +157,15 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
> >                  ff_dnn_free_model_native(&model);
> >                  return NULL;
> >              }
> > +            conv_params->dilation = (int32_t)avio_rl32(model_file_context);
> > +            conv_params->padding_method = (int32_t)avio_rl32(model_file_context);
> >              conv_params->activation = (int32_t)avio_rl32(model_file_context);
> >              conv_params->input_num = (int32_t)avio_rl32(model_file_context);
> >              conv_params->output_num = (int32_t)avio_rl32(model_file_context);
> >              conv_params->kernel_size = (int32_t)avio_rl32(model_file_context);
> >              kernel_size = conv_params->input_num * conv_params->output_num *
> >                            conv_params->kernel_size * conv_params->kernel_size;
> > -            dnn_size += 16 + (kernel_size + conv_params->output_num << 2);
> > +            dnn_size += 24 + (kernel_size + conv_params->output_num << 2);
> Add some comments for the number 16 or 24 ?

dnn_size is the bytes of dnn model parameters. Because of the adding of parameter "conv_params->dilation" and "conv_params->padding_method", dnn_size shoule be added by 8.

> >              if (dnn_size > file_size || conv_params->input_num <= 0 ||
> >                  conv_params->output_num <= 0 || conv_params->kernel_size <= 0){
> >                  avio_closep(&model_file_context);
> > @@ -221,23 +223,28 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
> >
> >  static void convolve(const float *input, float *output, const ConvolutionalParams *conv_params, int width, int height)
> >  {
> > -    int y, x, n_filter, ch, kernel_y, kernel_x;
> Why?
Because I think it is better to definite the variables when using them in for loop. 

> >      int radius = conv_params->kernel_size >> 1;
> >      int src_linesize = width * conv_params->input_num;
> >      int filter_linesize = conv_params->kernel_size * conv_params->input_num;
> >      int filter_size = conv_params->kernel_size * filter_linesize;
> > +    int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
> >
> > -    for (y = 0; y < height; ++y){
> > -        for (x = 0; x < width; ++x){
> > -            for (n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
> > +    for (int y = pad_size; y < height - pad_size; ++y){
> > +        for (int x = pad_size; x < width - pad_size; ++x){
> > +            for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
> >                  output[n_filter] = conv_params->biases[n_filter];
> > -                for (ch = 0; ch < conv_params->input_num; ++ch){
> > -                    for (kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
> > -                        for (kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
> > -                            output[n_filter] += input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
> > -                                                      CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch] *
> > -                                                conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> > -                                                                    kernel_x * conv_params->input_num + ch];
> > +
> > +                for (int ch = 0; ch < conv_params->input_num; ++ch){
> > +                    for (int kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
> > +                        for (int kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
> > +                            int y_pos = y + (kernel_y - radius) * conv_params->dilation;
> > +                            int x_pos = x + (kernel_x - radius) * conv_params->dilation;
> > +
> > +                            float input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 :
> > +                                               input[y_pos * src_linesize + x_pos * conv_params->input_num + ch];
> > +
> > +                            output[n_filter] += input_pel * conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> > +                                                                                kernel_x * conv_params->input_num + ch];
> >                          }
> >                      }
> >                  }
> > @@ -250,6 +257,11 @@ static void convolve(const float *input, float *output, const ConvolutionalParam
> >                      break;
> >                  case SIGMOID:
> >                      output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
> > +                    break;
> > +                case NONE:
> > +                    break;
> > +                case LEAKY_RELU:
> > +                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 * FFMIN(output[n_filter], 0.0);
> >                  }
> >              }
> >              output += conv_params->output_num;
> > diff --git a/libavfilter/dnn_backend_native.h b/libavfilter/dnn_backend_native.h
> > index 51d4cac955..f7d4eb823b 100644
> > --- a/libavfilter/dnn_backend_native.h
> > +++ b/libavfilter/dnn_backend_native.h
> > @@ -32,7 +32,9 @@
> >
> >  typedef enum {INPUT, CONV, DEPTH_TO_SPACE} DNNLayerType;
> >
> > -typedef enum {RELU, TANH, SIGMOID} DNNActivationFunc;
> > +typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc;
> > +
> > +typedef enum {VALID, SAME} DNNPaddingFunc;
> >
> >  typedef struct Layer{
> >      DNNLayerType type;
> > @@ -43,6 +45,8 @@ typedef struct Layer{
> >  typedef struct ConvolutionalParams{
> >      int32_t input_num, output_num, kernel_size;
> >      DNNActivationFunc activation;
> > +    DNNPaddingFunc padding_method;
> > +    int32_t dilation;
> >      float *kernel;
> >      float *biases;
> >  } ConvolutionalParams;
> > --
> > 2.17.1
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Guo, Yejun April 29, 2019, 2:03 a.m. UTC | #3
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> xwmeng@pku.edu.cn

> Sent: Sunday, April 28, 2019 5:27 PM

> To: ffmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Subject: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in

> FFmpeg dnn native mode.

> 

> This patch is for the support of derain filter project in GSoC. It adds supports for

> the following operations:

> 

> 

> 

> 

>  (1) Conv padding method: "SAME" and "VALID"

> 

>  (2) Dilation

> 

>  (3) Activation: "NONE" and "LEAKY_RELU"


how about separate this single patch into 3 patches.

> 

> 

> 

> 

> These operations are all needed in derain filter. And if modify the dnn native

> mode in FFmpeg, the generation process of Super Resolution model should be

> changed accordingly, e.g. add padding method parameter (= 0) and dilation

> parameter (= 1).


you can create a PR at https://github.com/HighVoltageRocknRoll/sr 

> 

> 

> 

> 

> In addition, I have a question about the Super Resulotion implementation. The

> model training process of SR uses "VALID" method. According to my

> understanding of "VALID" mode in tensorflow, the size of output image should

> be smaller than the current design in SR. Because pixels near the boundary are

> not processed in "VALID" mode, however, these unprocessed pixels are filled

> with adjacent pixels in current dnn native mode. I wonder why to do like this

> here.


I have the same concern that why the native model is not exactly the same as tf model,
the pad layer is missed, and the native model also change the behavior of pad parameter of conv layer.

it is only suitable for vf_sr, and not general for other models.

> 

> 

> 

> 

> From 4d92ef21a5acf064122c51f442d0e2f5437b3343 Mon Sep 17 00:00:00

> 2001

> From: Xuewei Meng <xwmeng@pku.edu.cn>

> Date: Sun, 28 Apr 2019 17:21:35 +0800

> Subject: [PATCH] Add operation supports in dnn_native

> 

> Signed-off-by: Xuewei Meng <xwmeng@pku.edu.cn>

> ---

>  libavfilter/dnn_backend_native.c | 36 +++++++++++++++++++++-----------

>  libavfilter/dnn_backend_native.h |  6 +++++-

>  2 files changed, 29 insertions(+), 13 deletions(-)

> 

> diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c

> index 70d857f5f2..0e3ef5d64d 100644

> --- a/libavfilter/dnn_backend_native.c

> +++ b/libavfilter/dnn_backend_native.c

> @@ -157,13 +157,15 @@ DNNModel *ff_dnn_load_model_native(const char

> *model_filename)

>                  ff_dnn_free_model_native(&model);

>                  return NULL;

>              }

> +            conv_params->dilation =

> (int32_t)avio_rl32(model_file_context);

> +            conv_params->padding_method =

> (int32_t)avio_rl32(model_file_context);

>              conv_params->activation =

> (int32_t)avio_rl32(model_file_context);

>              conv_params->input_num =

> (int32_t)avio_rl32(model_file_context);

>              conv_params->output_num =

> (int32_t)avio_rl32(model_file_context);

>              conv_params->kernel_size =

> (int32_t)avio_rl32(model_file_context);

>              kernel_size = conv_params->input_num *

> conv_params->output_num *

>                            conv_params->kernel_size *

> conv_params->kernel_size;

> -            dnn_size += 16 + (kernel_size + conv_params->output_num <<

> 2);

> +            dnn_size += 24 + (kernel_size + conv_params->output_num <<

> 2);

>              if (dnn_size > file_size || conv_params->input_num <= 0 ||

>                  conv_params->output_num <= 0 ||

> conv_params->kernel_size <= 0){

>                  avio_closep(&model_file_context);

> @@ -221,23 +223,28 @@ DNNModel *ff_dnn_load_model_native(const char

> *model_filename)

> 

>  static void convolve(const float *input, float *output, const

> ConvolutionalParams *conv_params, int width, int height)

>  {

> -    int y, x, n_filter, ch, kernel_y, kernel_x;

>      int radius = conv_params->kernel_size >> 1;

>      int src_linesize = width * conv_params->input_num;

>      int filter_linesize = conv_params->kernel_size *

> conv_params->input_num;

>      int filter_size = conv_params->kernel_size * filter_linesize;

> +    int pad_size = (conv_params->padding_method == VALID) ?

> (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;


for parameter 'valid', the size of feature map is changed, it should be reflected at function set_input_output_native,
for example, the size of network->layers[layer].output should be changed, and we might add the size info into struct Layer.

> 

> -    for (y = 0; y < height; ++y){

> -        for (x = 0; x < width; ++x){

> -            for (n_filter = 0; n_filter < conv_params->output_num;

> ++n_filter){

> +    for (int y = pad_size; y < height - pad_size; ++y){

> +        for (int x = pad_size; x < width - pad_size; ++x){

> +            for (int n_filter = 0; n_filter < conv_params->output_num;

> ++n_filter){

>                  output[n_filter] = conv_params->biases[n_filter];

> -                for (ch = 0; ch < conv_params->input_num; ++ch){

> -                    for (kernel_y = 0; kernel_y <

> conv_params->kernel_size; ++kernel_y){

> -                        for (kernel_x = 0; kernel_x <

> conv_params->kernel_size; ++kernel_x){

> -                            output[n_filter] +=

> input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +

> -

> CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch]


to compatible with vf_sr.c, as a step by step method, we can keep clamp_to_edge at the first step.

it means that we can support 3 parameters for conv pad, same, valid, and this extra same_clamp_to_edge,
we can remove same_clamp_to_edge after all the things are settled.

> *

> -

> conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +

> -

> kernel_x * conv_params->input_num + ch];

> +

> +                for (int ch = 0; ch < conv_params->input_num; ++ch){

> +                    for (int kernel_y = 0; kernel_y <

> conv_params->kernel_size; ++kernel_y){

> +                        for (int kernel_x = 0; kernel_x <

> conv_params->kernel_size; ++kernel_x){

> +                            int y_pos = y + (kernel_y - radius) *

> conv_params->dilation;

> +                            int x_pos = x + (kernel_x - radius) *

> conv_params->dilation;

> +

> +                            float input_pel = (x_pos < 0 || x_pos >=

> width || y_pos < 0 || y_pos >= height) ? 0.0 :

> +                                               input[y_pos *

> src_linesize + x_pos * conv_params->input_num + ch];

> +

> +                            output[n_filter] += input_pel *

> conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +

> +

> kernel_x * conv_params->input_num + ch];

>                          }

>                      }

>                  }

> @@ -250,6 +257,11 @@ static void convolve(const float *input, float *output,

> const ConvolutionalParam

>                      break;

>                  case SIGMOID:

>                      output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));

> +                    break;

> +                case NONE:

> +                    break;

> +                case LEAKY_RELU:

> +                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 *

> FFMIN(output[n_filter], 0.0);

>                  }

>              }

>              output += conv_params->output_num;

> diff --git a/libavfilter/dnn_backend_native.h b/libavfilter/dnn_backend_native.h

> index 51d4cac955..f7d4eb823b 100644

> --- a/libavfilter/dnn_backend_native.h

> +++ b/libavfilter/dnn_backend_native.h

> @@ -32,7 +32,9 @@

> 

>  typedef enum {INPUT, CONV, DEPTH_TO_SPACE} DNNLayerType;

> 

> -typedef enum {RELU, TANH, SIGMOID} DNNActivationFunc;

> +typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU}

> DNNActivationFunc;

> +

> +typedef enum {VALID, SAME} DNNPaddingFunc;

> 

>  typedef struct Layer{

>      DNNLayerType type;

> @@ -43,6 +45,8 @@ typedef struct Layer{

>  typedef struct ConvolutionalParams{

>      int32_t input_num, output_num, kernel_size;

>      DNNActivationFunc activation;

> +    DNNPaddingFunc padding_method;

> +    int32_t dilation;

>      float *kernel;

>      float *biases;

>  } ConvolutionalParams;

> --

> 2.17.1

> 

> 

> 

> 

> 

> 

> 

> 

> 

> 

> 

> 

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
xwmeng@pku.edu.cn April 29, 2019, 2:21 a.m. UTC | #4
> -----原始邮件-----
> 发件人: "Guo, Yejun" <yejun.guo@intel.com>
> 发送时间: 2019-04-29 10:03:43 (星期一)
> 收件人: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> 抄送: 
> 主题: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in FFmpeg dnn native mode.
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> > xwmeng@pku.edu.cn
> > Sent: Sunday, April 28, 2019 5:27 PM
> > To: ffmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > Subject: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in
> > FFmpeg dnn native mode.
> > 
> > This patch is for the support of derain filter project in GSoC. It adds supports for
> > the following operations:
> > 
> > 
> > 
> > 
> >  (1) Conv padding method: "SAME" and "VALID"
> > 
> >  (2) Dilation
> > 
> >  (3) Activation: "NONE" and "LEAKY_RELU"
> 
> how about separate this single patch into 3 patches.
So, first, we can seperate this single patch into 3 patches ('padding', 'dilation', and 'activation'). For 'padding', we can support 3 parameters, same, valid, and the extra same_clamp_to_edge used in sr. And for "Dilation" and "padding" patch, the generation process of sr should be changed and we can create a PR at https://github.com/HighVoltageRocknRoll/sr.  

> 
> > 
> > 
> > 
> > 
> > These operations are all needed in derain filter. And if modify the dnn native
> > mode in FFmpeg, the generation process of Super Resolution model should be
> > changed accordingly, e.g. add padding method parameter (= 0) and dilation
> > parameter (= 1).
> 
> you can create a PR at https://github.com/HighVoltageRocknRoll/sr 
> 
> > 
> > 
> > 
> > 
> > In addition, I have a question about the Super Resulotion implementation. The
> > model training process of SR uses "VALID" method. According to my
> > understanding of "VALID" mode in tensorflow, the size of output image should
> > be smaller than the current design in SR. Because pixels near the boundary are
> > not processed in "VALID" mode, however, these unprocessed pixels are filled
> > with adjacent pixels in current dnn native mode. I wonder why to do like this
> > here.
> 
> I have the same concern that why the native model is not exactly the same as tf model,
> the pad layer is missed, and the native model also change the behavior of pad parameter of conv layer.
> 
> it is only suitable for vf_sr, and not general for other models.
> 
> > 
> > 
> > 
> > 
> > From 4d92ef21a5acf064122c51f442d0e2f5437b3343 Mon Sep 17 00:00:00
> > 2001
> > From: Xuewei Meng <xwmeng@pku.edu.cn>
> > Date: Sun, 28 Apr 2019 17:21:35 +0800
> > Subject: [PATCH] Add operation supports in dnn_native
> > 
> > Signed-off-by: Xuewei Meng <xwmeng@pku.edu.cn>
> > ---
> >  libavfilter/dnn_backend_native.c | 36 +++++++++++++++++++++-----------
> >  libavfilter/dnn_backend_native.h |  6 +++++-
> >  2 files changed, 29 insertions(+), 13 deletions(-)
> > 
> > diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c
> > index 70d857f5f2..0e3ef5d64d 100644
> > --- a/libavfilter/dnn_backend_native.c
> > +++ b/libavfilter/dnn_backend_native.c
> > @@ -157,13 +157,15 @@ DNNModel *ff_dnn_load_model_native(const char
> > *model_filename)
> >                  ff_dnn_free_model_native(&model);
> >                  return NULL;
> >              }
> > +            conv_params->dilation =
> > (int32_t)avio_rl32(model_file_context);
> > +            conv_params->padding_method =
> > (int32_t)avio_rl32(model_file_context);
> >              conv_params->activation =
> > (int32_t)avio_rl32(model_file_context);
> >              conv_params->input_num =
> > (int32_t)avio_rl32(model_file_context);
> >              conv_params->output_num =
> > (int32_t)avio_rl32(model_file_context);
> >              conv_params->kernel_size =
> > (int32_t)avio_rl32(model_file_context);
> >              kernel_size = conv_params->input_num *
> > conv_params->output_num *
> >                            conv_params->kernel_size *
> > conv_params->kernel_size;
> > -            dnn_size += 16 + (kernel_size + conv_params->output_num <<
> > 2);
> > +            dnn_size += 24 + (kernel_size + conv_params->output_num <<
> > 2);
> >              if (dnn_size > file_size || conv_params->input_num <= 0 ||
> >                  conv_params->output_num <= 0 ||
> > conv_params->kernel_size <= 0){
> >                  avio_closep(&model_file_context);
> > @@ -221,23 +223,28 @@ DNNModel *ff_dnn_load_model_native(const char
> > *model_filename)
> > 
> >  static void convolve(const float *input, float *output, const
> > ConvolutionalParams *conv_params, int width, int height)
> >  {
> > -    int y, x, n_filter, ch, kernel_y, kernel_x;
> >      int radius = conv_params->kernel_size >> 1;
> >      int src_linesize = width * conv_params->input_num;
> >      int filter_linesize = conv_params->kernel_size *
> > conv_params->input_num;
> >      int filter_size = conv_params->kernel_size * filter_linesize;
> > +    int pad_size = (conv_params->padding_method == VALID) ?
> > (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
> 
> for parameter 'valid', the size of feature map is changed, it should be reflected at function set_input_output_native,
> for example, the size of network->layers[layer].output should be changed, and we might add the size info into struct Layer.
> 
> > 
> > -    for (y = 0; y < height; ++y){
> > -        for (x = 0; x < width; ++x){
> > -            for (n_filter = 0; n_filter < conv_params->output_num;
> > ++n_filter){
> > +    for (int y = pad_size; y < height - pad_size; ++y){
> > +        for (int x = pad_size; x < width - pad_size; ++x){
> > +            for (int n_filter = 0; n_filter < conv_params->output_num;
> > ++n_filter){
> >                  output[n_filter] = conv_params->biases[n_filter];
> > -                for (ch = 0; ch < conv_params->input_num; ++ch){
> > -                    for (kernel_y = 0; kernel_y <
> > conv_params->kernel_size; ++kernel_y){
> > -                        for (kernel_x = 0; kernel_x <
> > conv_params->kernel_size; ++kernel_x){
> > -                            output[n_filter] +=
> > input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
> > -
> > CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch]
> 
> to compatible with vf_sr.c, as a step by step method, we can keep clamp_to_edge at the first step.
> 
> it means that we can support 3 parameters for conv pad, same, valid, and this extra same_clamp_to_edge,
> we can remove same_clamp_to_edge after all the things are settled.
> 
> > *
> > -
> > conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> > -
> > kernel_x * conv_params->input_num + ch];
> > +
> > +                for (int ch = 0; ch < conv_params->input_num; ++ch){
> > +                    for (int kernel_y = 0; kernel_y <
> > conv_params->kernel_size; ++kernel_y){
> > +                        for (int kernel_x = 0; kernel_x <
> > conv_params->kernel_size; ++kernel_x){
> > +                            int y_pos = y + (kernel_y - radius) *
> > conv_params->dilation;
> > +                            int x_pos = x + (kernel_x - radius) *
> > conv_params->dilation;
> > +
> > +                            float input_pel = (x_pos < 0 || x_pos >=
> > width || y_pos < 0 || y_pos >= height) ? 0.0 :
> > +                                               input[y_pos *
> > src_linesize + x_pos * conv_params->input_num + ch];
> > +
> > +                            output[n_filter] += input_pel *
> > conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> > +
> > kernel_x * conv_params->input_num + ch];
> >                          }
> >                      }
> >                  }
> > @@ -250,6 +257,11 @@ static void convolve(const float *input, float *output,
> > const ConvolutionalParam
> >                      break;
> >                  case SIGMOID:
> >                      output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
> > +                    break;
> > +                case NONE:
> > +                    break;
> > +                case LEAKY_RELU:
> > +                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 *
> > FFMIN(output[n_filter], 0.0);
> >                  }
> >              }
> >              output += conv_params->output_num;
> > diff --git a/libavfilter/dnn_backend_native.h b/libavfilter/dnn_backend_native.h
> > index 51d4cac955..f7d4eb823b 100644
> > --- a/libavfilter/dnn_backend_native.h
> > +++ b/libavfilter/dnn_backend_native.h
> > @@ -32,7 +32,9 @@
> > 
> >  typedef enum {INPUT, CONV, DEPTH_TO_SPACE} DNNLayerType;
> > 
> > -typedef enum {RELU, TANH, SIGMOID} DNNActivationFunc;
> > +typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU}
> > DNNActivationFunc;
> > +
> > +typedef enum {VALID, SAME} DNNPaddingFunc;
> > 
> >  typedef struct Layer{
> >      DNNLayerType type;
> > @@ -43,6 +45,8 @@ typedef struct Layer{
> >  typedef struct ConvolutionalParams{
> >      int32_t input_num, output_num, kernel_size;
> >      DNNActivationFunc activation;
> > +    DNNPaddingFunc padding_method;
> > +    int32_t dilation;
> >      float *kernel;
> >      float *biases;
> >  } ConvolutionalParams;
> > --
> > 2.17.1
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > 
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Pedro Arthur April 29, 2019, 2:42 a.m. UTC | #5
Em dom, 28 de abr de 2019 às 23:07, Guo, Yejun <yejun.guo@intel.com> escreveu:
>
>
>
> > -----Original Message-----
> > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> > xwmeng@pku.edu.cn
> > Sent: Sunday, April 28, 2019 5:27 PM
> > To: ffmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > Subject: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in
> > FFmpeg dnn native mode.
> >
> > This patch is for the support of derain filter project in GSoC. It adds supports for
> > the following operations:
> >
> >
> >
> >
> >  (1) Conv padding method: "SAME" and "VALID"
> >
> >  (2) Dilation
> >
> >  (3) Activation: "NONE" and "LEAKY_RELU"
>
> how about separate this single patch into 3 patches.
>
> >
> >
> >
> >
> > These operations are all needed in derain filter. And if modify the dnn native
> > mode in FFmpeg, the generation process of Super Resolution model should be
> > changed accordingly, e.g. add padding method parameter (= 0) and dilation
> > parameter (= 1).
>
> you can create a PR at https://github.com/HighVoltageRocknRoll/sr
>
> >
> >
> >
> >
> > In addition, I have a question about the Super Resulotion implementation. The
> > model training process of SR uses "VALID" method. According to my
> > understanding of "VALID" mode in tensorflow, the size of output image should
> > be smaller than the current design in SR. Because pixels near the boundary are
> > not processed in "VALID" mode, however, these unprocessed pixels are filled
> > with adjacent pixels in current dnn native mode. I wonder why to do like this
> > here.
>
> I have the same concern that why the native model is not exactly the same as tf model,
> the pad layer is missed, and the native model also change the behavior of pad parameter of conv layer.
>
> it is only suitable for vf_sr, and not general for other models.
>
I think for training these filters the preferred method is VALID as it
uses only the data available (without filling the borders) and gives
the best possible result.
However for inference usually one expects to output an image with the
same size of the original (imagine the case of chained filters where
each one reduces the image by a few pixels, in the end one may have a
useless output).
Therefore it makes perfect sense to use different padding methods for
training/inference.

The clamp_to_edge padding was introduced before the TF backend thus it
stayed in the native backend even after the introduction of the TF
backend.
Indeed the clamp_to_edge is simpler than the other padding methods and
also gives a slight better result, If I remember correct the student
which implemented the TF backend did not find an equivalent padding
method in TF, thats why it uses different paddings.

> >
> >
> >
> >
> > From 4d92ef21a5acf064122c51f442d0e2f5437b3343 Mon Sep 17 00:00:00
> > 2001
> > From: Xuewei Meng <xwmeng@pku.edu.cn>
> > Date: Sun, 28 Apr 2019 17:21:35 +0800
> > Subject: [PATCH] Add operation supports in dnn_native
> >
> > Signed-off-by: Xuewei Meng <xwmeng@pku.edu.cn>
> > ---
> >  libavfilter/dnn_backend_native.c | 36 +++++++++++++++++++++-----------
> >  libavfilter/dnn_backend_native.h |  6 +++++-
> >  2 files changed, 29 insertions(+), 13 deletions(-)
> >
> > diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c
> > index 70d857f5f2..0e3ef5d64d 100644
> > --- a/libavfilter/dnn_backend_native.c
> > +++ b/libavfilter/dnn_backend_native.c
> > @@ -157,13 +157,15 @@ DNNModel *ff_dnn_load_model_native(const char
> > *model_filename)
> >                  ff_dnn_free_model_native(&model);
> >                  return NULL;
> >              }
> > +            conv_params->dilation =
> > (int32_t)avio_rl32(model_file_context);
> > +            conv_params->padding_method =
> > (int32_t)avio_rl32(model_file_context);
> >              conv_params->activation =
> > (int32_t)avio_rl32(model_file_context);
> >              conv_params->input_num =
> > (int32_t)avio_rl32(model_file_context);
> >              conv_params->output_num =
> > (int32_t)avio_rl32(model_file_context);
> >              conv_params->kernel_size =
> > (int32_t)avio_rl32(model_file_context);
> >              kernel_size = conv_params->input_num *
> > conv_params->output_num *
> >                            conv_params->kernel_size *
> > conv_params->kernel_size;
> > -            dnn_size += 16 + (kernel_size + conv_params->output_num <<
> > 2);
> > +            dnn_size += 24 + (kernel_size + conv_params->output_num <<
> > 2);
> >              if (dnn_size > file_size || conv_params->input_num <= 0 ||
> >                  conv_params->output_num <= 0 ||
> > conv_params->kernel_size <= 0){
> >                  avio_closep(&model_file_context);
> > @@ -221,23 +223,28 @@ DNNModel *ff_dnn_load_model_native(const char
> > *model_filename)
> >
> >  static void convolve(const float *input, float *output, const
> > ConvolutionalParams *conv_params, int width, int height)
> >  {
> > -    int y, x, n_filter, ch, kernel_y, kernel_x;
> >      int radius = conv_params->kernel_size >> 1;
> >      int src_linesize = width * conv_params->input_num;
> >      int filter_linesize = conv_params->kernel_size *
> > conv_params->input_num;
> >      int filter_size = conv_params->kernel_size * filter_linesize;
> > +    int pad_size = (conv_params->padding_method == VALID) ?
> > (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
>
> for parameter 'valid', the size of feature map is changed, it should be reflected at function set_input_output_native,
> for example, the size of network->layers[layer].output should be changed, and we might add the size info into struct Layer.
>
> >
> > -    for (y = 0; y < height; ++y){
> > -        for (x = 0; x < width; ++x){
> > -            for (n_filter = 0; n_filter < conv_params->output_num;
> > ++n_filter){
> > +    for (int y = pad_size; y < height - pad_size; ++y){
> > +        for (int x = pad_size; x < width - pad_size; ++x){
> > +            for (int n_filter = 0; n_filter < conv_params->output_num;
> > ++n_filter){
> >                  output[n_filter] = conv_params->biases[n_filter];
> > -                for (ch = 0; ch < conv_params->input_num; ++ch){
> > -                    for (kernel_y = 0; kernel_y <
> > conv_params->kernel_size; ++kernel_y){
> > -                        for (kernel_x = 0; kernel_x <
> > conv_params->kernel_size; ++kernel_x){
> > -                            output[n_filter] +=
> > input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
> > -
> > CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch]
>
> to compatible with vf_sr.c, as a step by step method, we can keep clamp_to_edge at the first step.
>
> it means that we can support 3 parameters for conv pad, same, valid, and this extra same_clamp_to_edge,
> we can remove same_clamp_to_edge after all the things are settled.
>
> > *
> > -
> > conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> > -
> > kernel_x * conv_params->input_num + ch];
> > +
> > +                for (int ch = 0; ch < conv_params->input_num; ++ch){
> > +                    for (int kernel_y = 0; kernel_y <
> > conv_params->kernel_size; ++kernel_y){
> > +                        for (int kernel_x = 0; kernel_x <
> > conv_params->kernel_size; ++kernel_x){
> > +                            int y_pos = y + (kernel_y - radius) *
> > conv_params->dilation;
> > +                            int x_pos = x + (kernel_x - radius) *
> > conv_params->dilation;
> > +
> > +                            float input_pel = (x_pos < 0 || x_pos >=
> > width || y_pos < 0 || y_pos >= height) ? 0.0 :
> > +                                               input[y_pos *
> > src_linesize + x_pos * conv_params->input_num + ch];
> > +
> > +                            output[n_filter] += input_pel *
> > conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> > +
> > kernel_x * conv_params->input_num + ch];
> >                          }
> >                      }
> >                  }
> > @@ -250,6 +257,11 @@ static void convolve(const float *input, float *output,
> > const ConvolutionalParam
> >                      break;
> >                  case SIGMOID:
> >                      output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
> > +                    break;
> > +                case NONE:
> > +                    break;
> > +                case LEAKY_RELU:
> > +                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 *
> > FFMIN(output[n_filter], 0.0);
> >                  }
> >              }
> >              output += conv_params->output_num;
> > diff --git a/libavfilter/dnn_backend_native.h b/libavfilter/dnn_backend_native.h
> > index 51d4cac955..f7d4eb823b 100644
> > --- a/libavfilter/dnn_backend_native.h
> > +++ b/libavfilter/dnn_backend_native.h
> > @@ -32,7 +32,9 @@
> >
> >  typedef enum {INPUT, CONV, DEPTH_TO_SPACE} DNNLayerType;
> >
> > -typedef enum {RELU, TANH, SIGMOID} DNNActivationFunc;
> > +typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU}
> > DNNActivationFunc;
> > +
> > +typedef enum {VALID, SAME} DNNPaddingFunc;
> >
> >  typedef struct Layer{
> >      DNNLayerType type;
> > @@ -43,6 +45,8 @@ typedef struct Layer{
> >  typedef struct ConvolutionalParams{
> >      int32_t input_num, output_num, kernel_size;
> >      DNNActivationFunc activation;
> > +    DNNPaddingFunc padding_method;
> > +    int32_t dilation;
> >      float *kernel;
> >      float *biases;
> >  } ConvolutionalParams;
> > --
> > 2.17.1
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
xwmeng@pku.edu.cn April 29, 2019, 3:06 a.m. UTC | #6
> -----原始邮件-----
> 发件人: "Pedro Arthur" <bygrandao@gmail.com>
> 发送时间: 2019-04-29 10:42:42 (星期一)
> 收件人: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> 抄送: 
> 主题: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in FFmpeg dnn native mode.
> 
> Em dom, 28 de abr de 2019 às 23:07, Guo, Yejun <yejun.guo@intel.com> escreveu:
> >
> >
> >
> > > -----Original Message-----
> > > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> > > xwmeng@pku.edu.cn
> > > Sent: Sunday, April 28, 2019 5:27 PM
> > > To: ffmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > > Subject: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in
> > > FFmpeg dnn native mode.
> > >
> > > This patch is for the support of derain filter project in GSoC. It adds supports for
> > > the following operations:
> > >
> > >
> > >
> > >
> > >  (1) Conv padding method: "SAME" and "VALID"
> > >
> > >  (2) Dilation
> > >
> > >  (3) Activation: "NONE" and "LEAKY_RELU"
> >
> > how about separate this single patch into 3 patches.
> >
> > >
> > >
> > >
> > >
> > > These operations are all needed in derain filter. And if modify the dnn native
> > > mode in FFmpeg, the generation process of Super Resolution model should be
> > > changed accordingly, e.g. add padding method parameter (= 0) and dilation
> > > parameter (= 1).
> >
> > you can create a PR at https://github.com/HighVoltageRocknRoll/sr
> >
> > >
> > >
> > >
> > >
> > > In addition, I have a question about the Super Resulotion implementation. The
> > > model training process of SR uses "VALID" method. According to my
> > > understanding of "VALID" mode in tensorflow, the size of output image should
> > > be smaller than the current design in SR. Because pixels near the boundary are
> > > not processed in "VALID" mode, however, these unprocessed pixels are filled
> > > with adjacent pixels in current dnn native mode. I wonder why to do like this
> > > here.
> >
> > I have the same concern that why the native model is not exactly the same as tf model,
> > the pad layer is missed, and the native model also change the behavior of pad parameter of conv layer.
> >
> > it is only suitable for vf_sr, and not general for other models.
> >
> I think for training these filters the preferred method is VALID as it
> uses only the data available (without filling the borders) and gives
> the best possible result.
> However for inference usually one expects to output an image with the
> same size of the original (imagine the case of chained filters where
> each one reduces the image by a few pixels, in the end one may have a
> useless output).
> Therefore it makes perfect sense to use different padding methods for
> training/inference.
> 
> The clamp_to_edge padding was introduced before the TF backend thus it
> stayed in the native backend even after the introduction of the TF
> backend.
> Indeed the clamp_to_edge is simpler than the other padding methods and
> also gives a slight better result, If I remember correct the student
> which implemented the TF backend did not find an equivalent padding
> method in TF, thats why it uses different paddings.
> 
Yes, I think clamp_to_edge is a good method to keep the output with the same size as input. However, I don't think "VALID" is the best method giving best possible result. So, for "VALID" mode, maybe we can use the clamp_to_edge method in the current dnn native mode? And then, we should also add "SAME" option to support other filters.

> > >
> > >
> > >
> > >
> > > From 4d92ef21a5acf064122c51f442d0e2f5437b3343 Mon Sep 17 00:00:00
> > > 2001
> > > From: Xuewei Meng <xwmeng@pku.edu.cn>
> > > Date: Sun, 28 Apr 2019 17:21:35 +0800
> > > Subject: [PATCH] Add operation supports in dnn_native
> > >
> > > Signed-off-by: Xuewei Meng <xwmeng@pku.edu.cn>
> > > ---
> > >  libavfilter/dnn_backend_native.c | 36 +++++++++++++++++++++-----------
> > >  libavfilter/dnn_backend_native.h |  6 +++++-
> > >  2 files changed, 29 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c
> > > index 70d857f5f2..0e3ef5d64d 100644
> > > --- a/libavfilter/dnn_backend_native.c
> > > +++ b/libavfilter/dnn_backend_native.c
> > > @@ -157,13 +157,15 @@ DNNModel *ff_dnn_load_model_native(const char
> > > *model_filename)
> > >                  ff_dnn_free_model_native(&model);
> > >                  return NULL;
> > >              }
> > > +            conv_params->dilation =
> > > (int32_t)avio_rl32(model_file_context);
> > > +            conv_params->padding_method =
> > > (int32_t)avio_rl32(model_file_context);
> > >              conv_params->activation =
> > > (int32_t)avio_rl32(model_file_context);
> > >              conv_params->input_num =
> > > (int32_t)avio_rl32(model_file_context);
> > >              conv_params->output_num =
> > > (int32_t)avio_rl32(model_file_context);
> > >              conv_params->kernel_size =
> > > (int32_t)avio_rl32(model_file_context);
> > >              kernel_size = conv_params->input_num *
> > > conv_params->output_num *
> > >                            conv_params->kernel_size *
> > > conv_params->kernel_size;
> > > -            dnn_size += 16 + (kernel_size + conv_params->output_num <<
> > > 2);
> > > +            dnn_size += 24 + (kernel_size + conv_params->output_num <<
> > > 2);
> > >              if (dnn_size > file_size || conv_params->input_num <= 0 ||
> > >                  conv_params->output_num <= 0 ||
> > > conv_params->kernel_size <= 0){
> > >                  avio_closep(&model_file_context);
> > > @@ -221,23 +223,28 @@ DNNModel *ff_dnn_load_model_native(const char
> > > *model_filename)
> > >
> > >  static void convolve(const float *input, float *output, const
> > > ConvolutionalParams *conv_params, int width, int height)
> > >  {
> > > -    int y, x, n_filter, ch, kernel_y, kernel_x;
> > >      int radius = conv_params->kernel_size >> 1;
> > >      int src_linesize = width * conv_params->input_num;
> > >      int filter_linesize = conv_params->kernel_size *
> > > conv_params->input_num;
> > >      int filter_size = conv_params->kernel_size * filter_linesize;
> > > +    int pad_size = (conv_params->padding_method == VALID) ?
> > > (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
> >
> > for parameter 'valid', the size of feature map is changed, it should be reflected at function set_input_output_native,
> > for example, the size of network->layers[layer].output should be changed, and we might add the size info into struct Layer.
> >
> > >
> > > -    for (y = 0; y < height; ++y){
> > > -        for (x = 0; x < width; ++x){
> > > -            for (n_filter = 0; n_filter < conv_params->output_num;
> > > ++n_filter){
> > > +    for (int y = pad_size; y < height - pad_size; ++y){
> > > +        for (int x = pad_size; x < width - pad_size; ++x){
> > > +            for (int n_filter = 0; n_filter < conv_params->output_num;
> > > ++n_filter){
> > >                  output[n_filter] = conv_params->biases[n_filter];
> > > -                for (ch = 0; ch < conv_params->input_num; ++ch){
> > > -                    for (kernel_y = 0; kernel_y <
> > > conv_params->kernel_size; ++kernel_y){
> > > -                        for (kernel_x = 0; kernel_x <
> > > conv_params->kernel_size; ++kernel_x){
> > > -                            output[n_filter] +=
> > > input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
> > > -
> > > CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch]
> >
> > to compatible with vf_sr.c, as a step by step method, we can keep clamp_to_edge at the first step.
> >
> > it means that we can support 3 parameters for conv pad, same, valid, and this extra same_clamp_to_edge,
> > we can remove same_clamp_to_edge after all the things are settled.
> >
> > > *
> > > -
> > > conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> > > -
> > > kernel_x * conv_params->input_num + ch];
> > > +
> > > +                for (int ch = 0; ch < conv_params->input_num; ++ch){
> > > +                    for (int kernel_y = 0; kernel_y <
> > > conv_params->kernel_size; ++kernel_y){
> > > +                        for (int kernel_x = 0; kernel_x <
> > > conv_params->kernel_size; ++kernel_x){
> > > +                            int y_pos = y + (kernel_y - radius) *
> > > conv_params->dilation;
> > > +                            int x_pos = x + (kernel_x - radius) *
> > > conv_params->dilation;
> > > +
> > > +                            float input_pel = (x_pos < 0 || x_pos >=
> > > width || y_pos < 0 || y_pos >= height) ? 0.0 :
> > > +                                               input[y_pos *
> > > src_linesize + x_pos * conv_params->input_num + ch];
> > > +
> > > +                            output[n_filter] += input_pel *
> > > conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> > > +
> > > kernel_x * conv_params->input_num + ch];
> > >                          }
> > >                      }
> > >                  }
> > > @@ -250,6 +257,11 @@ static void convolve(const float *input, float *output,
> > > const ConvolutionalParam
> > >                      break;
> > >                  case SIGMOID:
> > >                      output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
> > > +                    break;
> > > +                case NONE:
> > > +                    break;
> > > +                case LEAKY_RELU:
> > > +                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 *
> > > FFMIN(output[n_filter], 0.0);
> > >                  }
> > >              }
> > >              output += conv_params->output_num;
> > > diff --git a/libavfilter/dnn_backend_native.h b/libavfilter/dnn_backend_native.h
> > > index 51d4cac955..f7d4eb823b 100644
> > > --- a/libavfilter/dnn_backend_native.h
> > > +++ b/libavfilter/dnn_backend_native.h
> > > @@ -32,7 +32,9 @@
> > >
> > >  typedef enum {INPUT, CONV, DEPTH_TO_SPACE} DNNLayerType;
> > >
> > > -typedef enum {RELU, TANH, SIGMOID} DNNActivationFunc;
> > > +typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU}
> > > DNNActivationFunc;
> > > +
> > > +typedef enum {VALID, SAME} DNNPaddingFunc;
> > >
> > >  typedef struct Layer{
> > >      DNNLayerType type;
> > > @@ -43,6 +45,8 @@ typedef struct Layer{
> > >  typedef struct ConvolutionalParams{
> > >      int32_t input_num, output_num, kernel_size;
> > >      DNNActivationFunc activation;
> > > +    DNNPaddingFunc padding_method;
> > > +    int32_t dilation;
> > >      float *kernel;
> > >      float *biases;
> > >  } ConvolutionalParams;
> > > --
> > > 2.17.1
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > > _______________________________________________
> > > ffmpeg-devel mailing list
> > > ffmpeg-devel@ffmpeg.org
> > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > >
> > > To unsubscribe, visit link above, or email
> > > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Pedro Arthur April 29, 2019, 3:26 p.m. UTC | #7
Em seg, 29 de abr de 2019 às 00:06, <xwmeng@pku.edu.cn> escreveu:
>
>
>
>
> > -----原始邮件-----
> > 发件人: "Pedro Arthur" <bygrandao@gmail.com>
> > 发送时间: 2019-04-29 10:42:42 (星期一)
> > 收件人: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> > 抄送:
> > 主题: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in FFmpeg dnn native mode.
> >
> > I think for training these filters the preferred method is VALID as it
> > uses only the data available (without filling the borders) and gives
> > the best possible result.
> > However for inference usually one expects to output an image with the
> > same size of the original (imagine the case of chained filters where
> > each one reduces the image by a few pixels, in the end one may have a
> > useless output).
> > Therefore it makes perfect sense to use different padding methods for
> > training/inference.
> >
> > The clamp_to_edge padding was introduced before the TF backend thus it
> > stayed in the native backend even after the introduction of the TF
> > backend.
> > Indeed the clamp_to_edge is simpler than the other padding methods and
> > also gives a slight better result, If I remember correct the student
> > which implemented the TF backend did not find an equivalent padding
> > method in TF, thats why it uses different paddings.
> >
> Yes, I think clamp_to_edge is a good method to keep the output with the same size as input. However, I don't think "VALID" is the best method giving best possible result. So, for "VALID" mode, maybe we can use the clamp_to_edge method in the current dnn native mode? And then, we should also add "SAME" option to support other filters.
>

I think it is best to not make any assumptions like VALID =>
clamp_to_edge, but you can keep it for now.
Ideally the model should have a padding layer which the backend
properly implements. Currently the TF backend when reading a native
model adds this padding layer implicitly, therefore it would be a
matter of changing it to have an explicity padding layer in the model.

Maybe you can assume VALID => clamp_to_edge, so you can add what you
need without changing the SR code and later you implement the
explicity padding support and send a PR to the original repo
(https://github.com/HighVoltageRocknRoll/sr) properly modifying the
model.
Guo, Yejun April 30, 2019, 12:43 a.m. UTC | #8
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Pedro Arthur

> Sent: Monday, April 29, 2019 11:26 PM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in

> FFmpeg dnn native mode.

> 

> Em seg, 29 de abr de 2019 às 00:06, <xwmeng@pku.edu.cn> escreveu:

> >

> >

> >

> >

> > > -----原始邮件-----

> > > 发件人: "Pedro Arthur" <bygrandao@gmail.com>

> > > 发送时间: 2019-04-29 10:42:42 (星期一)

> > > 收件人: "FFmpeg development discussions and patches"

> <ffmpeg-devel@ffmpeg.org>

> > > 抄送:

> > > 主题: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports

> in FFmpeg dnn native mode.

> > >

> > > I think for training these filters the preferred method is VALID as it

> > > uses only the data available (without filling the borders) and gives

> > > the best possible result.

> > > However for inference usually one expects to output an image with the

> > > same size of the original (imagine the case of chained filters where

> > > each one reduces the image by a few pixels, in the end one may have a

> > > useless output).

> > > Therefore it makes perfect sense to use different padding methods for

> > > training/inference.

> > >

> > > The clamp_to_edge padding was introduced before the TF backend thus it

> > > stayed in the native backend even after the introduction of the TF

> > > backend.

> > > Indeed the clamp_to_edge is simpler than the other padding methods and

> > > also gives a slight better result, If I remember correct the student

> > > which implemented the TF backend did not find an equivalent padding

> > > method in TF, thats why it uses different paddings.

> > >

> > Yes, I think clamp_to_edge is a good method to keep the output with the

> same size as input. However, I don't think "VALID" is the best method giving

> best possible result. So, for "VALID" mode, maybe we can use the

> clamp_to_edge method in the current dnn native mode? And then, we should

> also add "SAME" option to support other filters.


@xwmeng, We now can support 3 padding options for conv layer.
- valid:  the same definition as TF model. (the size of feature map changed).
- same:  the same definition as TF model. (zero filled for outside pixels).
- same_clamp_to_edge: just like 'same' option, but edge filled for outside pixels. So we don't need change SR code for the padding.

> >

> 

> I think it is best to not make any assumptions like VALID =>

> clamp_to_edge, but you can keep it for now.

> Ideally the model should have a padding layer which the backend

> properly implements. Currently the TF backend when reading a native

> model adds this padding layer implicitly, therefore it would be a

> matter of changing it to have an explicity padding layer in the model.

> 

> Maybe you can assume VALID => clamp_to_edge, so you can add what you

> need without changing the SR code and later you implement the

> explicity padding support and send a PR to the original repo

> (https://github.com/HighVoltageRocknRoll/sr) properly modifying the

> model.


yes, the ideal solution to make native SR model the same as TF model, is to:
- add explicit padding layer in SR native model. (the padding layer is explicitly added in TF model)
- export 'valid' option for conv layer in SR native model. (the TF model uses 'valid' option)
- add padding layer execution support in native mode.
- add padding option support in conv layer in native mode.

And, we can first keep most of SR code unchanged, by adding 'same_clamp_to_edge' option in conv layer
in native mode. This can be our current choice. And the ideal solution can be our next choice, just as you mentioned.

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
xwmeng@pku.edu.cn May 1, 2019, 9:08 a.m. UTC | #9
> -----原始邮件-----
> 发件人: "Guo, Yejun" <yejun.guo@intel.com>
> 发送时间: 2019-04-30 08:43:43 (星期二)
> 收件人: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> 抄送: 
> 主题: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in FFmpeg dnn native mode.
> 
> 
> 
> > -----Original Message-----
> > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> > Pedro Arthur
> > Sent: Monday, April 29, 2019 11:26 PM
> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in
> > FFmpeg dnn native mode.
> > 
> > Em seg, 29 de abr de 2019 às 00:06, <xwmeng@pku.edu.cn> escreveu:
> > >
> > >
> > >
> > >
> > > > -----原始邮件-----
> > > > 发件人: "Pedro Arthur" <bygrandao@gmail.com>
> > > > 发送时间: 2019-04-29 10:42:42 (星期一)
> > > > 收件人: "FFmpeg development discussions and patches"
> > <ffmpeg-devel@ffmpeg.org>
> > > > 抄送:
> > > > 主题: Re: [FFmpeg-devel] [PATCH] libavfilter: Add more operation supports
> > in FFmpeg dnn native mode.
> > > >
> > > > I think for training these filters the preferred method is VALID as it
> > > > uses only the data available (without filling the borders) and gives
> > > > the best possible result.
> > > > However for inference usually one expects to output an image with the
> > > > same size of the original (imagine the case of chained filters where
> > > > each one reduces the image by a few pixels, in the end one may have a
> > > > useless output).
> > > > Therefore it makes perfect sense to use different padding methods for
> > > > training/inference.
> > > >
> > > > The clamp_to_edge padding was introduced before the TF backend thus it
> > > > stayed in the native backend even after the introduction of the TF
> > > > backend.
> > > > Indeed the clamp_to_edge is simpler than the other padding methods and
> > > > also gives a slight better result, If I remember correct the student
> > > > which implemented the TF backend did not find an equivalent padding
> > > > method in TF, thats why it uses different paddings.
> > > >
> > > Yes, I think clamp_to_edge is a good method to keep the output with the
> > same size as input. However, I don't think "VALID" is the best method giving
> > best possible result. So, for "VALID" mode, maybe we can use the
> > clamp_to_edge method in the current dnn native mode? And then, we should
> > also add "SAME" option to support other filters.
> 
> @xwmeng, We now can support 3 padding options for conv layer.
> - valid:  the same definition as TF model. (the size of feature map changed).
> - same:  the same definition as TF model. (zero filled for outside pixels).
> - same_clamp_to_edge: just like 'same' option, but edge filled for outside pixels. So we don't need change SR code for the padding.

I think as long as we add different padding options, we need to change the SR generate code. 
> 
> > >
> > 
> > I think it is best to not make any assumptions like VALID =>
> > clamp_to_edge, but you can keep it for now.
> > Ideally the model should have a padding layer which the backend
> > properly implements. Currently the TF backend when reading a native
> > model adds this padding layer implicitly, therefore it would be a
> > matter of changing it to have an explicity padding layer in the model.
> > 
> > Maybe you can assume VALID => clamp_to_edge, so you can add what you
> > need without changing the SR code and later you implement the
> > explicity padding support and send a PR to the original repo
> > (https://github.com/HighVoltageRocknRoll/sr) properly modifying the
> > model.
> 
> yes, the ideal solution to make native SR model the same as TF model, is to:
> - add explicit padding layer in SR native model. (the padding layer is explicitly added in TF model)
> - export 'valid' option for conv layer in SR native model. (the TF model uses 'valid' option)
> - add padding layer execution support in native mode.
> - add padding option support in conv layer in native mode.
> 
> And, we can first keep most of SR code unchanged, by adding 'same_clamp_to_edge' option in conv layer
> in native mode. This can be our current choice. And the ideal solution can be our next choice, just as you mentioned.
> 
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > 
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Guo, Yejun May 4, 2019, 10:48 a.m. UTC | #10
> >

> > @xwmeng, We now can support 3 padding options for conv layer.

> > - valid:  the same definition as TF model. (the size of feature map changed).

> > - same:  the same definition as TF model. (zero filled for outside pixels).

> > - same_clamp_to_edge: just like 'same' option, but edge filled for outside

> pixels. So we don't need change SR code for the padding.

> 

> I think as long as we add different padding options, we need to change the SR

> generate code.

> >


yes, you are right. Sorry for not made it clear, I meant that the key logic/code for padding does not change.
We have to add code to export the value of the pad option into SR native model, no matter which method used.
diff mbox

Patch

diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c
index 70d857f5f2..0e3ef5d64d 100644
--- a/libavfilter/dnn_backend_native.c
+++ b/libavfilter/dnn_backend_native.c
@@ -157,13 +157,15 @@  DNNModel *ff_dnn_load_model_native(const char *model_filename)
                 ff_dnn_free_model_native(&model);
                 return NULL;
             }
+            conv_params->dilation = (int32_t)avio_rl32(model_file_context);
+            conv_params->padding_method = (int32_t)avio_rl32(model_file_context);
             conv_params->activation = (int32_t)avio_rl32(model_file_context);
             conv_params->input_num = (int32_t)avio_rl32(model_file_context);
             conv_params->output_num = (int32_t)avio_rl32(model_file_context);
             conv_params->kernel_size = (int32_t)avio_rl32(model_file_context);
             kernel_size = conv_params->input_num * conv_params->output_num *
                           conv_params->kernel_size * conv_params->kernel_size;
-            dnn_size += 16 + (kernel_size + conv_params->output_num << 2);
+            dnn_size += 24 + (kernel_size + conv_params->output_num << 2);
             if (dnn_size > file_size || conv_params->input_num <= 0 ||
                 conv_params->output_num <= 0 || conv_params->kernel_size <= 0){
                 avio_closep(&model_file_context);
@@ -221,23 +223,28 @@  DNNModel *ff_dnn_load_model_native(const char *model_filename)
 
 static void convolve(const float *input, float *output, const ConvolutionalParams *conv_params, int width, int height)
 {
-    int y, x, n_filter, ch, kernel_y, kernel_x;
     int radius = conv_params->kernel_size >> 1;
     int src_linesize = width * conv_params->input_num;
     int filter_linesize = conv_params->kernel_size * conv_params->input_num;
     int filter_size = conv_params->kernel_size * filter_linesize;
+    int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
 
-    for (y = 0; y < height; ++y){
-        for (x = 0; x < width; ++x){
-            for (n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
+    for (int y = pad_size; y < height - pad_size; ++y){
+        for (int x = pad_size; x < width - pad_size; ++x){
+            for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
                 output[n_filter] = conv_params->biases[n_filter];
-                for (ch = 0; ch < conv_params->input_num; ++ch){
-                    for (kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
-                        for (kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
-                            output[n_filter] += input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
-                                                      CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch] *
-                                                conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
-                                                                    kernel_x * conv_params->input_num + ch];
+
+                for (int ch = 0; ch < conv_params->input_num; ++ch){
+                    for (int kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
+                        for (int kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
+                            int y_pos = y + (kernel_y - radius) * conv_params->dilation;
+                            int x_pos = x + (kernel_x - radius) * conv_params->dilation;
+
+                            float input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 : 
+                                               input[y_pos * src_linesize + x_pos * conv_params->input_num + ch];
+
+                            output[n_filter] += input_pel * conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
+                                                                                kernel_x * conv_params->input_num + ch];
                         }
                     }
                 }
@@ -250,6 +257,11 @@  static void convolve(const float *input, float *output, const ConvolutionalParam
                     break;
                 case SIGMOID:
                     output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
+                    break;
+                case NONE:
+                    break;
+                case LEAKY_RELU:
+                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 * FFMIN(output[n_filter], 0.0);
                 }
             }
             output += conv_params->output_num;
diff --git a/libavfilter/dnn_backend_native.h b/libavfilter/dnn_backend_native.h
index 51d4cac955..f7d4eb823b 100644
--- a/libavfilter/dnn_backend_native.h
+++ b/libavfilter/dnn_backend_native.h
@@ -32,7 +32,9 @@ 
 
 typedef enum {INPUT, CONV, DEPTH_TO_SPACE} DNNLayerType;
 
-typedef enum {RELU, TANH, SIGMOID} DNNActivationFunc;
+typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc;
+
+typedef enum {VALID, SAME} DNNPaddingFunc;
 
 typedef struct Layer{
     DNNLayerType type;
@@ -43,6 +45,8 @@  typedef struct Layer{
 typedef struct ConvolutionalParams{
     int32_t input_num, output_num, kernel_size;
     DNNActivationFunc activation;
+    DNNPaddingFunc padding_method;
+    int32_t dilation;
     float *kernel;
     float *biases;
 } ConvolutionalParams;