diff mbox

[FFmpeg-devel,1/3] dnn: introduce dnn operand (in c code) to hold operand infos within network

Message ID 1566291015-12763-1-git-send-email-yejun.guo@intel.com
State Superseded
Headers show

Commit Message

Guo, Yejun Aug. 20, 2019, 8:50 a.m. UTC
the info can be saved in dnn operand object without regenerating again and again,
and it is also needed for layer split/merge, and for memory reuse.

to make things step by step, this patch just focuses on c code,
the change within python script will be added later.

Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
---
 libavfilter/dnn/dnn_backend_native.c           | 226 ++++++++++++-------------
 libavfilter/dnn/dnn_backend_native.h           |  54 +++++-
 libavfilter/dnn/dnn_backend_native_layer_pad.c |  24 ++-
 libavfilter/dnn/dnn_backend_native_layer_pad.h |   4 +-
 tests/dnn/Makefile                             |   2 +-
 tests/dnn/dnn-layer-pad-test.c                 |  60 +++++--
 6 files changed, 236 insertions(+), 134 deletions(-)

Comments

Pedro Arthur Aug. 27, 2019, 2:24 p.m. UTC | #1
Hi,


Em ter, 20 de ago de 2019 às 05:54, Guo, Yejun <yejun.guo@intel.com> escreveu:
>
> the info can be saved in dnn operand object without regenerating again and again,
> and it is also needed for layer split/merge, and for memory reuse.
>
> to make things step by step, this patch just focuses on c code,
> the change within python script will be added later.
>
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  libavfilter/dnn/dnn_backend_native.c           | 226 ++++++++++++-------------
>  libavfilter/dnn/dnn_backend_native.h           |  54 +++++-
>  libavfilter/dnn/dnn_backend_native_layer_pad.c |  24 ++-
>  libavfilter/dnn/dnn_backend_native_layer_pad.h |   4 +-
>  tests/dnn/Makefile                             |   2 +-
>  tests/dnn/dnn-layer-pad-test.c                 |  60 +++++--
>  6 files changed, 236 insertions(+), 134 deletions(-)
>
> diff --git a/libavfilter/dnn/dnn_backend_native.c b/libavfilter/dnn/dnn_backend_native.c
> index d52abc6..78227a5 100644
> --- a/libavfilter/dnn/dnn_backend_native.c
> +++ b/libavfilter/dnn/dnn_backend_native.c
> @@ -30,77 +30,30 @@
>  static DNNReturnType set_input_output_native(void *model, DNNInputData *input, const char *input_name, const char **output_names, uint32_t nb_output)
>  {
>      ConvolutionalNetwork *network = (ConvolutionalNetwork *)model;
> -    InputParams *input_params;
> -    ConvolutionalParams *conv_params;
> -    DepthToSpaceParams *depth_to_space_params;
> -    LayerPadParams *pad_params;
> -    int cur_width, cur_height, cur_channels;
> -    int32_t layer;
>
> -    if (network->layers_num <= 0 || network->layers[0].type != INPUT){
> +    if (network->layers_num <= 0 || network->operands_num <= 0)
>          return DNN_ERROR;
> -    }
> -    else{
> -        input_params = (InputParams *)network->layers[0].params;
> -        input_params->width = cur_width = input->width;
> -        input_params->height = cur_height = input->height;
> -        input_params->channels = cur_channels = input->channels;
> -        if (input->data){
> -            av_freep(&input->data);
> -        }
> -        av_assert0(input->dt == DNN_FLOAT);
> -        network->layers[0].output = input->data = av_malloc(cur_height * cur_width * cur_channels * sizeof(float));
> -        if (!network->layers[0].output){
> -            return DNN_ERROR;
> -        }
> -    }
> -
> -    for (layer = 1; layer < network->layers_num; ++layer){
> -        switch (network->layers[layer].type){
> -        case CONV:
> -            conv_params = (ConvolutionalParams *)network->layers[layer].params;
> -            if (conv_params->input_num != cur_channels){
> -                return DNN_ERROR;
> -            }
> -            cur_channels = conv_params->output_num;
> -
> -            if (conv_params->padding_method == VALID) {
> -                int pad_size = (conv_params->kernel_size - 1) * conv_params->dilation;
> -                cur_height -= pad_size;
> -                cur_width -= pad_size;
> -            }
> -            break;
> -        case DEPTH_TO_SPACE:
> -            depth_to_space_params = (DepthToSpaceParams *)network->layers[layer].params;
> -            if (cur_channels % (depth_to_space_params->block_size * depth_to_space_params->block_size) != 0){
> -                return DNN_ERROR;
> -            }
> -            cur_channels = cur_channels / (depth_to_space_params->block_size * depth_to_space_params->block_size);
> -            cur_height *= depth_to_space_params->block_size;
> -            cur_width *= depth_to_space_params->block_size;
> -            break;
> -        case MIRROR_PAD:
> -            pad_params = (LayerPadParams *)network->layers[layer].params;
> -            cur_height = cur_height + pad_params->paddings[1][0] + pad_params->paddings[1][1];
> -            cur_width = cur_width + pad_params->paddings[2][0] + pad_params->paddings[2][1];
> -            cur_channels = cur_channels + pad_params->paddings[3][0] + pad_params->paddings[3][1];
> -            break;
> -        default:
> -            return DNN_ERROR;
> -        }
> -        if (network->layers[layer].output){
> -            av_freep(&network->layers[layer].output);
> -        }
> -
> -        if (cur_height <= 0 || cur_width <= 0)
> -            return DNN_ERROR;
>
> -        network->layers[layer].output = av_malloc(cur_height * cur_width * cur_channels * sizeof(float));
> -        if (!network->layers[layer].output){
> -            return DNN_ERROR;
> -        }
> -    }
> +    av_assert0(input->dt == DNN_FLOAT);
> +
> +    /**
> +     * as the first step, suppose network->operands[0] is the input operand.
> +     */
> +    network->operands[0].dims[0] = 1;
> +    network->operands[0].dims[1] = input->height;
> +    network->operands[0].dims[2] = input->width;
> +    network->operands[0].dims[3] = input->channels;
> +    network->operands[0].type = DOT_INPUT;
> +    network->operands[0].data_type = DNN_FLOAT;
> +    network->operands[0].isNHWC = 1;
> +
> +    av_freep(&network->operands[0].data);
> +    network->operands[0].length = calculate_operand_data_length(&network->operands[0]);
> +    network->operands[0].data = av_malloc(network->operands[0].length);
> +    if (!network->operands[0].data)
> +        return DNN_ERROR;
>
> +    input->data = network->operands[0].data;
>      return DNN_SUCCESS;
>  }
>
> @@ -119,6 +72,7 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
>      ConvolutionalParams *conv_params;
>      DepthToSpaceParams *depth_to_space_params;
>      LayerPadParams *pad_params;
> +    int32_t operand_index = 0;
>
>      model = av_malloc(sizeof(DNNModel));
>      if (!model){
> @@ -131,7 +85,7 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
>      }
>      file_size = avio_size(model_file_context);
>
> -    network = av_malloc(sizeof(ConvolutionalNetwork));
> +    network = av_mallocz(sizeof(ConvolutionalNetwork));
>      if (!network){
>          avio_closep(&model_file_context);
>          av_freep(&model);
> @@ -139,32 +93,33 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
>      }
>      model->model = (void *)network;
>
> -    network->layers_num = 1 + (int32_t)avio_rl32(model_file_context);
> +    network->layers_num = (int32_t)avio_rl32(model_file_context);
>      dnn_size = 4;
>
> -    network->layers = av_malloc(network->layers_num * sizeof(Layer));
> +    network->layers = av_mallocz(network->layers_num * sizeof(Layer));
>      if (!network->layers){
> -        av_freep(&network);
>          avio_closep(&model_file_context);
> -        av_freep(&model);
> +        ff_dnn_free_model_native(&model);
>          return NULL;
>      }
>
> -    for (layer = 0; layer < network->layers_num; ++layer){
> -        network->layers[layer].output = NULL;
> -        network->layers[layer].params = NULL;
> -    }
> -    network->layers[0].type = INPUT;
> -    network->layers[0].params = av_malloc(sizeof(InputParams));
> -    if (!network->layers[0].params){
> +    /**
> +     * Operands should be read from model file, the whole change will be huge.
> +     * to make things step by step, we first mock the operands, instead of reading from model file.
> +     */
> +    network->operands_num = network->layers_num + 1;
> +    network->operands = av_mallocz(network->operands_num * sizeof(DnnOperand));
> +    if (!network->operands){
>          avio_closep(&model_file_context);
>          ff_dnn_free_model_native(&model);
>          return NULL;
>      }
>
> -    for (layer = 1; layer < network->layers_num; ++layer){
> +    for (layer = 0; layer < network->layers_num; ++layer){
>          layer_type = (int32_t)avio_rl32(model_file_context);
>          dnn_size += 4;
> +        network->layers[layer].input_operand_indexes[0] = operand_index++;
> +        network->layers[layer].output_operand_index = operand_index;
>          switch (layer_type){
>          case CONV:
>              conv_params = av_malloc(sizeof(ConvolutionalParams));
> @@ -258,14 +213,35 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
>
>  #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
>
> -static void convolve(const float *input, float *output, const ConvolutionalParams *conv_params, int width, int height)
> +static int convolve(DnnOperand *operands, const int32_t *input_operand_indexes, int32_t output_operand_index, const ConvolutionalParams *conv_params)
>  {
> +    float *output;
> +    int32_t input_operand_index = input_operand_indexes[0];
> +    int number = operands[input_operand_index].dims[0];
> +    int height = operands[input_operand_index].dims[1];
> +    int width = operands[input_operand_index].dims[2];
> +    int channel = operands[input_operand_index].dims[3];
> +    const float *input = operands[input_operand_index].data;
> +
>      int radius = conv_params->kernel_size >> 1;
>      int src_linesize = width * conv_params->input_num;
>      int filter_linesize = conv_params->kernel_size * conv_params->input_num;
>      int filter_size = conv_params->kernel_size * filter_linesize;
>      int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
>
> +    DnnOperand *output_operand = &operands[output_operand_index];
> +    output_operand->dims[0] = number;
> +    output_operand->dims[1] = height - pad_size * 2;
> +    output_operand->dims[2] = width - pad_size * 2;
> +    output_operand->dims[3] = conv_params->output_num;
> +    output_operand->length = calculate_operand_data_length(output_operand);
> +    output_operand->data = av_realloc(output_operand->data, output_operand->length);
> +    if (!output_operand->data)
> +        return -1;
> +    output = output_operand->data;
> +
> +    av_assert0(channel == conv_params->input_num);
> +
>      for (int y = pad_size; y < height - pad_size; ++y) {
>          for (int x = pad_size; x < width - pad_size; ++x) {
>              for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
> @@ -311,16 +287,36 @@ static void convolve(const float *input, float *output, const ConvolutionalParam
>              output += conv_params->output_num;
>          }
>      }
> +    return 0;
>  }
>
> -static void depth_to_space(const float *input, float *output, int block_size, int width, int height, int channels)
> +static int depth_to_space(DnnOperand *operands, const int32_t *input_operand_indexes, int32_t output_operand_index, int block_size)
>  {
> +    float *output;
> +    int32_t input_operand_index = input_operand_indexes[0];
> +    int number = operands[input_operand_index].dims[0];
> +    int height = operands[input_operand_index].dims[1];
> +    int width = operands[input_operand_index].dims[2];
> +    int channels = operands[input_operand_index].dims[3];
> +    const float *input = operands[input_operand_index].data;
> +
>      int y, x, by, bx, ch;
>      int new_channels = channels / (block_size * block_size);
>      int output_linesize = width * channels;
>      int by_linesize = output_linesize / block_size;
>      int x_linesize = new_channels * block_size;
>
> +    DnnOperand *output_operand = &operands[output_operand_index];
> +    output_operand->dims[0] = number;
> +    output_operand->dims[1] = height * block_size;
> +    output_operand->dims[2] = width * block_size;
> +    output_operand->dims[3] = new_channels;
> +    output_operand->length = calculate_operand_data_length(output_operand);
> +    output_operand->data = av_realloc(output_operand->data, output_operand->length);
> +    if (!output_operand->data)
> +        return -1;
> +    output = output_operand->data;
> +
>      for (y = 0; y < height; ++y){
>          for (x = 0; x < width; ++x){
>              for (by = 0; by < block_size; ++by){
> @@ -334,58 +330,38 @@ static void depth_to_space(const float *input, float *output, int block_size, in
>          }
>          output += output_linesize;
>      }
> +    return 0;
>  }
>
>  DNNReturnType ff_dnn_execute_model_native(const DNNModel *model, DNNData *outputs, uint32_t nb_output)
>  {
>      ConvolutionalNetwork *network = (ConvolutionalNetwork *)model->model;
> -    int cur_width, cur_height, cur_channels;
>      int32_t layer;
> -    InputParams *input_params;
>      ConvolutionalParams *conv_params;
>      DepthToSpaceParams *depth_to_space_params;
>      LayerPadParams *pad_params;
>
> -    if (network->layers_num <= 0 || network->layers[0].type != INPUT || !network->layers[0].output){
> +    if (network->layers_num <= 0 || network->operands_num <= 0)
> +        return DNN_ERROR;
> +    if (!network->operands[0].data)
>          return DNN_ERROR;
> -    }
> -    else{
> -        input_params = (InputParams *)network->layers[0].params;
> -        cur_width = input_params->width;
> -        cur_height = input_params->height;
> -        cur_channels = input_params->channels;
> -    }
>
> -    for (layer = 1; layer < network->layers_num; ++layer){
> -        if (!network->layers[layer].output){
> -            return DNN_ERROR;
> -        }
> +    for (layer = 0; layer < network->layers_num; ++layer){
>          switch (network->layers[layer].type){
>          case CONV:
>              conv_params = (ConvolutionalParams *)network->layers[layer].params;
> -            convolve(network->layers[layer - 1].output, network->layers[layer].output, conv_params, cur_width, cur_height);
> -            cur_channels = conv_params->output_num;
> -            if (conv_params->padding_method == VALID) {
> -                int pad_size = (conv_params->kernel_size - 1) * conv_params->dilation;
> -                cur_height -= pad_size;
> -                cur_width -= pad_size;
> -            }
> +            convolve(network->operands, network->layers[layer].input_operand_indexes,
> +                     network->layers[layer].output_operand_index, conv_params);
>              break;
>          case DEPTH_TO_SPACE:
>              depth_to_space_params = (DepthToSpaceParams *)network->layers[layer].params;
> -            depth_to_space(network->layers[layer - 1].output, network->layers[layer].output,
> -                           depth_to_space_params->block_size, cur_width, cur_height, cur_channels);
> -            cur_height *= depth_to_space_params->block_size;
> -            cur_width *= depth_to_space_params->block_size;
> -            cur_channels /= depth_to_space_params->block_size * depth_to_space_params->block_size;
> +            depth_to_space(network->operands, network->layers[layer].input_operand_indexes,
> +                           network->layers[layer].output_operand_index, depth_to_space_params->block_size);
>              break;
>          case MIRROR_PAD:
>              pad_params = (LayerPadParams *)network->layers[layer].params;
> -            dnn_execute_layer_pad(network->layers[layer - 1].output, network->layers[layer].output,
> -                                  pad_params, 1, cur_height, cur_width, cur_channels);
> -            cur_height = cur_height + pad_params->paddings[1][0] + pad_params->paddings[1][1];
> -            cur_width = cur_width + pad_params->paddings[2][0] + pad_params->paddings[2][1];
> -            cur_channels = cur_channels + pad_params->paddings[3][0] + pad_params->paddings[3][1];
> +            dnn_execute_layer_pad(network->operands, network->layers[layer].input_operand_indexes,
> +                                  network->layers[layer].output_operand_index, pad_params);
>              break;
>          case INPUT:
>              return DNN_ERROR;
> @@ -395,14 +371,24 @@ DNNReturnType ff_dnn_execute_model_native(const DNNModel *model, DNNData *output
>      // native mode does not support multiple outputs yet
>      if (nb_output > 1)
>          return DNN_ERROR;
> -    outputs[0].data = network->layers[network->layers_num - 1].output;
> -    outputs[0].height = cur_height;
> -    outputs[0].width = cur_width;
> -    outputs[0].channels = cur_channels;
> +
> +    /**
> +     * as the first step, suppose network->operands[network->operands_num - 1] is the output operand.
> +     */
> +    outputs[0].data = network->operands[network->operands_num - 1].data;
> +    outputs[0].height = network->operands[network->operands_num - 1].dims[1];
> +    outputs[0].width = network->operands[network->operands_num - 1].dims[2];
> +    outputs[0].channels = network->operands[network->operands_num - 1].dims[3];
>
>      return DNN_SUCCESS;
>  }
>
> +int32_t calculate_operand_data_length(DnnOperand* operand)
> +{
> +    //av_assert0(operand->data_type == DNN_FLOAT);
Please remove the whole commented line or uncomment the assert if it
is relevant.

> +    return operand->dims[0] * operand->dims[1] * operand->dims[2] * operand->dims[3] * sizeof(float);
> +}
> +
>  void ff_dnn_free_model_native(DNNModel **model)
>  {
>      ConvolutionalNetwork *network;
> @@ -413,7 +399,6 @@ void ff_dnn_free_model_native(DNNModel **model)
>      {
>          network = (ConvolutionalNetwork *)(*model)->model;
>          for (layer = 0; layer < network->layers_num; ++layer){
> -            av_freep(&network->layers[layer].output);
>              if (network->layers[layer].type == CONV){
>                  conv_params = (ConvolutionalParams *)network->layers[layer].params;
>                  av_freep(&conv_params->kernel);
> @@ -422,6 +407,11 @@ void ff_dnn_free_model_native(DNNModel **model)
>              av_freep(&network->layers[layer].params);
>          }
>          av_freep(&network->layers);
> +
> +        for (uint32_t operand = 0; operand < network->operands_num; ++operand)
> +            av_freep(&network->operands[operand].data);
> +        av_freep(&network->operands);
> +
>          av_freep(&network);
>          av_freep(model);
>      }
> diff --git a/libavfilter/dnn/dnn_backend_native.h b/libavfilter/dnn/dnn_backend_native.h
> index b6f9533..d7737ac 100644
> --- a/libavfilter/dnn/dnn_backend_native.h
> +++ b/libavfilter/dnn/dnn_backend_native.h
> @@ -36,12 +36,60 @@ typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc;
>
>  typedef enum {VALID, SAME, SAME_CLAMP_TO_EDGE} DNNConvPaddingParam;
>
> +typedef enum {DOT_INPUT, DOT_INTERMEDIATE, DOT_OUTPUT} DNNOperandType;
> +
>  typedef struct Layer{
>      DNNLayerType type;
> -    float *output;
> +    /**
> +     * a layer can have multiple inputs and one output.
> +     * 4 is just a big enough number for input operands (increase it if necessary),
> +     * do not use 'int32_t *input_operand_indexes', so we don't worry about mem leaks.
> +     */
> +    int32_t input_operand_indexes[4];
> +    int32_t output_operand_index;
>      void *params;
>  } Layer;
>
> +typedef struct DnnOperand{
> +    /**
> +     * there are two memory layouts, NHWC or NCHW, so we use dims,
> +     * dims[0] is Number.
> +     */
> +    int32_t dims[4];
> +
> +    /**
> +     * input/output/intermediate operand of the network
> +     */
> +    DNNOperandType type;
> +
> +    /**
> +     * support different kinds of data type such as float, half float, int8 etc,
> +     * first support float now.
> +     */
> +    DNNDataType data_type;
> +
> +    /**
> +     * NHWC if 1, otherwise NCHW.
> +     * let's first support NHWC only, this flag is for extensive usage.
> +     */
> +    int8_t isNHWC;
> +
> +    /**
> +     * to avoid possible memory leak, do not use char *name
> +     */
> +    char name[512];
512 bytes seems a bit too large for a name, but if you think it is
really necessary I'm ok with it.

> +
> +    /**
> +     * data pointer with data length in bytes.
> +     * usedNumbersLeft is only valid for intermediate operand,
> +     * it means how many layers still depend on this operand,
> +     * todo: the memory can be reused when usedNumbersLeft is zero.
> +     */
> +    void *data;
> +    int32_t length;
> +    int32_t usedNumbersLeft;
> +}DnnOperand;
> +
>  typedef struct ConvolutionalParams{
>      int32_t input_num, output_num, kernel_size;
>      DNNActivationFunc activation;
> @@ -63,6 +111,8 @@ typedef struct DepthToSpaceParams{
>  typedef struct ConvolutionalNetwork{
>      Layer *layers;
>      int32_t layers_num;
> +    DnnOperand *operands;
> +    int32_t operands_num;
>  } ConvolutionalNetwork;
>
>  DNNModel *ff_dnn_load_model_native(const char *model_filename);
> @@ -71,4 +121,6 @@ DNNReturnType ff_dnn_execute_model_native(const DNNModel *model, DNNData *output
>
>  void ff_dnn_free_model_native(DNNModel **model);
>
> +int32_t calculate_operand_data_length(DnnOperand *operand);
> +
>  #endif
> diff --git a/libavfilter/dnn/dnn_backend_native_layer_pad.c b/libavfilter/dnn/dnn_backend_native_layer_pad.c
> index 5417d73..c2905a7 100644
> --- a/libavfilter/dnn/dnn_backend_native_layer_pad.c
> +++ b/libavfilter/dnn/dnn_backend_native_layer_pad.c
> @@ -48,12 +48,21 @@ static int after_get_buddy(int given, int border, LayerPadModeParam mode)
>      }
>  }
>
> -void dnn_execute_layer_pad(const float *input, float *output, const LayerPadParams *params, int number, int height, int width, int channel)
> +int dnn_execute_layer_pad(DnnOperand *operands, const int32_t *input_operand_indexes, int32_t output_operand_index,
> +                           const LayerPadParams *params)
>  {
>      int32_t before_paddings;
>      int32_t after_paddings;
> +    float* output;
>
>      // suppose format is <N, H, W, C>
> +    int32_t input_operand_index = input_operand_indexes[0];
> +    int number = operands[input_operand_index].dims[0];
> +    int height = operands[input_operand_index].dims[1];
> +    int width = operands[input_operand_index].dims[2];
> +    int channel = operands[input_operand_index].dims[3];
> +    const float *input = operands[input_operand_index].data;
> +
>      int new_number = number + params->paddings[0][0] + params->paddings[0][1];
>      int new_height = height + params->paddings[1][0] + params->paddings[1][1];
>      int new_width = width + params->paddings[2][0] + params->paddings[2][1];
> @@ -67,6 +76,17 @@ void dnn_execute_layer_pad(const float *input, float *output, const LayerPadPara
>      int new_wc_stride = new_c_stride * new_width;
>      int new_hwc_stride = new_wc_stride * new_height;
>
> +    DnnOperand *output_operand = &operands[output_operand_index];
> +    output_operand->dims[0] = new_number;
> +    output_operand->dims[1] = new_height;
> +    output_operand->dims[2] = new_width;
> +    output_operand->dims[3] = new_channel;
> +    output_operand->length = calculate_operand_data_length(output_operand);
> +    output_operand->data = av_realloc(output_operand->data, output_operand->length);
> +    if (!output_operand->data)
> +        return -1;
> +    output = output_operand->data;
> +
>      // copy the original data
>      for (int n = 0; n < number; n++) {
>          for (int h = 0; h < height; h++) {
> @@ -208,4 +228,6 @@ void dnn_execute_layer_pad(const float *input, float *output, const LayerPadPara
>              }
>          }
>      }
> +
> +    return 0;
>  }
> diff --git a/libavfilter/dnn/dnn_backend_native_layer_pad.h b/libavfilter/dnn/dnn_backend_native_layer_pad.h
> index 0fbe652..7cc8213 100644
> --- a/libavfilter/dnn/dnn_backend_native_layer_pad.h
> +++ b/libavfilter/dnn/dnn_backend_native_layer_pad.h
> @@ -26,6 +26,7 @@
>  #define AVFILTER_DNN_DNN_BACKEND_NATIVE_LAYER_PAD_H
>
>  #include <stdint.h>
> +#include "dnn_backend_native.h"
>
>  typedef enum {LPMP_CONSTANT, LPMP_REFLECT, LPMP_SYMMETRIC} LayerPadModeParam;
>
> @@ -35,6 +36,7 @@ typedef struct LayerPadParams{
>      float constant_values;
>  } LayerPadParams;
>
> -void dnn_execute_layer_pad(const float *input, float *output, const LayerPadParams *params, int number, int height, int width, int channel);
> +int dnn_execute_layer_pad(DnnOperand *operands, const int32_t *input_operand_indexes, int32_t output_operand_index,
> +                           const LayerPadParams *params);
>
>  #endif
> diff --git a/tests/dnn/Makefile b/tests/dnn/Makefile
> index 0e050ea..fabed75 100644
> --- a/tests/dnn/Makefile
> +++ b/tests/dnn/Makefile
> @@ -5,7 +5,7 @@ DNNTESTPROGS := $(DNNTESTPROGS:%=$(DNNTESTSDIR)/%-test$(EXESUF))
>  -include $(wildcard $(DNNTESTOBJS:.o=.d))
>
>  $(DNNTESTPROGS): %$(EXESUF): %.o $(FF_STATIC_DEP_LIBS)
> -       $(LD) $(LDFLAGS) $(LDEXEFLAGS) $(LD_O) $(filter %.o,$^) $(FF_STATIC_DEP_LIBS) $(ELIBS)
> +       $(LD) $(LDFLAGS) $(LDEXEFLAGS) $(LD_O) $(filter %.o,$^) $(FF_STATIC_DEP_LIBS) $(EXTRALIBS-avcodec) $(EXTRALIBS-avfilter) $(EXTRALIBS-avformat) $(EXTRALIBS-avutil) $(EXTRALIBS-swresample) $(EXTRALIBS)
>
>  testclean::
>         $(RM) $(addprefix $(DNNTESTSDIR)/,$(CLEANSUFFIXES) *-test$(EXESUF))
> diff --git a/tests/dnn/dnn-layer-pad-test.c b/tests/dnn/dnn-layer-pad-test.c
> index 28a49eb..1fb2be1 100644
> --- a/tests/dnn/dnn-layer-pad-test.c
> +++ b/tests/dnn/dnn-layer-pad-test.c
> @@ -44,6 +44,8 @@ static int test_with_mode_symmetric(void)
>      */
>
>      LayerPadParams params;
> +    DnnOperand operands[2];
> +    int32_t input_indexes[1];
>      float input[1*4*4*3] = {
>          0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
>      };
> @@ -57,8 +59,7 @@ static int test_with_mode_symmetric(void)
>          27.0, 28.0, 29.0, 24.0, 25.0, 26.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 33.0, 34.0, 35.0, 30.0, 31.0, 32.0, 18.0, 19.0, 20.0, 15.0, 16.0, 17.0, 12.0,
>          13.0, 14.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 21.0, 22.0, 23.0, 18.0, 19.0, 20.0
>      };
> -    float output[1*9*9*3];
> -    memset(output, 0, sizeof(output));
> +    float *output;
>
>      params.mode = LPMP_SYMMETRIC;
>      params.paddings[0][0] = 0;
> @@ -70,15 +71,26 @@ static int test_with_mode_symmetric(void)
>      params.paddings[3][0] = 0;
>      params.paddings[3][1] = 0;
>
> -    dnn_execute_layer_pad(input, output, &params, 1, 4, 4, 3);
> +    operands[0].data = input;
> +    operands[0].dims[0] = 1;
> +    operands[0].dims[1] = 4;
> +    operands[0].dims[2] = 4;
> +    operands[0].dims[3] = 3;
> +    operands[1].data = NULL;
>
> -    for (int i = 0; i < sizeof(output) / sizeof(float); i++) {
> +    input_indexes[0] = 0;
> +    dnn_execute_layer_pad(operands, input_indexes, 1, &params);
> +
> +    output = operands[1].data;
> +    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
>          if (fabs(output[i] - expected_output[i]) > EPSON) {
>              printf("at index %d, output: %f, expected_output: %f\n", i, output[i], expected_output[i]);
> +            av_freep(&output);
>              return 1;
>          }
>      }
>
> +    av_freep(&output);
>      return 0;
>
>  }
> @@ -102,6 +114,8 @@ static int test_with_mode_reflect(void)
>      */
>
>      LayerPadParams params;
> +    DnnOperand operands[2];
> +    int32_t input_indexes[1];
>      float input[3*2*2*3] = {
>          0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35
>      };
> @@ -110,8 +124,7 @@ static int test_with_mode_reflect(void)
>          12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0,
>          35.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0
>      };
> -    float output[6*2*2*3];
> -    memset(output, 0, sizeof(output));
> +    float *output;
>
>      params.mode = LPMP_REFLECT;
>      params.paddings[0][0] = 1;
> @@ -123,15 +136,26 @@ static int test_with_mode_reflect(void)
>      params.paddings[3][0] = 0;
>      params.paddings[3][1] = 0;
>
> -    dnn_execute_layer_pad(input, output, &params, 3, 2, 2, 3);
> +    operands[0].data = input;
> +    operands[0].dims[0] = 3;
> +    operands[0].dims[1] = 2;
> +    operands[0].dims[2] = 2;
> +    operands[0].dims[3] = 3;
> +    operands[1].data = NULL;
> +
> +    input_indexes[0] = 0;
> +    dnn_execute_layer_pad(operands, input_indexes, 1, &params);
>
> -    for (int i = 0; i < sizeof(output) / sizeof(float); i++) {
> +    output = operands[1].data;
> +    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
>          if (fabs(output[i] - expected_output[i]) > EPSON) {
>              printf("at index %d, output: %f, expected_output: %f\n", i, output[i], expected_output[i]);
> +            av_freep(&output);
>              return 1;
>          }
>      }
>
> +    av_freep(&output);
>      return 0;
>
>  }
> @@ -155,6 +179,8 @@ static int test_with_mode_constant(void)
>      */
>
>      LayerPadParams params;
> +    DnnOperand operands[2];
> +    int32_t input_indexes[1];
>      float input[1*2*2*3] = {
>          0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
>      };
> @@ -163,8 +189,7 @@ static int test_with_mode_constant(void)
>          728.0, 728.0, 0.0, 1.0, 2.0, 728.0, 728.0, 728.0, 3.0, 4.0, 5.0, 728.0, 728.0,
>          728.0, 6.0, 7.0, 8.0, 728.0, 728.0, 728.0, 9.0, 10.0, 11.0, 728.0, 728.0
>      };
> -    float output[1*3*2*6];
> -    memset(output, 0, sizeof(output));
> +    float *output;
>
>      params.mode = LPMP_CONSTANT;
>      params.constant_values = 728;
> @@ -177,15 +202,26 @@ static int test_with_mode_constant(void)
>      params.paddings[3][0] = 1;
>      params.paddings[3][1] = 2;
>
> -    dnn_execute_layer_pad(input, output, &params, 1, 2, 2, 3);
> +    operands[0].data = input;
> +    operands[0].dims[0] = 3;
> +    operands[0].dims[1] = 2;
> +    operands[0].dims[2] = 2;
> +    operands[0].dims[3] = 3;
> +    operands[1].data = NULL;
> +
> +    input_indexes[0] = 0;
> +    dnn_execute_layer_pad(operands, input_indexes, 1, &params);
>
> -    for (int i = 0; i < sizeof(output) / sizeof(float); i++) {
> +    output = operands[1].data;
> +    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
>          if (fabs(output[i] - expected_output[i]) > EPSON) {
>              printf("at index %d, output: %f, expected_output: %f\n", i, output[i], expected_output[i]);
> +            av_freep(&output);
>              return 1;
>          }
>      }
>
> +    av_freep(&output);
>      return 0;
>
>  }
> --
> 2.7.4
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Guo, Yejun Aug. 29, 2019, 3:11 a.m. UTC | #2
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Pedro Arthur

> Sent: Tuesday, August 27, 2019 10:24 PM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH 1/3] dnn: introduce dnn operand (in c code)

> to hold operand infos within network

> 

> Hi,

> 

> 

> Em ter, 20 de ago de 2019 às 05:54, Guo, Yejun <yejun.guo@intel.com>

> escreveu:

> >

> > the info can be saved in dnn operand object without regenerating again and

> again,

> > and it is also needed for layer split/merge, and for memory reuse.

> >

> > to make things step by step, this patch just focuses on c code,

> > the change within python script will be added later.

> >

> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>

> > ---

> >  libavfilter/dnn/dnn_backend_native.c           | 226

> ++++++++++++-------------

> >  libavfilter/dnn/dnn_backend_native.h           |  54 +++++-

> >  libavfilter/dnn/dnn_backend_native_layer_pad.c |  24 ++-

> >  libavfilter/dnn/dnn_backend_native_layer_pad.h |   4 +-

> >  tests/dnn/Makefile                             |   2 +-

> >  tests/dnn/dnn-layer-pad-test.c                 |  60 +++++--

> >  6 files changed, 236 insertions(+), 134 deletions(-)

> >

> > diff --git a/libavfilter/dnn/dnn_backend_native.c

> b/libavfilter/dnn/dnn_backend_native.c

> > index d52abc6..78227a5 100644

> > --- a/libavfilter/dnn/dnn_backend_native.c

> > +++ b/libavfilter/dnn/dnn_backend_native.c

> >


> > +int32_t calculate_operand_data_length(DnnOperand* operand)

> > +{

> > +    //av_assert0(operand->data_type == DNN_FLOAT);

> Please remove the whole commented line or uncomment the assert if it

> is relevant.


thanks, I'll remove it and add comments.

> 

> > +    return operand->dims[0] * operand->dims[1] * operand->dims[2] *

> operand->dims[3] * sizeof(float);

> > +}

> > +

> > +

> > +    /**

> > +     * to avoid possible memory leak, do not use char *name

> > +     */

> > +    char name[512];

> 512 bytes seems a bit too large for a name, but if you think it is

> really necessary I'm ok with it.

> 


good point, I'll change to 128.
diff mbox

Patch

diff --git a/libavfilter/dnn/dnn_backend_native.c b/libavfilter/dnn/dnn_backend_native.c
index d52abc6..78227a5 100644
--- a/libavfilter/dnn/dnn_backend_native.c
+++ b/libavfilter/dnn/dnn_backend_native.c
@@ -30,77 +30,30 @@ 
 static DNNReturnType set_input_output_native(void *model, DNNInputData *input, const char *input_name, const char **output_names, uint32_t nb_output)
 {
     ConvolutionalNetwork *network = (ConvolutionalNetwork *)model;
-    InputParams *input_params;
-    ConvolutionalParams *conv_params;
-    DepthToSpaceParams *depth_to_space_params;
-    LayerPadParams *pad_params;
-    int cur_width, cur_height, cur_channels;
-    int32_t layer;
 
-    if (network->layers_num <= 0 || network->layers[0].type != INPUT){
+    if (network->layers_num <= 0 || network->operands_num <= 0)
         return DNN_ERROR;
-    }
-    else{
-        input_params = (InputParams *)network->layers[0].params;
-        input_params->width = cur_width = input->width;
-        input_params->height = cur_height = input->height;
-        input_params->channels = cur_channels = input->channels;
-        if (input->data){
-            av_freep(&input->data);
-        }
-        av_assert0(input->dt == DNN_FLOAT);
-        network->layers[0].output = input->data = av_malloc(cur_height * cur_width * cur_channels * sizeof(float));
-        if (!network->layers[0].output){
-            return DNN_ERROR;
-        }
-    }
-
-    for (layer = 1; layer < network->layers_num; ++layer){
-        switch (network->layers[layer].type){
-        case CONV:
-            conv_params = (ConvolutionalParams *)network->layers[layer].params;
-            if (conv_params->input_num != cur_channels){
-                return DNN_ERROR;
-            }
-            cur_channels = conv_params->output_num;
-
-            if (conv_params->padding_method == VALID) {
-                int pad_size = (conv_params->kernel_size - 1) * conv_params->dilation;
-                cur_height -= pad_size;
-                cur_width -= pad_size;
-            }
-            break;
-        case DEPTH_TO_SPACE:
-            depth_to_space_params = (DepthToSpaceParams *)network->layers[layer].params;
-            if (cur_channels % (depth_to_space_params->block_size * depth_to_space_params->block_size) != 0){
-                return DNN_ERROR;
-            }
-            cur_channels = cur_channels / (depth_to_space_params->block_size * depth_to_space_params->block_size);
-            cur_height *= depth_to_space_params->block_size;
-            cur_width *= depth_to_space_params->block_size;
-            break;
-        case MIRROR_PAD:
-            pad_params = (LayerPadParams *)network->layers[layer].params;
-            cur_height = cur_height + pad_params->paddings[1][0] + pad_params->paddings[1][1];
-            cur_width = cur_width + pad_params->paddings[2][0] + pad_params->paddings[2][1];
-            cur_channels = cur_channels + pad_params->paddings[3][0] + pad_params->paddings[3][1];
-            break;
-        default:
-            return DNN_ERROR;
-        }
-        if (network->layers[layer].output){
-            av_freep(&network->layers[layer].output);
-        }
-
-        if (cur_height <= 0 || cur_width <= 0)
-            return DNN_ERROR;
 
-        network->layers[layer].output = av_malloc(cur_height * cur_width * cur_channels * sizeof(float));
-        if (!network->layers[layer].output){
-            return DNN_ERROR;
-        }
-    }
+    av_assert0(input->dt == DNN_FLOAT);
+
+    /**
+     * as the first step, suppose network->operands[0] is the input operand.
+     */
+    network->operands[0].dims[0] = 1;
+    network->operands[0].dims[1] = input->height;
+    network->operands[0].dims[2] = input->width;
+    network->operands[0].dims[3] = input->channels;
+    network->operands[0].type = DOT_INPUT;
+    network->operands[0].data_type = DNN_FLOAT;
+    network->operands[0].isNHWC = 1;
+
+    av_freep(&network->operands[0].data);
+    network->operands[0].length = calculate_operand_data_length(&network->operands[0]);
+    network->operands[0].data = av_malloc(network->operands[0].length);
+    if (!network->operands[0].data)
+        return DNN_ERROR;
 
+    input->data = network->operands[0].data;
     return DNN_SUCCESS;
 }
 
@@ -119,6 +72,7 @@  DNNModel *ff_dnn_load_model_native(const char *model_filename)
     ConvolutionalParams *conv_params;
     DepthToSpaceParams *depth_to_space_params;
     LayerPadParams *pad_params;
+    int32_t operand_index = 0;
 
     model = av_malloc(sizeof(DNNModel));
     if (!model){
@@ -131,7 +85,7 @@  DNNModel *ff_dnn_load_model_native(const char *model_filename)
     }
     file_size = avio_size(model_file_context);
 
-    network = av_malloc(sizeof(ConvolutionalNetwork));
+    network = av_mallocz(sizeof(ConvolutionalNetwork));
     if (!network){
         avio_closep(&model_file_context);
         av_freep(&model);
@@ -139,32 +93,33 @@  DNNModel *ff_dnn_load_model_native(const char *model_filename)
     }
     model->model = (void *)network;
 
-    network->layers_num = 1 + (int32_t)avio_rl32(model_file_context);
+    network->layers_num = (int32_t)avio_rl32(model_file_context);
     dnn_size = 4;
 
-    network->layers = av_malloc(network->layers_num * sizeof(Layer));
+    network->layers = av_mallocz(network->layers_num * sizeof(Layer));
     if (!network->layers){
-        av_freep(&network);
         avio_closep(&model_file_context);
-        av_freep(&model);
+        ff_dnn_free_model_native(&model);
         return NULL;
     }
 
-    for (layer = 0; layer < network->layers_num; ++layer){
-        network->layers[layer].output = NULL;
-        network->layers[layer].params = NULL;
-    }
-    network->layers[0].type = INPUT;
-    network->layers[0].params = av_malloc(sizeof(InputParams));
-    if (!network->layers[0].params){
+    /**
+     * Operands should be read from model file, the whole change will be huge.
+     * to make things step by step, we first mock the operands, instead of reading from model file.
+     */
+    network->operands_num = network->layers_num + 1;
+    network->operands = av_mallocz(network->operands_num * sizeof(DnnOperand));
+    if (!network->operands){
         avio_closep(&model_file_context);
         ff_dnn_free_model_native(&model);
         return NULL;
     }
 
-    for (layer = 1; layer < network->layers_num; ++layer){
+    for (layer = 0; layer < network->layers_num; ++layer){
         layer_type = (int32_t)avio_rl32(model_file_context);
         dnn_size += 4;
+        network->layers[layer].input_operand_indexes[0] = operand_index++;
+        network->layers[layer].output_operand_index = operand_index;
         switch (layer_type){
         case CONV:
             conv_params = av_malloc(sizeof(ConvolutionalParams));
@@ -258,14 +213,35 @@  DNNModel *ff_dnn_load_model_native(const char *model_filename)
 
 #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
 
-static void convolve(const float *input, float *output, const ConvolutionalParams *conv_params, int width, int height)
+static int convolve(DnnOperand *operands, const int32_t *input_operand_indexes, int32_t output_operand_index, const ConvolutionalParams *conv_params)
 {
+    float *output;
+    int32_t input_operand_index = input_operand_indexes[0];
+    int number = operands[input_operand_index].dims[0];
+    int height = operands[input_operand_index].dims[1];
+    int width = operands[input_operand_index].dims[2];
+    int channel = operands[input_operand_index].dims[3];
+    const float *input = operands[input_operand_index].data;
+
     int radius = conv_params->kernel_size >> 1;
     int src_linesize = width * conv_params->input_num;
     int filter_linesize = conv_params->kernel_size * conv_params->input_num;
     int filter_size = conv_params->kernel_size * filter_linesize;
     int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
 
+    DnnOperand *output_operand = &operands[output_operand_index];
+    output_operand->dims[0] = number;
+    output_operand->dims[1] = height - pad_size * 2;
+    output_operand->dims[2] = width - pad_size * 2;
+    output_operand->dims[3] = conv_params->output_num;
+    output_operand->length = calculate_operand_data_length(output_operand);
+    output_operand->data = av_realloc(output_operand->data, output_operand->length);
+    if (!output_operand->data)
+        return -1;
+    output = output_operand->data;
+
+    av_assert0(channel == conv_params->input_num);
+
     for (int y = pad_size; y < height - pad_size; ++y) {
         for (int x = pad_size; x < width - pad_size; ++x) {
             for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
@@ -311,16 +287,36 @@  static void convolve(const float *input, float *output, const ConvolutionalParam
             output += conv_params->output_num;
         }
     }
+    return 0;
 }
 
-static void depth_to_space(const float *input, float *output, int block_size, int width, int height, int channels)
+static int depth_to_space(DnnOperand *operands, const int32_t *input_operand_indexes, int32_t output_operand_index, int block_size)
 {
+    float *output;
+    int32_t input_operand_index = input_operand_indexes[0];
+    int number = operands[input_operand_index].dims[0];
+    int height = operands[input_operand_index].dims[1];
+    int width = operands[input_operand_index].dims[2];
+    int channels = operands[input_operand_index].dims[3];
+    const float *input = operands[input_operand_index].data;
+
     int y, x, by, bx, ch;
     int new_channels = channels / (block_size * block_size);
     int output_linesize = width * channels;
     int by_linesize = output_linesize / block_size;
     int x_linesize = new_channels * block_size;
 
+    DnnOperand *output_operand = &operands[output_operand_index];
+    output_operand->dims[0] = number;
+    output_operand->dims[1] = height * block_size;
+    output_operand->dims[2] = width * block_size;
+    output_operand->dims[3] = new_channels;
+    output_operand->length = calculate_operand_data_length(output_operand);
+    output_operand->data = av_realloc(output_operand->data, output_operand->length);
+    if (!output_operand->data)
+        return -1;
+    output = output_operand->data;
+
     for (y = 0; y < height; ++y){
         for (x = 0; x < width; ++x){
             for (by = 0; by < block_size; ++by){
@@ -334,58 +330,38 @@  static void depth_to_space(const float *input, float *output, int block_size, in
         }
         output += output_linesize;
     }
+    return 0;
 }
 
 DNNReturnType ff_dnn_execute_model_native(const DNNModel *model, DNNData *outputs, uint32_t nb_output)
 {
     ConvolutionalNetwork *network = (ConvolutionalNetwork *)model->model;
-    int cur_width, cur_height, cur_channels;
     int32_t layer;
-    InputParams *input_params;
     ConvolutionalParams *conv_params;
     DepthToSpaceParams *depth_to_space_params;
     LayerPadParams *pad_params;
 
-    if (network->layers_num <= 0 || network->layers[0].type != INPUT || !network->layers[0].output){
+    if (network->layers_num <= 0 || network->operands_num <= 0)
+        return DNN_ERROR;
+    if (!network->operands[0].data)
         return DNN_ERROR;
-    }
-    else{
-        input_params = (InputParams *)network->layers[0].params;
-        cur_width = input_params->width;
-        cur_height = input_params->height;
-        cur_channels = input_params->channels;
-    }
 
-    for (layer = 1; layer < network->layers_num; ++layer){
-        if (!network->layers[layer].output){
-            return DNN_ERROR;
-        }
+    for (layer = 0; layer < network->layers_num; ++layer){
         switch (network->layers[layer].type){
         case CONV:
             conv_params = (ConvolutionalParams *)network->layers[layer].params;
-            convolve(network->layers[layer - 1].output, network->layers[layer].output, conv_params, cur_width, cur_height);
-            cur_channels = conv_params->output_num;
-            if (conv_params->padding_method == VALID) {
-                int pad_size = (conv_params->kernel_size - 1) * conv_params->dilation;
-                cur_height -= pad_size;
-                cur_width -= pad_size;
-            }
+            convolve(network->operands, network->layers[layer].input_operand_indexes,
+                     network->layers[layer].output_operand_index, conv_params);
             break;
         case DEPTH_TO_SPACE:
             depth_to_space_params = (DepthToSpaceParams *)network->layers[layer].params;
-            depth_to_space(network->layers[layer - 1].output, network->layers[layer].output,
-                           depth_to_space_params->block_size, cur_width, cur_height, cur_channels);
-            cur_height *= depth_to_space_params->block_size;
-            cur_width *= depth_to_space_params->block_size;
-            cur_channels /= depth_to_space_params->block_size * depth_to_space_params->block_size;
+            depth_to_space(network->operands, network->layers[layer].input_operand_indexes,
+                           network->layers[layer].output_operand_index, depth_to_space_params->block_size);
             break;
         case MIRROR_PAD:
             pad_params = (LayerPadParams *)network->layers[layer].params;
-            dnn_execute_layer_pad(network->layers[layer - 1].output, network->layers[layer].output,
-                                  pad_params, 1, cur_height, cur_width, cur_channels);
-            cur_height = cur_height + pad_params->paddings[1][0] + pad_params->paddings[1][1];
-            cur_width = cur_width + pad_params->paddings[2][0] + pad_params->paddings[2][1];
-            cur_channels = cur_channels + pad_params->paddings[3][0] + pad_params->paddings[3][1];
+            dnn_execute_layer_pad(network->operands, network->layers[layer].input_operand_indexes,
+                                  network->layers[layer].output_operand_index, pad_params);
             break;
         case INPUT:
             return DNN_ERROR;
@@ -395,14 +371,24 @@  DNNReturnType ff_dnn_execute_model_native(const DNNModel *model, DNNData *output
     // native mode does not support multiple outputs yet
     if (nb_output > 1)
         return DNN_ERROR;
-    outputs[0].data = network->layers[network->layers_num - 1].output;
-    outputs[0].height = cur_height;
-    outputs[0].width = cur_width;
-    outputs[0].channels = cur_channels;
+
+    /**
+     * as the first step, suppose network->operands[network->operands_num - 1] is the output operand.
+     */
+    outputs[0].data = network->operands[network->operands_num - 1].data;
+    outputs[0].height = network->operands[network->operands_num - 1].dims[1];
+    outputs[0].width = network->operands[network->operands_num - 1].dims[2];
+    outputs[0].channels = network->operands[network->operands_num - 1].dims[3];
 
     return DNN_SUCCESS;
 }
 
+int32_t calculate_operand_data_length(DnnOperand* operand)
+{
+    //av_assert0(operand->data_type == DNN_FLOAT);
+    return operand->dims[0] * operand->dims[1] * operand->dims[2] * operand->dims[3] * sizeof(float);
+}
+
 void ff_dnn_free_model_native(DNNModel **model)
 {
     ConvolutionalNetwork *network;
@@ -413,7 +399,6 @@  void ff_dnn_free_model_native(DNNModel **model)
     {
         network = (ConvolutionalNetwork *)(*model)->model;
         for (layer = 0; layer < network->layers_num; ++layer){
-            av_freep(&network->layers[layer].output);
             if (network->layers[layer].type == CONV){
                 conv_params = (ConvolutionalParams *)network->layers[layer].params;
                 av_freep(&conv_params->kernel);
@@ -422,6 +407,11 @@  void ff_dnn_free_model_native(DNNModel **model)
             av_freep(&network->layers[layer].params);
         }
         av_freep(&network->layers);
+
+        for (uint32_t operand = 0; operand < network->operands_num; ++operand)
+            av_freep(&network->operands[operand].data);
+        av_freep(&network->operands);
+
         av_freep(&network);
         av_freep(model);
     }
diff --git a/libavfilter/dnn/dnn_backend_native.h b/libavfilter/dnn/dnn_backend_native.h
index b6f9533..d7737ac 100644
--- a/libavfilter/dnn/dnn_backend_native.h
+++ b/libavfilter/dnn/dnn_backend_native.h
@@ -36,12 +36,60 @@  typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc;
 
 typedef enum {VALID, SAME, SAME_CLAMP_TO_EDGE} DNNConvPaddingParam;
 
+typedef enum {DOT_INPUT, DOT_INTERMEDIATE, DOT_OUTPUT} DNNOperandType;
+
 typedef struct Layer{
     DNNLayerType type;
-    float *output;
+    /**
+     * a layer can have multiple inputs and one output.
+     * 4 is just a big enough number for input operands (increase it if necessary),
+     * do not use 'int32_t *input_operand_indexes', so we don't worry about mem leaks.
+     */
+    int32_t input_operand_indexes[4];
+    int32_t output_operand_index;
     void *params;
 } Layer;
 
+typedef struct DnnOperand{
+    /**
+     * there are two memory layouts, NHWC or NCHW, so we use dims,
+     * dims[0] is Number.
+     */
+    int32_t dims[4];
+
+    /**
+     * input/output/intermediate operand of the network
+     */
+    DNNOperandType type;
+
+    /**
+     * support different kinds of data type such as float, half float, int8 etc,
+     * first support float now.
+     */
+    DNNDataType data_type;
+
+    /**
+     * NHWC if 1, otherwise NCHW.
+     * let's first support NHWC only, this flag is for extensive usage.
+     */
+    int8_t isNHWC;
+
+    /**
+     * to avoid possible memory leak, do not use char *name
+     */
+    char name[512];
+
+    /**
+     * data pointer with data length in bytes.
+     * usedNumbersLeft is only valid for intermediate operand,
+     * it means how many layers still depend on this operand,
+     * todo: the memory can be reused when usedNumbersLeft is zero.
+     */
+    void *data;
+    int32_t length;
+    int32_t usedNumbersLeft;
+}DnnOperand;
+
 typedef struct ConvolutionalParams{
     int32_t input_num, output_num, kernel_size;
     DNNActivationFunc activation;
@@ -63,6 +111,8 @@  typedef struct DepthToSpaceParams{
 typedef struct ConvolutionalNetwork{
     Layer *layers;
     int32_t layers_num;
+    DnnOperand *operands;
+    int32_t operands_num;
 } ConvolutionalNetwork;
 
 DNNModel *ff_dnn_load_model_native(const char *model_filename);
@@ -71,4 +121,6 @@  DNNReturnType ff_dnn_execute_model_native(const DNNModel *model, DNNData *output
 
 void ff_dnn_free_model_native(DNNModel **model);
 
+int32_t calculate_operand_data_length(DnnOperand *operand);
+
 #endif
diff --git a/libavfilter/dnn/dnn_backend_native_layer_pad.c b/libavfilter/dnn/dnn_backend_native_layer_pad.c
index 5417d73..c2905a7 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_pad.c
+++ b/libavfilter/dnn/dnn_backend_native_layer_pad.c
@@ -48,12 +48,21 @@  static int after_get_buddy(int given, int border, LayerPadModeParam mode)
     }
 }
 
-void dnn_execute_layer_pad(const float *input, float *output, const LayerPadParams *params, int number, int height, int width, int channel)
+int dnn_execute_layer_pad(DnnOperand *operands, const int32_t *input_operand_indexes, int32_t output_operand_index,
+                           const LayerPadParams *params)
 {
     int32_t before_paddings;
     int32_t after_paddings;
+    float* output;
 
     // suppose format is <N, H, W, C>
+    int32_t input_operand_index = input_operand_indexes[0];
+    int number = operands[input_operand_index].dims[0];
+    int height = operands[input_operand_index].dims[1];
+    int width = operands[input_operand_index].dims[2];
+    int channel = operands[input_operand_index].dims[3];
+    const float *input = operands[input_operand_index].data;
+
     int new_number = number + params->paddings[0][0] + params->paddings[0][1];
     int new_height = height + params->paddings[1][0] + params->paddings[1][1];
     int new_width = width + params->paddings[2][0] + params->paddings[2][1];
@@ -67,6 +76,17 @@  void dnn_execute_layer_pad(const float *input, float *output, const LayerPadPara
     int new_wc_stride = new_c_stride * new_width;
     int new_hwc_stride = new_wc_stride * new_height;
 
+    DnnOperand *output_operand = &operands[output_operand_index];
+    output_operand->dims[0] = new_number;
+    output_operand->dims[1] = new_height;
+    output_operand->dims[2] = new_width;
+    output_operand->dims[3] = new_channel;
+    output_operand->length = calculate_operand_data_length(output_operand);
+    output_operand->data = av_realloc(output_operand->data, output_operand->length);
+    if (!output_operand->data)
+        return -1;
+    output = output_operand->data;
+
     // copy the original data
     for (int n = 0; n < number; n++) {
         for (int h = 0; h < height; h++) {
@@ -208,4 +228,6 @@  void dnn_execute_layer_pad(const float *input, float *output, const LayerPadPara
             }
         }
     }
+
+    return 0;
 }
diff --git a/libavfilter/dnn/dnn_backend_native_layer_pad.h b/libavfilter/dnn/dnn_backend_native_layer_pad.h
index 0fbe652..7cc8213 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_pad.h
+++ b/libavfilter/dnn/dnn_backend_native_layer_pad.h
@@ -26,6 +26,7 @@ 
 #define AVFILTER_DNN_DNN_BACKEND_NATIVE_LAYER_PAD_H
 
 #include <stdint.h>
+#include "dnn_backend_native.h"
 
 typedef enum {LPMP_CONSTANT, LPMP_REFLECT, LPMP_SYMMETRIC} LayerPadModeParam;
 
@@ -35,6 +36,7 @@  typedef struct LayerPadParams{
     float constant_values;
 } LayerPadParams;
 
-void dnn_execute_layer_pad(const float *input, float *output, const LayerPadParams *params, int number, int height, int width, int channel);
+int dnn_execute_layer_pad(DnnOperand *operands, const int32_t *input_operand_indexes, int32_t output_operand_index,
+                           const LayerPadParams *params);
 
 #endif
diff --git a/tests/dnn/Makefile b/tests/dnn/Makefile
index 0e050ea..fabed75 100644
--- a/tests/dnn/Makefile
+++ b/tests/dnn/Makefile
@@ -5,7 +5,7 @@  DNNTESTPROGS := $(DNNTESTPROGS:%=$(DNNTESTSDIR)/%-test$(EXESUF))
 -include $(wildcard $(DNNTESTOBJS:.o=.d))
 
 $(DNNTESTPROGS): %$(EXESUF): %.o $(FF_STATIC_DEP_LIBS)
-	$(LD) $(LDFLAGS) $(LDEXEFLAGS) $(LD_O) $(filter %.o,$^) $(FF_STATIC_DEP_LIBS) $(ELIBS)
+	$(LD) $(LDFLAGS) $(LDEXEFLAGS) $(LD_O) $(filter %.o,$^) $(FF_STATIC_DEP_LIBS) $(EXTRALIBS-avcodec) $(EXTRALIBS-avfilter) $(EXTRALIBS-avformat) $(EXTRALIBS-avutil) $(EXTRALIBS-swresample) $(EXTRALIBS)
 
 testclean::
 	$(RM) $(addprefix $(DNNTESTSDIR)/,$(CLEANSUFFIXES) *-test$(EXESUF))
diff --git a/tests/dnn/dnn-layer-pad-test.c b/tests/dnn/dnn-layer-pad-test.c
index 28a49eb..1fb2be1 100644
--- a/tests/dnn/dnn-layer-pad-test.c
+++ b/tests/dnn/dnn-layer-pad-test.c
@@ -44,6 +44,8 @@  static int test_with_mode_symmetric(void)
     */
 
     LayerPadParams params;
+    DnnOperand operands[2];
+    int32_t input_indexes[1];
     float input[1*4*4*3] = {
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
     };
@@ -57,8 +59,7 @@  static int test_with_mode_symmetric(void)
         27.0, 28.0, 29.0, 24.0, 25.0, 26.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 33.0, 34.0, 35.0, 30.0, 31.0, 32.0, 18.0, 19.0, 20.0, 15.0, 16.0, 17.0, 12.0,
         13.0, 14.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 21.0, 22.0, 23.0, 18.0, 19.0, 20.0
     };
-    float output[1*9*9*3];
-    memset(output, 0, sizeof(output));
+    float *output;
 
     params.mode = LPMP_SYMMETRIC;
     params.paddings[0][0] = 0;
@@ -70,15 +71,26 @@  static int test_with_mode_symmetric(void)
     params.paddings[3][0] = 0;
     params.paddings[3][1] = 0;
 
-    dnn_execute_layer_pad(input, output, &params, 1, 4, 4, 3);
+    operands[0].data = input;
+    operands[0].dims[0] = 1;
+    operands[0].dims[1] = 4;
+    operands[0].dims[2] = 4;
+    operands[0].dims[3] = 3;
+    operands[1].data = NULL;
 
-    for (int i = 0; i < sizeof(output) / sizeof(float); i++) {
+    input_indexes[0] = 0;
+    dnn_execute_layer_pad(operands, input_indexes, 1, &params);
+
+    output = operands[1].data;
+    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
         if (fabs(output[i] - expected_output[i]) > EPSON) {
             printf("at index %d, output: %f, expected_output: %f\n", i, output[i], expected_output[i]);
+            av_freep(&output);
             return 1;
         }
     }
 
+    av_freep(&output);
     return 0;
 
 }
@@ -102,6 +114,8 @@  static int test_with_mode_reflect(void)
     */
 
     LayerPadParams params;
+    DnnOperand operands[2];
+    int32_t input_indexes[1];
     float input[3*2*2*3] = {
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35
     };
@@ -110,8 +124,7 @@  static int test_with_mode_reflect(void)
         12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0,
         35.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0
     };
-    float output[6*2*2*3];
-    memset(output, 0, sizeof(output));
+    float *output;
 
     params.mode = LPMP_REFLECT;
     params.paddings[0][0] = 1;
@@ -123,15 +136,26 @@  static int test_with_mode_reflect(void)
     params.paddings[3][0] = 0;
     params.paddings[3][1] = 0;
 
-    dnn_execute_layer_pad(input, output, &params, 3, 2, 2, 3);
+    operands[0].data = input;
+    operands[0].dims[0] = 3;
+    operands[0].dims[1] = 2;
+    operands[0].dims[2] = 2;
+    operands[0].dims[3] = 3;
+    operands[1].data = NULL;
+
+    input_indexes[0] = 0;
+    dnn_execute_layer_pad(operands, input_indexes, 1, &params);
 
-    for (int i = 0; i < sizeof(output) / sizeof(float); i++) {
+    output = operands[1].data;
+    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
         if (fabs(output[i] - expected_output[i]) > EPSON) {
             printf("at index %d, output: %f, expected_output: %f\n", i, output[i], expected_output[i]);
+            av_freep(&output);
             return 1;
         }
     }
 
+    av_freep(&output);
     return 0;
 
 }
@@ -155,6 +179,8 @@  static int test_with_mode_constant(void)
     */
 
     LayerPadParams params;
+    DnnOperand operands[2];
+    int32_t input_indexes[1];
     float input[1*2*2*3] = {
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
     };
@@ -163,8 +189,7 @@  static int test_with_mode_constant(void)
         728.0, 728.0, 0.0, 1.0, 2.0, 728.0, 728.0, 728.0, 3.0, 4.0, 5.0, 728.0, 728.0,
         728.0, 6.0, 7.0, 8.0, 728.0, 728.0, 728.0, 9.0, 10.0, 11.0, 728.0, 728.0
     };
-    float output[1*3*2*6];
-    memset(output, 0, sizeof(output));
+    float *output;
 
     params.mode = LPMP_CONSTANT;
     params.constant_values = 728;
@@ -177,15 +202,26 @@  static int test_with_mode_constant(void)
     params.paddings[3][0] = 1;
     params.paddings[3][1] = 2;
 
-    dnn_execute_layer_pad(input, output, &params, 1, 2, 2, 3);
+    operands[0].data = input;
+    operands[0].dims[0] = 3;
+    operands[0].dims[1] = 2;
+    operands[0].dims[2] = 2;
+    operands[0].dims[3] = 3;
+    operands[1].data = NULL;
+
+    input_indexes[0] = 0;
+    dnn_execute_layer_pad(operands, input_indexes, 1, &params);
 
-    for (int i = 0; i < sizeof(output) / sizeof(float); i++) {
+    output = operands[1].data;
+    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
         if (fabs(output[i] - expected_output[i]) > EPSON) {
             printf("at index %d, output: %f, expected_output: %f\n", i, output[i], expected_output[i]);
+            av_freep(&output);
             return 1;
         }
     }
 
+    av_freep(&output);
     return 0;
 
 }