From patchwork Wed Aug 5 03:43:54 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Fu, Ting" X-Patchwork-Id: 21486 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 4FB6544AD06 for ; Wed, 5 Aug 2020 06:49:33 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 1AE5168B741; Wed, 5 Aug 2020 06:49:33 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mga05.intel.com (mga05.intel.com [192.55.52.43]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 247E368AA60 for ; Wed, 5 Aug 2020 06:49:25 +0300 (EEST) IronPort-SDR: BqMzuCpVXwD4D2U2dPMDzfgulXaTc6TyV0BVVPSlnWH0uaXDzNj8vcbvLsRE7TvKa+Lq9V4K+W CNTCqeb2M7dw== X-IronPort-AV: E=McAfee;i="6000,8403,9703"; a="237318853" X-IronPort-AV: E=Sophos;i="5.75,436,1589266800"; d="scan'208";a="237318853" X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga004.fm.intel.com ([10.253.24.48]) by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 04 Aug 2020 20:49:23 -0700 IronPort-SDR: ff7YiGX3W0TYv6AFApnacYQwNKFDdqwfXc4lG4CZZmYm/2d1pTClxO747759aGNl3lV+K/fvp5 0HgpuHUZOAwA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.75,436,1589266800"; d="scan'208";a="315595849" Received: from semmer-ubuntu.sh.intel.com ([10.239.159.54]) by fmsmga004.fm.intel.com with ESMTP; 04 Aug 2020 20:49:22 -0700 From: Ting Fu To: ffmpeg-devel@ffmpeg.org Date: Wed, 5 Aug 2020 11:43:54 +0800 Message-Id: <20200805034355.22993-1-ting.fu@intel.com> X-Mailer: git-send-email 2.17.1 Subject: [FFmpeg-devel] [PATCH V4 1/2] dnn/native: add native support for avg_pool X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Not support pooling strides in channel dimension now. It can be tested with the model generated with below python script: import tensorflow as tf import numpy as np import imageio in_img = imageio.imread('input_odd.jpg') in_img = in_img.astype(np.float32)/255.0 in_data = in_img[np.newaxis, :] x = tf.placeholder(tf.float32, shape=[1, None, None, 3], name='dnn_in') x_pool = tf.nn.avg_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME') #please alter the params as needed y = tf.identity(x_pool, name='dnn_out') sess=tf.Session() sess.run(tf.global_variables_initializer()) graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['dnn_out']) tf.train.write_graph(graph_def, '.', 'image_process.pb', as_text=False) print("image_process.pb generated, please use \ path_to_ffmpeg/tools/python/convert.py to generate image_process.model\n") output = sess.run(y, feed_dict={x: in_data}) imageio.imsave("out.jpg", np.squeeze(output)) Signed-off-by: Ting Fu --- libavfilter/dnn/Makefile | 1 + libavfilter/dnn/dnn_backend_native.h | 2 + .../dnn/dnn_backend_native_layer_avgpool.c | 141 ++++++++++++++++++ .../dnn/dnn_backend_native_layer_avgpool.h | 40 +++++ .../dnn/dnn_backend_native_layer_conv2d.h | 3 +- libavfilter/dnn/dnn_backend_native_layers.c | 2 + tools/python/convert_from_tensorflow.py | 37 ++++- 7 files changed, 223 insertions(+), 3 deletions(-) create mode 100644 libavfilter/dnn/dnn_backend_native_layer_avgpool.c create mode 100644 libavfilter/dnn/dnn_backend_native_layer_avgpool.h diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile index d90137ec42..e0957073ee 100644 --- a/libavfilter/dnn/Makefile +++ b/libavfilter/dnn/Makefile @@ -1,6 +1,7 @@ OBJS-$(CONFIG_DNN) += dnn/dnn_interface.o OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native.o OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layers.o +OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_avgpool.o OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_pad.o OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_conv2d.o OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_depth2space.o diff --git a/libavfilter/dnn/dnn_backend_native.h b/libavfilter/dnn/dnn_backend_native.h index 62191ffe88..26e9a33387 100644 --- a/libavfilter/dnn/dnn_backend_native.h +++ b/libavfilter/dnn/dnn_backend_native.h @@ -43,10 +43,12 @@ typedef enum { DLT_MAXIMUM = 4, DLT_MATH_BINARY = 5, DLT_MATH_UNARY = 6, + DLT_AVG_POOL = 7, DLT_COUNT } DNNLayerType; typedef enum {DOT_INPUT = 1, DOT_OUTPUT = 2, DOT_INTERMEDIATE = DOT_INPUT | DOT_OUTPUT} DNNOperandType; +typedef enum {VALID, SAME, SAME_CLAMP_TO_EDGE} DNNPaddingParam; typedef struct Layer{ DNNLayerType type; diff --git a/libavfilter/dnn/dnn_backend_native_layer_avgpool.c b/libavfilter/dnn/dnn_backend_native_layer_avgpool.c new file mode 100644 index 0000000000..d745c35b4a --- /dev/null +++ b/libavfilter/dnn/dnn_backend_native_layer_avgpool.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2020 + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * DNN native backend implementation. + */ + +#include "libavutil/avassert.h" +#include "dnn_backend_native_layer_avgpool.h" + +int dnn_load_layer_avg_pool(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num) +{ + AvgPoolParams *avgpool_params; + int dnn_size = 0; + avgpool_params = av_malloc(sizeof(*avgpool_params)); + if(!avgpool_params) + return 0; + + avgpool_params->strides = (int32_t)avio_rl32(model_file_context); + avgpool_params->padding_method = (int32_t)avio_rl32(model_file_context); + avgpool_params->kernel_size = (int32_t)avio_rl32(model_file_context); + dnn_size += 12; + + if (dnn_size > file_size || avgpool_params->kernel_size <= 0 || avgpool_params->strides <=0){ + av_freep(&avgpool_params); + return 0; + } + + layer->params = avgpool_params; + layer->input_operand_indexes[0] = (int32_t)avio_rl32(model_file_context); + layer->output_operand_index = (int32_t)avio_rl32(model_file_context); + dnn_size += 8; + + if (layer->input_operand_indexes[0] >= operands_num || layer->output_operand_index >= operands_num) { + return 0; + } + return dnn_size; +} + +int dnn_execute_layer_avg_pool(DnnOperand *operands, const int32_t *input_operand_indexes, + int32_t output_operand_index, const void *parameters) +{ + float *output; + int height_end, width_end, height_radius, width_radius, output_height, output_width, kernel_area; + int32_t input_operand_index = input_operand_indexes[0]; + int number = operands[input_operand_index].dims[0]; + int height = operands[input_operand_index].dims[1]; + int width = operands[input_operand_index].dims[2]; + int channel = operands[input_operand_index].dims[3]; + const float *input = operands[input_operand_index].data; + const AvgPoolParams *avgpool_params = (const AvgPoolParams *)parameters; + + int kernel_strides = avgpool_params->strides; + int src_linesize = width * channel; + DnnOperand *output_operand = &operands[output_operand_index]; + + /** + * When padding_method = SAME, the tensorflow will only padding the hald number of 0 pxiels + * except the remainders. + * Eg: assuming the input height = 1080, the strides = 11, so the remainders = 1080 % 11 = 2 + * and if ksize = 5: it will fill (5 - 2) >> 1 = 1 line before the first line of input image, + * and 5 - 2 - 1 = 2 lines after the last line of input image. + * and if ksize = 7: it will fill (7 - 2) >> 1 = 2 lines before the first line of input image, + * and 7 - 2 - 2 = 3 lines after the last line of input image. + */ + if (avgpool_params->padding_method == SAME) { + height_end = height; + width_end = width; + height_radius = avgpool_params->kernel_size - ((height - 1) % kernel_strides + 1); + width_radius = avgpool_params->kernel_size - ((width - 1) % kernel_strides + 1); + height_radius = height_radius < 0 ? 0 : height_radius >> 1; + width_radius = width_radius < 0 ? 0 : width_radius >> 1; + output_height = ceil(height / (kernel_strides * 1.0)); + output_width = ceil(width / (kernel_strides * 1.0)); + } else { + assert(avgpool_params->padding_method = VALID); + height_end = height - avgpool_params->kernel_size + 1; + width_end = width - avgpool_params->kernel_size + 1; + height_radius = 0; + width_radius = 0; + output_height = ceil((height - avgpool_params->kernel_size + 1) / (kernel_strides * 1.0)); + output_width = ceil((width - avgpool_params->kernel_size + 1) / (kernel_strides * 1.0)); + } + + output_operand->dims[0] = number; + output_operand->dims[1] = output_height; + output_operand->dims[2] = output_width; + // not support pooling in channel dimension now + output_operand->dims[3] = channel; + output_operand->data_type = operands[input_operand_index].data_type; + output_operand->length = calculate_operand_data_length(output_operand); + output_operand->data = av_realloc(output_operand->data, output_operand->length); + if (!output_operand->data) + return -1; + output = output_operand->data; + + for (int y = 0; y < height_end; y += kernel_strides) { + for (int x = 0; x < width_end; x += kernel_strides) { + for (int n_channel = 0; n_channel < channel; ++n_channel) { + output[n_channel] = 0.0; + kernel_area = 0; + for (int kernel_y = 0; kernel_y < avgpool_params->kernel_size; ++kernel_y) { + for (int kernel_x = 0; kernel_x < avgpool_params->kernel_size; ++kernel_x) { + float input_pel; + int y_pos = y + (kernel_y - height_radius); + int x_pos = x + (kernel_x - width_radius); + if (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) { + input_pel = 0.0; + } else { + kernel_area++; + input_pel = input[y_pos * src_linesize + x_pos * channel + n_channel]; + } + output[n_channel] += input_pel; + } + } + output[n_channel] /= kernel_area; + } + output += channel; + } + } + + return 0; +} diff --git a/libavfilter/dnn/dnn_backend_native_layer_avgpool.h b/libavfilter/dnn/dnn_backend_native_layer_avgpool.h new file mode 100644 index 0000000000..8e31ddb7c8 --- /dev/null +++ b/libavfilter/dnn/dnn_backend_native_layer_avgpool.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2020 + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * DNN inference functions interface for native backend. + */ + +#ifndef AVFILTER_DNN_DNN_BACKEND_NATIVE_LAYER_AVGPOOL_H +#define AVFILTER_DNN_DNN_BACKEND_NATIVE_LAYER_AVGPOOL_H + +#include "dnn_backend_native.h" + +typedef struct AvgPoolParams{ + int32_t strides, kernel_size; + DNNPaddingParam padding_method; +} AvgPoolParams; + +int dnn_load_layer_avg_pool(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num); +int dnn_execute_layer_avg_pool(DnnOperand *operands, const int32_t *input_operand_indexes, + int32_t output_operand_index, const void *parameters); + +#endif diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.h b/libavfilter/dnn/dnn_backend_native_layer_conv2d.h index eeb15fdf01..b240b7ef6b 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.h +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.h @@ -24,12 +24,11 @@ #include "dnn_backend_native.h" typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc; -typedef enum {VALID, SAME, SAME_CLAMP_TO_EDGE} DNNConvPaddingParam; typedef struct ConvolutionalParams{ int32_t input_num, output_num, kernel_size; DNNActivationFunc activation; - DNNConvPaddingParam padding_method; + DNNPaddingParam padding_method; int32_t dilation; int32_t has_bias; float *kernel; diff --git a/libavfilter/dnn/dnn_backend_native_layers.c b/libavfilter/dnn/dnn_backend_native_layers.c index 70f9a5f958..4f42f62abb 100644 --- a/libavfilter/dnn/dnn_backend_native_layers.c +++ b/libavfilter/dnn/dnn_backend_native_layers.c @@ -26,6 +26,7 @@ #include "dnn_backend_native_layer_maximum.h" #include "dnn_backend_native_layer_mathbinary.h" #include "dnn_backend_native_layer_mathunary.h" +#include "dnn_backend_native_layer_avgpool.h" LayerFunc layer_funcs[DLT_COUNT] = { {NULL, NULL}, @@ -35,4 +36,5 @@ LayerFunc layer_funcs[DLT_COUNT] = { {dnn_execute_layer_maximum, dnn_load_layer_maximum}, {dnn_execute_layer_math_binary, dnn_load_layer_math_binary}, {dnn_execute_layer_math_unary, dnn_load_layer_math_unary}, + {dnn_execute_layer_avg_pool, dnn_load_layer_avg_pool}, }; diff --git a/tools/python/convert_from_tensorflow.py b/tools/python/convert_from_tensorflow.py index 85db7bf710..baff602cf2 100644 --- a/tools/python/convert_from_tensorflow.py +++ b/tools/python/convert_from_tensorflow.py @@ -67,10 +67,12 @@ class TFConverter: self.edges = {} self.conv_activations = {'Relu':0, 'Tanh':1, 'Sigmoid':2, 'None':3, 'LeakyRelu':4} self.conv_paddings = {'VALID':0, 'SAME':1} + self.pool_paddings = {'VALID':0, 'SAME':1} self.converted_nodes = set() self.conv2d_scope_names = set() self.conv2d_scopename_inputname_dict = {} - self.op2code = {'Conv2D':1, 'DepthToSpace':2, 'MirrorPad':3, 'Maximum':4, 'MathBinary':5, 'MathUnary':6} + self.op2code = {'Conv2D':1, 'DepthToSpace':2, 'MirrorPad':3, 'Maximum':4, + 'MathBinary':5, 'MathUnary':6, 'AvgPool':7} self.mathbin2code = {'Sub':0, 'Add':1, 'Mul':2, 'RealDiv':3, 'Minimum':4} self.mathun2code = {'Abs':0, 'Sin':1, 'Cos':2, 'Tan':3, 'Asin':4, 'Acos':5, 'Atan':6, 'Sinh':7, 'Cosh':8, 'Tanh':9, 'Asinh':10, 'Acosh':11, 'Atanh':12} self.mirrorpad_mode = {'CONSTANT':0, 'REFLECT':1, 'SYMMETRIC':2} @@ -298,6 +300,37 @@ class TFConverter: np.array([output_operand_index],dtype=np.uint32).tofile(f) + def dump_avg_pool_to_file(self, node, f): + assert(node.op == 'AvgPool') + self.layer_number = self.layer_number + 1 + self.converted_nodes.add(node.name) + node0 = self.name_node_dict[node.input[0]] + strides = node.attr['strides'] + + # Tensorflow do not support pooling strides in batch dimension and + # current native NN do not support pooling strides in channel dimension, added assert() here. + assert(strides.list.i[1]==strides.list.i[2]) + assert(strides.list.i[0]==1) + assert(strides.list.i[3]==1) + strides = strides.list.i[1] + filter_node = node.attr['ksize'] + input_name = node.input[0] + + # Tensorflow do not support pooling ksize in batch dimension and channel dimension. + assert(filter_node.list.i[0]==1) + assert(filter_node.list.i[3]==1) + filter_height = filter_node.list.i[1] + filter_width = filter_node.list.i[2] + + padding = node.attr['padding'].s.decode("utf-8") + np.array([self.op2code[node.op], strides, self.pool_paddings[padding], filter_height], + dtype=np.uint32).tofile(f) + + input_operand_index = self.add_operand(input_name, Operand.IOTYPE_INPUT) + output_operand_index = self.add_operand(node.name, Operand.IOTYPE_OUTPUT) + np.array([input_operand_index, output_operand_index],dtype=np.uint32).tofile(f) + + def dump_layers_to_file(self, f): for node in self.nodes: if node.name in self.converted_nodes: @@ -311,6 +344,8 @@ class TFConverter: if node.op == 'Conv2D': self.dump_simple_conv2d_to_file(node, f) + if node.op == 'AvgPool': + self.dump_avg_pool_to_file(node, f) elif node.op == 'DepthToSpace': self.dump_depth2space_to_file(node, f) elif node.op == 'MirrorPad':