From patchwork Tue Sep 22 07:11:09 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Mingyu Yin <mingyu.yin@intel.com>
X-Patchwork-Id: 22557
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
X-Original-To: patchwork@ffaux-bg.ffmpeg.org
Delivered-To: patchwork@ffaux-bg.ffmpeg.org
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by ffaux.localdomain (Postfix) with ESMTP id 07EF844A319
	for <patchwork@ffaux-bg.ffmpeg.org>; Tue, 22 Sep 2020 10:08:22 +0300 (EEST)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id D98FC68B5B9;
	Tue, 22 Sep 2020 10:08:21 +0300 (EEST)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from mga17.intel.com (mga17.intel.com [192.55.52.151])
 by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id D888868A494
 for <ffmpeg-devel@ffmpeg.org>; Tue, 22 Sep 2020 10:08:14 +0300 (EEST)
IronPort-SDR: 
 LvpliXUFnA7pKnvVcbYi+yO4djoJZX/WXYlWZ644WhHUgT6zZwnmQnsULLbGdLNqag9G/XAzDy
 h5kaCaR8ue7g==
X-IronPort-AV: E=McAfee;i="6000,8403,9751"; a="140551338"
X-IronPort-AV: E=Sophos;i="5.77,289,1596524400"; d="scan'208";a="140551338"
X-Amp-Result: SKIPPED(no attachment in message)
X-Amp-File-Uploaded: False
Received: from orsmga001.jf.intel.com ([10.7.209.18])
 by fmsmga107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 22 Sep 2020 00:08:12 -0700
IronPort-SDR: 
 gtJpYVV+x9aUITiOR72t4Y5et1bD8WpzvXBZcaq6Ct1MFrrF+5Rs+8wl9uGrxsQwuQL4Bsg9pF
 t9g34bjDp2Vw==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.77,289,1596524400"; d="scan'208";a="382220988"
Received: from cyj-pro.sh.intel.com ([10.239.98.50])
 by orsmga001.jf.intel.com with ESMTP; 22 Sep 2020 00:08:10 -0700
From: Mingyu Yin <mingyu.yin@intel.com>
To: ffmpeg-devel@ffmpeg.org
Date: Tue, 22 Sep 2020 15:11:09 +0800
Message-Id: <20200922071109.16531-1-mingyu.yin@intel.com>
X-Mailer: git-send-email 2.17.1
Subject: [FFmpeg-devel] [PATCH V2] dnn/native: add native support for dense
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
MIME-Version: 1.0
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Signed-off-by: Mingyu Yin <mingyu.yin@intel.com>
---
 libavfilter/dnn/Makefile                      |   1 +
 libavfilter/dnn/dnn_backend_native.h          |   2 +
 .../dnn/dnn_backend_native_layer_conv2d.h     |   1 -
 .../dnn/dnn_backend_native_layer_dense.c      | 151 ++++++++++++++++++
 .../dnn/dnn_backend_native_layer_dense.h      |  37 +++++
 libavfilter/dnn/dnn_backend_native_layers.c   |   2 +
 tests/dnn/dnn-layer-dense-test.c              | 131 +++++++++++++++
 tools/python/convert_from_tensorflow.py       | 126 ++++++++++++++-
 8 files changed, 442 insertions(+), 9 deletions(-)
 create mode 100644 libavfilter/dnn/dnn_backend_native_layer_dense.c
 create mode 100644 libavfilter/dnn/dnn_backend_native_layer_dense.h
 create mode 100644 tests/dnn/dnn-layer-dense-test.c

diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile
index e0957073ee..3681801892 100644
--- a/libavfilter/dnn/Makefile
+++ b/libavfilter/dnn/Makefile
@@ -2,6 +2,7 @@ OBJS-$(CONFIG_DNN)                           += dnn/dnn_interface.o
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native.o
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layers.o
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_avgpool.o
+OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_dense.o
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_pad.o
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_conv2d.o
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_depth2space.o
diff --git a/libavfilter/dnn/dnn_backend_native.h b/libavfilter/dnn/dnn_backend_native.h
index b1f8f3d6bf..0c98fd1a0c 100644
--- a/libavfilter/dnn/dnn_backend_native.h
+++ b/libavfilter/dnn/dnn_backend_native.h
@@ -45,11 +45,13 @@ typedef enum {
     DLT_MATH_BINARY = 5,
     DLT_MATH_UNARY = 6,
     DLT_AVG_POOL = 7,
+    DLT_DENSE = 8,
     DLT_COUNT
 } DNNLayerType;
 
 typedef enum {DOT_INPUT = 1, DOT_OUTPUT = 2, DOT_INTERMEDIATE = DOT_INPUT | DOT_OUTPUT} DNNOperandType;
 typedef enum {VALID, SAME, SAME_CLAMP_TO_EDGE} DNNPaddingParam;
+typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc;
 
 typedef struct Layer{
     DNNLayerType type;
diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.h b/libavfilter/dnn/dnn_backend_native_layer_conv2d.h
index 72319f2ebe..1295028c46 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.h
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.h
@@ -23,7 +23,6 @@
 
 #include "dnn_backend_native.h"
 
-typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc;
 
 typedef struct ConvolutionalParams{
     int32_t input_num, output_num, kernel_size;
diff --git a/libavfilter/dnn/dnn_backend_native_layer_dense.c b/libavfilter/dnn/dnn_backend_native_layer_dense.c
new file mode 100644
index 0000000000..1029137792
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_native_layer_dense.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2020
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "dnn_backend_native_layer_dense.h"
+
+int dnn_load_layer_dense(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num)
+{
+    DenseParams *dense_params;
+    int kernel_size;
+    int dnn_size = 0;
+    dense_params = av_malloc(sizeof(*dense_params));
+    if (!dense_params)
+        return 0;
+
+    dense_params->activation = (int32_t)avio_rl32(model_file_context);
+    dense_params->input_num = (int32_t)avio_rl32(model_file_context);
+    dense_params->output_num = (int32_t)avio_rl32(model_file_context);
+    dense_params->has_bias = (int32_t)avio_rl32(model_file_context);
+    dnn_size += 16;
+
+    kernel_size = dense_params->input_num * dense_params->output_num;
+    dnn_size += kernel_size * 4;
+    if (dense_params->has_bias)
+        dnn_size += dense_params->output_num * 4;
+
+    if (dnn_size > file_size || dense_params->input_num <= 0 ||
+        dense_params->output_num <= 0){
+        av_freep(&dense_params);
+        return 0;
+    }
+
+    dense_params->kernel = av_malloc(kernel_size * sizeof(float));
+    if (!dense_params->kernel) {
+        av_freep(&dense_params);
+        return 0;
+    }
+    for (int i = 0; i < kernel_size; ++i) {
+        dense_params->kernel[i] = av_int2float(avio_rl32(model_file_context));
+    }
+
+    dense_params->biases = NULL;
+    if (dense_params->has_bias) {
+        dense_params->biases = av_malloc(dense_params->output_num * sizeof(float));
+        if (!dense_params->biases){
+            av_freep(&dense_params->kernel);
+            av_freep(&dense_params);
+            return 0;
+        }
+        for (int i = 0; i < dense_params->output_num; ++i){
+            dense_params->biases[i] = av_int2float(avio_rl32(model_file_context));
+        }
+    }
+
+    layer->params = dense_params;
+
+    layer->input_operand_indexes[0] = (int32_t)avio_rl32(model_file_context);
+    layer->output_operand_index = (int32_t)avio_rl32(model_file_context);
+    dnn_size += 8;
+
+    if (layer->input_operand_indexes[0] >= operands_num || layer->output_operand_index >= operands_num) {
+        return 0;
+    }
+
+    return dnn_size;
+}
+
+int dnn_execute_layer_dense(DnnOperand *operands, const int32_t *input_operand_indexes,
+                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
+{
+    float *output;
+    int32_t input_operand_index = input_operand_indexes[0];
+    int number = operands[input_operand_index].dims[0];
+    int height = operands[input_operand_index].dims[1];
+    int width = operands[input_operand_index].dims[2];
+    int channel = operands[input_operand_index].dims[3];
+    const float *input = operands[input_operand_index].data;
+    const DenseParams *dense_params = (const DenseParams *)parameters;
+
+    int src_linesize = width * channel;
+    DnnOperand *output_operand = &operands[output_operand_index];
+    output_operand->dims[0] = number;
+    output_operand->dims[1] = height;
+    output_operand->dims[2] = width;
+    output_operand->dims[3] = dense_params->output_num;
+    output_operand->data_type = operands[input_operand_index].data_type;
+    output_operand->length = calculate_operand_data_length(output_operand);
+    if (output_operand->length <= 0) {
+        av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
+        return DNN_ERROR;
+    }
+    output_operand->data = av_realloc(output_operand->data, output_operand->length);
+    if (!output_operand->data) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
+        return DNN_ERROR;
+    }
+    output = output_operand->data;
+
+    av_assert0(channel == dense_params->input_num);
+
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            for (int n_filter = 0; n_filter < dense_params->output_num; ++n_filter) {
+                if (dense_params->has_bias)
+                    output[n_filter] = dense_params->biases[n_filter];
+                else
+                    output[n_filter] = 0.f;
+
+                for (int ch = 0; ch < dense_params->input_num; ++ch) {
+                    float input_pel;
+                    input_pel = input[y * src_linesize + x * dense_params->input_num + ch];
+                    output[n_filter] += input_pel * dense_params->kernel[n_filter*dense_params->input_num + ch];
+                }
+                switch (dense_params->activation){
+                case RELU:
+                    output[n_filter] = FFMAX(output[n_filter], 0.0);
+                    break;
+                case TANH:
+                    output[n_filter] = 2.0f  / (1.0f + exp(-2.0f * output[n_filter])) - 1.0f;
+                    break;
+                case SIGMOID:
+                    output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
+                    break;
+                case NONE:
+                    break;
+                case LEAKY_RELU:
+                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 * FFMIN(output[n_filter], 0.0);
+                }
+            }
+            output += dense_params->output_num;
+        }
+    }
+    return 0;
+}
diff --git a/libavfilter/dnn/dnn_backend_native_layer_dense.h b/libavfilter/dnn/dnn_backend_native_layer_dense.h
new file mode 100644
index 0000000000..f98284b154
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_native_layer_dense.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_DNN_DNN_BACKEND_NATIVE_LAYER_DENSE_H
+#define AVFILTER_DNN_DNN_BACKEND_NATIVE_LAYER_DENSE_H
+
+#include "dnn_backend_native.h"
+
+typedef struct DenseParams{
+    int32_t input_num, output_num;
+    DNNActivationFunc activation;
+    int32_t has_bias;
+    float *kernel;
+    float *biases;
+} DenseParams;
+
+int dnn_load_layer_dense(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num);
+int dnn_execute_layer_dense(DnnOperand *operands, const int32_t *input_operand_indexes,
+                             int32_t output_operand_index, const void *parameters, NativeContext *ctx);
+#endif
diff --git a/libavfilter/dnn/dnn_backend_native_layers.c b/libavfilter/dnn/dnn_backend_native_layers.c
index 4f42f62abb..638a94e9a3 100644
--- a/libavfilter/dnn/dnn_backend_native_layers.c
+++ b/libavfilter/dnn/dnn_backend_native_layers.c
@@ -27,6 +27,7 @@
 #include "dnn_backend_native_layer_mathbinary.h"
 #include "dnn_backend_native_layer_mathunary.h"
 #include "dnn_backend_native_layer_avgpool.h"
+#include "dnn_backend_native_layer_dense.h"
 
 LayerFunc layer_funcs[DLT_COUNT] = {
     {NULL, NULL},
@@ -37,4 +38,5 @@ LayerFunc layer_funcs[DLT_COUNT] = {
     {dnn_execute_layer_math_binary, dnn_load_layer_math_binary},
     {dnn_execute_layer_math_unary,  dnn_load_layer_math_unary},
     {dnn_execute_layer_avg_pool,  dnn_load_layer_avg_pool},
+    {dnn_execute_layer_dense,  dnn_load_layer_dense},
 };
diff --git a/tests/dnn/dnn-layer-dense-test.c b/tests/dnn/dnn-layer-dense-test.c
new file mode 100644
index 0000000000..2c11ec5218
--- /dev/null
+++ b/tests/dnn/dnn-layer-dense-test.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2020
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "libavfilter/dnn/dnn_backend_native_layer_dense.h"
+
+#define EPSON 0.00001
+
+static int test(void)
+{
+    // the input data and expected data are generated with below python code.
+    /*
+    x = tf.placeholder(tf.float32, shape=[1, None, None, 3])
+    y = tf.layers.dense(input_x, 3, activation=tf.nn.sigmoid, bias_initializer=tf.keras.initializers.he_normal())
+    data = np.random.rand(1, 5, 6, 3);
+
+    sess=tf.Session()
+    sess.run(tf.global_variables_initializer())
+
+    weights = dict([(var.name, sess.run(var)) for var in tf.trainable_variables()])
+    kernel = weights['dense/kernel:0']
+    kernel = np.transpose(kernel, [1, 0])
+    print("kernel:")
+    print(kernel.shape)
+    print(list(kernel.flatten()))
+
+    bias = weights['dense/bias:0']
+    print("bias:")
+    print(bias.shape)
+    print(list(bias.flatten()))
+
+    output = sess.run(y, feed_dict={x: data})
+
+    print("input:")
+    print(data.shape)
+    print(list(data.flatten()))
+
+    print("output:")
+    print(output.shape)
+    print(list(output.flatten()))
+    */
+
+    ConvolutionalParams params;
+    DnnOperand operands[2];
+    int32_t input_indexes[1];
+    float input[1*5*6*3] = {
+        0.5552418686576308, 0.20653189262022464, 0.31115120939398877, 0.5897014433221428, 0.37340078861060655, 0.6470921693941893, 0.8039950367872679, 0.8762700891949274,
+        0.6556655583829558, 0.5911096107039339, 0.18640250865290997, 0.2803248779238966, 0.31586613136402053, 0.9447300740056483, 0.9443980824873418, 0.8158851991115941,
+        0.5631010340387631, 0.9407402251929046, 0.6485434876551682, 0.5631376966470001, 0.17581924875609634, 0.7033802439103178, 0.04802402495561675, 0.9183681450194972,
+        0.46059317944364, 0.07964160481596883, 0.871787076270302, 0.973743142324361, 0.15923146943258415, 0.8212946080584571, 0.5415954459227064, 0.9552813822803975,
+        0.4908552668172057, 0.33723691635292274, 0.46588057864910026, 0.8994239961321776, 0.09845220457674186, 0.1713400292123486, 0.39570294912818826, 0.08018956486392803,
+        0.5290478278169032, 0.7141906125920976, 0.0320878067840098, 0.6412406575332606, 0.0075712007102423096, 0.7150828462386156, 0.1311989216968138, 0.4706847944253756,
+        0.5447610794883336, 0.3430923933318001, 0.536082357943209, 0.4371629342483694, 0.40227962985019927, 0.3553806249465469, 0.031806622424259245, 0.7053916426174,
+        0.3261570237309813, 0.419500213292063, 0.3155691223480851, 0.05664028113178088, 0.3636491555914486, 0.8502419746667123, 0.9836596530684955, 0.1628681802975801,
+        0.09410832912479894, 0.28407218939480294, 0.7983417928813697, 0.24132158596506748, 0.8154729498062224, 0.29173768373895637, 0.13407102008052096, 0.18705786678800385,
+        0.7167943621295573, 0.09222004247174376, 0.2319220738766018, 0.17708964382285064, 0.1391440370249517, 0.3254088083499256, 0.4013916894718289, 0.4819742663322323,
+        0.15080103744648077, 0.9302407847555013, 0.9397597961319524, 0.5719200825550793, 0.9538938024682824, 0.9583882089203861, 0.5168861091262276, 0.1926396841842669,
+        0.6781176744337578, 0.719366447288566
+    };
+    float expected_output[1*5*6*3] = {
+        -0.3921688, -0.9243112, -0.29659146, -0.64000785, -0.9466343, -0.62125254, -0.71759033, -0.9171336, -0.735589, -0.34365994,
+        -0.92100817, -0.23903961, -0.8962277, -0.9521279, -0.90962386, -0.7488303, -0.9563761, -0.7701762, -0.40800542, -0.87684774,
+        -0.3339763, -0.6354543, -0.97068924, -0.6246325, -0.6992075, -0.9706726, -0.6818918, -0.51864433, -0.9592881, -0.51187396,
+        -0.7423632, -0.89911884, -0.7457824, -0.82009757, -0.96402895, -0.8235518, -0.61980766, -0.94494647, -0.5410502, -0.8281218,
+        -0.95508635, -0.8201453, -0.5937325, -0.8679507, -0.500767, -0.39430764, -0.93967676, -0.32183182, -0.58913624, -0.939717,
+        -0.55179894, -0.55004454, -0.9214453, -0.4889004, -0.75294703, -0.9118363, -0.7200309, -0.3248641, -0.8878874, -0.18977344,
+        -0.8873837, -0.9571257, -0.90145934, -0.50521654, -0.93739635, -0.39051685, -0.61143184, -0.9591179, -0.605999, -0.40008977,
+        -0.92219675, -0.26732883, -0.19607787, -0.9172511, -0.07068595, -0.5409857, -0.9387041, -0.44181606, -0.4705004, -0.8899935,
+        -0.37997037, -0.66105115, -0.89754754, -0.68141997, -0.6324047, -0.886776, -0.65066385, -0.8334821, -0.94801456, -0.83297
+    };
+    float *output;
+    float kernel[3*3] = {
+        0.56611896, -0.5144603, -0.82600045, 0.19219112, 0.3835776, -0.7475352, 0.5209291, -0.6301091, -0.99442935};
+    float bias[3] = {-0.3654299, -1.5711838, -0.15546428};
+
+    params.activation = TANH;
+    params.has_bias = 1;
+    params.biases = bias;
+    params.input_num = 3;
+    params.kernel = kernel;
+    params.output_num = 3;
+
+    operands[0].data = input;
+    operands[0].dims[0] = 1;
+    operands[0].dims[1] = 5;
+    operands[0].dims[2] = 6;
+    operands[0].dims[3] = 3;
+    operands[1].data = NULL;
+
+    input_indexes[0] = 0;
+    dnn_execute_layer_dense(operands, input_indexes, 1, &params, NULL);
+
+    output = operands[1].data;
+    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
+        if (fabs(output[i] - expected_output[i]) > EPSON) {
+            printf("at index %d, output: %f, expected_output: %f\n", i, output[i], expected_output[i]);
+            av_freep(&output);
+            return 1;
+        }
+    }
+
+    av_freep(&output);
+    return 0;
+}
+
+int main(int argc, char **argv)
+{
+    if (test())
+        return 1;
+
+    return 0;
+}
diff --git a/tools/python/convert_from_tensorflow.py b/tools/python/convert_from_tensorflow.py
index 1762091fdd..1a5d93fbb7 100644
--- a/tools/python/convert_from_tensorflow.py
+++ b/tools/python/convert_from_tensorflow.py
@@ -48,9 +48,9 @@ class Operand(object):
             self.used_count = self.used_count + 1
 
     def __str__(self):
-        return "{}: (name: {}, iotype: {}, dtype: {}, dims: ({},{},{},{}) used_count: {})".format(self.index,
+        return "{}: (name: {}, iotype: {}, dtype: {}, dims: {}, used_count: {})".format(self.index,
                             self.name, self.iotype2str[self.iotype], self.dtype2str[self.dtype],
-                            self.dims[0], self.dims[1], self.dims[2], self.dims[3], self.used_count)
+                            self.dims, self.used_count)
 
     def __lt__(self, other):
         return self.index < other.index
@@ -71,8 +71,10 @@ class TFConverter:
         self.converted_nodes = set()
         self.conv2d_scope_names = set()
         self.conv2d_scopename_inputname_dict = {}
+        self.dense_scope_names = set()
+        self.dense_scopename_inputname_dict = {}
         self.op2code = {'Conv2D':1, 'DepthToSpace':2, 'MirrorPad':3, 'Maximum':4,
-                        'MathBinary':5, 'MathUnary':6, 'AvgPool':7}
+                        'MathBinary':5, 'MathUnary':6, 'AvgPool':7, 'MatMul':8}
         self.mathbin2code = {'Sub':0, 'Add':1, 'Mul':2, 'RealDiv':3, 'Minimum':4, 'FloorMod':5}
         self.mathun2code  = {'Abs':0, 'Sin':1, 'Cos':2, 'Tan':3, 'Asin':4,
                 'Acos':5, 'Atan':6, 'Sinh':7, 'Cosh':8, 'Tanh':9, 'Asinh':10,
@@ -126,6 +128,22 @@ class TFConverter:
         return knode, bnode, dnode, anode
 
 
+    def get_dense_params(self, dense_scope_name):
+        knode = self.name_node_dict[dense_scope_name + '/kernel']
+        bnode = self.name_node_dict.get(dense_scope_name + '/bias')
+        # the BiasAdd name is possible be changed into the output name,
+        # if activation is None, and BiasAdd.next is the last op which is Identity
+        anode = None
+        if bnode:
+            if dense_scope_name + '/BiasAdd' in self.edges:
+                anode = self.edges[dense_scope_name + '/BiasAdd'][0]
+                if anode.op not in self.conv_activations:
+                    anode = None
+        else:
+            anode = None
+        return knode, bnode, anode
+
+
     def dump_complex_conv2d_to_file(self, node, f):
         assert(node.op == 'Conv2D')
         self.layer_number = self.layer_number + 1
@@ -181,6 +199,57 @@ class TFConverter:
             output_operand_index = self.add_operand(self.edges[bnode.name][0].name, Operand.IOTYPE_OUTPUT)
         np.array([input_operand_index, output_operand_index], dtype=np.uint32).tofile(f)
 
+    def dump_dense_to_file(self, node, f):
+        assert(node.op == 'MatMul')
+        self.layer_number = self.layer_number + 1
+        self.converted_nodes.add(node.name)
+
+        scope_name = TFConverter.get_scope_name(node.name)
+        #knode for kernel, bnode for bias, anode for activation
+        knode, bnode, anode = self.get_dense_params(scope_name.split('/')[0])
+
+        if bnode is not None:
+            has_bias = 1
+            btensor = bnode.attr['value'].tensor
+            if btensor.tensor_shape.dim[0].size == 1:
+                bias = struct.pack("f", btensor.float_val[0])
+            else:
+                bias = btensor.tensor_content
+        else:
+            has_bias = 0
+
+        if anode is not None:
+            activation = anode.op
+        else:
+            activation = 'None'
+
+        ktensor = knode.attr['value'].tensor
+        in_channels = ktensor.tensor_shape.dim[0].size
+        out_channels = ktensor.tensor_shape.dim[1].size
+        if in_channels * out_channels == 1:
+            kernel = np.float32(ktensor.float_val[0])
+        else:
+            kernel = np.frombuffer(ktensor.tensor_content, dtype=np.float32)
+        kernel = kernel.reshape(in_channels, out_channels)
+        kernel = np.transpose(kernel, [1, 0])
+
+        np.array([self.op2code[node.op], self.conv_activations[activation], in_channels, out_channels, has_bias], dtype=np.uint32).tofile(f)
+        kernel.tofile(f)
+        if has_bias:
+            f.write(bias)
+
+        input_name = self.dense_scopename_inputname_dict[scope_name.split('/')[0]]
+        input_operand_index = self.add_operand(input_name, Operand.IOTYPE_INPUT)
+
+        if anode is not None:
+            output_operand_index = self.add_operand(anode.name, Operand.IOTYPE_OUTPUT)
+        else:
+            if bnode is not None:
+                output_operand_index = self.add_operand(self.edges[bnode.name][0].name, Operand.IOTYPE_OUTPUT)
+            else:
+                output_operand_index = self.add_operand(self.edges[scope_name+'/concat_1'][0].name, Operand.IOTYPE_OUTPUT)
+        np.array([input_operand_index, output_operand_index], dtype=np.uint32).tofile(f)
+
 
     def dump_simple_conv2d_to_file(self, node, f):
         assert(node.op == 'Conv2D')
@@ -343,9 +412,19 @@ class TFConverter:
                 if node.op == 'Conv2D':
                     self.dump_complex_conv2d_to_file(node, f)
                 continue
+            if self.in_dense_scope(node.name):
+                if node.op == 'MatMul':
+                    self.dump_dense_to_file(node, f)
+                continue
+
 
             if node.op == 'Conv2D':
                 self.dump_simple_conv2d_to_file(node, f)
+                continue
+            if node.name in self.output_names:
+                input_name = self.id_different_scope_dict[node.name]
+                if TFConverter.get_scope_name(input_name)!=TFConverter.get_scope_name(node.name):
+                    continue
             if node.op == 'AvgPool':
                 self.dump_avg_pool_to_file(node, f)
             elif node.op == 'DepthToSpace':
@@ -367,7 +446,7 @@ class TFConverter:
                 np.array([operand.index, len(operand.name)], dtype=np.uint32).tofile(f)
                 f.write(operand.name.encode('utf-8'))
                 np.array([operand.iotype, operand.dtype], dtype=np.uint32).tofile(f)
-                np.array([operand.dims[0], operand.dims[1], operand.dims[2], operand.dims[3]], dtype=np.uint32).tofile(f)
+                np.array(operand.dims, dtype=np.uint32).tofile(f)
 
 
     def dump_to_file(self):
@@ -396,6 +475,7 @@ class TFConverter:
 
 
     def remove_identity(self):
+        self.id_different_scope_dict = {}
         id_nodes = []
         id_dict = {}
         for node in self.nodes:
@@ -408,6 +488,7 @@ class TFConverter:
                     self.name_node_dict[input].name = name
                     self.name_node_dict[name] = self.name_node_dict[input]
                     del self.name_node_dict[input]
+                    self.id_different_scope_dict[name] = input
                 else:
                     id_dict[name] = input
 
@@ -449,8 +530,18 @@ class TFConverter:
         return False
 
 
-    def generate_conv2d_scope_info(self):
-        # mostly, conv2d is a sub block in graph, get the scope name
+    def in_dense_scope(self, name):
+        inner_scope = TFConverter.get_scope_name(name)
+        if inner_scope == "":
+            return False;
+        for scope in self.dense_scope_names:
+            index = inner_scope.find(scope)
+            if index == 0:
+                return True
+        return False
+
+    def generate_sub_block_op_scope_info(self):
+        # mostly, conv2d/dense is a sub block in graph, get the scope name
         for node in self.nodes:
             if node.op == 'Conv2D':
                 scope = TFConverter.get_scope_name(node.name)
@@ -461,8 +552,17 @@ class TFConverter:
                 if scope + '/kernel' not in self.name_node_dict:
                     continue
                 self.conv2d_scope_names.add(scope)
+            elif node.op == 'MatMul':
+                scope = TFConverter.get_scope_name(node.name)
+                # for the case tf.nn.dense is called directly
+                if scope == '':
+                    continue
+                # for the case tf.nn.dense is called within a scope
+                if scope + '/kernel' not in self.name_node_dict and scope.split('/Tensordot')[0] + '/kernel' not in self.name_node_dict:
+                    continue
+                self.dense_scope_names.add(scope.split('/Tensordot')[0])
 
-        # get the input name to the conv2d sub block
+        # get the input name to the conv2d/dense sub block
         for node in self.nodes:
             scope = TFConverter.get_scope_name(node.name)
             if scope in self.conv2d_scope_names:
@@ -470,6 +570,16 @@ class TFConverter:
                     for inp in node.input:
                         if TFConverter.get_scope_name(inp) != scope:
                             self.conv2d_scopename_inputname_dict[scope] = inp
+            elif scope in self.dense_scope_names:
+                if node.op == 'MatMul' or node.op == 'Shape':
+                    for inp in node.input:
+                        if TFConverter.get_scope_name(inp) != scope:
+                            self.dense_scopename_inputname_dict[scope] = inp
+            elif scope.split('/Tensordot')[0] in self.dense_scope_names:
+                if node.op == 'Transpose':
+                    for inp in node.input:
+                        if TFConverter.get_scope_name(inp).find(scope)<0 and TFConverter.get_scope_name(inp).find(scope.split('/')[0])<0:
+                            self.dense_scopename_inputname_dict[scope.split('/Tensordot')[0]] = inp
 
 
     def run(self):
@@ -477,7 +587,7 @@ class TFConverter:
         self.generate_output_names()
         self.remove_identity()
         self.generate_edges()
-        self.generate_conv2d_scope_info()
+        self.generate_sub_block_op_scope_info()
 
         if self.dump4tb:
             self.dump_for_tensorboard()