From patchwork Thu Sep  3 15:57:23 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Xu Jun <xujunzz@sjtu.edu.cn>
X-Patchwork-Id: 22076
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
X-Original-To: patchwork@ffaux-bg.ffmpeg.org
Delivered-To: patchwork@ffaux-bg.ffmpeg.org
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by ffaux.localdomain (Postfix) with ESMTP id B055A44BCF7
	for <patchwork@ffaux-bg.ffmpeg.org>; Thu,  3 Sep 2020 18:57:40 +0300 (EEST)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 8C634680A18;
	Thu,  3 Sep 2020 18:57:40 +0300 (EEST)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181])
 by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 3DABD68034C
 for <ffmpeg-devel@ffmpeg.org>; Thu,  3 Sep 2020 18:57:33 +0300 (EEST)
Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188])
 by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id CD8261008CBC2
 for <ffmpeg-devel@ffmpeg.org>; Thu,  3 Sep 2020 23:57:28 +0800 (CST)
Received: from localhost (localhost.localdomain [127.0.0.1])
 by proxy02.sjtu.edu.cn (Postfix) with ESMTP id CC988200B4498;
 Thu,  3 Sep 2020 23:57:28 +0800 (CST)
X-Virus-Scanned: amavisd-new at 
Received: from proxy02.sjtu.edu.cn ([127.0.0.1])
 by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026)
 with ESMTP id IcPUZQuJ7gYW; Thu,  3 Sep 2020 23:57:28 +0800 (CST)
Received: from localhost.localdomain (unknown [202.120.39.204])
 (Authenticated sender: xujunzz@sjtu.edu.cn)
 by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id EB112200B448D;
 Thu,  3 Sep 2020 23:57:27 +0800 (CST)
From: xujunzz@sjtu.edu.cn
To: ffmpeg-devel@ffmpeg.org
Date: Thu,  3 Sep 2020 23:57:23 +0800
Message-Id: <20200903155724.167477-1-xujunzz@sjtu.edu.cn>
X-Mailer: git-send-email 2.28.0
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 1/2] dnn_backend_native.c: parse options in
	native backend
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: xujunzz@sjtu.edu.cn
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From: Xu Jun <xujunzz@sjtu.edu.cn>

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 libavfilter/dnn/dnn_backend_native.c | 22 ++++++++++++++++++++--
 libavfilter/dnn/dnn_backend_native.h | 13 +++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/libavfilter/dnn/dnn_backend_native.c b/libavfilter/dnn/dnn_backend_native.c
index a8fe6b94eb..83205aac72 100644
--- a/libavfilter/dnn/dnn_backend_native.c
+++ b/libavfilter/dnn/dnn_backend_native.c
@@ -31,7 +31,7 @@
 static const AVClass dnn_native_class = {
     .class_name = "dnn_native",
     .item_name  = av_default_item_name,
-    .option     = NULL,
+    .option     = dnn_native_options,
     .version    = LIBAVUTIL_VERSION_INT,
     .category   = AV_CLASS_CATEGORY_FILTER,
 };
@@ -112,6 +112,22 @@ static DNNReturnType set_input_native(void *model, DNNData *input, const char *i
     return DNN_SUCCESS;
 }
 
+static int dnn_parse_options(void *ctx, const char *options)
+{
+    AVDictionary *dict = NULL;
+    int err = av_dict_parse_string(&dict, options, "=", "&", 0);
+    if (err < 0) {
+        av_dict_free(&dict);
+        return err;
+    }
+
+    av_opt_set_defaults(ctx);
+    err = av_opt_set_dict(ctx, &dict);
+
+    av_dict_free(&dict);
+    return err;
+}
+
 // Loads model and its parameters that are stored in a binary file with following structure:
 // layers_num,layer_type,layer_parameterss,layer_type,layer_parameters...
 // For CONV layer: activation_function, input_num, output_num, kernel_size, kernel, biases
@@ -174,6 +190,9 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename, const char *optio
     }
 
     native_model->ctx.class = &dnn_native_class;
+    model->options = options;
+    if (dnn_parse_options(&native_model->ctx, model->options) < 0)
+        goto fail;
     model->model = (void *)native_model;
 
     avio_seek(model_file_context, file_size - 8, SEEK_SET);
@@ -248,7 +267,6 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename, const char *optio
 
     model->set_input = &set_input_native;
     model->get_input = &get_input_native;
-    model->options = options;
 
     return model;
 
diff --git a/libavfilter/dnn/dnn_backend_native.h b/libavfilter/dnn/dnn_backend_native.h
index 197f557dee..a19b9a4233 100644
--- a/libavfilter/dnn/dnn_backend_native.h
+++ b/libavfilter/dnn/dnn_backend_native.h
@@ -29,6 +29,7 @@
 
 #include "../dnn_interface.h"
 #include "libavformat/avio.h"
+#include "libavutil/opt.h"
 
 /**
  * the enum value of DNNLayerType should not be changed,
@@ -106,10 +107,22 @@ typedef struct InputParams{
     int height, width, channels;
 } InputParams;
 
+typedef struct NativeOptions{
+    uint32_t conv2d_threads;
+} NativeOptions;
+
 typedef struct NativeContext {
     const AVClass *class;
+    NativeOptions options;
 } NativeContext;
 
+#define OFFSET(x) offsetof(NativeContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM
+static const AVOption dnn_native_options[] = {
+    { "conv2d_threads", "threads num for conv2d layer", OFFSET(options.conv2d_threads), AV_OPT_TYPE_INT,  { .i64 = 0 }, INT_MIN, INT_MAX, FLAGS },
+    { NULL },
+};
+
 // Represents simple feed-forward convolutional network.
 typedef struct NativeModel{
     NativeContext ctx;

From patchwork Thu Sep  3 15:57:24 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Xu Jun <xujunzz@sjtu.edu.cn>
X-Patchwork-Id: 22077
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
X-Original-To: patchwork@ffaux-bg.ffmpeg.org
Delivered-To: patchwork@ffaux-bg.ffmpeg.org
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by ffaux.localdomain (Postfix) with ESMTP id EFC9644BCF7
	for <patchwork@ffaux-bg.ffmpeg.org>; Thu,  3 Sep 2020 18:57:43 +0300 (EEST)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id D7DC368AAFD;
	Thu,  3 Sep 2020 18:57:43 +0300 (EEST)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181])
 by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id A1A47689248
 for <ffmpeg-devel@ffmpeg.org>; Thu,  3 Sep 2020 18:57:37 +0300 (EEST)
Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188])
 by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id 0DC8E1008CBCD
 for <ffmpeg-devel@ffmpeg.org>; Thu,  3 Sep 2020 23:57:35 +0800 (CST)
Received: from localhost (localhost.localdomain [127.0.0.1])
 by proxy02.sjtu.edu.cn (Postfix) with ESMTP id 0DA27200B4498;
 Thu,  3 Sep 2020 23:57:35 +0800 (CST)
X-Virus-Scanned: amavisd-new at 
Received: from proxy02.sjtu.edu.cn ([127.0.0.1])
 by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026)
 with ESMTP id sabr97KA6Rwv; Thu,  3 Sep 2020 23:57:34 +0800 (CST)
Received: from localhost.localdomain (unknown [202.120.39.204])
 (Authenticated sender: xujunzz@sjtu.edu.cn)
 by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id 26EF4200B448D;
 Thu,  3 Sep 2020 23:57:33 +0800 (CST)
From: xujunzz@sjtu.edu.cn
To: ffmpeg-devel@ffmpeg.org
Date: Thu,  3 Sep 2020 23:57:24 +0800
Message-Id: <20200903155724.167477-2-xujunzz@sjtu.edu.cn>
X-Mailer: git-send-email 2.28.0
In-Reply-To: <20200903155724.167477-1-xujunzz@sjtu.edu.cn>
References: <20200903155724.167477-1-xujunzz@sjtu.edu.cn>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 2/2] Add mutithread function for
	dnn_backend_native_layer_conv2d.c
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: xujunzz@sjtu.edu.cn
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From: Xu Jun <xujunzz@sjtu.edu.cn>

Use pthread to multithread dnn_execute_layer_conv2d.
Can be tested with command "./ffmpeg_g -i input.png -vf \
format=yuvj420p,dnn_processing=dnn_backend=native:model= \
espcn.model:input=x:output=y:options=conv2d_threads=23 \
 -y sr_native.jpg -benchmark"

before patch: utime=11.238s stime=0.005s rtime=11.248s
after patch:  utime=20.817s stime=0.047s rtime=1.051s
on my 3900X 12c24t @4.2GHz

About the increase of utime, it's because that CPU HyperThreading
technology makes logical cores twice of physical cores while cpu's
counting performance improves less than double. And utime sums
all cpu's logical cores' runtime. As a result, using threads num
near cpu's logical core's number will double utime, while reduce
rtime less than half for HyperThreading CPUs.

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 .../dnn/dnn_backend_native_layer_conv2d.c     | 92 ++++++++++++++++---
 1 file changed, 81 insertions(+), 11 deletions(-)

diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
index d079795bf8..8da99540ed 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
@@ -19,10 +19,27 @@
  */
 
 #include "libavutil/avassert.h"
+#include "libavutil/thread.h"
+#include "libavutil/cpu.h"
 #include "dnn_backend_native_layer_conv2d.h"
 
 #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
 
+//struct to pass parameters
+typedef struct thread_common_param{
+    DnnOperand *operands;
+    const int32_t *input_operand_indexes;
+    int32_t output_operand_index;
+    const void *parameters;
+    NativeContext *ctx;
+    int thread_num;
+} thread_common_param;
+
+typedef struct thread_param{
+    thread_common_param *thread_common_param;
+    int thread_index
+} thread_param;
+
 int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num)
 {
     ConvolutionalParams *conv_params;
@@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil
     return dnn_size;
 }
 
-int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
-                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
+static void * dnn_execute_layer_conv2d_thread(void *threadarg)
 {
+    //pass parameters
+    thread_param *thread_param = (struct thread_param *)threadarg;
+    thread_common_param *thread_common_param = thread_param->thread_common_param;
+    DnnOperand *operands = thread_common_param->operands;
     float *output;
-    int32_t input_operand_index = input_operand_indexes[0];
+    int32_t input_operand_index = thread_common_param->input_operand_indexes[0];
     int number = operands[input_operand_index].dims[0];
     int height = operands[input_operand_index].dims[1];
     int width = operands[input_operand_index].dims[2];
     int channel = operands[input_operand_index].dims[3];
     const float *input = operands[input_operand_index].data;
-    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters;
+    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_common_param->parameters);
 
     int radius = conv_params->kernel_size >> 1;
     int src_linesize = width * conv_params->input_num;
@@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
     int filter_size = conv_params->kernel_size * filter_linesize;
     int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
 
-    DnnOperand *output_operand = &operands[output_operand_index];
+    int thread_stride = (height - pad_size * 2) / thread_common_param->thread_num;
+    int thread_start = thread_stride * thread_param->thread_index + pad_size;
+    int thread_end = (thread_param->thread_index == thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride);
+
+    DnnOperand *output_operand = &operands[thread_common_param->output_operand_index];
     output_operand->dims[0] = number;
     output_operand->dims[1] = height - pad_size * 2;
     output_operand->dims[2] = width - pad_size * 2;
@@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
     output_operand->data_type = operands[input_operand_index].data_type;
     output_operand->length = calculate_operand_data_length(output_operand);
     if (output_operand->length <= 0) {
-        av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
-        return DNN_ERROR;
+        av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length overflow\n");
+        return (void *)DNN_ERROR;
     }
     output_operand->data = av_realloc(output_operand->data, output_operand->length);
     if (!output_operand->data) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
-        return DNN_ERROR;
+        av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
+        return (void *)DNN_ERROR;
     }
+
     output = output_operand->data;
+    output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size);
 
     av_assert0(channel == conv_params->input_num);
 
-    for (int y = pad_size; y < height - pad_size; ++y) {
+    for (int y = thread_start; y < thread_end; ++y) {
         for (int x = pad_size; x < width - pad_size; ++x) {
             for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
                 if (conv_params->has_bias)
@@ -174,5 +200,49 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
             output += conv_params->output_num;
         }
     }
-    return 0;
+    return (void *)0;
+}
+
+
+int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
+                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
+{
+    int thread_num = (ctx->options.conv2d_threads <= 0 || ctx->options.conv2d_threads > av_cpu_count())
+        ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads);
+    pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t));
+    thread_param **thread_param = av_malloc(thread_num * sizeof(*thread_param));
+    void *res;
+    int error_flag = 0;
+
+    //struct used to pass parameters
+    thread_common_param thread_common_param;
+    thread_common_param.operands = operands;
+    thread_common_param.input_operand_indexes = input_operand_indexes;
+    thread_common_param.output_operand_index = output_operand_index;
+    thread_common_param.parameters = parameters;
+    thread_common_param.ctx = ctx;
+    thread_common_param.thread_num = thread_num;
+
+    //create threads
+    for (int i = 0; i < thread_num; i++){
+        thread_param[i] = av_malloc(sizeof(thread_param));
+        thread_param[i]->thread_common_param = &thread_common_param;
+        thread_param[i]->thread_index = i;
+        pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_param[i]);
+    }
+
+    //join threads, res gets function return
+    for (int i = 0; i < thread_num; i++){
+        pthread_join(thread_id[i], &res);
+        if ((int)res != 0)
+            error_flag = (int)res;
+    }
+
+    //release memory
+    av_free(thread_id);
+    for (int i = 0; i < thread_num; i++){
+        av_free(thread_param[i]);
+    }
+    av_free(thread_param);
+    return error_flag;
 }