From patchwork Sun Sep 6 12:28:51 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xu Jun X-Patchwork-Id: 22129 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 7310B44A1EA for ; Sun, 6 Sep 2020 15:29:57 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 3DC1A689247; Sun, 6 Sep 2020 15:29:57 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 3C9196880B1 for ; Sun, 6 Sep 2020 15:29:50 +0300 (EEST) Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188]) by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id 125A71008CBC4 for ; Sun, 6 Sep 2020 20:29:46 +0800 (CST) Received: from localhost (localhost.localdomain [127.0.0.1]) by proxy02.sjtu.edu.cn (Postfix) with ESMTP id 13D61200B4497; Sun, 6 Sep 2020 20:29:46 +0800 (CST) X-Virus-Scanned: amavisd-new at Received: from proxy02.sjtu.edu.cn ([127.0.0.1]) by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026) with ESMTP id Cmz7XnsgXPYN; Sun, 6 Sep 2020 20:29:46 +0800 (CST) Received: from localhost.localdomain (unknown [202.120.39.204]) (Authenticated sender: xujunzz@sjtu.edu.cn) by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id 2259B200B4496; Sun, 6 Sep 2020 20:29:44 +0800 (CST) From: xujunzz@sjtu.edu.cn To: ffmpeg-devel@ffmpeg.org Date: Sun, 6 Sep 2020 20:28:51 +0800 Message-Id: <20200906122851.159892-1-xujunzz@sjtu.edu.cn> X-Mailer: git-send-email 2.28.0 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v5 1/2] dnn_backend_native.c: parse options in native backend X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: xujunzz@sjtu.edu.cn Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Xu Jun Signed-off-by: Xu Jun --- v2: use av_opt_set_from_string instead of function dnn_parse_option(). v3: make all the options supported, not just conv2d_threads v4: move dnn_native_options and dnn_native_class to from .h to .c. libavfilter/dnn/dnn_backend_native.c | 22 +++++++++++++++++++--- libavfilter/dnn/dnn_backend_native.h | 6 ++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/libavfilter/dnn/dnn_backend_native.c b/libavfilter/dnn/dnn_backend_native.c index a8fe6b94eb..a9ecbdc88b 100644 --- a/libavfilter/dnn/dnn_backend_native.c +++ b/libavfilter/dnn/dnn_backend_native.c @@ -28,10 +28,17 @@ #include "dnn_backend_native_layer_conv2d.h" #include "dnn_backend_native_layers.h" -static const AVClass dnn_native_class = { +#define OFFSET(x) offsetof(NativeContext, x) +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM +static const AVOption dnn_native_options[] = { + { "conv2d_threads", "threads num for conv2d layer", OFFSET(options.conv2d_threads), AV_OPT_TYPE_INT, { .i64 = 0 }, INT_MIN, INT_MAX, FLAGS }, + { NULL }, +}; + +const AVClass dnn_native_class = { .class_name = "dnn_native", .item_name = av_default_item_name, - .option = NULL, + .option = dnn_native_options, .version = LIBAVUTIL_VERSION_INT, .category = AV_CLASS_CATEGORY_FILTER, }; @@ -174,8 +181,18 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename, const char *optio } native_model->ctx.class = &dnn_native_class; + model->options = options; + if (av_opt_set_from_string(&native_model->ctx, model->options, NULL, "=", "&") < 0) + goto fail; model->model = (void *)native_model; +#if !HAVE_PTHREAD_CANCEL + if (native_model->ctx.options.conv2d_threads > 1){ + av_log(&native_model->ctx, AV_LOG_WARNING, "'conv2d_threads' option was set but it is not supported " + "on this build (pthread support is required)\n"); + } +#endif + avio_seek(model_file_context, file_size - 8, SEEK_SET); native_model->layers_num = (int32_t)avio_rl32(model_file_context); native_model->operands_num = (int32_t)avio_rl32(model_file_context); @@ -248,7 +265,6 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename, const char *optio model->set_input = &set_input_native; model->get_input = &get_input_native; - model->options = options; return model; diff --git a/libavfilter/dnn/dnn_backend_native.h b/libavfilter/dnn/dnn_backend_native.h index 197f557dee..b1f8f3d6bf 100644 --- a/libavfilter/dnn/dnn_backend_native.h +++ b/libavfilter/dnn/dnn_backend_native.h @@ -29,6 +29,7 @@ #include "../dnn_interface.h" #include "libavformat/avio.h" +#include "libavutil/opt.h" /** * the enum value of DNNLayerType should not be changed, @@ -106,8 +107,13 @@ typedef struct InputParams{ int height, width, channels; } InputParams; +typedef struct NativeOptions{ + uint32_t conv2d_threads; +} NativeOptions; + typedef struct NativeContext { const AVClass *class; + NativeOptions options; } NativeContext; // Represents simple feed-forward convolutional network. From patchwork Sun Sep 6 12:28:53 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xu Jun X-Patchwork-Id: 22130 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 8E18944A7F5 for ; Sun, 6 Sep 2020 15:31:41 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 685EF688392; Sun, 6 Sep 2020 15:31:41 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id B74FE680330 for ; Sun, 6 Sep 2020 15:31:34 +0300 (EEST) Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188]) by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id 0C3811008CBC4 for ; Sun, 6 Sep 2020 20:31:32 +0800 (CST) Received: from localhost (localhost.localdomain [127.0.0.1]) by proxy02.sjtu.edu.cn (Postfix) with ESMTP id 0D708200B4497; Sun, 6 Sep 2020 20:31:32 +0800 (CST) X-Virus-Scanned: amavisd-new at Received: from proxy02.sjtu.edu.cn ([127.0.0.1]) by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026) with ESMTP id ZRIosVCn60mf; Sun, 6 Sep 2020 20:31:31 +0800 (CST) Received: from localhost.localdomain (unknown [202.120.39.204]) (Authenticated sender: xujunzz@sjtu.edu.cn) by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id 5D2E9200B4496; Sun, 6 Sep 2020 20:31:29 +0800 (CST) From: xujunzz@sjtu.edu.cn To: ffmpeg-devel@ffmpeg.org Date: Sun, 6 Sep 2020 20:28:53 +0800 Message-Id: <20200906122851.159892-2-xujunzz@sjtu.edu.cn> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20200906122851.159892-1-xujunzz@sjtu.edu.cn> References: <20200906122851.159892-1-xujunzz@sjtu.edu.cn> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v5 2/2] dnn_backend_native_layer_conv2d.c:Add mutithread function X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: xujunzz@sjtu.edu.cn Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Xu Jun Use pthread to multithread dnn_execute_layer_conv2d. Can be tested with command "./ffmpeg_g -i input.png -vf \ format=yuvj420p,dnn_processing=dnn_backend=native:model= \ espcn.model:input=x:output=y:options=conv2d_threads=23 \ -y sr_native.jpg -benchmark" before patch: utime=11.238s stime=0.005s rtime=11.248s after patch: utime=20.817s stime=0.047s rtime=1.051s on my 3900X 12c24t @4.2GHz About the increase of utime, it's because that CPU HyperThreading technology makes logical cores twice of physical cores while cpu's counting performance improves less than double. And utime sums all cpu's logical cores' runtime. As a result, using threads num near cpu's logical core's number will double utime, while reduce rtime less than half for HyperThreading CPUs. Signed-off-by: Xu Jun --- v2: add check for HAVE_PTHREAD_CANCEL and modify FATE test dnn-layer-conv2d-test.c v4: use extern to call dnn_native_class in dnn-layer-conv2d-test.c v5: use DNN_SUCCESS as return flag instead of 0. .../dnn/dnn_backend_native_layer_conv2d.c | 107 ++++++++++++++++-- tests/dnn/dnn-layer-conv2d-test.c | 14 ++- 2 files changed, 108 insertions(+), 13 deletions(-) diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c index d079795bf8..777a54db43 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c @@ -19,10 +19,27 @@ */ #include "libavutil/avassert.h" +#include "libavutil/thread.h" +#include "libavutil/cpu.h" #include "dnn_backend_native_layer_conv2d.h" #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x))) +//struct to pass parameters +typedef struct thread_common_param{ + DnnOperand *operands; + const int32_t *input_operand_indexes; + int32_t output_operand_index; + const void *parameters; + NativeContext *ctx; + int thread_num; +} thread_common_param; + +typedef struct thread_param{ + thread_common_param *thread_common_param; + int thread_index; +} thread_param; + int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num) { ConvolutionalParams *conv_params; @@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil return dnn_size; } -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, - int32_t output_operand_index, const void *parameters, NativeContext *ctx) +static void * dnn_execute_layer_conv2d_thread(void *threadarg) { + //pass parameters + thread_param *thread_param = (struct thread_param *)threadarg; + thread_common_param *thread_common_param = thread_param->thread_common_param; + DnnOperand *operands = thread_common_param->operands; float *output; - int32_t input_operand_index = input_operand_indexes[0]; + int32_t input_operand_index = thread_common_param->input_operand_indexes[0]; int number = operands[input_operand_index].dims[0]; int height = operands[input_operand_index].dims[1]; int width = operands[input_operand_index].dims[2]; int channel = operands[input_operand_index].dims[3]; const float *input = operands[input_operand_index].data; - const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters; + const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_common_param->parameters); int radius = conv_params->kernel_size >> 1; int src_linesize = width * conv_params->input_num; @@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ int filter_size = conv_params->kernel_size * filter_linesize; int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0; - DnnOperand *output_operand = &operands[output_operand_index]; + int thread_stride = (height - pad_size * 2) / thread_common_param->thread_num; + int thread_start = thread_stride * thread_param->thread_index + pad_size; + int thread_end = (thread_param->thread_index == thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride); + + DnnOperand *output_operand = &operands[thread_common_param->output_operand_index]; output_operand->dims[0] = number; output_operand->dims[1] = height - pad_size * 2; output_operand->dims[2] = width - pad_size * 2; @@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ output_operand->data_type = operands[input_operand_index].data_type; output_operand->length = calculate_operand_data_length(output_operand); if (output_operand->length <= 0) { - av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n"); - return DNN_ERROR; + av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length overflow\n"); + return (void *)DNN_ERROR; } output_operand->data = av_realloc(output_operand->data, output_operand->length); if (!output_operand->data) { - av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); - return DNN_ERROR; + av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); + return (void *)DNN_ERROR; } + output = output_operand->data; + output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size); av_assert0(channel == conv_params->input_num); - for (int y = pad_size; y < height - pad_size; ++y) { + for (int y = thread_start; y < thread_end; ++y) { for (int x = pad_size; x < width - pad_size; ++x) { for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) { if (conv_params->has_bias) @@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ output += conv_params->output_num; } } - return 0; + return (void *)DNN_SUCCESS; +} + + +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, + int32_t output_operand_index, const void *parameters, NativeContext *ctx) +{ + int thread_num = (ctx->options.conv2d_threads <= 0 || ctx->options.conv2d_threads > av_cpu_count()) + ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads); +#if HAVE_PTHREAD_CANCEL + pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t)); +#endif + thread_param **thread_param = av_malloc(thread_num * sizeof(*thread_param)); + void *res; + int error_flag = DNN_SUCCESS; + + //struct used to pass parameters + thread_common_param thread_common_param; + thread_common_param.operands = operands; + thread_common_param.input_operand_indexes = input_operand_indexes; + thread_common_param.output_operand_index = output_operand_index; + thread_common_param.parameters = parameters; + thread_common_param.ctx = ctx; +#if HAVE_PTHREAD_CANCEL + thread_common_param.thread_num = thread_num; + + //create threads + for (int i = 0; i < thread_num; i++){ + thread_param[i] = av_malloc(sizeof(thread_param)); + thread_param[i]->thread_common_param = &thread_common_param; + thread_param[i]->thread_index = i; + pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_param[i]); + } + + //join threads, res gets function return + for (int i = 0; i < thread_num; i++){ + pthread_join(thread_id[i], &res); + if ((int)res != DNN_SUCCESS) + error_flag = (int)res; + } + + //release memory + av_free(thread_id); + + for (int i = 0; i < thread_num; i++){ + av_free(thread_param[i]); + } +#else + thread_common_param.thread_num = 1; + thread_param[0] = av_malloc(sizeof(thread_param)); + thread_param[0]->thread_common_param = &thread_common_param; + thread_param[0]->thread_index = 0; + res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]); + if ((int)res != DNN_SUCCESS) + error_flag = (int)res; + av_free(thread_param[0]); +#endif + + av_free(thread_param); + return error_flag; } diff --git a/tests/dnn/dnn-layer-conv2d-test.c b/tests/dnn/dnn-layer-conv2d-test.c index 836839cc64..378a05eafc 100644 --- a/tests/dnn/dnn-layer-conv2d-test.c +++ b/tests/dnn/dnn-layer-conv2d-test.c @@ -25,6 +25,8 @@ #define EPSON 0.00001 +extern const AVClass dnn_native_class; + static int test_with_same_dilate(void) { // the input data and expected data are generated with below python code. @@ -96,6 +98,10 @@ static int test_with_same_dilate(void) }; float bias[2] = { -1.6574852, -0.72915393 }; + NativeContext ctx; + ctx.class = &dnn_native_class; + ctx.options.conv2d_threads = 1; + params.activation = TANH; params.has_bias = 1; params.biases = bias; @@ -114,7 +120,7 @@ static int test_with_same_dilate(void) operands[1].data = NULL; input_indexes[0] = 0; - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL); + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx); output = operands[1].data; for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) { @@ -196,6 +202,10 @@ static int test_with_valid(void) }; float bias[2] = { -0.4773722, -0.19620377 }; + NativeContext ctx; + ctx.class = &dnn_native_class; + ctx.options.conv2d_threads = 1; + params.activation = TANH; params.has_bias = 1; params.biases = bias; @@ -214,7 +224,7 @@ static int test_with_valid(void) operands[1].data = NULL; input_indexes[0] = 0; - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL); + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx); output = operands[1].data; for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {