[FFmpeg-devel,1/3,GSoC] Add mutithread function for dnn_backend_native_layer_conv2d.c

Message ID	20200831170341.879003-1-xujunzz@sjtu.edu.cn
State	New
Headers	show Return-Path: <ffmpeg-devel-bounces@ffmpeg.org> sender: xujunzz@sjtu.edu.cn) by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id 69A7E200B448D; Tue, 1 Sep 2020 01:04:37 +0800 (CST) From: xujunzz@sjtu.edu.cn To: ffmpeg-devel@ffmpeg.org Date: Tue, 1 Sep 2020 01:03:40 +0800 Message-Id: <20200831170341.879003-1-xujunzz@sjtu.edu.cn> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 1/3][GSoC] Add mutithread function for dnn_backend_native_layer_conv2d.c Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: xujunzz@sjtu.edu.cn Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,1/3,GSoC] Add mutithread function for dnn_backend_native_layer_conv2d.c \| expand [FFmpeg-devel,1/3,GSoC] Add mutithread function for dnn_backend_native_layer_conv2d.c [FFmpeg-devel,2/3,GSoC] Add x86-sse4 optimization for dnn_execute_layer_conv2d [FFmpeg-devel,3/3,GSoC] Add x86-avx2 optimization for dnn_execute_layer_conv2d

Context	Check	Description
andriy/default	pending
andriy/make	success	Make finished
andriy/make_fate	success	Make fate finished

diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c index d079795bf8..570b974052 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c @@ -19,10 +19,23 @@ */ #include "libavutil/avassert.h" +#include "libavutil/thread.h" +#include "libavutil/cpu.h" #include "dnn_backend_native_layer_conv2d.h" #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x))) +//struct to pass parameters +typedef struct thread_data{ + DnnOperand *operands; + const int32_t *input_operand_indexes; + int32_t output_operand_index; + const void *parameters; + NativeContext *ctx; + int32_t thread_num; + int32_t thread_index; +} thread_data; + int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num) { ConvolutionalParams *conv_params; @@ -88,17 +101,27 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil return dnn_size; } -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, - int32_t output_operand_index, const void *parameters, NativeContext *ctx) +static void * dnn_execute_layer_conv2d_thread(void *threadarg) { + static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; + //use mutexe to protect thread_index + + //pass parameters + struct thread_data *thread_data = (struct thread_data *)threadarg; + DnnOperand *operands = thread_data->operands; + + int thread_stride; + int thread_start; + int thread_end; + float *output; - int32_t input_operand_index = input_operand_indexes[0]; + int32_t input_operand_index = thread_data->input_operand_indexes[0]; int number = operands[input_operand_index].dims[0]; int height = operands[input_operand_index].dims[1]; int width = operands[input_operand_index].dims[2]; int channel = operands[input_operand_index].dims[3]; const float *input = operands[input_operand_index].data; - const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters; + const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_data->parameters); int radius = conv_params->kernel_size >> 1; int src_linesize = width * conv_params->input_num; @@ -106,7 +129,7 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ int filter_size = conv_params->kernel_size * filter_linesize; int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0; - DnnOperand *output_operand = &operands[output_operand_index]; + DnnOperand *output_operand = &operands[thread_data->output_operand_index]; output_operand->dims[0] = number; output_operand->dims[1] = height - pad_size * 2; output_operand->dims[2] = width - pad_size * 2; @@ -114,19 +137,30 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ output_operand->data_type = operands[input_operand_index].data_type; output_operand->length = calculate_operand_data_length(output_operand); if (output_operand->length <= 0) { - av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n"); - return DNN_ERROR; + av_log(thread_data->ctx, AV_LOG_ERROR, "The output data length overflow\n"); + return (void *)DNN_ERROR; } output_operand->data = av_realloc(output_operand->data, output_operand->length); if (!output_operand->data) { - av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); - return DNN_ERROR; + av_log(thread_data->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); + return (void *)DNN_ERROR; } + + //calculate area for this thread + thread_stride = (height - pad_size * 2) / thread_data->thread_num; + pthread_mutex_lock(&mtx); + thread_start = thread_stride * thread_data->thread_index + pad_size; + thread_end = (thread_data->thread_index == thread_data->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride); + thread_data->thread_index += 1; + pthread_mutex_unlock(&mtx); + output = output_operand->data; + //calculate output start pos for this thread + output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size); av_assert0(channel == conv_params->input_num); - for (int y = pad_size; y < height - pad_size; ++y) { + for (int y = thread_start; y < thread_end; ++y) { for (int x = pad_size; x < width - pad_size; ++x) { for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) { if (conv_params->has_bias) @@ -174,5 +208,44 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ output += conv_params->output_num; } } - return 0; + return (void *)0; +} + + +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, + int32_t output_operand_index, const void *parameters, NativeContext *ctx) +{ + //get cpu available cores, -1 for higher efficiency + const int thread_num = av_cpu_count() - 1; + pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t)); + void *res; + int error_flag = 0; + + //struct used to pass parameters + struct thread_data *thread_data; + thread_data = av_malloc(sizeof(*thread_data)); + thread_data->operands = operands; + thread_data->input_operand_indexes = input_operand_indexes; + thread_data->output_operand_index = output_operand_index; + thread_data->parameters = parameters; + thread_data->ctx = ctx; + thread_data->thread_num = thread_num; + thread_data->thread_index = 0; + + //create threads + for (int i = 0; i < thread_num; i++){ + pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_data); + } + + //join threads, res gets function return + for (int i = 0; i < thread_num; i++){ + pthread_join(thread_id[i], &res); + if ((int)res != 0) + error_flag = (int)res; + } + + //release memory + av_free(thread_id); + av_free(thread_data); + return error_flag; }

[FFmpeg-devel,1/3,GSoC] Add mutithread function for dnn_backend_native_layer_conv2d.c

Checks

Commit Message

Comments

Patch