From patchwork Mon Aug 31 17:03:40 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xu Jun X-Patchwork-Id: 22025 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 9020444A6B9 for ; Mon, 31 Aug 2020 20:04:50 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 7508F68AB1D; Mon, 31 Aug 2020 20:04:50 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 87E456808E9 for ; Mon, 31 Aug 2020 20:04:43 +0300 (EEST) Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188]) by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id 5BA751008CBC1 for ; Tue, 1 Sep 2020 01:04:39 +0800 (CST) Received: from localhost (localhost.localdomain [127.0.0.1]) by proxy02.sjtu.edu.cn (Postfix) with ESMTP id 4BE07200B4496; Tue, 1 Sep 2020 01:04:39 +0800 (CST) X-Virus-Scanned: amavisd-new at Received: from proxy02.sjtu.edu.cn ([127.0.0.1]) by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026) with ESMTP id zahSrTqSbnoC; Tue, 1 Sep 2020 01:04:39 +0800 (CST) Received: from localhost.localdomain (unknown [202.120.39.204]) (Authenticated sender: xujunzz@sjtu.edu.cn) by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id 69A7E200B448D; Tue, 1 Sep 2020 01:04:37 +0800 (CST) From: xujunzz@sjtu.edu.cn To: ffmpeg-devel@ffmpeg.org Date: Tue, 1 Sep 2020 01:03:40 +0800 Message-Id: <20200831170341.879003-1-xujunzz@sjtu.edu.cn> X-Mailer: git-send-email 2.28.0 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 1/3][GSoC] Add mutithread function for dnn_backend_native_layer_conv2d.c X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: xujunzz@sjtu.edu.cn Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Xu Jun Use pthread to multithread dnn_execute_layer_conv2d. Can be tested with command "./ffmpeg_g -i input.png -vf \ format=yuvj420p,dnn_processing=dnn_backend=native:model= \ espcn.model:input=x:output=y -y sr_native.jpg -benchmark" before patch: utime=11.238s stime=0.005s rtime=11.248s after patch: utime=20.817s stime=0.047s rtime=1.051s Signed-off-by: Xu Jun --- .../dnn/dnn_backend_native_layer_conv2d.c | 95 ++++++++++++++++--- 1 file changed, 84 insertions(+), 11 deletions(-) diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c index d079795bf8..570b974052 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c @@ -19,10 +19,23 @@ */ #include "libavutil/avassert.h" +#include "libavutil/thread.h" +#include "libavutil/cpu.h" #include "dnn_backend_native_layer_conv2d.h" #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x))) +//struct to pass parameters +typedef struct thread_data{ + DnnOperand *operands; + const int32_t *input_operand_indexes; + int32_t output_operand_index; + const void *parameters; + NativeContext *ctx; + int32_t thread_num; + int32_t thread_index; +} thread_data; + int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num) { ConvolutionalParams *conv_params; @@ -88,17 +101,27 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil return dnn_size; } -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, - int32_t output_operand_index, const void *parameters, NativeContext *ctx) +static void * dnn_execute_layer_conv2d_thread(void *threadarg) { + static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; + //use mutexe to protect thread_index + + //pass parameters + struct thread_data *thread_data = (struct thread_data *)threadarg; + DnnOperand *operands = thread_data->operands; + + int thread_stride; + int thread_start; + int thread_end; + float *output; - int32_t input_operand_index = input_operand_indexes[0]; + int32_t input_operand_index = thread_data->input_operand_indexes[0]; int number = operands[input_operand_index].dims[0]; int height = operands[input_operand_index].dims[1]; int width = operands[input_operand_index].dims[2]; int channel = operands[input_operand_index].dims[3]; const float *input = operands[input_operand_index].data; - const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters; + const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_data->parameters); int radius = conv_params->kernel_size >> 1; int src_linesize = width * conv_params->input_num; @@ -106,7 +129,7 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ int filter_size = conv_params->kernel_size * filter_linesize; int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0; - DnnOperand *output_operand = &operands[output_operand_index]; + DnnOperand *output_operand = &operands[thread_data->output_operand_index]; output_operand->dims[0] = number; output_operand->dims[1] = height - pad_size * 2; output_operand->dims[2] = width - pad_size * 2; @@ -114,19 +137,30 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ output_operand->data_type = operands[input_operand_index].data_type; output_operand->length = calculate_operand_data_length(output_operand); if (output_operand->length <= 0) { - av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n"); - return DNN_ERROR; + av_log(thread_data->ctx, AV_LOG_ERROR, "The output data length overflow\n"); + return (void *)DNN_ERROR; } output_operand->data = av_realloc(output_operand->data, output_operand->length); if (!output_operand->data) { - av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); - return DNN_ERROR; + av_log(thread_data->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); + return (void *)DNN_ERROR; } + + //calculate area for this thread + thread_stride = (height - pad_size * 2) / thread_data->thread_num; + pthread_mutex_lock(&mtx); + thread_start = thread_stride * thread_data->thread_index + pad_size; + thread_end = (thread_data->thread_index == thread_data->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride); + thread_data->thread_index += 1; + pthread_mutex_unlock(&mtx); + output = output_operand->data; + //calculate output start pos for this thread + output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size); av_assert0(channel == conv_params->input_num); - for (int y = pad_size; y < height - pad_size; ++y) { + for (int y = thread_start; y < thread_end; ++y) { for (int x = pad_size; x < width - pad_size; ++x) { for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) { if (conv_params->has_bias) @@ -174,5 +208,44 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ output += conv_params->output_num; } } - return 0; + return (void *)0; +} + + +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, + int32_t output_operand_index, const void *parameters, NativeContext *ctx) +{ + //get cpu available cores, -1 for higher efficiency + const int thread_num = av_cpu_count() - 1; + pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t)); + void *res; + int error_flag = 0; + + //struct used to pass parameters + struct thread_data *thread_data; + thread_data = av_malloc(sizeof(*thread_data)); + thread_data->operands = operands; + thread_data->input_operand_indexes = input_operand_indexes; + thread_data->output_operand_index = output_operand_index; + thread_data->parameters = parameters; + thread_data->ctx = ctx; + thread_data->thread_num = thread_num; + thread_data->thread_index = 0; + + //create threads + for (int i = 0; i < thread_num; i++){ + pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_data); + } + + //join threads, res gets function return + for (int i = 0; i < thread_num; i++){ + pthread_join(thread_id[i], &res); + if ((int)res != 0) + error_flag = (int)res; + } + + //release memory + av_free(thread_id); + av_free(thread_data); + return error_flag; } From patchwork Mon Aug 31 17:03:42 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xu Jun X-Patchwork-Id: 22026 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 0C3B644ABC9 for ; Mon, 31 Aug 2020 20:05:11 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id E072E68ABFD; Mon, 31 Aug 2020 20:05:10 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id BBFA568AB82 for ; Mon, 31 Aug 2020 20:05:04 +0300 (EEST) Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188]) by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id 531071008CBC1 for ; Tue, 1 Sep 2020 01:05:02 +0800 (CST) Received: from localhost (localhost.localdomain [127.0.0.1]) by proxy02.sjtu.edu.cn (Postfix) with ESMTP id 16397200B4498; Tue, 1 Sep 2020 01:05:02 +0800 (CST) X-Virus-Scanned: amavisd-new at Received: from proxy02.sjtu.edu.cn ([127.0.0.1]) by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026) with ESMTP id fbR57rSMtNJq; Tue, 1 Sep 2020 01:05:02 +0800 (CST) Received: from localhost.localdomain (unknown [202.120.39.204]) (Authenticated sender: xujunzz@sjtu.edu.cn) by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id 32D6C200B448D; Tue, 1 Sep 2020 01:05:00 +0800 (CST) From: xujunzz@sjtu.edu.cn To: ffmpeg-devel@ffmpeg.org Date: Tue, 1 Sep 2020 01:03:42 +0800 Message-Id: <20200831170341.879003-2-xujunzz@sjtu.edu.cn> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20200831170341.879003-1-xujunzz@sjtu.edu.cn> References: <20200831170341.879003-1-xujunzz@sjtu.edu.cn> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 2/3][GSoC] Add x86-sse4 optimization for dnn_execute_layer_conv2d X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: xujunzz@sjtu.edu.cn Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Xu Jun Can be tested with command "./ffmpeg_g -i input.png -vf \ format=yuvj420p,dnn_processing=dnn_backend=native:model= \ espcn.model:input=x:output=y -y sr_native.jpg -benchmark"\ -cpuflags 0x100 before patch: utime=20.817s stime=0.047s rtime=1.051s after patch: utime=3.744s stime=0.037s rtime=0.252s Signed-off-by: Xu Jun ss Signed-off-by: Xu Jun --- libavfilter/dnn/Makefile | 1 + .../dnn/dnn_backend_native_layer_conv2d.c | 123 ++++++++-- .../dnn_backend_native_layer_conv2d_x86.asm | 214 ++++++++++++++++++ 3 files changed, 314 insertions(+), 24 deletions(-) create mode 100644 libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile index e0957073ee..bdd334b192 100644 --- a/libavfilter/dnn/Makefile +++ b/libavfilter/dnn/Makefile @@ -8,6 +8,7 @@ OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_dep OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_maximum.o OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_mathbinary.o OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_mathunary.o +OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_conv2d_x86.o DNN-OBJS-$(CONFIG_LIBTENSORFLOW) += dnn/dnn_backend_tf.o DNN-OBJS-$(CONFIG_LIBOPENVINO) += dnn/dnn_backend_openvino.o diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c index 570b974052..92cc5313dc 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c @@ -21,6 +21,7 @@ #include "libavutil/avassert.h" #include "libavutil/thread.h" #include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" #include "dnn_backend_native_layer_conv2d.h" #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x))) @@ -34,8 +35,20 @@ typedef struct thread_data{ NativeContext *ctx; int32_t thread_num; int32_t thread_index; + int step; } thread_data; +typedef struct execute_data{ + int thread_start, thread_end, input_num, output_num, kernel_size, padding_method, dilation; + int pad_size, width, height, radius, src_linesize, filter_size, filter_linesize; + float *input; + float *output; + float *kernel; +} execute_data; + +void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data); +void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data); + int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num) { ConvolutionalParams *conv_params; @@ -101,6 +114,56 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil return dnn_size; } +void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data){ + int thread_start = execute_data->thread_start; + int thread_end = execute_data->thread_end; + float *input = execute_data->input; + float *output = execute_data->output; + float *kernel = execute_data->kernel; + int input_num = execute_data->input_num; + int output_num = execute_data->output_num; + int kernel_size = execute_data->kernel_size; + int padding_method = execute_data->padding_method; + int dilation = execute_data->dilation; + int pad_size = execute_data->pad_size; + int width = execute_data->width; + int height = execute_data->height; + int radius = execute_data->radius; + int src_linesize = execute_data->src_linesize; + int filter_size = execute_data->filter_size; + int filter_linesize = execute_data->filter_linesize; + + for (int y = thread_start; y < thread_end; ++y) { + for (int x = pad_size; x < width - pad_size; ++x) { + for (int n_filter = 0; n_filter < output_num; ++n_filter) { + output[n_filter] = 0.0f; + for (int ch = 0; ch < input_num; ++ch) { + for (int kernel_y = 0; kernel_y < kernel_size; ++kernel_y) { + for (int kernel_x = 0; kernel_x < kernel_size; ++kernel_x) { + float input_pel; + if (padding_method == SAME_CLAMP_TO_EDGE) { + int y_pos = CLAMP_TO_EDGE(y + (kernel_y - radius) * dilation, height); + int x_pos = CLAMP_TO_EDGE(x + (kernel_x - radius) * dilation, width); + input_pel = input[y_pos * src_linesize + x_pos * input_num + ch]; + } else { + int y_pos = y + (kernel_y - radius) * dilation; + int x_pos = x + (kernel_x - radius) * dilation; + input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 : + input[y_pos * src_linesize + x_pos * input_num + ch]; + } + + + output[n_filter] += input_pel * kernel[n_filter * filter_size + kernel_y * filter_linesize + + kernel_x * input_num + ch]; + } + } + } + } + output += output_num; + } + } +} + static void * dnn_execute_layer_conv2d_thread(void *threadarg) { static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; @@ -160,35 +223,40 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg) av_assert0(channel == conv_params->input_num); + struct execute_data *execute_data; + execute_data = av_malloc(sizeof(*execute_data)); + execute_data->thread_start = thread_start; + execute_data->thread_end = thread_end; + execute_data->input = input; + execute_data->output = output; + execute_data->kernel = conv_params->kernel; + execute_data->input_num = conv_params->input_num; + execute_data->output_num = conv_params->output_num; + execute_data->kernel_size = conv_params->kernel_size; + execute_data->padding_method = conv_params->padding_method; + execute_data->dilation = conv_params->dilation; + execute_data->pad_size = pad_size; + execute_data->width = width; + execute_data->height = height; + execute_data->radius = radius; + execute_data->src_linesize = src_linesize; + execute_data->filter_size = filter_size; + execute_data->filter_linesize = filter_linesize; + if ((thread_data->step >= 4) && (conv_params->input_num >= 4)) { + ff_dnn_execute_layer_conv2d_sse4(execute_data); + } + else { + ff_dnn_execute_layer_conv2d_c(execute_data); + } + + output = output_operand->data; + output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size); for (int y = thread_start; y < thread_end; ++y) { for (int x = pad_size; x < width - pad_size; ++x) { for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) { if (conv_params->has_bias) - output[n_filter] = conv_params->biases[n_filter]; - else - output[n_filter] = 0.f; + output[n_filter] += conv_params->biases[n_filter]; - for (int ch = 0; ch < conv_params->input_num; ++ch) { - for (int kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y) { - for (int kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x) { - float input_pel; - if (conv_params->padding_method == SAME_CLAMP_TO_EDGE) { - int y_pos = CLAMP_TO_EDGE(y + (kernel_y - radius) * conv_params->dilation, height); - int x_pos = CLAMP_TO_EDGE(x + (kernel_x - radius) * conv_params->dilation, width); - input_pel = input[y_pos * src_linesize + x_pos * conv_params->input_num + ch]; - } else { - int y_pos = y + (kernel_y - radius) * conv_params->dilation; - int x_pos = x + (kernel_x - radius) * conv_params->dilation; - input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 : - input[y_pos * src_linesize + x_pos * conv_params->input_num + ch]; - } - - - output[n_filter] += input_pel * conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize + - kernel_x * conv_params->input_num + ch]; - } - } - } switch (conv_params->activation){ case RELU: output[n_filter] = FFMAX(output[n_filter], 0.0); @@ -208,6 +276,7 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg) output += conv_params->output_num; } } + return (void *)0; } @@ -231,6 +300,12 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ thread_data->ctx = ctx; thread_data->thread_num = thread_num; thread_data->thread_index = 0; + thread_data->step = 1; + #if ARCH_X86_64 + int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_SSE4(cpu_flags)) + thread_data->step = 4; + #endif //create threads for (int i = 0; i < thread_num; i++){ diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm new file mode 100644 index 0000000000..dc781d42e5 --- /dev/null +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm @@ -0,0 +1,214 @@ +;***************************************************************************** +;* x86-optimized functions for dnn native backend convolution +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro COUNT_INPUT 0 + mov tmp1d, padding_method + cmp tmp1d, SAME_CLAMP_TO_EDGE + je .clamp + + cmp y_posd, 0 + jl .out_of_th + mov tmp2d, height + cmp y_posd, tmp2d + jge .out_of_th + + cmp x_posd, 0 + jl .out_of_th + mov tmp2d, width + cmp x_posd, tmp2d + jge .out_of_th + + mov tmp1d, y_posd + imul tmp1d, src_linesize + mov tmp2d, x_posd + imul tmp2d, input_num + add tmp1d, tmp2d + jmp .count_end + + .out_of_th: + mov tmp1d, -1 + jmp .count_end + + .clamp: + cmp y_posd, 0 + jl .y_clamp_zero + mov tmp1d, height + cmp y_posd, tmp1d + jge .y_clamp_height + mov tmp1d, y_posd + jmp .y_normal + + .y_clamp_zero: + xor tmp1d, tmp1d + jmp .y_normal + + .y_clamp_height: + sub tmp1d, 1 + + .y_normal: + + cmp x_posd, 0 + jl .x_clamp_zero + mov tmp2d, width + cmp x_posd, tmp2d + jge .x_clamp_width + mov tmp2d, x_posd + jmp .x_normal + + .x_clamp_zero: + xor tmp2d, tmp2d + jmp .x_normal + + .x_clamp_width: + sub tmp2d, 1 + + .x_normal: + + imul tmp1d, src_linesize + imul tmp2d, input_num + add tmp1d, tmp2d + + .count_end: +%endmacro + +; void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data); + +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\ + x, y, n_filter, cha, kernel_x, kernel_y, x_pos, y_pos, kernel_pos,\ + input, output, kernel, tmp1, tmp2 + +%define thread_start [execute_dataq] +%define thread_end [execute_dataq + 1 * 4] +%define input_num [execute_dataq + 2 * 4] +%define output_num [execute_dataq + 3 * 4] +%define kernel_size [execute_dataq + 4 * 4] +%define padding_method [execute_dataq + 5 * 4] +%define dilation [execute_dataq + 6 * 4] +%define pad_size [execute_dataq + 7 * 4] +%define width [execute_dataq + 8 * 4] +%define height [execute_dataq + 9 * 4] +%define radius [execute_dataq + 10 * 4] +%define src_linesize [execute_dataq + 11 * 4] +%define filter_size [execute_dataq + 12 * 4] +%define filter_linesize [execute_dataq + 13 * 4] +%define SAME_CLAMP_TO_EDGE 2 + + mov inputq, [execute_dataq + 14 * 4] + mov outputq, [execute_dataq + 14 * 4 + 8] + mov kernelq, [execute_dataq + 14 * 4 + 2 * 8] + + mov yd, thread_start +.loop_y: + mov xd, pad_size + .loop_x: + xor n_filterd, n_filterd + xor kernel_posq, kernel_posq + .loop_filter: + xorps m2, m2 + xor kernel_yd, kernel_yd + + mov tmp1d, kernel_yd + sub tmp1d, radius + mov y_posd, dilation + imul y_posd, tmp1d + add y_posd, yd + + .loop_kery: + xor kernel_xd, kernel_xd + + mov tmp1d, kernel_xd + sub tmp1d, radius + mov x_posd, dilation + imul x_posd, tmp1d + add x_posd, xd + + .loop_kerx: + COUNT_INPUT + xor chad, chad + .loop_ch: + cmp tmp1d, -1 + je .out + + movsxdifnidn tmp1q, tmp1d + movups m0, [inputq + tmp1q * 4] + add tmp1d, 4 + jmp .load_end + + .out: + xorps m0, m0 + + .load_end: + + movups m1, [kernelq + kernel_posq * 4] + add kernel_posq, 4 + + mulps m0, m1 + addps m2, m0 + + add chad, 4 + mov tmp2d, input_num + cmp chad, tmp2d + jl .loop_ch + + add x_posd, dilation + add kernel_xd, 1 + mov tmp1d, kernel_size + cmp kernel_xd, tmp1d + jl .loop_kerx + + add y_posd, dilation + add kernel_yd, 1 + mov tmp1d, kernel_size + cmp kernel_yd, tmp1d + jl .loop_kery + + haddps m2, m2 + haddps m2, m2 + movsxdifnidn n_filterq, n_filterd + movss [outputq + n_filterq * 4], m2 + + add n_filterd, 1 + mov tmp1d, output_num + cmp n_filterd, tmp1d + jl .loop_filter + + mov tmp1d, output_num + movsxdifnidn tmp1q, tmp1d + shl tmp1d, 2 + add outputq, tmp1q + add xd, 1 + mov tmp2d, width + sub tmp2d, pad_size + cmp xd, tmp2d + jl .loop_x + + add yd, 1 + mov tmp1d, thread_end + cmp yd, tmp1d + jl .loop_y + + RET +%endif From patchwork Mon Aug 31 17:03:44 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xu Jun X-Patchwork-Id: 22027 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 426B044ABC9 for ; Mon, 31 Aug 2020 20:05:30 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 2495968AC31; Mon, 31 Aug 2020 20:05:30 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 314BF6883D0 for ; Mon, 31 Aug 2020 20:05:24 +0300 (EEST) Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188]) by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id B588C1008CBC1 for ; Tue, 1 Sep 2020 01:05:21 +0800 (CST) Received: from localhost (localhost.localdomain [127.0.0.1]) by proxy02.sjtu.edu.cn (Postfix) with ESMTP id B4D42200B4496; Tue, 1 Sep 2020 01:05:21 +0800 (CST) X-Virus-Scanned: amavisd-new at Received: from proxy02.sjtu.edu.cn ([127.0.0.1]) by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026) with ESMTP id cTc9B_-hSMW6; Tue, 1 Sep 2020 01:05:21 +0800 (CST) Received: from localhost.localdomain (unknown [202.120.39.204]) (Authenticated sender: xujunzz@sjtu.edu.cn) by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id D1FBA200B448D; Tue, 1 Sep 2020 01:05:20 +0800 (CST) From: xujunzz@sjtu.edu.cn To: ffmpeg-devel@ffmpeg.org Date: Tue, 1 Sep 2020 01:03:44 +0800 Message-Id: <20200831170341.879003-3-xujunzz@sjtu.edu.cn> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20200831170341.879003-1-xujunzz@sjtu.edu.cn> References: <20200831170341.879003-1-xujunzz@sjtu.edu.cn> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 3/3][GSoC] Add x86-avx2 optimization for dnn_execute_layer_conv2d X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: xujunzz@sjtu.edu.cn Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Xu Jun Can be tested with command "./ffmpeg_g -i test_1s.mp4 -vf \ format=yuvj420p,dnn_processing=dnn_backend=native:model= \ espcn.model:input=x:output=y -y sr_native.mp4 -benchmark" before patch: utime=826.044s stime=0.550s rtime=39.680s after patch: utime=545.137s stime=0.467s rtime=27.113s Signed-off-by: Xu Jun --- .../dnn/dnn_backend_native_layer_conv2d.c | 10 +- .../dnn_backend_native_layer_conv2d_x86.asm | 121 ++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c index 92cc5313dc..089f724156 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c @@ -46,6 +46,7 @@ typedef struct execute_data{ float *kernel; } execute_data; +void ff_dnn_execute_layer_conv2d_avx2(execute_data *execute_data); void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data); void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data); @@ -243,7 +244,12 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg) execute_data->filter_size = filter_size; execute_data->filter_linesize = filter_linesize; if ((thread_data->step >= 4) && (conv_params->input_num >= 4)) { - ff_dnn_execute_layer_conv2d_sse4(execute_data); + if ((thread_data->step == 8) && (conv_params->input_num >= 8)) { + ff_dnn_execute_layer_conv2d_avx2(execute_data); + } + else { + ff_dnn_execute_layer_conv2d_sse4(execute_data); + } } else { ff_dnn_execute_layer_conv2d_c(execute_data); @@ -305,6 +311,8 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_SSE4(cpu_flags)) thread_data->step = 4; + if (EXTERNAL_AVX2(cpu_flags)) + thread_data->step = 8; #endif //create threads diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm index dc781d42e5..7c7285c4c5 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm @@ -210,5 +210,126 @@ cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\ cmp yd, tmp1d jl .loop_y + RET + +; void ff_dnn_execute_layer_conv2d_avx4(execute_data *execute_data); + +INIT_YMM avx2 +cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\ + x, y, n_filter, cha, kernel_x, kernel_y, x_pos, y_pos, kernel_pos,\ + input, output, kernel, tmp1, tmp2 + +%define thread_start [execute_dataq] +%define thread_end [execute_dataq + 1 * 4] +%define input_num [execute_dataq + 2 * 4] +%define output_num [execute_dataq + 3 * 4] +%define kernel_size [execute_dataq + 4 * 4] +%define padding_method [execute_dataq + 5 * 4] +%define dilation [execute_dataq + 6 * 4] +%define pad_size [execute_dataq + 7 * 4] +%define width [execute_dataq + 8 * 4] +%define height [execute_dataq + 9 * 4] +%define radius [execute_dataq + 10 * 4] +%define src_linesize [execute_dataq + 11 * 4] +%define filter_size [execute_dataq + 12 * 4] +%define filter_linesize [execute_dataq + 13 * 4] +%define SAME_CLAMP_TO_EDGE 2 + + mov inputq, [execute_dataq + 14 * 4] + mov outputq, [execute_dataq + 14 * 4 + 8] + mov kernelq, [execute_dataq + 14 * 4 + 2 * 8] + + mov yd, thread_start +.loop_y: + mov xd, pad_size + .loop_x: + xor n_filterd, n_filterd + xor kernel_posq, kernel_posq + .loop_filter: + xorps m2, m2 + xor kernel_yd, kernel_yd + + mov tmp1d, kernel_yd + sub tmp1d, radius + mov y_posd, dilation + imul y_posd, tmp1d + add y_posd, yd + + .loop_kery: + xor kernel_xd, kernel_xd + + mov tmp1d, kernel_xd + sub tmp1d, radius + mov x_posd, dilation + imul x_posd, tmp1d + add x_posd, xd + + .loop_kerx: + COUNT_INPUT + xor chad, chad + .loop_ch: + cmp tmp1d, -1 + je .out + + movsxdifnidn tmp1q, tmp1d + movups m0, [inputq + tmp1q * 4] + add tmp1d, 8 + jmp .load_end + + .out: + xorps m0, m0 + + .load_end: + + movups m1, [kernelq + kernel_posq * 4] + add kernel_posq, 8 + + mulps m0, m1 + addps m2, m0 + + add chad, 8 + mov tmp2d, input_num + cmp chad, tmp2d + jl .loop_ch + + add x_posd, dilation + add kernel_xd, 1 + mov tmp1d, kernel_size + cmp kernel_xd, tmp1d + jl .loop_kerx + + add y_posd, dilation + add kernel_yd, 1 + mov tmp1d, kernel_size + cmp kernel_yd, tmp1d + jl .loop_kery + + vperm2f128 m1, m2, m2, 1 + addps m2, m1 + haddps m2, m2 + haddps m2, m2 + movsxdifnidn n_filterq, n_filterd + movss [outputq + n_filterq * 4], xm2 + + add n_filterd, 1 + mov tmp1d, output_num + cmp n_filterd, tmp1d + jl .loop_filter + + mov tmp1d, output_num + movsxdifnidn tmp1q, tmp1d + shl tmp1d, 2 + add outputq, tmp1q + add xd, 1 + mov tmp2d, width + sub tmp2d, pad_size + cmp xd, tmp2d + jl .loop_x + + add yd, 1 + mov tmp1d, thread_end + cmp yd, tmp1d + jl .loop_y + RET %endif