From patchwork Mon Aug 31 17:03:44 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xu Jun X-Patchwork-Id: 22027 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 426B044ABC9 for ; Mon, 31 Aug 2020 20:05:30 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 2495968AC31; Mon, 31 Aug 2020 20:05:30 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 314BF6883D0 for ; Mon, 31 Aug 2020 20:05:24 +0300 (EEST) Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188]) by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id B588C1008CBC1 for ; Tue, 1 Sep 2020 01:05:21 +0800 (CST) Received: from localhost (localhost.localdomain [127.0.0.1]) by proxy02.sjtu.edu.cn (Postfix) with ESMTP id B4D42200B4496; Tue, 1 Sep 2020 01:05:21 +0800 (CST) X-Virus-Scanned: amavisd-new at Received: from proxy02.sjtu.edu.cn ([127.0.0.1]) by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026) with ESMTP id cTc9B_-hSMW6; Tue, 1 Sep 2020 01:05:21 +0800 (CST) Received: from localhost.localdomain (unknown [202.120.39.204]) (Authenticated sender: xujunzz@sjtu.edu.cn) by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id D1FBA200B448D; Tue, 1 Sep 2020 01:05:20 +0800 (CST) From: xujunzz@sjtu.edu.cn To: ffmpeg-devel@ffmpeg.org Date: Tue, 1 Sep 2020 01:03:44 +0800 Message-Id: <20200831170341.879003-3-xujunzz@sjtu.edu.cn> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20200831170341.879003-1-xujunzz@sjtu.edu.cn> References: <20200831170341.879003-1-xujunzz@sjtu.edu.cn> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 3/3][GSoC] Add x86-avx2 optimization for dnn_execute_layer_conv2d X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: xujunzz@sjtu.edu.cn Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Xu Jun Can be tested with command "./ffmpeg_g -i test_1s.mp4 -vf \ format=yuvj420p,dnn_processing=dnn_backend=native:model= \ espcn.model:input=x:output=y -y sr_native.mp4 -benchmark" before patch: utime=826.044s stime=0.550s rtime=39.680s after patch: utime=545.137s stime=0.467s rtime=27.113s Signed-off-by: Xu Jun --- .../dnn/dnn_backend_native_layer_conv2d.c | 10 +- .../dnn_backend_native_layer_conv2d_x86.asm | 121 ++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c index 92cc5313dc..089f724156 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c @@ -46,6 +46,7 @@ typedef struct execute_data{ float *kernel; } execute_data; +void ff_dnn_execute_layer_conv2d_avx2(execute_data *execute_data); void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data); void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data); @@ -243,7 +244,12 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg) execute_data->filter_size = filter_size; execute_data->filter_linesize = filter_linesize; if ((thread_data->step >= 4) && (conv_params->input_num >= 4)) { - ff_dnn_execute_layer_conv2d_sse4(execute_data); + if ((thread_data->step == 8) && (conv_params->input_num >= 8)) { + ff_dnn_execute_layer_conv2d_avx2(execute_data); + } + else { + ff_dnn_execute_layer_conv2d_sse4(execute_data); + } } else { ff_dnn_execute_layer_conv2d_c(execute_data); @@ -305,6 +311,8 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_SSE4(cpu_flags)) thread_data->step = 4; + if (EXTERNAL_AVX2(cpu_flags)) + thread_data->step = 8; #endif //create threads diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm index dc781d42e5..7c7285c4c5 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm @@ -210,5 +210,126 @@ cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\ cmp yd, tmp1d jl .loop_y + RET + +; void ff_dnn_execute_layer_conv2d_avx4(execute_data *execute_data); + +INIT_YMM avx2 +cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\ + x, y, n_filter, cha, kernel_x, kernel_y, x_pos, y_pos, kernel_pos,\ + input, output, kernel, tmp1, tmp2 + +%define thread_start [execute_dataq] +%define thread_end [execute_dataq + 1 * 4] +%define input_num [execute_dataq + 2 * 4] +%define output_num [execute_dataq + 3 * 4] +%define kernel_size [execute_dataq + 4 * 4] +%define padding_method [execute_dataq + 5 * 4] +%define dilation [execute_dataq + 6 * 4] +%define pad_size [execute_dataq + 7 * 4] +%define width [execute_dataq + 8 * 4] +%define height [execute_dataq + 9 * 4] +%define radius [execute_dataq + 10 * 4] +%define src_linesize [execute_dataq + 11 * 4] +%define filter_size [execute_dataq + 12 * 4] +%define filter_linesize [execute_dataq + 13 * 4] +%define SAME_CLAMP_TO_EDGE 2 + + mov inputq, [execute_dataq + 14 * 4] + mov outputq, [execute_dataq + 14 * 4 + 8] + mov kernelq, [execute_dataq + 14 * 4 + 2 * 8] + + mov yd, thread_start +.loop_y: + mov xd, pad_size + .loop_x: + xor n_filterd, n_filterd + xor kernel_posq, kernel_posq + .loop_filter: + xorps m2, m2 + xor kernel_yd, kernel_yd + + mov tmp1d, kernel_yd + sub tmp1d, radius + mov y_posd, dilation + imul y_posd, tmp1d + add y_posd, yd + + .loop_kery: + xor kernel_xd, kernel_xd + + mov tmp1d, kernel_xd + sub tmp1d, radius + mov x_posd, dilation + imul x_posd, tmp1d + add x_posd, xd + + .loop_kerx: + COUNT_INPUT + xor chad, chad + .loop_ch: + cmp tmp1d, -1 + je .out + + movsxdifnidn tmp1q, tmp1d + movups m0, [inputq + tmp1q * 4] + add tmp1d, 8 + jmp .load_end + + .out: + xorps m0, m0 + + .load_end: + + movups m1, [kernelq + kernel_posq * 4] + add kernel_posq, 8 + + mulps m0, m1 + addps m2, m0 + + add chad, 8 + mov tmp2d, input_num + cmp chad, tmp2d + jl .loop_ch + + add x_posd, dilation + add kernel_xd, 1 + mov tmp1d, kernel_size + cmp kernel_xd, tmp1d + jl .loop_kerx + + add y_posd, dilation + add kernel_yd, 1 + mov tmp1d, kernel_size + cmp kernel_yd, tmp1d + jl .loop_kery + + vperm2f128 m1, m2, m2, 1 + addps m2, m1 + haddps m2, m2 + haddps m2, m2 + movsxdifnidn n_filterq, n_filterd + movss [outputq + n_filterq * 4], xm2 + + add n_filterd, 1 + mov tmp1d, output_num + cmp n_filterd, tmp1d + jl .loop_filter + + mov tmp1d, output_num + movsxdifnidn tmp1q, tmp1d + shl tmp1d, 2 + add outputq, tmp1q + add xd, 1 + mov tmp2d, width + sub tmp2d, pad_size + cmp xd, tmp2d + jl .loop_x + + add yd, 1 + mov tmp1d, thread_end + cmp yd, tmp1d + jl .loop_y + RET %endif