From patchwork Mon Aug 31 17:03:44 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Xu Jun <xujunzz@sjtu.edu.cn>
X-Patchwork-Id: 22027
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
X-Original-To: patchwork@ffaux-bg.ffmpeg.org
Delivered-To: patchwork@ffaux-bg.ffmpeg.org
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by ffaux.localdomain (Postfix) with ESMTP id 426B044ABC9
	for <patchwork@ffaux-bg.ffmpeg.org>; Mon, 31 Aug 2020 20:05:30 +0300 (EEST)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 2495968AC31;
	Mon, 31 Aug 2020 20:05:30 +0300 (EEST)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181])
 by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 314BF6883D0
 for <ffmpeg-devel@ffmpeg.org>; Mon, 31 Aug 2020 20:05:24 +0300 (EEST)
Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188])
 by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id B588C1008CBC1
 for <ffmpeg-devel@ffmpeg.org>; Tue,  1 Sep 2020 01:05:21 +0800 (CST)
Received: from localhost (localhost.localdomain [127.0.0.1])
 by proxy02.sjtu.edu.cn (Postfix) with ESMTP id B4D42200B4496;
 Tue,  1 Sep 2020 01:05:21 +0800 (CST)
X-Virus-Scanned: amavisd-new at 
Received: from proxy02.sjtu.edu.cn ([127.0.0.1])
 by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026)
 with ESMTP id cTc9B_-hSMW6; Tue,  1 Sep 2020 01:05:21 +0800 (CST)
Received: from localhost.localdomain (unknown [202.120.39.204])
 (Authenticated sender: xujunzz@sjtu.edu.cn)
 by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id D1FBA200B448D;
 Tue,  1 Sep 2020 01:05:20 +0800 (CST)
From: xujunzz@sjtu.edu.cn
To: ffmpeg-devel@ffmpeg.org
Date: Tue,  1 Sep 2020 01:03:44 +0800
Message-Id: <20200831170341.879003-3-xujunzz@sjtu.edu.cn>
X-Mailer: git-send-email 2.28.0
In-Reply-To: <20200831170341.879003-1-xujunzz@sjtu.edu.cn>
References: <20200831170341.879003-1-xujunzz@sjtu.edu.cn>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 3/3][GSoC] Add x86-avx2 optimization for
	dnn_execute_layer_conv2d
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: xujunzz@sjtu.edu.cn
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From: Xu Jun <xujunzz@sjtu.edu.cn>

Can be tested with command "./ffmpeg_g -i test_1s.mp4 -vf \
format=yuvj420p,dnn_processing=dnn_backend=native:model= \
espcn.model:input=x:output=y -y sr_native.mp4 -benchmark"

before patch: utime=826.044s stime=0.550s rtime=39.680s
after patch:  utime=545.137s stime=0.467s rtime=27.113s

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 .../dnn/dnn_backend_native_layer_conv2d.c     |  10 +-
 .../dnn_backend_native_layer_conv2d_x86.asm   | 121 ++++++++++++++++++
 2 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
index 92cc5313dc..089f724156 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
@@ -46,6 +46,7 @@ typedef struct execute_data{
     float *kernel;
 } execute_data;
 
+void ff_dnn_execute_layer_conv2d_avx2(execute_data *execute_data);
 void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data);
 void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data);
 
@@ -243,7 +244,12 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg)
     execute_data->filter_size = filter_size;
     execute_data->filter_linesize = filter_linesize;
     if ((thread_data->step >= 4) && (conv_params->input_num >= 4)) {
-        ff_dnn_execute_layer_conv2d_sse4(execute_data);
+        if ((thread_data->step == 8) && (conv_params->input_num >= 8)) {
+            ff_dnn_execute_layer_conv2d_avx2(execute_data);
+        }
+        else {
+            ff_dnn_execute_layer_conv2d_sse4(execute_data);
+        }
     }
     else {
         ff_dnn_execute_layer_conv2d_c(execute_data);
@@ -305,6 +311,8 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
         int cpu_flags = av_get_cpu_flags();
         if (EXTERNAL_SSE4(cpu_flags))
             thread_data->step = 4;
+        if (EXTERNAL_AVX2(cpu_flags))
+            thread_data->step = 8;
     #endif
 
     //create threads
diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm
index dc781d42e5..7c7285c4c5 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm
@@ -210,5 +210,126 @@ cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\
     cmp yd, tmp1d
     jl .loop_y
 
+    RET
+
+; void ff_dnn_execute_layer_conv2d_avx4(execute_data *execute_data);
+
+INIT_YMM avx2
+cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\
+    x, y, n_filter, cha, kernel_x, kernel_y, x_pos, y_pos, kernel_pos,\
+    input, output, kernel, tmp1, tmp2
+
+%define thread_start [execute_dataq]
+%define thread_end [execute_dataq + 1 * 4]
+%define input_num [execute_dataq + 2 * 4]
+%define output_num [execute_dataq + 3 * 4]
+%define kernel_size [execute_dataq + 4 * 4]
+%define padding_method [execute_dataq + 5 * 4]
+%define dilation [execute_dataq + 6 * 4]
+%define pad_size [execute_dataq + 7 * 4]
+%define width [execute_dataq + 8 * 4]
+%define height [execute_dataq + 9 * 4]
+%define radius [execute_dataq + 10 * 4]
+%define src_linesize [execute_dataq + 11 * 4]
+%define filter_size [execute_dataq + 12 * 4]
+%define filter_linesize [execute_dataq + 13 * 4]
+%define SAME_CLAMP_TO_EDGE 2
+
+    mov inputq, [execute_dataq + 14 * 4]
+    mov outputq, [execute_dataq + 14 * 4 + 8]
+    mov kernelq, [execute_dataq + 14 * 4 + 2 * 8]
+
+    mov yd, thread_start
+.loop_y:
+    mov xd, pad_size
+    .loop_x:
+        xor n_filterd, n_filterd
+        xor kernel_posq, kernel_posq
+        .loop_filter:
+            xorps m2, m2
+            xor kernel_yd, kernel_yd
+
+            mov tmp1d, kernel_yd
+            sub tmp1d, radius
+            mov y_posd, dilation
+            imul y_posd, tmp1d
+            add y_posd, yd
+
+            .loop_kery:
+                xor kernel_xd, kernel_xd
+
+                mov tmp1d, kernel_xd
+                sub tmp1d, radius
+                mov x_posd, dilation
+                imul x_posd, tmp1d
+                add x_posd, xd
+
+                .loop_kerx:
+                    COUNT_INPUT
+                    xor chad, chad
+                    .loop_ch:
+                        cmp tmp1d, -1
+                        je .out
+
+                        movsxdifnidn tmp1q, tmp1d
+                        movups m0, [inputq + tmp1q * 4]
+                        add tmp1d, 8
+                        jmp .load_end
+
+                        .out:
+                        xorps m0, m0
+
+                        .load_end:
+
+                        movups m1, [kernelq + kernel_posq * 4]
+                        add kernel_posq, 8
+
+                        mulps m0, m1
+                        addps m2, m0
+
+                        add chad, 8
+                        mov tmp2d, input_num
+                        cmp chad, tmp2d
+                        jl .loop_ch
+
+                    add x_posd, dilation
+                    add kernel_xd, 1
+                    mov tmp1d, kernel_size
+                    cmp kernel_xd, tmp1d
+                    jl .loop_kerx
+
+                add y_posd, dilation
+                add kernel_yd, 1
+                mov tmp1d, kernel_size
+                cmp kernel_yd, tmp1d
+                jl .loop_kery
+
+            vperm2f128 m1, m2, m2, 1
+            addps m2, m1
+            haddps m2, m2
+            haddps m2, m2
+            movsxdifnidn n_filterq, n_filterd
+            movss [outputq + n_filterq * 4], xm2
+
+            add n_filterd, 1
+            mov tmp1d, output_num
+            cmp n_filterd, tmp1d
+            jl .loop_filter
+
+        mov tmp1d, output_num
+        movsxdifnidn tmp1q, tmp1d
+        shl tmp1d, 2
+        add outputq, tmp1q
+        add xd, 1
+        mov tmp2d, width
+        sub tmp2d, pad_size
+        cmp xd, tmp2d
+        jl .loop_x
+
+    add yd, 1
+    mov tmp1d, thread_end
+    cmp yd, tmp1d
+    jl .loop_y
+
     RET
 %endif