From patchwork Mon Aug 31 17:03:42 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Xu Jun <xujunzz@sjtu.edu.cn>
X-Patchwork-Id: 22026
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
X-Original-To: patchwork@ffaux-bg.ffmpeg.org
Delivered-To: patchwork@ffaux-bg.ffmpeg.org
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by ffaux.localdomain (Postfix) with ESMTP id 0C3B644ABC9
	for <patchwork@ffaux-bg.ffmpeg.org>; Mon, 31 Aug 2020 20:05:11 +0300 (EEST)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id E072E68ABFD;
	Mon, 31 Aug 2020 20:05:10 +0300 (EEST)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from smtp181.sjtu.edu.cn (smtp181.sjtu.edu.cn [202.120.2.181])
 by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id BBFA568AB82
 for <ffmpeg-devel@ffmpeg.org>; Mon, 31 Aug 2020 20:05:04 +0300 (EEST)
Received: from proxy02.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188])
 by smtp181.sjtu.edu.cn (Postfix) with ESMTPS id 531071008CBC1
 for <ffmpeg-devel@ffmpeg.org>; Tue,  1 Sep 2020 01:05:02 +0800 (CST)
Received: from localhost (localhost.localdomain [127.0.0.1])
 by proxy02.sjtu.edu.cn (Postfix) with ESMTP id 16397200B4498;
 Tue,  1 Sep 2020 01:05:02 +0800 (CST)
X-Virus-Scanned: amavisd-new at 
Received: from proxy02.sjtu.edu.cn ([127.0.0.1])
 by localhost (proxy02.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026)
 with ESMTP id fbR57rSMtNJq; Tue,  1 Sep 2020 01:05:02 +0800 (CST)
Received: from localhost.localdomain (unknown [202.120.39.204])
 (Authenticated sender: xujunzz@sjtu.edu.cn)
 by proxy02.sjtu.edu.cn (Postfix) with ESMTPSA id 32D6C200B448D;
 Tue,  1 Sep 2020 01:05:00 +0800 (CST)
From: xujunzz@sjtu.edu.cn
To: ffmpeg-devel@ffmpeg.org
Date: Tue,  1 Sep 2020 01:03:42 +0800
Message-Id: <20200831170341.879003-2-xujunzz@sjtu.edu.cn>
X-Mailer: git-send-email 2.28.0
In-Reply-To: <20200831170341.879003-1-xujunzz@sjtu.edu.cn>
References: <20200831170341.879003-1-xujunzz@sjtu.edu.cn>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 2/3][GSoC] Add x86-sse4 optimization for
	dnn_execute_layer_conv2d
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: xujunzz@sjtu.edu.cn
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From: Xu Jun <xujunzz@sjtu.edu.cn>

Can be tested with command "./ffmpeg_g -i input.png -vf \
format=yuvj420p,dnn_processing=dnn_backend=native:model= \
espcn.model:input=x:output=y -y sr_native.jpg -benchmark"\
-cpuflags 0x100

before patch: utime=20.817s stime=0.047s rtime=1.051s
after patch:  utime=3.744s stime=0.037s rtime=0.252s

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>

ss

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 libavfilter/dnn/Makefile                      |   1 +
 .../dnn/dnn_backend_native_layer_conv2d.c     | 123 ++++++++--
 .../dnn_backend_native_layer_conv2d_x86.asm   | 214 ++++++++++++++++++
 3 files changed, 314 insertions(+), 24 deletions(-)
 create mode 100644 libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm

diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile
index e0957073ee..bdd334b192 100644
--- a/libavfilter/dnn/Makefile
+++ b/libavfilter/dnn/Makefile
@@ -8,6 +8,7 @@ OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_dep
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_maximum.o
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_mathbinary.o
 OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_mathunary.o
+OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_native_layer_conv2d_x86.o
 
 DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn/dnn_backend_tf.o
 DNN-OBJS-$(CONFIG_LIBOPENVINO)               += dnn/dnn_backend_openvino.o
diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
index 570b974052..92cc5313dc 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
@@ -21,6 +21,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/thread.h"
 #include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
 #include "dnn_backend_native_layer_conv2d.h"
 
 #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
@@ -34,8 +35,20 @@ typedef struct thread_data{
     NativeContext *ctx;
     int32_t thread_num;
     int32_t thread_index;
+    int step;
 } thread_data;
 
+typedef struct execute_data{
+    int thread_start, thread_end, input_num, output_num, kernel_size, padding_method, dilation;
+    int pad_size, width, height, radius, src_linesize, filter_size, filter_linesize;
+    float *input;
+    float *output;
+    float *kernel;
+} execute_data;
+
+void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data);
+void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data);
+
 int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num)
 {
     ConvolutionalParams *conv_params;
@@ -101,6 +114,56 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil
     return dnn_size;
 }
 
+void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data){
+    int thread_start = execute_data->thread_start;
+    int thread_end = execute_data->thread_end;
+    float *input = execute_data->input;
+    float *output = execute_data->output;
+    float *kernel = execute_data->kernel;
+    int input_num = execute_data->input_num;
+    int output_num = execute_data->output_num;
+    int kernel_size = execute_data->kernel_size;
+    int padding_method = execute_data->padding_method;
+    int dilation = execute_data->dilation;
+    int pad_size = execute_data->pad_size;
+    int width = execute_data->width;
+    int height = execute_data->height;
+    int radius = execute_data->radius;
+    int src_linesize = execute_data->src_linesize;
+    int filter_size = execute_data->filter_size;
+    int filter_linesize = execute_data->filter_linesize;
+
+    for (int y = thread_start; y < thread_end; ++y) {
+        for (int x = pad_size; x < width - pad_size; ++x) {
+            for (int n_filter = 0; n_filter < output_num; ++n_filter) {
+                output[n_filter] = 0.0f;
+                for (int ch = 0; ch < input_num; ++ch) {
+                    for (int kernel_y = 0; kernel_y < kernel_size; ++kernel_y) {
+                        for (int kernel_x = 0; kernel_x < kernel_size; ++kernel_x) {
+                            float input_pel;
+                            if (padding_method == SAME_CLAMP_TO_EDGE) {
+                                int y_pos = CLAMP_TO_EDGE(y + (kernel_y - radius) * dilation, height);
+                                int x_pos = CLAMP_TO_EDGE(x + (kernel_x - radius) * dilation, width);
+                                input_pel = input[y_pos * src_linesize + x_pos * input_num + ch];
+                            } else {
+                                int y_pos = y + (kernel_y - radius) * dilation;
+                                int x_pos = x + (kernel_x - radius) * dilation;
+                                input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 :
+                                                input[y_pos * src_linesize + x_pos * input_num + ch];
+                            }
+
+
+                            output[n_filter] += input_pel * kernel[n_filter * filter_size + kernel_y * filter_linesize +
+                                                                                kernel_x * input_num + ch];
+                        }
+                    }
+                }
+            }
+            output += output_num;
+        }
+    }
+}
+
 static void * dnn_execute_layer_conv2d_thread(void *threadarg)
 {
     static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
@@ -160,35 +223,40 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg)
 
     av_assert0(channel == conv_params->input_num);
 
+    struct execute_data *execute_data;
+    execute_data = av_malloc(sizeof(*execute_data));
+    execute_data->thread_start = thread_start;
+    execute_data->thread_end = thread_end;
+    execute_data->input = input;
+    execute_data->output = output;
+    execute_data->kernel = conv_params->kernel;
+    execute_data->input_num = conv_params->input_num;
+    execute_data->output_num = conv_params->output_num;
+    execute_data->kernel_size = conv_params->kernel_size;
+    execute_data->padding_method = conv_params->padding_method;
+    execute_data->dilation = conv_params->dilation;
+    execute_data->pad_size = pad_size;
+    execute_data->width = width;
+    execute_data->height = height;
+    execute_data->radius = radius;
+    execute_data->src_linesize = src_linesize;
+    execute_data->filter_size = filter_size;
+    execute_data->filter_linesize = filter_linesize;
+    if ((thread_data->step >= 4) && (conv_params->input_num >= 4)) {
+        ff_dnn_execute_layer_conv2d_sse4(execute_data);
+    }
+    else {
+        ff_dnn_execute_layer_conv2d_c(execute_data);
+    }
+
+    output = output_operand->data;
+    output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size);
     for (int y = thread_start; y < thread_end; ++y) {
         for (int x = pad_size; x < width - pad_size; ++x) {
             for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
                 if (conv_params->has_bias)
-                    output[n_filter] = conv_params->biases[n_filter];
-                else
-                    output[n_filter] = 0.f;
+                    output[n_filter] += conv_params->biases[n_filter];
 
-                for (int ch = 0; ch < conv_params->input_num; ++ch) {
-                    for (int kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y) {
-                        for (int kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x) {
-                            float input_pel;
-                            if (conv_params->padding_method == SAME_CLAMP_TO_EDGE) {
-                                int y_pos = CLAMP_TO_EDGE(y + (kernel_y - radius) * conv_params->dilation, height);
-                                int x_pos = CLAMP_TO_EDGE(x + (kernel_x - radius) * conv_params->dilation, width);
-                                input_pel = input[y_pos * src_linesize + x_pos * conv_params->input_num + ch];
-                            } else {
-                                int y_pos = y + (kernel_y - radius) * conv_params->dilation;
-                                int x_pos = x + (kernel_x - radius) * conv_params->dilation;
-                                input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 :
-                                                   input[y_pos * src_linesize + x_pos * conv_params->input_num + ch];
-                            }
-
-
-                            output[n_filter] += input_pel * conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
-                                                                                kernel_x * conv_params->input_num + ch];
-                        }
-                    }
-                }
                 switch (conv_params->activation){
                 case RELU:
                     output[n_filter] = FFMAX(output[n_filter], 0.0);
@@ -208,6 +276,7 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg)
             output += conv_params->output_num;
         }
     }
+
     return (void *)0;
 }
 
@@ -231,6 +300,12 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
     thread_data->ctx = ctx;
     thread_data->thread_num = thread_num;
     thread_data->thread_index = 0;
+    thread_data->step = 1;
+    #if ARCH_X86_64
+        int cpu_flags = av_get_cpu_flags();
+        if (EXTERNAL_SSE4(cpu_flags))
+            thread_data->step = 4;
+    #endif
 
     //create threads
     for (int i = 0; i < thread_num; i++){
diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm
new file mode 100644
index 0000000000..dc781d42e5
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm
@@ -0,0 +1,214 @@
+;*****************************************************************************
+;* x86-optimized functions for dnn native backend convolution
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro COUNT_INPUT 0
+    mov tmp1d, padding_method
+    cmp tmp1d, SAME_CLAMP_TO_EDGE
+    je .clamp
+
+    cmp y_posd, 0
+    jl .out_of_th
+    mov tmp2d, height
+    cmp y_posd, tmp2d
+    jge .out_of_th
+
+    cmp x_posd, 0
+    jl .out_of_th
+    mov tmp2d, width
+    cmp x_posd, tmp2d
+    jge .out_of_th
+
+    mov tmp1d, y_posd
+    imul tmp1d, src_linesize
+    mov tmp2d, x_posd
+    imul tmp2d, input_num
+    add tmp1d, tmp2d
+    jmp .count_end
+
+    .out_of_th:
+        mov tmp1d, -1
+        jmp .count_end
+
+    .clamp:
+    cmp y_posd, 0
+    jl .y_clamp_zero
+    mov tmp1d, height
+    cmp y_posd, tmp1d
+    jge .y_clamp_height
+    mov tmp1d, y_posd
+    jmp .y_normal
+
+    .y_clamp_zero:
+        xor tmp1d, tmp1d
+        jmp .y_normal
+
+    .y_clamp_height:
+        sub tmp1d, 1
+
+    .y_normal:
+
+    cmp x_posd, 0
+    jl .x_clamp_zero
+    mov tmp2d, width
+    cmp x_posd, tmp2d
+    jge .x_clamp_width
+    mov tmp2d, x_posd
+    jmp .x_normal
+
+    .x_clamp_zero:
+        xor tmp2d, tmp2d
+        jmp .x_normal
+
+    .x_clamp_width:
+        sub tmp2d, 1
+
+    .x_normal:
+
+    imul tmp1d, src_linesize
+    imul tmp2d, input_num
+    add tmp1d, tmp2d
+
+    .count_end:
+%endmacro
+
+; void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data);
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\
+    x, y, n_filter, cha, kernel_x, kernel_y, x_pos, y_pos, kernel_pos,\
+    input, output, kernel, tmp1, tmp2
+
+%define thread_start [execute_dataq]
+%define thread_end [execute_dataq + 1 * 4]
+%define input_num [execute_dataq + 2 * 4]
+%define output_num [execute_dataq + 3 * 4]
+%define kernel_size [execute_dataq + 4 * 4]
+%define padding_method [execute_dataq + 5 * 4]
+%define dilation [execute_dataq + 6 * 4]
+%define pad_size [execute_dataq + 7 * 4]
+%define width [execute_dataq + 8 * 4]
+%define height [execute_dataq + 9 * 4]
+%define radius [execute_dataq + 10 * 4]
+%define src_linesize [execute_dataq + 11 * 4]
+%define filter_size [execute_dataq + 12 * 4]
+%define filter_linesize [execute_dataq + 13 * 4]
+%define SAME_CLAMP_TO_EDGE 2
+
+    mov inputq, [execute_dataq + 14 * 4]
+    mov outputq, [execute_dataq + 14 * 4 + 8]
+    mov kernelq, [execute_dataq + 14 * 4 + 2 * 8]
+
+    mov yd, thread_start
+.loop_y:
+    mov xd, pad_size
+    .loop_x:
+        xor n_filterd, n_filterd
+        xor kernel_posq, kernel_posq
+        .loop_filter:
+            xorps m2, m2
+            xor kernel_yd, kernel_yd
+
+            mov tmp1d, kernel_yd
+            sub tmp1d, radius
+            mov y_posd, dilation
+            imul y_posd, tmp1d
+            add y_posd, yd
+
+            .loop_kery:
+                xor kernel_xd, kernel_xd
+
+                mov tmp1d, kernel_xd
+                sub tmp1d, radius
+                mov x_posd, dilation
+                imul x_posd, tmp1d
+                add x_posd, xd
+
+                .loop_kerx:
+                    COUNT_INPUT
+                    xor chad, chad
+                    .loop_ch:
+                        cmp tmp1d, -1
+                        je .out
+
+                        movsxdifnidn tmp1q, tmp1d
+                        movups m0, [inputq + tmp1q * 4]
+                        add tmp1d, 4
+                        jmp .load_end
+
+                        .out:
+                        xorps m0, m0
+
+                        .load_end:
+
+                        movups m1, [kernelq + kernel_posq * 4]
+                        add kernel_posq, 4
+
+                        mulps m0, m1
+                        addps m2, m0
+
+                        add chad, 4
+                        mov tmp2d, input_num
+                        cmp chad, tmp2d
+                        jl .loop_ch
+
+                    add x_posd, dilation
+                    add kernel_xd, 1
+                    mov tmp1d, kernel_size
+                    cmp kernel_xd, tmp1d
+                    jl .loop_kerx
+
+                add y_posd, dilation
+                add kernel_yd, 1
+                mov tmp1d, kernel_size
+                cmp kernel_yd, tmp1d
+                jl .loop_kery
+
+            haddps m2, m2
+            haddps m2, m2
+            movsxdifnidn n_filterq, n_filterd
+            movss [outputq + n_filterq * 4], m2
+
+            add n_filterd, 1
+            mov tmp1d, output_num
+            cmp n_filterd, tmp1d
+            jl .loop_filter
+
+        mov tmp1d, output_num
+        movsxdifnidn tmp1q, tmp1d
+        shl tmp1d, 2
+        add outputq, tmp1q
+        add xd, 1
+        mov tmp2d, width
+        sub tmp2d, pad_size
+        cmp xd, tmp2d
+        jl .loop_x
+
+    add yd, 1
+    mov tmp1d, thread_end
+    cmp yd, tmp1d
+    jl .loop_y
+
+    RET
+%endif