[FFmpeg-devel] Improve dnn_backend_native convolution speed

Submitted by Pedro Arthur on June 12, 2018, 9:07 p.m.

Details

Message ID CAKN1MR7iiy+RaG+E252Y08CN+JDM6+j5SNrN22z4EvbK3gUZ_g@mail.gmail.com
State New
Headers show

Commit Message

Pedro Arthur June 12, 2018, 9:07 p.m.
The attached patch adds some specialized convolution functions based on the
filter size.

Benchmark (1190x670px image):
Filter        New        Old      Diff (%)
9x9x1x64 |  3.093662   5.135679   39.76%
1x1x64x32|  0.912451   5.670451   83.90%
5x5x32x1 |  0.502857   0.787371   36.13%
Total    |  4.51023    11.5954    61.10%

Patch hide | download patch | download mbox

From 3868e5f033c62b84d29a3592bb7997fa348c2e9c Mon Sep 17 00:00:00 2001
From: Pedro Arthur <bygrandao@gmail.com>
Date: Tue, 12 Jun 2018 17:47:05 -0300
Subject: [PATCH] Improve dnn_backend_native convolution speed

Changed memory layout from i x j x k (width, height, ch) to k x i x j
Added convolve function for 1x1xn filter case
Added convolve using 32x32 blocks of input
---
 libavfilter/dnn_backend_native.c | 212 ++++++++++++++++++++++++++-----
 1 file changed, 181 insertions(+), 31 deletions(-)

diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c
index 6e80dd3663..9f6b690a82 100644
--- a/libavfilter/dnn_backend_native.c
+++ b/libavfilter/dnn_backend_native.c
@@ -51,6 +51,187 @@  typedef struct ConvolutionalNetwork{
     int32_t layers_num;
 } ConvolutionalNetwork;
 
+#define VINDEX3(view, i, j, k) (view)->data[((view)->c * ((view)->w * (i) + (j)) + (k))]
+#define VINDEX2(view, i, j) (view)->data[((view)->w * (i) + (j))]
+#define VINDEX3A(view, i, j, k) (view)->data[((view)->w * ((view)->h * (k) + (i)) + (j))]
+#define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
+
+typedef struct Tensor_view
+{
+    float *data;
+    int w, h, c;
+} Tensor_view;
+
+static void copy(Tensor_view *in, Tensor_view *buff, int size, int row, int col, int channel, int half)
+{
+    int h = in->h;
+    int w = in->w;
+
+    for (int i = 0; i < size; ++i) {
+        int line = CLAMP_TO_EDGE(row + i - half, h);
+        for (int j = 0; j < size; ++j) {
+            int column = CLAMP_TO_EDGE(col + j - half, w);
+            VINDEX2(buff, i, j) = VINDEX3A(in, line, column, channel);
+        }
+    }
+}
+
+static void copy_relu(Tensor_view *in, Tensor_view *out, int row, int col, int ilen, int jlen, float bias)
+{
+    for (int i = 0; i <= ilen; ++i) {
+        for (int j = 0; j <= jlen; ++j) {
+            VINDEX3A(out, row + i, col + j, 0) = FFMAX(VINDEX2(in, i, j) + bias, 0);
+        }
+    }
+}
+
+static void do_block(Tensor_view *in, Tensor_view *out, Tensor_view *kern, float bias, const int row, const int col, int w, int h, int fw)
+{
+    float tmp[32 * 32];
+    float tmp2[32 * 32];
+
+    int half = fw / 2;
+    int ilen = FFMIN(32 - fw, h - row - 1);
+    int jlen = FFMIN(32 - fw, w - col - 1);
+
+    Tensor_view buf = {tmp, 32, 32, 1};
+    Tensor_view obuf = { tmp2, 32, 32, 1 };
+    memset(tmp2, 0, sizeof(float) * 32 * 32);
+
+
+    for (int k = 0; k < kern->c; ++k) {
+        copy(in, &buf, 32, row, col, k, half);
+        for (int ii = 0; ii <= ilen; ++ii) {
+            for (int jj = 0; jj <= jlen; ++jj) {
+
+                float acc = 0;
+                for (int i = 0; i < fw; ++i) {
+                    for (int j = 0; j < fw; ++j) {
+                        acc += VINDEX2(&buf, ii + i, jj + j) * VINDEX3(kern, i, j, k);
+                    }
+                }
+                VINDEX2(&obuf, ii, jj) += acc;
+            }
+        }
+    }
+    copy_relu(&obuf, out, row, col, ilen, jlen, bias);
+}
+
+
+static void convolve_block_32(Tensor_view *in, Tensor_view *kernel, Tensor_view *out, float bias, int w, int h, int c, int fw)
+{
+    int stride = 32 - fw + 1;
+    for (int i = 0; i < h; i += stride) {
+        for (int j = 0; j < w; j += stride) {
+                do_block(in, out, kernel, bias, i, j, w, h, fw);
+        }
+    }
+}
+
+static void convolve_1x1(Tensor_view *in, Tensor_view *kernel, Tensor_view *out, float bias, int w, int h, int c, int fw)
+{
+    if (c > 1) {
+        for (int i = 0; i < h; ++i) {
+            for (int j = 0; j < w; ++j) {
+                    VINDEX3A(out, i, j, 0) = VINDEX3A(in, i, j, 0) * kernel->data[0];
+            }
+        }
+    }
+
+    for (int k = 1; k < c-1; ++k) {
+        for (int i = 0; i < h; ++i) {
+            for (int j = 0; j < w; ++j) {
+                    VINDEX3A(out, i, j, 0) += VINDEX3A(in, i, j, k) * kernel->data[k];
+            }
+        }
+    }
+
+    for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+                VINDEX3A(out, i, j, 0) += VINDEX3A(in, i, j, c-1) * kernel->data[c-1];
+                VINDEX3A(out, i, j, 0) = FFMAX(VINDEX3A(out, i, j, 0) + bias, 0);
+        }
+    }
+}
+
+static void convolve_generic(Tensor_view *in, Tensor_view *kernel, Tensor_view *out, float bias, int w, int h, int c, int fw)
+{
+    int half = fw / 2;
+
+    if (c > 1) {
+        for (int i = 0; i < h; ++i) {
+            for (int j = 0; j < w; ++j) {
+                float acc = 0;
+                for (int ii = 0; ii < fw; ++ii) {
+                    for (int jj = 0; jj < fw; ++jj) {
+                        int row = CLAMP_TO_EDGE(i + ii - half, h);
+                        int col = CLAMP_TO_EDGE(j + jj - half, w);
+
+                        acc += VINDEX3A(in, row, col, 0) * VINDEX3(kernel, ii, jj, 0);
+                    }
+                }
+                VINDEX3A(out, i, j, 0) = acc;
+            }
+        }
+    }
+
+
+    for (int k = 1; k < kernel->c - 1; ++k) {
+        for (int i = 0; i < h; ++i) {
+            for (int j = 0; j < w; ++j) {
+                float acc = 0;
+                for (int ii = 0; ii < fw; ++ii) {
+                    for (int jj = 0; jj < fw; ++jj) {
+                        int row = CLAMP_TO_EDGE(i + ii - half, h);
+                        int col = CLAMP_TO_EDGE(j + jj - half, w);
+
+                        acc += VINDEX3A(in, row, col, k) * VINDEX3(kernel, ii, jj, k);
+                    }
+                }
+                VINDEX3A(out, i, j, 0) += acc;
+            }
+        }
+    }
+
+    for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+            float acc = 0;
+            for (int ii = 0; ii < fw; ++ii) {
+                for (int jj = 0; jj < fw; ++jj) {
+                    int row = CLAMP_TO_EDGE(i + ii - half, h);
+                    int col = CLAMP_TO_EDGE(j + jj - half, w);
+
+                    acc += VINDEX3A(in, row, col, c-1) * VINDEX3(kernel, ii, jj, c-1);
+                }
+            }
+
+            VINDEX3A(out, i, j, 0) += acc;
+            VINDEX3A(out, i, j, 0) = FFMAX(0, VINDEX3A(out, i, j, 0) + bias);
+        }
+    }
+}
+
+static void convolve(const float* input, float* output, const ConvolutionalParams* conv_params, int32_t width, int32_t height)
+{
+    int out_stride = width * height;
+    int kern_stride = conv_params->kernel_size * conv_params->kernel_size * conv_params->input_num;
+
+    Tensor_view in = {(float*)input, width, height, conv_params->input_num};
+    
+
+    for (int i = 0; i < conv_params->output_num; ++i) {
+        Tensor_view out = {output + i * out_stride, width, height, 1};
+        Tensor_view kern = {conv_params->kernel + i * kern_stride, conv_params->kernel_size, conv_params->kernel_size, conv_params->input_num};
+
+        if (kern.w == 1 && kern.h == 1)
+            convolve_1x1(&in, &kern, &out, conv_params->biases[i], width, height, conv_params->input_num, conv_params->kernel_size);
+        else if (kern.w < 16 && kern.h < 16)
+            convolve_block_32(&in, &kern, &out, conv_params->biases[i], width, height, conv_params->input_num, conv_params->kernel_size);
+        else
+            convolve_generic(&in, &kern, &out, conv_params->biases[i], width, height, conv_params->input_num, conv_params->kernel_size);
+    }
+}
+
 static DNNReturnType set_input_output_native(void* model, const DNNData* input, const DNNData* output)
 {
     ConvolutionalNetwork* network = (ConvolutionalNetwork*)model;
@@ -289,37 +470,6 @@  DNNModel* ff_dnn_load_default_model_native(DNNDefaultModel model_type)
     }
 }
 
-#define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
-
-static void convolve(const float* input, float* output, const ConvolutionalParams* conv_params, int32_t width, int32_t height)
-{
-    int y, x, n_filter, ch, kernel_y, kernel_x;
-    int radius = conv_params->kernel_size >> 1;
-    int src_linesize = width * conv_params->input_num;
-    int filter_linesize = conv_params->kernel_size * conv_params->input_num;
-    int filter_size = conv_params->kernel_size * filter_linesize;
-
-    for (y = 0; y < height; ++y){
-        for (x = 0; x < width; ++x){
-            for (n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
-                output[n_filter] = conv_params->biases[n_filter];
-                for (ch = 0; ch < conv_params->input_num; ++ch){
-                    for (kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
-                        for (kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
-                            output[n_filter] += input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
-                                                      CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch] *
-                                                conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
-                                                                    kernel_x * conv_params->input_num + ch];
-                        }
-                    }
-                }
-                output[n_filter] = FFMAX(output[n_filter], 0.0);
-            }
-            output += conv_params->output_num;
-        }
-    }
-}
-
 DNNReturnType ff_dnn_execute_model_native(const DNNModel* model)
 {
     ConvolutionalNetwork* network = (ConvolutionalNetwork*)model->model;
-- 
2.17.1