From patchwork Tue Dec  3 07:52:06 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Xu Jun <xujunzz@sjtu.edu.cn>
X-Patchwork-Id: 16551
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
X-Original-To: patchwork@ffaux-bg.ffmpeg.org
Delivered-To: patchwork@ffaux-bg.ffmpeg.org
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by ffaux.localdomain (Postfix) with ESMTP id D603044AB5A
	for <patchwork@ffaux-bg.ffmpeg.org>;
	Tue,  3 Dec 2019 09:52:26 +0200 (EET)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id C572868AD86;
	Tue,  3 Dec 2019 09:52:26 +0200 (EET)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from smtp180.sjtu.edu.cn (smtp180.sjtu.edu.cn [202.120.2.180])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 2389F68AD9A
	for <ffmpeg-devel@ffmpeg.org>; Tue,  3 Dec 2019 09:52:18 +0200 (EET)
Received: from proxy06.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188])
	by smtp180.sjtu.edu.cn (Postfix) with ESMTPS id 6A91D1008CBC9
	for <ffmpeg-devel@ffmpeg.org>; Tue,  3 Dec 2019 15:52:12 +0800 (CST)
Received: from localhost (localhost.localdomain [127.0.0.1])
	by proxy06.sjtu.edu.cn (Postfix) with ESMTP id 09C177DBEF;
	Tue,  3 Dec 2019 15:52:11 +0800 (CST)
X-Virus-Scanned: amavisd-new at proxy06.sjtu.edu.cn
Received: from proxy06.sjtu.edu.cn ([127.0.0.1])
	by localhost (proxy06.sjtu.edu.cn [127.0.0.1]) (amavisd-new,
	port 10026)
	with ESMTP id DLtkYIYBFAPG; Tue,  3 Dec 2019 15:52:10 +0800 (CST)
Received: from localhost.localdomain (unknown [202.120.39.2])
	by proxy06.sjtu.edu.cn (Postfix) with ESMTPSA id F33543A740;
	Tue,  3 Dec 2019 15:52:09 +0800 (CST)
From: xujunzz@sjtu.edu.cn
To: ffmpeg-devel@ffmpeg.org
Date: Tue,  3 Dec 2019 15:52:06 +0800
Message-Id: <20191203075207.26243-2-xujunzz@sjtu.edu.cn>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20191203075207.26243-1-xujunzz@sjtu.edu.cn>
References: <20191203075207.26243-1-xujunzz@sjtu.edu.cn>
Subject: [FFmpeg-devel] [PATCH 2/3] avfilter/vf_convolution: Add x86 SIMD
	optimizations for filter_row()
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
	<mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
	<mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches
	<ffmpeg-devel@ffmpeg.org>
Cc: xujunzz@sjtu.edu.cn
MIME-Version: 1.0
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From: Xu Jun <xujunzz@sjtu.edu.cn>

Tested using this command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:row:row:row:row" -an -vframes 5000 -f null /dev/null -benchmark

after patch:
frame= 4317 fps=477 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=19.1x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=29.840s stime=2.121s rtime=9.047s
bench: maxrss=15156kB

before patch:
frame= 4317 fps=187 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed= 7.5x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=106.948s stime=2.382s rtime=23.039s
bench: maxrss=15224kB

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 libavfilter/x86/vf_convolution.asm    | 104 ++++++++++++++++++++++++++
 libavfilter/x86/vf_convolution_init.c |   9 +++
 2 files changed, 113 insertions(+)
 mode change 100644 => 100755 libavfilter/x86/vf_convolution.asm

diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
old mode 100644
new mode 100755
index 754d4d1064..b71e9720fb
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -154,3 +154,107 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c
 INIT_XMM sse4
 FILTER_3X3
 %endif
+
+; void filter_row_sse4(uint8_t *dst, int width,
+;                      float rdiv, float bias, const int *const matrix,
+;                      const uint8_t *c[], int peak, int radius,
+;                      int dstride, int stride)
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_row, 6, 10, 7, dst, width, matrix, ptr, mult, rad, r, x, i, ci
+%else
+cglobal filter_row, 4, 10, 7, dst, width, rdiv, bias, matrix, ptr, mult, rad, r, x, i, ci
+%endif
+
+%if WIN64
+    SWAP m0, m2
+    SWAP m1, m3
+    mov r2q, matrixmp
+    mov r3q, ptrmp
+    mov r5q, radmp
+    DEFINE_ARGS dst, width, matrix, ptr, mult, rad, r, x, i, ci
+%endif
+
+movsxdifnidn radq, radd
+sal radq, 1
+add radq, 1 ;   2*radius+1
+movsxdifnidn widthq, widthd
+VBROADCASTSS m0, m0
+VBROADCASTSS m1, m1
+pxor m6, m6
+movss m5, [half]
+VBROADCASTSS m5, m5
+
+xor xq, xq
+cmp widthq, mmsize/4
+jl .loop2
+
+mov rq, widthq
+and rq, mmsize/4-1
+sub widthq, rq
+
+.loop1:
+    pxor m4, m4
+    xor iq, iq
+    .loop1_1:
+        movss m2, [matrixq + 4*iq]
+        VBROADCASTSS m2, m2
+        mov ciq, [ptrq + iq * gprsize]
+        movss m3, [ciq + xq]
+        punpcklbw m3, m6
+        punpcklwd m3, m6
+        pmulld m2, m3
+        paddd m4, m2
+
+        add iq, 1
+        cmp iq, radq
+        jl .loop1_1
+
+    cvtdq2ps m4, m4
+    mulps m4, m0     ; sum *= rdiv
+    addps m4, m1     ; sum += bias
+    addps m4, m5     ; sum += 0.5
+    cvttps2dq m4, m4
+    packssdw m4, m4
+    packuswb m4, m4
+    movss [dstq + xq], m4
+
+    add xq, mmsize/4
+    cmp xq, widthq
+    jl .loop1
+
+    add widthq, rq
+    cmp xq, widthq
+    jge .end
+
+.loop2:
+    xor rd, rd
+    xor iq, iq
+    .loop2_2:
+        mov ciq, [ptrq + iq * gprsize]
+        movzx multd, byte [ciq + xq]
+        imul multd, [matrixq + 4*iq]
+        add rd, multd
+
+        add iq, 1
+        cmp iq, radq
+        jl .loop2_2
+
+    pxor m4, m4
+    cvtsi2ss m4, rd
+    mulss m4, m0     ; sum *= rdiv
+    addss m4, m1     ; sum += bias
+    addss m4, m5     ; sum += 0.5
+    cvttps2dq m4, m4
+    packssdw m4, m4
+    packuswb m4, m4
+    movd rd, m4
+    mov [dstq + xq], rb
+
+    add xq, 1
+    cmp xq, widthq
+    jl .loop2
+.end:
+    RET
+%endif
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 51432406ed..5eb3b3bee1 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride);
 
+void ff_filter_row_sse4(uint8_t *dst, int width,
+                        float rdiv, float bias, const int *const matrix,
+                        const uint8_t *c[], int peak, int radius,
+                        int dstride, int stride);
+
 av_cold void ff_convolution_init_x86(ConvolutionContext *s)
 {
 #if ARCH_X86_64
@@ -41,6 +46,10 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
                     s->filter[i] = ff_filter_3x3_sse4;
             }
         }
+        if (s->mode[i] == MATRIX_ROW) {
+                if (EXTERNAL_SSE4(cpu_flags))
+                    s->filter[i] = ff_filter_row_sse4;
+        }
     }
 #endif
 }