From patchwork Wed Nov 27 15:13:54 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Xu Jun <xujunzz@sjtu.edu.cn>
X-Patchwork-Id: 16460
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
X-Original-To: patchwork@ffaux-bg.ffmpeg.org
Delivered-To: patchwork@ffaux-bg.ffmpeg.org
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by ffaux.localdomain (Postfix) with ESMTP id 5F54B44A600
	for <patchwork@ffaux-bg.ffmpeg.org>;
	Wed, 27 Nov 2019 17:14:21 +0200 (EET)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 37A3668B0E7;
	Wed, 27 Nov 2019 17:14:21 +0200 (EET)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from smtp180.sjtu.edu.cn (smtp180.sjtu.edu.cn [202.120.2.180])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 7798768B07B
	for <ffmpeg-devel@ffmpeg.org>; Wed, 27 Nov 2019 17:14:13 +0200 (EET)
Received: from proxy01.sjtu.edu.cn (unknown [202.112.26.54])
	by smtp180.sjtu.edu.cn (Postfix) with ESMTPS id A46621008CBC3
	for <ffmpeg-devel@ffmpeg.org>; Wed, 27 Nov 2019 23:14:07 +0800 (CST)
Received: from localhost (localhost [127.0.0.1])
	by proxy01.sjtu.edu.cn (Postfix) with ESMTP id 92482201AEBD5;
	Wed, 27 Nov 2019 23:14:07 +0800 (CST)
X-Virus-Scanned: amavisd-new at proxy01.sjtu.edu.cn
Received: from proxy01.sjtu.edu.cn ([127.0.0.1])
	by localhost (proxy01.sjtu.edu.cn [127.0.0.1]) (amavisd-new,
	port 10026)
	with ESMTP id 8PJ5tv0SiJ9U; Wed, 27 Nov 2019 23:14:07 +0800 (CST)
Received: from localhost.localdomain (unknown [59.78.63.241])
	(Authenticated sender: xujunzz@sjtu.edu.cn)
	by proxy01.sjtu.edu.cn (Postfix) with ESMTPSA id 44EE720426A73;
	Wed, 27 Nov 2019 23:14:05 +0800 (CST)
From: xujunzz@sjtu.edu.cn
To: ffmpeg-devel@ffmpeg.org
Date: Wed, 27 Nov 2019 23:13:54 +0800
Message-Id: <20191127151354.7726-1-xujunzz@sjtu.edu.cn>
X-Mailer: git-send-email 2.17.1
Subject: [FFmpeg-devel] [PATCH] avfilter/vf_convolution: add x86 SIMD for
	filter_column()
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
	<mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
	<mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches
	<ffmpeg-devel@ffmpeg.org>
Cc: xujunzz@sjtu.edu.cn
MIME-Version: 1.0
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From: Xu Jun <xujunzz@sjtu.edu.cn>

Tested using a simple command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 1000 -f null /dev/null

The fps increase from 284 to 693 on my local machine.

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 libavfilter/x86/vf_convolution.asm    | 129 ++++++++++++++++++++++++++
 libavfilter/x86/vf_convolution_init.c |   7 ++
 2 files changed, 136 insertions(+)

diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
index b71e9720fb..49dfbab9c0 100755
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -258,3 +258,132 @@ sub widthq, rq
 .end:
     RET
 %endif
+
+; void filter_column(uint8_t *dst, int height,
+;                         float rdiv, float bias, const int *const matrix,
+;                         const uint8_t *c[], int length, int radius,
+;                         int dstride, int stride);
+
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_column16, 8, 15, 7, dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%else
+cglobal filter_column16, 8, 15, 7, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%endif
+
+%if WIN64
+    SWAP m0, m2
+    SWAP m1, m3
+    mov r2q, matrixmp
+    mov r3q, ptrmp
+    mov r4q, widthmp
+    mov r5q, radmp
+    mov r6q, dstridemp
+    mov r7q, stridemp
+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%endif
+
+movsxdifnidn widthq, widthd
+movsxdifnidn radq, radd
+movsxdifnidn dstrideq, dstrided
+movsxdifnidn strideq, strided
+sal radq, 1
+add radq, 1     ;2*radius+1
+movsxdifnidn heightq, heightd
+VBROADCASTSS m0, m0
+VBROADCASTSS m1, m1
+pxor m6, m6
+movss m5, [half]
+VBROADCASTSS m5, m5
+
+xor dst_offq, dst_offq
+xor c_offq, c_offq
+
+.loopy:
+    xor off16q, off16q
+    cmp widthq, mmsize/4
+    jl .loopr
+
+    mov rq, widthq
+    and rq, mmsize/4-1
+    sub widthq, rq
+
+    .loop16: ;parallel process 16 elements in a row
+        pxor m4, m4
+        xor iq, iq
+        .loopi:
+            movss m2, [matrixq + 4*iq]
+            VBROADCASTSS m2, m2
+            mov ciq, [ptrq + iq * gprsize]
+            movss m3, [ciq + c_offq] ;c[i][y*stride + off16]
+            punpcklbw m3, m6
+            punpcklwd m3, m6
+            pmulld m2, m3
+            paddd m4, m2
+
+            add iq, 1
+            cmp iq, radq
+            jl .loopi
+
+        cvtdq2ps m4, m4
+        mulps m4, m0     ; sum *= rdiv
+        addps m4, m1     ; sum += bias
+        addps m4, m5     ; sum += 0.5
+        cvttps2dq m4, m4
+        packssdw m4, m4
+        packuswb m4, m4
+        movss [dstq + dst_offq], m4
+        add c_offq, mmsize/4
+        add dst_offq, mmsize/4
+
+        add off16q, mmsize/4
+        cmp off16q, widthq
+        jl .loop16
+
+    add widthq, rq
+    cmp off16q, widthq
+    jge .paraend
+
+    .loopr:
+        xor sumd, sumd
+        xor iq, iq
+        .loopr_i:
+            mov ciq, [ptrq + iq * gprsize]
+            movzx rd, byte [ciq + c_offq]
+            imul rd, [matrixq + 4*iq]
+            add sumd, rd
+
+            add iq, 1
+            cmp iq, radq
+            jl .loopr_i
+
+        pxor m4, m4
+        cvtsi2ss m4, sumd
+        mulss m4, m0     ; sum *= rdiv
+        addss m4, m1     ; sum += bias
+        addss m4, m5     ; sum += 0.5
+        cvttps2dq m4, m4
+        packssdw m4, m4
+        packuswb m4, m4
+        movd sumd, m4
+        mov [dstq + dst_offq], sumb
+        add c_offq, 1
+        add dst_offq, 1
+        add off16q, 1
+        cmp off16q, widthq
+        jl .loopr
+
+    .paraend:
+    sub c_offq, widthq
+    sub dst_offq, widthq
+    add c_offq, strideq
+    add dst_offq, dstrideq
+
+    sub heightq, 1
+    cmp heightq, 0
+    jg .loopy
+
+.end:
+    RET
+%endif
\ No newline at end of file
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 6b1c2f0e9f..d9e93296b9 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -56,6 +56,11 @@ static void filter_column16(uint8_t *dst, int height,
 
 }
 
+void ff_filter_column16_sse4(uint8_t *dst, int width,
+                        float rdiv, float bias, const int *const matrix,
+                        const uint8_t *c[], int length, int radius,
+                        int dstride, int stride);
+
 av_cold void ff_convolution_init_x86(ConvolutionContext *s)
 {
 #if ARCH_X86_64
@@ -74,6 +79,8 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
         }
         if (s->mode[i] == MATRIX_COLUMN)
             s->filter[i] = filter_column16;
+            if (EXTERNAL_SSE4(cpu_flags))
+                    s->filter[i] = ff_filter_column16_sse4;
     }
 #endif
 }