From patchwork Sun Dec 22 08:37:03 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Xu Jun <xujunzz@sjtu.edu.cn>
X-Patchwork-Id: 16919
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
X-Original-To: patchwork@ffaux-bg.ffmpeg.org
Delivered-To: patchwork@ffaux-bg.ffmpeg.org
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100])
	by ffaux.localdomain (Postfix) with ESMTP id 3CA6E44123C
	for <patchwork@ffaux-bg.ffmpeg.org>;
	Sun, 22 Dec 2019 10:37:25 +0200 (EET)
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 0D20E68A71F;
	Sun, 22 Dec 2019 10:37:25 +0200 (EET)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from smtp180.sjtu.edu.cn (smtp180.sjtu.edu.cn [202.120.2.180])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 915BD68A3A3
	for <ffmpeg-devel@ffmpeg.org>; Sun, 22 Dec 2019 10:37:17 +0200 (EET)
Received: from proxy06.sjtu.edu.cn (smtp188.sjtu.edu.cn [202.120.2.188])
	by smtp180.sjtu.edu.cn (Postfix) with ESMTPS id A6047100AFC26
	for <ffmpeg-devel@ffmpeg.org>; Sun, 22 Dec 2019 16:37:10 +0800 (CST)
Received: from localhost (localhost.localdomain [127.0.0.1])
	by proxy06.sjtu.edu.cn (Postfix) with ESMTP id 991F895F3C7;
	Sun, 22 Dec 2019 16:37:10 +0800 (CST)
X-Virus-Scanned: amavisd-new at proxy06.sjtu.edu.cn
Received: from proxy06.sjtu.edu.cn ([127.0.0.1])
	by localhost (proxy06.sjtu.edu.cn [127.0.0.1]) (amavisd-new,
	port 10026)
	with ESMTP id 4oPksQJRBv2l; Sun, 22 Dec 2019 16:37:10 +0800 (CST)
Received: from localhost.localdomain (unknown [202.120.39.204])
	by proxy06.sjtu.edu.cn (Postfix) with ESMTPSA id C8C88AAF684;
	Sun, 22 Dec 2019 16:37:07 +0800 (CST)
From: xujunzz@sjtu.edu.cn
To: ffmpeg-devel@ffmpeg.org
Date: Sun, 22 Dec 2019 16:37:03 +0800
Message-Id: <20191222083703.3080-3-xujunzz@sjtu.edu.cn>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20191222083703.3080-1-xujunzz@sjtu.edu.cn>
References: <20191222083703.3080-1-xujunzz@sjtu.edu.cn>
Subject: [FFmpeg-devel] [PATCH v2 3/3] avfilter/vf_convolution: Add X86 SIMD
	optimizations for filter_column()
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
	<mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
	<mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches
	<ffmpeg-devel@ffmpeg.org>
Cc: xujunzz@sjtu.edu.cn
MIME-Version: 1.0
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From: Xu Jun <xujunzz@sjtu.edu.cn>

Performance improves about 10% compared to v1.

Tested using this command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark

after patch:
frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=  24x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=21.540s stime=2.091s rtime=7.197s

before patch:
frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=74.377s stime=1.880s rtime=16.420s

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 libavfilter/x86/vf_convolution.asm    | 202 ++++++++++++++++++++++++++
 libavfilter/x86/vf_convolution_init.c |   9 ++
 2 files changed, 211 insertions(+)

diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
index 2a09374b00..4c700656d6 100755
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -22,6 +22,8 @@
 
 SECTION_RODATA
 half:   dd 0.5
+shuf_init:   ddq 0x80808003808080028080800180808000
+shuf_step: ddq 0x00000004000000040000000400000004
 
 SECTION .text
 
@@ -285,3 +287,203 @@ sub widthq, rq
 .end:
     RET
 %endif
+
+; void filter_column(uint8_t *dst, int height,
+;                         float rdiv, float bias, const int *const matrix,
+;                         const uint8_t *c[], int length, int radius,
+;                         int dstride, int stride);
+
+%macro COMPUTE_4COL 1
+    pshufb m7, m6, m4    ; get 4 uint8s from the 16 uint8s
+    pmulld m7, m5
+    paddd m1%1, m7
+%endmacro
+
+%macro CVT_PACK_COL 1
+    cvtdq2ps m1%1, m1%1
+    mulps m1%1, m0     ; sum *= rdiv
+    addps m1%1, m1     ; sum += bias
+    addps m1%1, m3     ; sum += 0.5
+    cvttps2dq m1%1, m1%1
+    packssdw m1%1, m1%1
+    packuswb m1%1, m1%1
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \
+i, ci, ystride, sum, r, off16
+%else
+cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \
+i, ci, ystride, sum, r, off16
+%endif
+
+%if WIN64
+    SWAP m0, m2
+    SWAP m1, m3
+    mov r2q, matrixmp
+    mov r3q, ptrmp
+    mov r4q, widthmp
+    mov r5q, radmp
+    mov r6q, dstridemp
+    mov r7q, stridemp
+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \
+    i, ci, ystride, sum, r, off16
+%endif
+
+movsxdifnidn widthq, widthd
+movsxdifnidn radq, radd
+lea radq, [radq * 2 + 1]
+movsxdifnidn dstrideq, dstrided
+movsxdifnidn strideq, strided
+movsxdifnidn heightq, heightd
+
+VBROADCASTSS m0, m0    ; rdiv
+VBROADCASTSS m1, m1    ; bias
+pxor m2, m2    ; zero
+movss m3, [half]
+VBROADCASTSS m3, m3    ; 0.5
+movdqu m8, [shuf_init]      ; shuffle initialization
+movdqu m9, [shuf_step]    ; shuffle step
+
+xor ystrideq, ystrideq    ; y*stride
+
+cmp widthq, mmsize    ;if width<16 run loopr, width=16 run 16 parallel
+jl .less16
+
+.equal16:
+    pxor m10, m10
+    pxor m11, m11
+    pxor m12, m12
+    pxor m13, m13
+    ; m10-13 hold sums
+
+    lea iq, [radq - 1]
+    .loopi:
+        movd m5, [matrixq + 4*iq]    ; matrix[i]
+        VBROADCASTSS m5, m5
+        mov ciq, [ptrq + iq * gprsize]
+        movdqu m6, [ciq + ystrideq]    ; c[i][y*stride] 16 uint8s
+
+        ;m4 controls shuffle
+        movdqa m4, m8
+        COMPUTE_4COL 0    ; process 0-3 cols, sum in m10
+        paddd m4, m9
+        COMPUTE_4COL 1    ; process 4-7 cols, sum in m11
+        paddd m4, m9
+        COMPUTE_4COL 2    ; process 8-11 cols, sum in m12
+        paddd m4, m9
+        COMPUTE_4COL 3    ; process 12-15 cols, sum in m13
+
+        sub iq, 1
+        jns .loopi
+
+    CVT_PACK_COL 0    ; process 0-3 cols, result in m10's low 32bit
+    CVT_PACK_COL 1    ; process 4-7 cols, result in m11's low 32bit
+    CVT_PACK_COL 2    ; process 8-11 cols, result in m12's low 32bit
+    CVT_PACK_COL 3    ; process 12-15 cols, result in m13's low 32bit
+    punpckldq m10, m11
+    punpckldq m12, m13
+    punpcklqdq m10, m12    ; pack 16 results in m10
+    movdqu [dstq], m10
+
+    add dstq, dstrideq
+    add ystrideq, strideq
+    sub heightq, 1
+    jnz .equal16
+    jmp .end
+
+.less16:
+    xor off16q, off16q
+    cmp widthq, mmsize/4
+    jl .loopr
+
+    mov   rq, widthq
+    and   rq, mmsize/4-1
+    sub   widthq, rq
+
+    pxor m10, m10
+    pxor m11, m11
+    pxor m12, m12
+
+    lea iq, [radq - 1]
+    .loopi_4:
+        movd m5, [matrixq + 4*iq]    ; matrix[i]
+        VBROADCASTSS m5, m5
+        mov ciq, [ptrq + iq * gprsize]
+        movdqu m6, [ciq + ystrideq]    ; c[i][y*stride] 16 uint8s
+
+        ;m4 controls shuffle
+        movdqa m4, m8
+        COMPUTE_4COL 0    ; process 0-3 cols, sum in m10
+        cmp widthq, mmsize/4 ; width = 4
+        je .i4_end
+
+        paddd m4, m9
+        COMPUTE_4COL 1    ; process 4-7 cols, sum in m11
+        cmp widthq, mmsize/2 ; width = 8
+        je .i4_end
+
+        paddd m4, m9
+        COMPUTE_4COL 2    ; process 8-11 cols, sum in m12
+
+        .i4_end:
+        sub iq, 1
+        jns .loopi_4
+
+    CVT_PACK_COL 0    ; process 0-3 cols, result in m10's low 32bit
+    movd [dstq], m10
+    cmp widthq, mmsize/4 ; width = 4
+    je .cvt_end
+
+    CVT_PACK_COL 1    ; process 4-7 cols, result in m11's low 32bit
+    movd [dstq + mmsize/4], m11
+    cmp widthq, mmsize/2 ; width = 8
+    je .cvt_end
+
+    CVT_PACK_COL 2    ; process 8-11 cols, result in m12's low 32bit
+    movd [dstq + mmsize/2], m12
+
+    .cvt_end:
+    cmp rq, 0
+    je .loopr_end
+    mov off16q, widthq
+    add widthq, rq
+
+    .loopr:
+        xor sumq, sumq
+        lea iq, [radq - 1]
+        .loopr_i:
+            mov ciq, [ptrq + iq * gprsize]
+            add ciq, ystrideq
+            movzx rd, byte [ciq + off16q]
+            imul rd, [matrixq + 4*iq]
+            add sumd, rd
+
+            sub iq, 1
+            jns .loopr_i
+
+        pxor m7, m7
+        cvtsi2ss m7, sumd
+        mulss m7, m0     ; sum *= rdiv
+        addss m7, m1     ; sum += bias
+        addss m7, m3     ; sum += 0.5
+        cvttps2dq m7, m7
+        packssdw m7, m7
+        packuswb m7, m7
+        movd sumd, m7
+        mov [dstq + off16q], sumb
+        add off16q, 1
+        cmp off16q, widthq
+        jl .loopr
+
+    .loopr_end:
+    add dstq, dstrideq
+    add ystrideq, strideq
+    sub heightq, 1
+    jnz .less16
+
+.end:
+    RET
+%endif
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 5eb3b3bee1..da39b8a400 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -34,6 +34,11 @@ void ff_filter_row_sse4(uint8_t *dst, int width,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride);
 
+void ff_filter_column_sse4(uint8_t *dst, int height,
+                        float rdiv, float bias, const int *const matrix,
+                        const uint8_t *c[], int length, int radius,
+                        int dstride, int stride);
+
 av_cold void ff_convolution_init_x86(ConvolutionContext *s)
 {
 #if ARCH_X86_64
@@ -50,6 +55,10 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
                 if (EXTERNAL_SSE4(cpu_flags))
                     s->filter[i] = ff_filter_row_sse4;
         }
+        if (s->mode[i] == MATRIX_COLUMN) {
+                if (EXTERNAL_SSE4(cpu_flags))
+                    s->filter[i] = ff_filter_column_sse4;
+        }
     }
 #endif
 }