From patchwork Wed Nov 27 15:13:54 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xu Jun X-Patchwork-Id: 16460 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 5F54B44A600 for ; Wed, 27 Nov 2019 17:14:21 +0200 (EET) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 37A3668B0E7; Wed, 27 Nov 2019 17:14:21 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from smtp180.sjtu.edu.cn (smtp180.sjtu.edu.cn [202.120.2.180]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 7798768B07B for ; Wed, 27 Nov 2019 17:14:13 +0200 (EET) Received: from proxy01.sjtu.edu.cn (unknown [202.112.26.54]) by smtp180.sjtu.edu.cn (Postfix) with ESMTPS id A46621008CBC3 for ; Wed, 27 Nov 2019 23:14:07 +0800 (CST) Received: from localhost (localhost [127.0.0.1]) by proxy01.sjtu.edu.cn (Postfix) with ESMTP id 92482201AEBD5; Wed, 27 Nov 2019 23:14:07 +0800 (CST) X-Virus-Scanned: amavisd-new at proxy01.sjtu.edu.cn Received: from proxy01.sjtu.edu.cn ([127.0.0.1]) by localhost (proxy01.sjtu.edu.cn [127.0.0.1]) (amavisd-new, port 10026) with ESMTP id 8PJ5tv0SiJ9U; Wed, 27 Nov 2019 23:14:07 +0800 (CST) Received: from localhost.localdomain (unknown [59.78.63.241]) (Authenticated sender: xujunzz@sjtu.edu.cn) by proxy01.sjtu.edu.cn (Postfix) with ESMTPSA id 44EE720426A73; Wed, 27 Nov 2019 23:14:05 +0800 (CST) From: xujunzz@sjtu.edu.cn To: ffmpeg-devel@ffmpeg.org Date: Wed, 27 Nov 2019 23:13:54 +0800 Message-Id: <20191127151354.7726-1-xujunzz@sjtu.edu.cn> X-Mailer: git-send-email 2.17.1 Subject: [FFmpeg-devel] [PATCH] avfilter/vf_convolution: add x86 SIMD for filter_column() X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: xujunzz@sjtu.edu.cn MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Xu Jun Tested using a simple command: ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 1000 -f null /dev/null The fps increase from 284 to 693 on my local machine. Signed-off-by: Xu Jun --- libavfilter/x86/vf_convolution.asm | 129 ++++++++++++++++++++++++++ libavfilter/x86/vf_convolution_init.c | 7 ++ 2 files changed, 136 insertions(+) diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm index b71e9720fb..49dfbab9c0 100755 --- a/libavfilter/x86/vf_convolution.asm +++ b/libavfilter/x86/vf_convolution.asm @@ -258,3 +258,132 @@ sub widthq, rq .end: RET %endif + +; void filter_column(uint8_t *dst, int height, +; float rdiv, float bias, const int *const matrix, +; const uint8_t *c[], int length, int radius, +; int dstride, int stride); + +%if ARCH_X86_64 +INIT_XMM sse4 +%if UNIX64 +cglobal filter_column16, 8, 15, 7, dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r +%else +cglobal filter_column16, 8, 15, 7, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r +%endif + +%if WIN64 + SWAP m0, m2 + SWAP m1, m3 + mov r2q, matrixmp + mov r3q, ptrmp + mov r4q, widthmp + mov r5q, radmp + mov r6q, dstridemp + mov r7q, stridemp + DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r +%endif + +movsxdifnidn widthq, widthd +movsxdifnidn radq, radd +movsxdifnidn dstrideq, dstrided +movsxdifnidn strideq, strided +sal radq, 1 +add radq, 1 ;2*radius+1 +movsxdifnidn heightq, heightd +VBROADCASTSS m0, m0 +VBROADCASTSS m1, m1 +pxor m6, m6 +movss m5, [half] +VBROADCASTSS m5, m5 + +xor dst_offq, dst_offq +xor c_offq, c_offq + +.loopy: + xor off16q, off16q + cmp widthq, mmsize/4 + jl .loopr + + mov rq, widthq + and rq, mmsize/4-1 + sub widthq, rq + + .loop16: ;parallel process 16 elements in a row + pxor m4, m4 + xor iq, iq + .loopi: + movss m2, [matrixq + 4*iq] + VBROADCASTSS m2, m2 + mov ciq, [ptrq + iq * gprsize] + movss m3, [ciq + c_offq] ;c[i][y*stride + off16] + punpcklbw m3, m6 + punpcklwd m3, m6 + pmulld m2, m3 + paddd m4, m2 + + add iq, 1 + cmp iq, radq + jl .loopi + + cvtdq2ps m4, m4 + mulps m4, m0 ; sum *= rdiv + addps m4, m1 ; sum += bias + addps m4, m5 ; sum += 0.5 + cvttps2dq m4, m4 + packssdw m4, m4 + packuswb m4, m4 + movss [dstq + dst_offq], m4 + add c_offq, mmsize/4 + add dst_offq, mmsize/4 + + add off16q, mmsize/4 + cmp off16q, widthq + jl .loop16 + + add widthq, rq + cmp off16q, widthq + jge .paraend + + .loopr: + xor sumd, sumd + xor iq, iq + .loopr_i: + mov ciq, [ptrq + iq * gprsize] + movzx rd, byte [ciq + c_offq] + imul rd, [matrixq + 4*iq] + add sumd, rd + + add iq, 1 + cmp iq, radq + jl .loopr_i + + pxor m4, m4 + cvtsi2ss m4, sumd + mulss m4, m0 ; sum *= rdiv + addss m4, m1 ; sum += bias + addss m4, m5 ; sum += 0.5 + cvttps2dq m4, m4 + packssdw m4, m4 + packuswb m4, m4 + movd sumd, m4 + mov [dstq + dst_offq], sumb + add c_offq, 1 + add dst_offq, 1 + add off16q, 1 + cmp off16q, widthq + jl .loopr + + .paraend: + sub c_offq, widthq + sub dst_offq, widthq + add c_offq, strideq + add dst_offq, dstrideq + + sub heightq, 1 + cmp heightq, 0 + jg .loopy + +.end: + RET +%endif \ No newline at end of file diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c index 6b1c2f0e9f..d9e93296b9 100644 --- a/libavfilter/x86/vf_convolution_init.c +++ b/libavfilter/x86/vf_convolution_init.c @@ -56,6 +56,11 @@ static void filter_column16(uint8_t *dst, int height, } +void ff_filter_column16_sse4(uint8_t *dst, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int length, int radius, + int dstride, int stride); + av_cold void ff_convolution_init_x86(ConvolutionContext *s) { #if ARCH_X86_64 @@ -74,6 +79,8 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s) } if (s->mode[i] == MATRIX_COLUMN) s->filter[i] = filter_column16; + if (EXTERNAL_SSE4(cpu_flags)) + s->filter[i] = ff_filter_column16_sse4; } #endif }