diff mbox

[FFmpeg-devel,v2,2/3] avfilter/vf_convolution: Add x86 SIMD optimizations for filter_row()

Message ID 20191222083703.3080-2-xujunzz@sjtu.edu.cn
State New
Headers show

Commit Message

xujunzz@sjtu.edu.cn Dec. 22, 2019, 8:37 a.m. UTC
From: Xu Jun <xujunzz@sjtu.edu.cn>

Read 16 elements from memory, shuffle and parallally compute 4 rows at a time, shuffle and parallelly write 16 results to memory.
Performance improves about 15% compared to v1.

Tested using this command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:row:row:row:row" -an -vframes 5000 -f null /dev/null -benchmark

after patch:
frame= 4317 fps=622 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=24.9x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=20.539s stime=1.834s rtime=6.943s

before patch(c version):
frame= 4317 fps=306 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=12.2x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=60.591s stime=1.787s rtime=14.100s

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 libavfilter/x86/vf_convolution.asm    | 131 ++++++++++++++++++++++++++
 libavfilter/x86/vf_convolution_init.c |   9 ++
 2 files changed, 140 insertions(+)
 mode change 100644 => 100755 libavfilter/x86/vf_convolution.asm

Comments

Andriy Gelman Dec. 22, 2019, 4:50 p.m. UTC | #1
Xu, 

On Sun, 22. Dec 16:37, xujunzz@sjtu.edu.cn wrote:
> From: Xu Jun <xujunzz@sjtu.edu.cn>
> 
> Read 16 elements from memory, shuffle and parallally compute 4 rows at a time, shuffle and parallelly write 16 results to memory.
> Performance improves about 15% compared to v1.
> 
> Tested using this command:
> ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:row:row:row:row" -an -vframes 5000 -f null /dev/null -benchmark
> 
> after patch:
> frame= 4317 fps=622 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=24.9x
> video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> bench: utime=20.539s stime=1.834s rtime=6.943s
> 
> before patch(c version):
> frame= 4317 fps=306 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=12.2x
> video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> bench: utime=60.591s stime=1.787s rtime=14.100s
> 
> Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
> ---
>  libavfilter/x86/vf_convolution.asm    | 131 ++++++++++++++++++++++++++
>  libavfilter/x86/vf_convolution_init.c |   9 ++
>  2 files changed, 140 insertions(+)
>  mode change 100644 => 100755 libavfilter/x86/vf_convolution.asm
> 
> diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
> old mode 100644
> new mode 100755
> index 754d4d1064..2a09374b00
> --- a/libavfilter/x86/vf_convolution.asm
> +++ b/libavfilter/x86/vf_convolution.asm
> @@ -154,3 +154,134 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c
>  INIT_XMM sse4
>  FILTER_3X3
>  %endif
> +

Patch 2-3 are failing to build:
https://unofficial.patchwork-ffmpeg.org/project/FFmpeg/list/?series=26
xujunzz@sjtu.edu.cn Dec. 30, 2019, 2:34 a.m. UTC | #2
Hi, Andriy

----- Original Message -----
> From: "Andriy Gelman" <andriy.gelman@gmail.com>
> To: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> Cc: xujunzz@sjtu.edu.cn
> Sent: Monday, December 23, 2019 12:50:48 AM
> Subject: Re: [FFmpeg-devel] [PATCH v2 2/3] avfilter/vf_convolution: Add x86 SIMD optimizations for filter_row()

> Xu,
> 
> On Sun, 22. Dec 16:37, xujunzz@sjtu.edu.cn wrote:
>> From: Xu Jun <xujunzz@sjtu.edu.cn>
>> 
>> Read 16 elements from memory, shuffle and parallally compute 4 rows at a time,
>> shuffle and parallelly write 16 results to memory.
>> Performance improves about 15% compared to v1.
>> 
>> Tested using this command:
>> ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6
>> 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8
>> 9:1/45:1/45:1/45:1/45:1:2:3:4:row:row:row:row" -an -vframes 5000 -f null
>> /dev/null -benchmark
>> 
>> after patch:
>> frame= 4317 fps=622 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=24.9x
>> video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
>> overhead: unknown
>> bench: utime=20.539s stime=1.834s rtime=6.943s
>> 
>> before patch(c version):
>> frame= 4317 fps=306 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=12.2x
>> video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
>> overhead: unknown
>> bench: utime=60.591s stime=1.787s rtime=14.100s
>> 
>> Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
>> ---
>>  libavfilter/x86/vf_convolution.asm    | 131 ++++++++++++++++++++++++++
>>  libavfilter/x86/vf_convolution_init.c |   9 ++
>>  2 files changed, 140 insertions(+)
>>  mode change 100644 => 100755 libavfilter/x86/vf_convolution.asm
>> 
>> diff --git a/libavfilter/x86/vf_convolution.asm
>> b/libavfilter/x86/vf_convolution.asm
>> old mode 100644
>> new mode 100755
>> index 754d4d1064..2a09374b00
>> --- a/libavfilter/x86/vf_convolution.asm
>> +++ b/libavfilter/x86/vf_convolution.asm
>> @@ -154,3 +154,134 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias,
>> matrix, ptr, c0, c1, c2, c
>>  INIT_XMM sse4
>>  FILTER_3X3
>>  %endif
>> +
> 
> Patch 2-3 are failing to build:
> https://unofficial.patchwork-ffmpeg.org/project/FFmpeg/list/?series=26
> 
> --
> Andriy

I'm sorry I haven't built patches independently. There seem to be some bugs in the dependency of the patches.
I'll fix them in v3.

Xu Jun
diff mbox

Patch

diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
old mode 100644
new mode 100755
index 754d4d1064..2a09374b00
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -154,3 +154,134 @@  cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c
 INIT_XMM sse4
 FILTER_3X3
 %endif
+
+; void filter_row_sse4(uint8_t *dst, int width,
+;                      float rdiv, float bias, const int *const matrix,
+;                      const uint8_t *c[], int peak, int radius,
+;                      int dstride, int stride)
+
+%macro COMPUTE_4ROW 1
+pshufb m7, m6, m4 ; get 4 uint8s from the 16 uint8s
+pmulld m7, m5
+paddd m1%1, m7
+%endmacro
+
+%macro CVT_PACK_ROW 1
+cvtdq2ps m1%1, m1%1
+mulps m1%1, m0 ; sum *= rdiv
+addps m1%1, m1 ; sum += bias
+addps m1%1, m3 ; sum += 0.5
+cvttps2dq m1%1, m1%1
+packssdw m1%1, m1%1
+packuswb m1%1, m1%1
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_row, 6, 10, 14, dst, width, matrix, ptr, mult, rad, r, x, i, ci
+%else
+cglobal filter_row, 4, 10, 14, dst, width, rdiv, bias, matrix, ptr, mult, rad, r, x, i, ci
+%endif
+
+%if WIN64
+    SWAP m0, m2
+    SWAP m1, m3
+    mov r2q, matrixmp
+    mov r3q, ptrmp
+    mov r5q, radmp
+    DEFINE_ARGS dst, width, matrix, ptr, mult, rad, r, x, i, ci
+%endif
+
+movsxdifnidn widthq, widthd
+movsxdifnidn radq, radd
+lea radq, [radq * 2 + 1]
+VBROADCASTSS m0, m0
+VBROADCASTSS m1, m1
+pxor m2, m2 ; zero
+movss m3, [half]
+VBROADCASTSS m3, m3 ; 0.5
+movdqu m8, [shuf_init] ; shuffle initialization
+movdqu m9, [shuf_step] ; shuffle step
+
+xor xq, xq
+cmp widthq, mmsize
+jl .less16
+
+mov rq, widthq
+and rq, mmsize-1
+sub widthq, rq
+
+.equal16:
+    pxor m10, m10
+    pxor m11, m11
+    pxor m12, m12
+    pxor m13, m13
+    ; m10-13 hold sums
+
+   lea iq, [radq - 1]
+    .loopi:
+        movd m5, [matrixq + 4*iq] ; matrix[i]
+        VBROADCASTSS m5, m5
+        mov ciq, [ptrq + iq * gprsize]
+        movdqu m6, [ciq + xq] ; c[i][y*stride] 16 uint8s
+
+        ;m4 controls shuffle
+        movdqa m4, m8
+        COMPUTE_4ROW 0 ; process 0-3 rows, sum in m10
+        paddd m4, m9
+        COMPUTE_4ROW 1 ; process 4-7 rows, sum in m11
+        paddd m4, m9
+        COMPUTE_4ROW 2 ; process 8-11 rows, sum in m12
+        paddd m4, m9
+        COMPUTE_4ROW 3 ; process 12-15 rows, sum in m13
+
+        sub iq, 1
+        jns .loopi
+
+    CVT_PACK_ROW 0 ; process 0-3 rows, result in m10's low 32bit
+    CVT_PACK_ROW 1 ; process 4-7 rows, result in m11's low 32bit
+    CVT_PACK_ROW 2 ; process 8-11 rows, result in m12's low 32bit
+    CVT_PACK_ROW 3 ; process 12-15 rows, result in m13's low 32bit
+    punpckldq m10, m11
+    punpckldq m12, m13
+    punpcklqdq m10, m12 ; pack 16 results in m10
+    movdqu [dstq+xq], m10
+
+    add xq, mmsize
+    cmp xq, widthq
+    jl .equal16
+
+    add widthq, rq
+    cmp xq, widthq
+    jge .end
+
+.less16:
+    xor rd, rd
+    lea iq, [radq - 1]
+    .loopr_i:
+        mov ciq, [ptrq + iq * gprsize]
+        movzx multd, byte [ciq + xq]
+        imul multd, [matrixq + 4*iq]
+        add rd, multd
+
+        sub iq, 1
+        jns .loopr_i
+
+    pxor m7, m7
+    cvtsi2ss m7, rd
+    mulss m7, m0 ; sum *= rdiv
+    addss m7, m1 ; sum += bias
+    addss m7, m3 ; sum += 0.5
+    cvttps2dq m7, m7
+    packssdw m7, m7
+    packuswb m7, m7
+    movd rd, m7
+    mov [dstq + xq], rb
+
+    add xq, 1
+    cmp xq, widthq
+    jl .less16
+.end:
+    RET
+%endif
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 51432406ed..5eb3b3bee1 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -29,6 +29,11 @@  void ff_filter_3x3_sse4(uint8_t *dst, int width,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride);
 
+void ff_filter_row_sse4(uint8_t *dst, int width,
+                        float rdiv, float bias, const int *const matrix,
+                        const uint8_t *c[], int peak, int radius,
+                        int dstride, int stride);
+
 av_cold void ff_convolution_init_x86(ConvolutionContext *s)
 {
 #if ARCH_X86_64
@@ -41,6 +46,10 @@  av_cold void ff_convolution_init_x86(ConvolutionContext *s)
                     s->filter[i] = ff_filter_3x3_sse4;
             }
         }
+        if (s->mode[i] == MATRIX_ROW) {
+                if (EXTERNAL_SSE4(cpu_flags))
+                    s->filter[i] = ff_filter_row_sse4;
+        }
     }
 #endif
 }