diff mbox

[FFmpeg-devel,v2,3/3] avfilter/vf_convolution: Add X86 SIMD optimizations for filter_column()

Message ID 20191222083703.3080-3-xujunzz@sjtu.edu.cn
State New
Headers show

Commit Message

Xu Jun Dec. 22, 2019, 8:37 a.m. UTC
From: Xu Jun <xujunzz@sjtu.edu.cn>

Performance improves about 10% compared to v1.

Tested using this command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark

after patch:
frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=  24x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=21.540s stime=2.091s rtime=7.197s

before patch:
frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=74.377s stime=1.880s rtime=16.420s

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 libavfilter/x86/vf_convolution.asm    | 202 ++++++++++++++++++++++++++
 libavfilter/x86/vf_convolution_init.c |   9 ++
 2 files changed, 211 insertions(+)

Comments

chen Dec. 23, 2019, 4:59 a.m. UTC | #1
comments inlined
At 2019-12-22 16:37:03, xujunzz@sjtu.edu.cn wrote:
>From: Xu Jun <xujunzz@sjtu.edu.cn>
>
>Performance improves about 10% compared to v1.
>
>Tested using this command:
>./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark
>
>after patch:
>frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=  24x
>video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>bench: utime=21.540s stime=2.091s rtime=7.197s
>
>before patch:
>frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x
>video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>bench: utime=74.377s stime=1.880s rtime=16.420s
>
>Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
>---
> libavfilter/x86/vf_convolution.asm    | 202 ++++++++++++++++++++++++++
> libavfilter/x86/vf_convolution_init.c |   9 ++
> 2 files changed, 211 insertions(+)
>
>diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
>index 2a09374b00..4c700656d6 100755
>--- a/libavfilter/x86/vf_convolution.asm
>+++ b/libavfilter/x86/vf_convolution.asm
>@@ -22,6 +22,8 @@
> 
> SECTION_RODATA
> half:   dd 0.5

>+shuf_init:   ddq 0x80808003808080028080800180808000
TBD
ps: constant define as Byte (db) or Word (dw) have more readable, in this case, you use it with psuhfb, so Byte.


>+shuf_step: ddq 0x00000004000000040000000400000004
> 
> SECTION .text
> 
>@@ -285,3 +287,203 @@ sub widthq, rq
> .end:
>     RET
> %endif
>+
>+; void filter_column(uint8_t *dst, int height,
>+;                         float rdiv, float bias, const int *const matrix,
>+;                         const uint8_t *c[], int length, int radius,
>+;                         int dstride, int stride);
>+
>+%macro COMPUTE_4COL 1

>+    pshufb m7, m6, m4    ; get 4 uint8s from the 16 uint8s
Unnecessary, see below comment


>+    pmulld m7, m5

>+    paddd m1%1, m7
not error, generally, this sum operator made new dependency link, it may stall pipeline, I suggest sum 4 of register in parallelism.
In this case, I am not sure dynamic range of Matrix, so I am not sure it is good or overflow if sum element of (2 * radius + 1) times.


>+%endmacro
>+
>+%macro CVT_PACK_COL 1
>+    cvtdq2ps m1%1, m1%1
>+    mulps m1%1, m0     ; sum *= rdiv
>+    addps m1%1, m1     ; sum += bias
>+    addps m1%1, m3     ; sum += 0.5
>+    cvttps2dq m1%1, m1%1
>+    packssdw m1%1, m1%1
>+    packuswb m1%1, m1%1
>+%endmacro
>+
>+%if ARCH_X86_64
>+INIT_XMM sse4
>+%if UNIX64
>+cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%else
>+cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%endif
>+
>+%if WIN64
>+    SWAP m0, m2
>+    SWAP m1, m3
>+    mov r2q, matrixmp
>+    mov r3q, ptrmp
>+    mov r4q, widthmp
>+    mov r5q, radmp
>+    mov r6q, dstridemp
>+    mov r7q, stridemp
>+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \
>+    i, ci, ystride, sum, r, off16
>+%endif
>+
>+movsxdifnidn widthq, widthd
>+movsxdifnidn radq, radd
>+lea radq, [radq * 2 + 1]
>+movsxdifnidn dstrideq, dstrided
>+movsxdifnidn strideq, strided
>+movsxdifnidn heightq, heightd
>+
>+VBROADCASTSS m0, m0    ; rdiv
>+VBROADCASTSS m1, m1    ; bias
>+pxor m2, m2    ; zero
>+movss m3, [half]
>+VBROADCASTSS m3, m3    ; 0.5

>+movdqu m8, [shuf_init]      ; shuffle initialization
TBD


>+movdqu m9, [shuf_step]    ; shuffle step
>+
>+xor ystrideq, ystrideq    ; y*stride
>+
>+cmp widthq, mmsize    ;if width<16 run loopr, width=16 run 16 parallel
>+jl .less16
>+
>+.equal16:
>+    pxor m10, m10
>+    pxor m11, m11
>+    pxor m12, m12
>+    pxor m13, m13

>+    ; m10-13 hold sums
not error, however, use m0-m7 can be save 1 byte instruction prefix, in the inner loop, it made a little performance improvement.


>+
>+    lea iq, [radq - 1]
>+    .loopi:
>+        movd m5, [matrixq + 4*iq]    ; matrix[i]

>+        VBROADCASTSS m5, m5
since you claim SSE4, PSHUFD maybe better, however, it is not problem if you want to upgrade to AVX and above


>+        mov ciq, [ptrq + iq * gprsize]

>+        movdqu m6, [ciq + ystrideq]    ; c[i][y*stride] 16 uint8s
SSE4 provided MOVZXBD, it make you reduce above PSHUFB and series constant load


>+
>+        ;m4 controls shuffle
>+        movdqa m4, m8
>+        COMPUTE_4COL 0    ; process 0-3 cols, sum in m10
>+        paddd m4, m9
>+        COMPUTE_4COL 1    ; process 4-7 cols, sum in m11
>+        paddd m4, m9
>+        COMPUTE_4COL 2    ; process 8-11 cols, sum in m12
>+        paddd m4, m9
>+        COMPUTE_4COL 3    ; process 12-15 cols, sum in m13
>+
>+        sub iq, 1
>+        jns .loopi
>+
>+    CVT_PACK_COL 0    ; process 0-3 cols, result in m10's low 32bit
>+    CVT_PACK_COL 1    ; process 4-7 cols, result in m11's low 32bit
>+    CVT_PACK_COL 2    ; process 8-11 cols, result in m12's low 32bit
>+    CVT_PACK_COL 3    ; process 12-15 cols, result in m13's low 32bit
>+    punpckldq m10, m11
>+    punpckldq m12, m13
>+    punpcklqdq m10, m12    ; pack 16 results in m10
>+    movdqu [dstq], m10
>+
>+    add dstq, dstrideq
>+    add ystrideq, strideq
>+    sub heightq, 1
>+    jnz .equal16
>+    jmp .end
>+
>+.less16:
>+    xor off16q, off16q
>+    cmp widthq, mmsize/4
>+    jl .loopr
>+
>+    mov   rq, widthq
>+    and   rq, mmsize/4-1
>+    sub   widthq, rq
>+
>+    pxor m10, m10
>+    pxor m11, m11
>+    pxor m12, m12
>+
>+    lea iq, [radq - 1]
>+    .loopi_4:
>+        movd m5, [matrixq + 4*iq]    ; matrix[i]
>+        VBROADCASTSS m5, m5
>+        mov ciq, [ptrq + iq * gprsize]
>+        movdqu m6, [ciq + ystrideq]    ; c[i][y*stride] 16 uint8s
>+
>+        ;m4 controls shuffle
>+        movdqa m4, m8
>+        COMPUTE_4COL 0    ; process 0-3 cols, sum in m10
>+        cmp widthq, mmsize/4 ; width = 4
>+        je .i4_end
>+
>+        paddd m4, m9
>+        COMPUTE_4COL 1    ; process 4-7 cols, sum in m11
>+        cmp widthq, mmsize/2 ; width = 8
>+        je .i4_end
>+
>+        paddd m4, m9
>+        COMPUTE_4COL 2    ; process 8-11 cols, sum in m12
>+
>+        .i4_end:
>+        sub iq, 1
>+        jns .loopi_4
>+
>+    CVT_PACK_COL 0    ; process 0-3 cols, result in m10's low 32bit
>+    movd [dstq], m10
>+    cmp widthq, mmsize/4 ; width = 4
>+    je .cvt_end
>+
>+    CVT_PACK_COL 1    ; process 4-7 cols, result in m11's low 32bit
>+    movd [dstq + mmsize/4], m11
>+    cmp widthq, mmsize/2 ; width = 8
>+    je .cvt_end
>+
>+    CVT_PACK_COL 2    ; process 8-11 cols, result in m12's low 32bit
>+    movd [dstq + mmsize/2], m12
>+
>+    .cvt_end:
>+    cmp rq, 0
>+    je .loopr_end
>+    mov off16q, widthq
>+    add widthq, rq
>+
>+    .loopr:
>+        xor sumq, sumq
>+        lea iq, [radq - 1]
>+        .loopr_i:
>+            mov ciq, [ptrq + iq * gprsize]
>+            add ciq, ystrideq
>+            movzx rd, byte [ciq + off16q]
>+            imul rd, [matrixq + 4*iq]
>+            add sumd, rd
>+
>+            sub iq, 1
>+            jns .loopr_i
>+
>+        pxor m7, m7
>+        cvtsi2ss m7, sumd
>+        mulss m7, m0     ; sum *= rdiv
>+        addss m7, m1     ; sum += bias
>+        addss m7, m3     ; sum += 0.5
>+        cvttps2dq m7, m7
>+        packssdw m7, m7
>+        packuswb m7, m7
>+        movd sumd, m7

>+        mov [dstq + off16q], sumb
SSE4 provided PEXTRB


>+        add off16q, 1
>+        cmp off16q, widthq
>+        jl .loopr
>+
>+    .loopr_end:
>+    add dstq, dstrideq
>+    add ystrideq, strideq
>+    sub heightq, 1

>+    jnz .less16
JNZ is not problem, but I more like JGT, it may avoid risk if value goes negative
Moritz Barsnick Jan. 1, 2020, 10:45 p.m. UTC | #2
On Sun, Dec 22, 2019 at 16:37:03 +0800, xujunzz@sjtu.edu.cn wrote:
> +        if (s->mode[i] == MATRIX_COLUMN) {
> +                if (EXTERNAL_SSE4(cpu_flags))
> +                    s->filter[i] = ff_filter_column_sse4;
> +        }

Incorrect indentation.

Moritz
Paul B Mahol Jan. 18, 2020, 5:12 p.m. UTC | #3
What is status of this?

On 12/22/19, xujunzz@sjtu.edu.cn <xujunzz@sjtu.edu.cn> wrote:
> From: Xu Jun <xujunzz@sjtu.edu.cn>
>
> Performance improves about 10% compared to v1.
>
> Tested using this command:
> ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4
> 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8
> 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000
> -f null /dev/null -benchmark
>
> after patch:
> frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=
> 24x
> video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB
> muxing overhead: unknown
> bench: utime=21.540s stime=2.091s rtime=7.197s
>
> before patch:
> frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A
> speed=10.5x
> video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB
> muxing overhead: unknown
> bench: utime=74.377s stime=1.880s rtime=16.420s
>
> Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
> ---
>  libavfilter/x86/vf_convolution.asm    | 202 ++++++++++++++++++++++++++
>  libavfilter/x86/vf_convolution_init.c |   9 ++
>  2 files changed, 211 insertions(+)
diff mbox

Patch

diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
index 2a09374b00..4c700656d6 100755
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -22,6 +22,8 @@ 
 
 SECTION_RODATA
 half:   dd 0.5
+shuf_init:   ddq 0x80808003808080028080800180808000
+shuf_step: ddq 0x00000004000000040000000400000004
 
 SECTION .text
 
@@ -285,3 +287,203 @@  sub widthq, rq
 .end:
     RET
 %endif
+
+; void filter_column(uint8_t *dst, int height,
+;                         float rdiv, float bias, const int *const matrix,
+;                         const uint8_t *c[], int length, int radius,
+;                         int dstride, int stride);
+
+%macro COMPUTE_4COL 1
+    pshufb m7, m6, m4    ; get 4 uint8s from the 16 uint8s
+    pmulld m7, m5
+    paddd m1%1, m7
+%endmacro
+
+%macro CVT_PACK_COL 1
+    cvtdq2ps m1%1, m1%1
+    mulps m1%1, m0     ; sum *= rdiv
+    addps m1%1, m1     ; sum += bias
+    addps m1%1, m3     ; sum += 0.5
+    cvttps2dq m1%1, m1%1
+    packssdw m1%1, m1%1
+    packuswb m1%1, m1%1
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \
+i, ci, ystride, sum, r, off16
+%else
+cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \
+i, ci, ystride, sum, r, off16
+%endif
+
+%if WIN64
+    SWAP m0, m2
+    SWAP m1, m3
+    mov r2q, matrixmp
+    mov r3q, ptrmp
+    mov r4q, widthmp
+    mov r5q, radmp
+    mov r6q, dstridemp
+    mov r7q, stridemp
+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \
+    i, ci, ystride, sum, r, off16
+%endif
+
+movsxdifnidn widthq, widthd
+movsxdifnidn radq, radd
+lea radq, [radq * 2 + 1]
+movsxdifnidn dstrideq, dstrided
+movsxdifnidn strideq, strided
+movsxdifnidn heightq, heightd
+
+VBROADCASTSS m0, m0    ; rdiv
+VBROADCASTSS m1, m1    ; bias
+pxor m2, m2    ; zero
+movss m3, [half]
+VBROADCASTSS m3, m3    ; 0.5
+movdqu m8, [shuf_init]      ; shuffle initialization
+movdqu m9, [shuf_step]    ; shuffle step
+
+xor ystrideq, ystrideq    ; y*stride
+
+cmp widthq, mmsize    ;if width<16 run loopr, width=16 run 16 parallel
+jl .less16
+
+.equal16:
+    pxor m10, m10
+    pxor m11, m11
+    pxor m12, m12
+    pxor m13, m13
+    ; m10-13 hold sums
+
+    lea iq, [radq - 1]
+    .loopi:
+        movd m5, [matrixq + 4*iq]    ; matrix[i]
+        VBROADCASTSS m5, m5
+        mov ciq, [ptrq + iq * gprsize]
+        movdqu m6, [ciq + ystrideq]    ; c[i][y*stride] 16 uint8s
+
+        ;m4 controls shuffle
+        movdqa m4, m8
+        COMPUTE_4COL 0    ; process 0-3 cols, sum in m10
+        paddd m4, m9
+        COMPUTE_4COL 1    ; process 4-7 cols, sum in m11
+        paddd m4, m9
+        COMPUTE_4COL 2    ; process 8-11 cols, sum in m12
+        paddd m4, m9
+        COMPUTE_4COL 3    ; process 12-15 cols, sum in m13
+
+        sub iq, 1
+        jns .loopi
+
+    CVT_PACK_COL 0    ; process 0-3 cols, result in m10's low 32bit
+    CVT_PACK_COL 1    ; process 4-7 cols, result in m11's low 32bit
+    CVT_PACK_COL 2    ; process 8-11 cols, result in m12's low 32bit
+    CVT_PACK_COL 3    ; process 12-15 cols, result in m13's low 32bit
+    punpckldq m10, m11
+    punpckldq m12, m13
+    punpcklqdq m10, m12    ; pack 16 results in m10
+    movdqu [dstq], m10
+
+    add dstq, dstrideq
+    add ystrideq, strideq
+    sub heightq, 1
+    jnz .equal16
+    jmp .end
+
+.less16:
+    xor off16q, off16q
+    cmp widthq, mmsize/4
+    jl .loopr
+
+    mov   rq, widthq
+    and   rq, mmsize/4-1
+    sub   widthq, rq
+
+    pxor m10, m10
+    pxor m11, m11
+    pxor m12, m12
+
+    lea iq, [radq - 1]
+    .loopi_4:
+        movd m5, [matrixq + 4*iq]    ; matrix[i]
+        VBROADCASTSS m5, m5
+        mov ciq, [ptrq + iq * gprsize]
+        movdqu m6, [ciq + ystrideq]    ; c[i][y*stride] 16 uint8s
+
+        ;m4 controls shuffle
+        movdqa m4, m8
+        COMPUTE_4COL 0    ; process 0-3 cols, sum in m10
+        cmp widthq, mmsize/4 ; width = 4
+        je .i4_end
+
+        paddd m4, m9
+        COMPUTE_4COL 1    ; process 4-7 cols, sum in m11
+        cmp widthq, mmsize/2 ; width = 8
+        je .i4_end
+
+        paddd m4, m9
+        COMPUTE_4COL 2    ; process 8-11 cols, sum in m12
+
+        .i4_end:
+        sub iq, 1
+        jns .loopi_4
+
+    CVT_PACK_COL 0    ; process 0-3 cols, result in m10's low 32bit
+    movd [dstq], m10
+    cmp widthq, mmsize/4 ; width = 4
+    je .cvt_end
+
+    CVT_PACK_COL 1    ; process 4-7 cols, result in m11's low 32bit
+    movd [dstq + mmsize/4], m11
+    cmp widthq, mmsize/2 ; width = 8
+    je .cvt_end
+
+    CVT_PACK_COL 2    ; process 8-11 cols, result in m12's low 32bit
+    movd [dstq + mmsize/2], m12
+
+    .cvt_end:
+    cmp rq, 0
+    je .loopr_end
+    mov off16q, widthq
+    add widthq, rq
+
+    .loopr:
+        xor sumq, sumq
+        lea iq, [radq - 1]
+        .loopr_i:
+            mov ciq, [ptrq + iq * gprsize]
+            add ciq, ystrideq
+            movzx rd, byte [ciq + off16q]
+            imul rd, [matrixq + 4*iq]
+            add sumd, rd
+
+            sub iq, 1
+            jns .loopr_i
+
+        pxor m7, m7
+        cvtsi2ss m7, sumd
+        mulss m7, m0     ; sum *= rdiv
+        addss m7, m1     ; sum += bias
+        addss m7, m3     ; sum += 0.5
+        cvttps2dq m7, m7
+        packssdw m7, m7
+        packuswb m7, m7
+        movd sumd, m7
+        mov [dstq + off16q], sumb
+        add off16q, 1
+        cmp off16q, widthq
+        jl .loopr
+
+    .loopr_end:
+    add dstq, dstrideq
+    add ystrideq, strideq
+    sub heightq, 1
+    jnz .less16
+
+.end:
+    RET
+%endif
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 5eb3b3bee1..da39b8a400 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -34,6 +34,11 @@  void ff_filter_row_sse4(uint8_t *dst, int width,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride);
 
+void ff_filter_column_sse4(uint8_t *dst, int height,
+                        float rdiv, float bias, const int *const matrix,
+                        const uint8_t *c[], int length, int radius,
+                        int dstride, int stride);
+
 av_cold void ff_convolution_init_x86(ConvolutionContext *s)
 {
 #if ARCH_X86_64
@@ -50,6 +55,10 @@  av_cold void ff_convolution_init_x86(ConvolutionContext *s)
                 if (EXTERNAL_SSE4(cpu_flags))
                     s->filter[i] = ff_filter_row_sse4;
         }
+        if (s->mode[i] == MATRIX_COLUMN) {
+                if (EXTERNAL_SSE4(cpu_flags))
+                    s->filter[i] = ff_filter_column_sse4;
+        }
     }
 #endif
 }