diff mbox

[FFmpeg-devel,3/3] avfilter/vf_convolution: add X86 SIMD for filter_column()

Message ID 20191203075207.26243-3-xujunzz@sjtu.edu.cn
State New
Headers show

Commit Message

Xu Jun Dec. 3, 2019, 7:52 a.m. UTC
From: Xu Jun <xujunzz@sjtu.edu.cn>

Tested using this command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark

after patch:
frame= 4317 fps=464 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=18.6x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=30.556s stime=2.361s rtime=9.307s
bench: maxrss=15156kB

before patch:
frame= 4317 fps=271 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.8x
video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=76.097s stime=1.676s rtime=15.929s
bench: maxrss=15160kB

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
 libavfilter/x86/vf_convolution.asm    | 129 ++++++++++++++++++++++++++
 libavfilter/x86/vf_convolution_init.c |   9 ++
 2 files changed, 138 insertions(+)

Comments

chen Dec. 3, 2019, 8:59 a.m. UTC | #1
comments inline in code


At 2019-12-03 15:52:07, xujunzz@sjtu.edu.cn wrote:
>From: Xu Jun <xujunzz@sjtu.edu.cn>
>
>+; void filter_column(uint8_t *dst, int height,
>+;                         float rdiv, float bias, const int *const matrix,
>+;                         const uint8_t *c[], int length, int radius,
>+;                         int dstride, int stride);
>+
>+%if ARCH_X86_64
>+INIT_XMM sse4
>+%if UNIX64
>+cglobal filter_column, 8, 15, 7, dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
>+%else
>+cglobal filter_column, 8, 15, 7, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r

>+%endif
no idea, these are difficult to read and understand




>+
>+%if WIN64
>+    SWAP m0, m2
>+    SWAP m1, m3
>+    mov r2q, matrixmp
>+    mov r3q, ptrmp
>+    mov r4q, widthmp
>+    mov r5q, radmp
>+    mov r6q, dstridemp
>+    mov r7q, stridemp
>+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
>+%endif
>+
>+movsxdifnidn widthq, widthd
>+movsxdifnidn radq, radd
>+movsxdifnidn dstrideq, dstrided
>+movsxdifnidn strideq, strided
>+sal radq, 1

>+add radq, 1     ;2*radius+1
I don't know how about compare to "LEA x,[y*2+1]"
And....I want not discuss in between SAL and SHL


>+movsxdifnidn heightq, heightd
>+VBROADCASTSS m0, m0
>+VBROADCASTSS m1, m1
>+pxor m6, m6
>+movss m5, [half]
>+VBROADCASTSS m5, m5
>+
>+xor dst_offq, dst_offq
>+xor c_offq, c_offq
>+
>+.loopy:
>+    xor off16q, off16q
>+    cmp widthq, mmsize/4
>+    jl .loopr
>+
>+    mov rq, widthq
>+    and rq, mmsize/4-1
>+    sub widthq, rq
>+

>+    .loop16: ;parallel process 16 elements in a row
Processing 4 column per loop, are you means, we want to save lots of unused register?
We claim X64, so we have 16 of XMMs


>+        pxor m4, m4
>+        xor iq, iq
>+        .loopi:

>+            movss m2, [matrixq + 4*iq]
no idea that you working on Float data path, we are lucky, Intel CPU sounds not penalty in here.


>+            VBROADCASTSS m2, m2
>+            mov ciq, [ptrq + iq * gprsize]
>+            movss m3, [ciq + c_offq] ;c[i][y*stride + off16]
>+            punpcklbw m3, m6

>+            punpcklwd m3, m6
Since you claim SSE4, the instruction PMOVZXBD available, moreover, SSE4 register can be full fill 16 of uint8, but load 4 of them only.


>+            pmulld m2, m3
>+            paddd m4, m2
>+
>+            add iq, 1

>+            cmp iq, radq
When you initial iq to radq and decrement per loop, you can reduce one instruction
I know iq is work as index in the loop, but we can found some trick over there.
>+            jl .loopi
>+
>+        cvtdq2ps m4, m4
>+        mulps m4, m0     ; sum *= rdiv
>+        addps m4, m1     ; sum += bias

>+        addps m4, m5     ; sum += 0.5
I don't know how about precision mismatch if we pre-compute (bias+0.5)


>+        cvttps2dq m4, m4
>+        packssdw m4, m4
>+        packuswb m4, m4
>+        movss [dstq + dst_offq], m4
>+        add c_offq, mmsize/4
>+        add dst_offq, mmsize/4
>+
>+        add off16q, mmsize/4
>+        cmp off16q, widthq
>+        jl .loop16
>+
>+    add widthq, rq
>+    cmp off16q, widthq
>+    jge .paraend
>+

>+    .loopr:
no idea about this loop, if we can read beyond, we can reuse above SIMD code


>+        xor sumd, sumd
>+        xor iq, iq
>+        .loopr_i:
>+            mov ciq, [ptrq + iq * gprsize]
>+            movzx rd, byte [ciq + c_offq]
>+            imul rd, [matrixq + 4*iq]
>+            add sumd, rd
>+
>+            add iq, 1
>+            cmp iq, radq
>+            jl .loopr_i
>+
>+        pxor m4, m4
>+        cvtsi2ss m4, sumd
>+        mulss m4, m0     ; sum *= rdiv
>+        addss m4, m1     ; sum += bias
>+        addss m4, m5     ; sum += 0.5
>+        cvttps2dq m4, m4
>+        packssdw m4, m4
>+        packuswb m4, m4
>+        movd sumd, m4
>+        mov [dstq + dst_offq], sumb
>+        add c_offq, 1
>+        add dst_offq, 1
>+        add off16q, 1
>+        cmp off16q, widthq
>+        jl .loopr
>+
>+    .paraend:
>+    sub c_offq, widthq
>+    sub dst_offq, widthq
>+    add c_offq, strideq
>+    add dst_offq, dstrideq
>+
>+    sub heightq, 1
>+    cmp heightq, 0
>+    jg .loopy
>+
>+.end:
>+    RET
Ruiling Song Dec. 4, 2019, 12:59 a.m. UTC | #2
> -----Original Message-----

> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of

> chen

> Sent: Tuesday, December 3, 2019 4:59 PM

> To: FFmpeg development discussions and patches <ffmpeg-

> devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86

> SIMD for filter_column()

> 

> comments inline in code

> 

> 

> At 2019-12-03 15:52:07, xujunzz@sjtu.edu.cn wrote:

> >From: Xu Jun <xujunzz@sjtu.edu.cn>

[...]
> >+

> >+        cvtdq2ps m4, m4

> >+        mulps m4, m0     ; sum *= rdiv

> >+        addps m4, m1     ; sum += bias

> 

> >+        addps m4, m5     ; sum += 0.5

> I don't know how about precision mismatch if we pre-compute (bias+0.5)

I think it is hard to prove it is safe to do pre-compute.

> 

> 

> >+        cvttps2dq m4, m4

> >+        packssdw m4, m4

> >+        packuswb m4, m4

> >+        movss [dstq + dst_offq], m4

> >+        add c_offq, mmsize/4

> >+        add dst_offq, mmsize/4

> >+

> >+        add off16q, mmsize/4

> >+        cmp off16q, widthq

> >+        jl .loop16

> >+

> >+    add widthq, rq

> >+    cmp off16q, widthq

> >+    jge .paraend

> >+

> 

> >+    .loopr:

> no idea about this loop, if we can read beyond, we can reuse above SIMD

> code

Reuse above SIMD code may write to the memory that does not belong to this slice-thread.
IMO, the code to handle remainder columns is still necessary.

Ruiling
> 

> 

> >+        xor sumd, sumd

> >+        xor iq, iq

> >+        .loopr_i:

> >+            mov ciq, [ptrq + iq * gprsize]

> >+            movzx rd, byte [ciq + c_offq]

> >+            imul rd, [matrixq + 4*iq]

> >+            add sumd, rd

> >+

> >+            add iq, 1

> >+            cmp iq, radq

> >+            jl .loopr_i

> >+

> >+        pxor m4, m4

> >+        cvtsi2ss m4, sumd

> >+        mulss m4, m0     ; sum *= rdiv

> >+        addss m4, m1     ; sum += bias

> >+        addss m4, m5     ; sum += 0.5

> >+        cvttps2dq m4, m4

> >+        packssdw m4, m4

> >+        packuswb m4, m4

> >+        movd sumd, m4

> >+        mov [dstq + dst_offq], sumb

> >+        add c_offq, 1

> >+        add dst_offq, 1

> >+        add off16q, 1

> >+        cmp off16q, widthq

> >+        jl .loopr

> >+

> >+    .paraend:

> >+    sub c_offq, widthq

> >+    sub dst_offq, widthq

> >+    add c_offq, strideq

> >+    add dst_offq, dstrideq

> >+

> >+    sub heightq, 1

> >+    cmp heightq, 0

> >+    jg .loopy

> >+

> >+.end:

> >+    RET

> 

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
chen Dec. 4, 2019, 1:36 a.m. UTC | #3
At 2019-12-04 08:59:08, "Song, Ruiling" <ruiling.song@intel.com> wrote:
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> chen
>> Sent: Tuesday, December 3, 2019 4:59 PM
>> To: FFmpeg development discussions and patches <ffmpeg-
>> devel@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86
>> SIMD for filter_column()
>> 
>> comments inline in code
>> 
>> 
>> At 2019-12-03 15:52:07, xujunzz@sjtu.edu.cn wrote:
>> >From: Xu Jun <xujunzz@sjtu.edu.cn>
>[...]
>> >+
>> >+        cvtdq2ps m4, m4
>> >+        mulps m4, m0     ; sum *= rdiv
>> >+        addps m4, m1     ; sum += bias
>> 
>> >+        addps m4, m5     ; sum += 0.5
>> I don't know how about precision mismatch if we pre-compute (bias+0.5)

>I think it is hard to prove it is safe to do pre-compute.
Agree, I also worried precision issue since float operator is execute order dependent.
How about ROUNDPS?


>
>> 
>> 
>> >+        cvttps2dq m4, m4
>> >+        packssdw m4, m4
>> >+        packuswb m4, m4
>> >+        movss [dstq + dst_offq], m4
>> >+        add c_offq, mmsize/4
>> >+        add dst_offq, mmsize/4
>> >+
>> >+        add off16q, mmsize/4
>> >+        cmp off16q, widthq
>> >+        jl .loop16
>> >+
>> >+    add widthq, rq
>> >+    cmp off16q, widthq
>> >+    jge .paraend
>> >+
>> 
>> >+    .loopr:
>> no idea about this loop, if we can read beyond, we can reuse above SIMD
>> code
>Reuse above SIMD code may write to the memory that does not belong to this slice-thread.

>IMO, the code to handle remainder columns is still necessary.


Depends on algorithm & size,
For example width=23
Process #0 [0:15]
Process #1 [7:22]
Both of them is multiple of 16
Ruiling Song Dec. 4, 2019, 2:28 a.m. UTC | #4
> -----Original Message-----

> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of

> chen

> Sent: Wednesday, December 4, 2019 9:36 AM

> To: FFmpeg development discussions and patches <ffmpeg-

> devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86

> SIMD for filter_column()

> 

> 

> 

> At 2019-12-04 08:59:08, "Song, Ruiling" <ruiling.song@intel.com> wrote:

> >> -----Original Message-----

> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of

> >> chen

> >> Sent: Tuesday, December 3, 2019 4:59 PM

> >> To: FFmpeg development discussions and patches <ffmpeg-

> >> devel@ffmpeg.org>

> >> Subject: Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86

> >> SIMD for filter_column()

> >>

> >> comments inline in code

> >>

> >>

> >> At 2019-12-03 15:52:07, xujunzz@sjtu.edu.cn wrote:

> >> >From: Xu Jun <xujunzz@sjtu.edu.cn>

> >[...]

> >> >+

> >> >+        cvtdq2ps m4, m4

> >> >+        mulps m4, m0     ; sum *= rdiv

> >> >+        addps m4, m1     ; sum += bias

> >>

> >> >+        addps m4, m5     ; sum += 0.5

> >> I don't know how about precision mismatch if we pre-compute (bias+0.5)

> 

> >I think it is hard to prove it is safe to do pre-compute.

> Agree, I also worried precision issue since float operator is execute order

> dependent.

> How about ROUNDPS?

Seems no exactly match.
> 

> 

> >

> >>

> >>

> >> >+        cvttps2dq m4, m4

> >> >+        packssdw m4, m4

> >> >+        packuswb m4, m4

> >> >+        movss [dstq + dst_offq], m4

> >> >+        add c_offq, mmsize/4

> >> >+        add dst_offq, mmsize/4

> >> >+

> >> >+        add off16q, mmsize/4

> >> >+        cmp off16q, widthq

> >> >+        jl .loop16

> >> >+

> >> >+    add widthq, rq

> >> >+    cmp off16q, widthq

> >> >+    jge .paraend

> >> >+

> >>

> >> >+    .loopr:

> >> no idea about this loop, if we can read beyond, we can reuse above SIMD

> >> code

> >Reuse above SIMD code may write to the memory that does not belong to

> this slice-thread.

> 

> >IMO, the code to handle remainder columns is still necessary.

> 

> 

> Depends on algorithm & size,

> For example width=23

> Process #0 [0:15]

> Process #1 [7:22]

> Both of them is multiple of 16

Sounds interesting. But FFmpeg does not do like this now.
One question is will this get a penalty for writing to same address of memory (both are writing to 7-15) from different threads?

> 

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Paul B Mahol Dec. 4, 2019, 8:51 a.m. UTC | #5
On 12/4/19, Song, Ruiling <ruiling.song@intel.com> wrote:
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> chen
>> Sent: Wednesday, December 4, 2019 9:36 AM
>> To: FFmpeg development discussions and patches <ffmpeg-
>> devel@ffmpeg.org>
>> Subject: Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86
>> SIMD for filter_column()
>>
>>
>>
>> At 2019-12-04 08:59:08, "Song, Ruiling" <ruiling.song@intel.com> wrote:
>> >> -----Original Message-----
>> >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>> >> chen
>> >> Sent: Tuesday, December 3, 2019 4:59 PM
>> >> To: FFmpeg development discussions and patches <ffmpeg-
>> >> devel@ffmpeg.org>
>> >> Subject: Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add
>> >> X86
>> >> SIMD for filter_column()
>> >>
>> >> comments inline in code
>> >>
>> >>
>> >> At 2019-12-03 15:52:07, xujunzz@sjtu.edu.cn wrote:
>> >> >From: Xu Jun <xujunzz@sjtu.edu.cn>
>> >[...]
>> >> >+
>> >> >+        cvtdq2ps m4, m4
>> >> >+        mulps m4, m0     ; sum *= rdiv
>> >> >+        addps m4, m1     ; sum += bias
>> >>
>> >> >+        addps m4, m5     ; sum += 0.5
>> >> I don't know how about precision mismatch if we pre-compute (bias+0.5)
>>
>> >I think it is hard to prove it is safe to do pre-compute.
>> Agree, I also worried precision issue since float operator is execute
>> order
>> dependent.
>> How about ROUNDPS?
> Seems no exactly match.
>>
>>
>> >
>> >>
>> >>
>> >> >+        cvttps2dq m4, m4
>> >> >+        packssdw m4, m4
>> >> >+        packuswb m4, m4
>> >> >+        movss [dstq + dst_offq], m4
>> >> >+        add c_offq, mmsize/4
>> >> >+        add dst_offq, mmsize/4
>> >> >+
>> >> >+        add off16q, mmsize/4
>> >> >+        cmp off16q, widthq
>> >> >+        jl .loop16
>> >> >+
>> >> >+    add widthq, rq
>> >> >+    cmp off16q, widthq
>> >> >+    jge .paraend
>> >> >+
>> >>
>> >> >+    .loopr:
>> >> no idea about this loop, if we can read beyond, we can reuse above
>> >> SIMD
>> >> code
>> >Reuse above SIMD code may write to the memory that does not belong to
>> this slice-thread.
>>
>> >IMO, the code to handle remainder columns is still necessary.
>>
>>
>> Depends on algorithm & size,
>> For example width=23
>> Process #0 [0:15]
>> Process #1 [7:22]
>> Both of them is multiple of 16
> Sounds interesting. But FFmpeg does not do like this now.
> One question is will this get a penalty for writing to same address of
> memory (both are writing to 7-15) from different threads?

Yes, and even bad results may happen.

>
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
chen Dec. 4, 2019, 11:41 a.m. UTC | #6
At 2019-12-04 16:51:52, "Paul B Mahol" <onemda@gmail.com> wrote:
>On 12/4/19, Song, Ruiling <ruiling.song@intel.com> wrote:
>>> -----Original Message-----
>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>>> chen

>>> >> At 2019-12-03 15:52:07, xujunzz@sjtu.edu.cn wrote:
>>> >> >From: Xu Jun <xujunzz@sjtu.edu.cn>
>>> >[...]
>>> >> >+
>>> >> >+        cvtdq2ps m4, m4
>>> >> >+        mulps m4, m0     ; sum *= rdiv
>>> >> >+        addps m4, m1     ; sum += bias
>>> >>
>>> >> >+        addps m4, m5     ; sum += 0.5
>>> >> I don't know how about precision mismatch if we pre-compute (bias+0.5)
>>>
>>> >I think it is hard to prove it is safe to do pre-compute.
>>> Agree, I also worried precision issue since float operator is execute
>>> order
>>> dependent.
>>> How about ROUNDPS?

>> Seems no exactly match.
Funny, I guess it is other issue, such as mistake on instruction's imm field.


>>> >> >+        cvttps2dq m4, m4
>>> >> >+        packssdw m4, m4
>>> >> >+        packuswb m4, m4
>>> >> >+        movss [dstq + dst_offq], m4
>>> >> >+        add c_offq, mmsize/4
>>> >> >+        add dst_offq, mmsize/4
>>> >> >+
>>> >> >+        add off16q, mmsize/4
>>> >> >+        cmp off16q, widthq
>>> >> >+        jl .loop16
>>> >> >+
>>> >> >+    add widthq, rq
>>> >> >+    cmp off16q, widthq
>>> >> >+    jge .paraend
>>> >> >+
>>> >>
>>> >> >+    .loopr:
>>> >> no idea about this loop, if we can read beyond, we can reuse above
>>> >> SIMD
>>> >> code
>>> >Reuse above SIMD code may write to the memory that does not belong to
>>> this slice-thread.
>>>
>>> >IMO, the code to handle remainder columns is still necessary.
>>>
>>>
>>> Depends on algorithm & size,
>>> For example width=23
>>> Process #0 [0:15]
>>> Process #1 [7:22]
>>> Both of them is multiple of 16
>> Sounds interesting. But FFmpeg does not do like this now.
>> One question is will this get a penalty for writing to same address of
>> memory (both are writing to 7-15) from different threads?
>
>Yes, and even bad results may happen.

>
This is my problem, I don't speak clean, the "Process #x" is one step of loops,
I guess the function must be atomic, we can place any threading that work on same address area.
Xu Jun Dec. 5, 2019, 5:49 a.m. UTC | #7
Hi, chen

----- 原始邮件 -----
> 发件人: "chen" <chenm003@163.com>
> 收件人: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> 发送时间: 星期二, 2019年 12 月 03日 下午 4:59:06
> 主题: Re: [FFmpeg-devel] [PATCH 3/3] avfilter/vf_convolution: add X86 SIMD for filter_column()

> comments inline in code
> 
> 
> At 2019-12-03 15:52:07, xujunzz@sjtu.edu.cn wrote:
>>From: Xu Jun <xujunzz@sjtu.edu.cn>
>>
>>+; void filter_column(uint8_t *dst, int height,
>>+;                         float rdiv, float bias, const int *const matrix,
>>+;                         const uint8_t *c[], int length, int radius,
>>+;                         int dstride, int stride);
>>+
>>+%if ARCH_X86_64
>>+INIT_XMM sse4
>>+%if UNIX64
>>+cglobal filter_column, 8, 15, 7, dst, height, matrix, ptr, width, rad, dstride,
>>stride, i, ci, dst_off, off16, c_off, sum, r
>>+%else
>>+cglobal filter_column, 8, 15, 7, dst, height, rdiv, bias, matrix, ptr, width,
>>rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
> 
>>+%endif
> no idea, these are difficult to read and understand

I will rename some variables to make it more readable. Do I need to add some notes here?

> 
> 
> 
> 
>>+
>>+%if WIN64
>>+    SWAP m0, m2
>>+    SWAP m1, m3
>>+    mov r2q, matrixmp
>>+    mov r3q, ptrmp
>>+    mov r4q, widthmp
>>+    mov r5q, radmp
>>+    mov r6q, dstridemp
>>+    mov r7q, stridemp
>>+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, i, ci,
>>dst_off, off16, c_off, sum, r
>>+%endif
>>+
>>+movsxdifnidn widthq, widthd
>>+movsxdifnidn radq, radd
>>+movsxdifnidn dstrideq, dstrided
>>+movsxdifnidn strideq, strided
>>+sal radq, 1
> 
>>+add radq, 1     ;2*radius+1
> I don't know how about compare to "LEA x,[y*2+1]"
> And....I want not discuss in between SAL and SHL
> 

I think lea is better and I will change in the next version.

> 
>>+movsxdifnidn heightq, heightd
>>+VBROADCASTSS m0, m0
>>+VBROADCASTSS m1, m1
>>+pxor m6, m6
>>+movss m5, [half]
>>+VBROADCASTSS m5, m5
>>+
>>+xor dst_offq, dst_offq
>>+xor c_offq, c_offq
>>+
>>+.loopy:
>>+    xor off16q, off16q
>>+    cmp widthq, mmsize/4
>>+    jl .loopr
>>+
>>+    mov rq, widthq
>>+    and rq, mmsize/4-1
>>+    sub widthq, rq
>>+
> 
>>+    .loop16: ;parallel process 16 elements in a row
> Processing 4 column per loop, are you means, we want to save lots of unused
> register?
> We claim X64, so we have 16 of XMMs

Will use more XMMs and process 16 column at a time.

> 
> 
>>+        pxor m4, m4
>>+        xor iq, iq
>>+        .loopi:
> 
>>+            movss m2, [matrixq + 4*iq]
> no idea that you working on Float data path, we are lucky, Intel CPU sounds not
> penalty in here.

Will change to Interger data path using movd.
And movd seems to have less CPI than movss.

> 
> 
>>+            VBROADCASTSS m2, m2
>>+            mov ciq, [ptrq + iq * gprsize]
>>+            movss m3, [ciq + c_offq] ;c[i][y*stride + off16]
>>+            punpcklbw m3, m6
> 
>>+            punpcklwd m3, m6
> Since you claim SSE4, the instruction PMOVZXBD available, moreover, SSE4
> register can be full fill 16 of uint8, but load 4 of them only.

I thought that since I would multiply 4 ints, loading 4 uint8s per loop is OK.
Now I know that read 16 uint8s and shuffle them is faster.
Will change in next version.

> 
>>+            pmulld m2, m3
>>+            paddd m4, m2
>>+
>>+            add iq, 1
> 
>>+            cmp iq, radq
> When you initial iq to radq and decrement per loop, you can reduce one
> instruction
> I know iq is work as index in the loop, but we can found some trick over there.

Will change in next V.

>>+            jl .loopi
>>+
>>+        cvtdq2ps m4, m4
>>+        mulps m4, m0     ; sum *= rdiv
>>+        addps m4, m1     ; sum += bias
> 
>>+        addps m4, m5     ; sum += 0.5
> I don't know how about precision mismatch if we pre-compute (bias+0.5)

Here may not be modified after discussions.

> 
> 
>>+        cvttps2dq m4, m4
>>+        packssdw m4, m4
>>+        packuswb m4, m4
>>+        movss [dstq + dst_offq], m4
>>+        add c_offq, mmsize/4
>>+        add dst_offq, mmsize/4
>>+
>>+        add off16q, mmsize/4
>>+        cmp off16q, widthq
>>+        jl .loop16
>>+
>>+    add widthq, rq
>>+    cmp off16q, widthq
>>+    jge .paraend
>>+
> 
>>+    .loopr:
> no idea about this loop, if we can read beyond, we can reuse above SIMD code

Here may not be modified too.

Xu Jun

> 
> 
>>+        xor sumd, sumd
>>+        xor iq, iq
>>+        .loopr_i:
>>+            mov ciq, [ptrq + iq * gprsize]
>>+            movzx rd, byte [ciq + c_offq]
>>+            imul rd, [matrixq + 4*iq]
>>+            add sumd, rd
>>+
>>+            add iq, 1
>>+            cmp iq, radq
>>+            jl .loopr_i
>>+
>>+        pxor m4, m4
>>+        cvtsi2ss m4, sumd
>>+        mulss m4, m0     ; sum *= rdiv
>>+        addss m4, m1     ; sum += bias
>>+        addss m4, m5     ; sum += 0.5
>>+        cvttps2dq m4, m4
>>+        packssdw m4, m4
>>+        packuswb m4, m4
>>+        movd sumd, m4
>>+        mov [dstq + dst_offq], sumb
>>+        add c_offq, 1
>>+        add dst_offq, 1
>>+        add off16q, 1
>>+        cmp off16q, widthq
>>+        jl .loopr
>>+
>>+    .paraend:
>>+    sub c_offq, widthq
>>+    sub dst_offq, widthq
>>+    add c_offq, strideq
>>+    add dst_offq, dstrideq
>>+
>>+    sub heightq, 1
>>+    cmp heightq, 0
>>+    jg .loopy
>>+
>>+.end:
>>+    RET
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox

Patch

diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
index b71e9720fb..3dbfb26978 100755
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -258,3 +258,132 @@  sub widthq, rq
 .end:
     RET
 %endif
+
+; void filter_column(uint8_t *dst, int height,
+;                         float rdiv, float bias, const int *const matrix,
+;                         const uint8_t *c[], int length, int radius,
+;                         int dstride, int stride);
+
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_column, 8, 15, 7, dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%else
+cglobal filter_column, 8, 15, 7, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%endif
+
+%if WIN64
+    SWAP m0, m2
+    SWAP m1, m3
+    mov r2q, matrixmp
+    mov r3q, ptrmp
+    mov r4q, widthmp
+    mov r5q, radmp
+    mov r6q, dstridemp
+    mov r7q, stridemp
+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%endif
+
+movsxdifnidn widthq, widthd
+movsxdifnidn radq, radd
+movsxdifnidn dstrideq, dstrided
+movsxdifnidn strideq, strided
+sal radq, 1
+add radq, 1     ;2*radius+1
+movsxdifnidn heightq, heightd
+VBROADCASTSS m0, m0
+VBROADCASTSS m1, m1
+pxor m6, m6
+movss m5, [half]
+VBROADCASTSS m5, m5
+
+xor dst_offq, dst_offq
+xor c_offq, c_offq
+
+.loopy:
+    xor off16q, off16q
+    cmp widthq, mmsize/4
+    jl .loopr
+
+    mov rq, widthq
+    and rq, mmsize/4-1
+    sub widthq, rq
+
+    .loop16: ;parallel process 16 elements in a row
+        pxor m4, m4
+        xor iq, iq
+        .loopi:
+            movss m2, [matrixq + 4*iq]
+            VBROADCASTSS m2, m2
+            mov ciq, [ptrq + iq * gprsize]
+            movss m3, [ciq + c_offq] ;c[i][y*stride + off16]
+            punpcklbw m3, m6
+            punpcklwd m3, m6
+            pmulld m2, m3
+            paddd m4, m2
+
+            add iq, 1
+            cmp iq, radq
+            jl .loopi
+
+        cvtdq2ps m4, m4
+        mulps m4, m0     ; sum *= rdiv
+        addps m4, m1     ; sum += bias
+        addps m4, m5     ; sum += 0.5
+        cvttps2dq m4, m4
+        packssdw m4, m4
+        packuswb m4, m4
+        movss [dstq + dst_offq], m4
+        add c_offq, mmsize/4
+        add dst_offq, mmsize/4
+
+        add off16q, mmsize/4
+        cmp off16q, widthq
+        jl .loop16
+
+    add widthq, rq
+    cmp off16q, widthq
+    jge .paraend
+
+    .loopr:
+        xor sumd, sumd
+        xor iq, iq
+        .loopr_i:
+            mov ciq, [ptrq + iq * gprsize]
+            movzx rd, byte [ciq + c_offq]
+            imul rd, [matrixq + 4*iq]
+            add sumd, rd
+
+            add iq, 1
+            cmp iq, radq
+            jl .loopr_i
+
+        pxor m4, m4
+        cvtsi2ss m4, sumd
+        mulss m4, m0     ; sum *= rdiv
+        addss m4, m1     ; sum += bias
+        addss m4, m5     ; sum += 0.5
+        cvttps2dq m4, m4
+        packssdw m4, m4
+        packuswb m4, m4
+        movd sumd, m4
+        mov [dstq + dst_offq], sumb
+        add c_offq, 1
+        add dst_offq, 1
+        add off16q, 1
+        cmp off16q, widthq
+        jl .loopr
+
+    .paraend:
+    sub c_offq, widthq
+    sub dst_offq, widthq
+    add c_offq, strideq
+    add dst_offq, dstrideq
+
+    sub heightq, 1
+    cmp heightq, 0
+    jg .loopy
+
+.end:
+    RET
+%endif
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 5eb3b3bee1..da39b8a400 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -34,6 +34,11 @@  void ff_filter_row_sse4(uint8_t *dst, int width,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride);
 
+void ff_filter_column_sse4(uint8_t *dst, int height,
+                        float rdiv, float bias, const int *const matrix,
+                        const uint8_t *c[], int length, int radius,
+                        int dstride, int stride);
+
 av_cold void ff_convolution_init_x86(ConvolutionContext *s)
 {
 #if ARCH_X86_64
@@ -50,6 +55,10 @@  av_cold void ff_convolution_init_x86(ConvolutionContext *s)
                 if (EXTERNAL_SSE4(cpu_flags))
                     s->filter[i] = ff_filter_row_sse4;
         }
+        if (s->mode[i] == MATRIX_COLUMN) {
+                if (EXTERNAL_SSE4(cpu_flags))
+                    s->filter[i] = ff_filter_column_sse4;
+        }
     }
 #endif
 }