Message ID | 20191222083703.3080-3-xujunzz@sjtu.edu.cn |
---|---|
State | New |
Headers | show |
comments inlined At 2019-12-22 16:37:03, xujunzz@sjtu.edu.cn wrote: >From: Xu Jun <xujunzz@sjtu.edu.cn> > >Performance improves about 10% compared to v1. > >Tested using this command: >./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark > >after patch: >frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed= 24x >video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown >bench: utime=21.540s stime=2.091s rtime=7.197s > >before patch: >frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x >video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown >bench: utime=74.377s stime=1.880s rtime=16.420s > >Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn> >--- > libavfilter/x86/vf_convolution.asm | 202 ++++++++++++++++++++++++++ > libavfilter/x86/vf_convolution_init.c | 9 ++ > 2 files changed, 211 insertions(+) > >diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm >index 2a09374b00..4c700656d6 100755 >--- a/libavfilter/x86/vf_convolution.asm >+++ b/libavfilter/x86/vf_convolution.asm >@@ -22,6 +22,8 @@ > > SECTION_RODATA > half: dd 0.5 >+shuf_init: ddq 0x80808003808080028080800180808000 TBD ps: constant define as Byte (db) or Word (dw) have more readable, in this case, you use it with psuhfb, so Byte. >+shuf_step: ddq 0x00000004000000040000000400000004 > > SECTION .text > >@@ -285,3 +287,203 @@ sub widthq, rq > .end: > RET > %endif >+ >+; void filter_column(uint8_t *dst, int height, >+; float rdiv, float bias, const int *const matrix, >+; const uint8_t *c[], int length, int radius, >+; int dstride, int stride); >+ >+%macro COMPUTE_4COL 1 >+ pshufb m7, m6, m4 ; get 4 uint8s from the 16 uint8s Unnecessary, see below comment >+ pmulld m7, m5 >+ paddd m1%1, m7 not error, generally, this sum operator made new dependency link, it may stall pipeline, I suggest sum 4 of register in parallelism. In this case, I am not sure dynamic range of Matrix, so I am not sure it is good or overflow if sum element of (2 * radius + 1) times. >+%endmacro >+ >+%macro CVT_PACK_COL 1 >+ cvtdq2ps m1%1, m1%1 >+ mulps m1%1, m0 ; sum *= rdiv >+ addps m1%1, m1 ; sum += bias >+ addps m1%1, m3 ; sum += 0.5 >+ cvttps2dq m1%1, m1%1 >+ packssdw m1%1, m1%1 >+ packuswb m1%1, m1%1 >+%endmacro >+ >+%if ARCH_X86_64 >+INIT_XMM sse4 >+%if UNIX64 >+cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \ >+i, ci, ystride, sum, r, off16 >+%else >+cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \ >+i, ci, ystride, sum, r, off16 >+%endif >+ >+%if WIN64 >+ SWAP m0, m2 >+ SWAP m1, m3 >+ mov r2q, matrixmp >+ mov r3q, ptrmp >+ mov r4q, widthmp >+ mov r5q, radmp >+ mov r6q, dstridemp >+ mov r7q, stridemp >+ DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \ >+ i, ci, ystride, sum, r, off16 >+%endif >+ >+movsxdifnidn widthq, widthd >+movsxdifnidn radq, radd >+lea radq, [radq * 2 + 1] >+movsxdifnidn dstrideq, dstrided >+movsxdifnidn strideq, strided >+movsxdifnidn heightq, heightd >+ >+VBROADCASTSS m0, m0 ; rdiv >+VBROADCASTSS m1, m1 ; bias >+pxor m2, m2 ; zero >+movss m3, [half] >+VBROADCASTSS m3, m3 ; 0.5 >+movdqu m8, [shuf_init] ; shuffle initialization TBD >+movdqu m9, [shuf_step] ; shuffle step >+ >+xor ystrideq, ystrideq ; y*stride >+ >+cmp widthq, mmsize ;if width<16 run loopr, width=16 run 16 parallel >+jl .less16 >+ >+.equal16: >+ pxor m10, m10 >+ pxor m11, m11 >+ pxor m12, m12 >+ pxor m13, m13 >+ ; m10-13 hold sums not error, however, use m0-m7 can be save 1 byte instruction prefix, in the inner loop, it made a little performance improvement. >+ >+ lea iq, [radq - 1] >+ .loopi: >+ movd m5, [matrixq + 4*iq] ; matrix[i] >+ VBROADCASTSS m5, m5 since you claim SSE4, PSHUFD maybe better, however, it is not problem if you want to upgrade to AVX and above >+ mov ciq, [ptrq + iq * gprsize] >+ movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s SSE4 provided MOVZXBD, it make you reduce above PSHUFB and series constant load >+ >+ ;m4 controls shuffle >+ movdqa m4, m8 >+ COMPUTE_4COL 0 ; process 0-3 cols, sum in m10 >+ paddd m4, m9 >+ COMPUTE_4COL 1 ; process 4-7 cols, sum in m11 >+ paddd m4, m9 >+ COMPUTE_4COL 2 ; process 8-11 cols, sum in m12 >+ paddd m4, m9 >+ COMPUTE_4COL 3 ; process 12-15 cols, sum in m13 >+ >+ sub iq, 1 >+ jns .loopi >+ >+ CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit >+ CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit >+ CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit >+ CVT_PACK_COL 3 ; process 12-15 cols, result in m13's low 32bit >+ punpckldq m10, m11 >+ punpckldq m12, m13 >+ punpcklqdq m10, m12 ; pack 16 results in m10 >+ movdqu [dstq], m10 >+ >+ add dstq, dstrideq >+ add ystrideq, strideq >+ sub heightq, 1 >+ jnz .equal16 >+ jmp .end >+ >+.less16: >+ xor off16q, off16q >+ cmp widthq, mmsize/4 >+ jl .loopr >+ >+ mov rq, widthq >+ and rq, mmsize/4-1 >+ sub widthq, rq >+ >+ pxor m10, m10 >+ pxor m11, m11 >+ pxor m12, m12 >+ >+ lea iq, [radq - 1] >+ .loopi_4: >+ movd m5, [matrixq + 4*iq] ; matrix[i] >+ VBROADCASTSS m5, m5 >+ mov ciq, [ptrq + iq * gprsize] >+ movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s >+ >+ ;m4 controls shuffle >+ movdqa m4, m8 >+ COMPUTE_4COL 0 ; process 0-3 cols, sum in m10 >+ cmp widthq, mmsize/4 ; width = 4 >+ je .i4_end >+ >+ paddd m4, m9 >+ COMPUTE_4COL 1 ; process 4-7 cols, sum in m11 >+ cmp widthq, mmsize/2 ; width = 8 >+ je .i4_end >+ >+ paddd m4, m9 >+ COMPUTE_4COL 2 ; process 8-11 cols, sum in m12 >+ >+ .i4_end: >+ sub iq, 1 >+ jns .loopi_4 >+ >+ CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit >+ movd [dstq], m10 >+ cmp widthq, mmsize/4 ; width = 4 >+ je .cvt_end >+ >+ CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit >+ movd [dstq + mmsize/4], m11 >+ cmp widthq, mmsize/2 ; width = 8 >+ je .cvt_end >+ >+ CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit >+ movd [dstq + mmsize/2], m12 >+ >+ .cvt_end: >+ cmp rq, 0 >+ je .loopr_end >+ mov off16q, widthq >+ add widthq, rq >+ >+ .loopr: >+ xor sumq, sumq >+ lea iq, [radq - 1] >+ .loopr_i: >+ mov ciq, [ptrq + iq * gprsize] >+ add ciq, ystrideq >+ movzx rd, byte [ciq + off16q] >+ imul rd, [matrixq + 4*iq] >+ add sumd, rd >+ >+ sub iq, 1 >+ jns .loopr_i >+ >+ pxor m7, m7 >+ cvtsi2ss m7, sumd >+ mulss m7, m0 ; sum *= rdiv >+ addss m7, m1 ; sum += bias >+ addss m7, m3 ; sum += 0.5 >+ cvttps2dq m7, m7 >+ packssdw m7, m7 >+ packuswb m7, m7 >+ movd sumd, m7 >+ mov [dstq + off16q], sumb SSE4 provided PEXTRB >+ add off16q, 1 >+ cmp off16q, widthq >+ jl .loopr >+ >+ .loopr_end: >+ add dstq, dstrideq >+ add ystrideq, strideq >+ sub heightq, 1 >+ jnz .less16 JNZ is not problem, but I more like JGT, it may avoid risk if value goes negative
On Sun, Dec 22, 2019 at 16:37:03 +0800, xujunzz@sjtu.edu.cn wrote: > + if (s->mode[i] == MATRIX_COLUMN) { > + if (EXTERNAL_SSE4(cpu_flags)) > + s->filter[i] = ff_filter_column_sse4; > + } Incorrect indentation. Moritz
What is status of this? On 12/22/19, xujunzz@sjtu.edu.cn <xujunzz@sjtu.edu.cn> wrote: > From: Xu Jun <xujunzz@sjtu.edu.cn> > > Performance improves about 10% compared to v1. > > Tested using this command: > ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 > 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 > 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 > -f null /dev/null -benchmark > > after patch: > frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed= > 24x > video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB > muxing overhead: unknown > bench: utime=21.540s stime=2.091s rtime=7.197s > > before patch: > frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A > speed=10.5x > video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB > muxing overhead: unknown > bench: utime=74.377s stime=1.880s rtime=16.420s > > Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn> > --- > libavfilter/x86/vf_convolution.asm | 202 ++++++++++++++++++++++++++ > libavfilter/x86/vf_convolution_init.c | 9 ++ > 2 files changed, 211 insertions(+)
diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm index 2a09374b00..4c700656d6 100755 --- a/libavfilter/x86/vf_convolution.asm +++ b/libavfilter/x86/vf_convolution.asm @@ -22,6 +22,8 @@ SECTION_RODATA half: dd 0.5 +shuf_init: ddq 0x80808003808080028080800180808000 +shuf_step: ddq 0x00000004000000040000000400000004 SECTION .text @@ -285,3 +287,203 @@ sub widthq, rq .end: RET %endif + +; void filter_column(uint8_t *dst, int height, +; float rdiv, float bias, const int *const matrix, +; const uint8_t *c[], int length, int radius, +; int dstride, int stride); + +%macro COMPUTE_4COL 1 + pshufb m7, m6, m4 ; get 4 uint8s from the 16 uint8s + pmulld m7, m5 + paddd m1%1, m7 +%endmacro + +%macro CVT_PACK_COL 1 + cvtdq2ps m1%1, m1%1 + mulps m1%1, m0 ; sum *= rdiv + addps m1%1, m1 ; sum += bias + addps m1%1, m3 ; sum += 0.5 + cvttps2dq m1%1, m1%1 + packssdw m1%1, m1%1 + packuswb m1%1, m1%1 +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse4 +%if UNIX64 +cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \ +i, ci, ystride, sum, r, off16 +%else +cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \ +i, ci, ystride, sum, r, off16 +%endif + +%if WIN64 + SWAP m0, m2 + SWAP m1, m3 + mov r2q, matrixmp + mov r3q, ptrmp + mov r4q, widthmp + mov r5q, radmp + mov r6q, dstridemp + mov r7q, stridemp + DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \ + i, ci, ystride, sum, r, off16 +%endif + +movsxdifnidn widthq, widthd +movsxdifnidn radq, radd +lea radq, [radq * 2 + 1] +movsxdifnidn dstrideq, dstrided +movsxdifnidn strideq, strided +movsxdifnidn heightq, heightd + +VBROADCASTSS m0, m0 ; rdiv +VBROADCASTSS m1, m1 ; bias +pxor m2, m2 ; zero +movss m3, [half] +VBROADCASTSS m3, m3 ; 0.5 +movdqu m8, [shuf_init] ; shuffle initialization +movdqu m9, [shuf_step] ; shuffle step + +xor ystrideq, ystrideq ; y*stride + +cmp widthq, mmsize ;if width<16 run loopr, width=16 run 16 parallel +jl .less16 + +.equal16: + pxor m10, m10 + pxor m11, m11 + pxor m12, m12 + pxor m13, m13 + ; m10-13 hold sums + + lea iq, [radq - 1] + .loopi: + movd m5, [matrixq + 4*iq] ; matrix[i] + VBROADCASTSS m5, m5 + mov ciq, [ptrq + iq * gprsize] + movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s + + ;m4 controls shuffle + movdqa m4, m8 + COMPUTE_4COL 0 ; process 0-3 cols, sum in m10 + paddd m4, m9 + COMPUTE_4COL 1 ; process 4-7 cols, sum in m11 + paddd m4, m9 + COMPUTE_4COL 2 ; process 8-11 cols, sum in m12 + paddd m4, m9 + COMPUTE_4COL 3 ; process 12-15 cols, sum in m13 + + sub iq, 1 + jns .loopi + + CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit + CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit + CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit + CVT_PACK_COL 3 ; process 12-15 cols, result in m13's low 32bit + punpckldq m10, m11 + punpckldq m12, m13 + punpcklqdq m10, m12 ; pack 16 results in m10 + movdqu [dstq], m10 + + add dstq, dstrideq + add ystrideq, strideq + sub heightq, 1 + jnz .equal16 + jmp .end + +.less16: + xor off16q, off16q + cmp widthq, mmsize/4 + jl .loopr + + mov rq, widthq + and rq, mmsize/4-1 + sub widthq, rq + + pxor m10, m10 + pxor m11, m11 + pxor m12, m12 + + lea iq, [radq - 1] + .loopi_4: + movd m5, [matrixq + 4*iq] ; matrix[i] + VBROADCASTSS m5, m5 + mov ciq, [ptrq + iq * gprsize] + movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s + + ;m4 controls shuffle + movdqa m4, m8 + COMPUTE_4COL 0 ; process 0-3 cols, sum in m10 + cmp widthq, mmsize/4 ; width = 4 + je .i4_end + + paddd m4, m9 + COMPUTE_4COL 1 ; process 4-7 cols, sum in m11 + cmp widthq, mmsize/2 ; width = 8 + je .i4_end + + paddd m4, m9 + COMPUTE_4COL 2 ; process 8-11 cols, sum in m12 + + .i4_end: + sub iq, 1 + jns .loopi_4 + + CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit + movd [dstq], m10 + cmp widthq, mmsize/4 ; width = 4 + je .cvt_end + + CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit + movd [dstq + mmsize/4], m11 + cmp widthq, mmsize/2 ; width = 8 + je .cvt_end + + CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit + movd [dstq + mmsize/2], m12 + + .cvt_end: + cmp rq, 0 + je .loopr_end + mov off16q, widthq + add widthq, rq + + .loopr: + xor sumq, sumq + lea iq, [radq - 1] + .loopr_i: + mov ciq, [ptrq + iq * gprsize] + add ciq, ystrideq + movzx rd, byte [ciq + off16q] + imul rd, [matrixq + 4*iq] + add sumd, rd + + sub iq, 1 + jns .loopr_i + + pxor m7, m7 + cvtsi2ss m7, sumd + mulss m7, m0 ; sum *= rdiv + addss m7, m1 ; sum += bias + addss m7, m3 ; sum += 0.5 + cvttps2dq m7, m7 + packssdw m7, m7 + packuswb m7, m7 + movd sumd, m7 + mov [dstq + off16q], sumb + add off16q, 1 + cmp off16q, widthq + jl .loopr + + .loopr_end: + add dstq, dstrideq + add ystrideq, strideq + sub heightq, 1 + jnz .less16 + +.end: + RET +%endif diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c index 5eb3b3bee1..da39b8a400 100644 --- a/libavfilter/x86/vf_convolution_init.c +++ b/libavfilter/x86/vf_convolution_init.c @@ -34,6 +34,11 @@ void ff_filter_row_sse4(uint8_t *dst, int width, const uint8_t *c[], int peak, int radius, int dstride, int stride); +void ff_filter_column_sse4(uint8_t *dst, int height, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int length, int radius, + int dstride, int stride); + av_cold void ff_convolution_init_x86(ConvolutionContext *s) { #if ARCH_X86_64 @@ -50,6 +55,10 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s) if (EXTERNAL_SSE4(cpu_flags)) s->filter[i] = ff_filter_row_sse4; } + if (s->mode[i] == MATRIX_COLUMN) { + if (EXTERNAL_SSE4(cpu_flags)) + s->filter[i] = ff_filter_column_sse4; + } } #endif }