Message ID | 20191222083703.3080-2-xujunzz@sjtu.edu.cn |
---|---|
State | New |
Headers | show |
Xu, On Sun, 22. Dec 16:37, xujunzz@sjtu.edu.cn wrote: > From: Xu Jun <xujunzz@sjtu.edu.cn> > > Read 16 elements from memory, shuffle and parallally compute 4 rows at a time, shuffle and parallelly write 16 results to memory. > Performance improves about 15% compared to v1. > > Tested using this command: > ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:row:row:row:row" -an -vframes 5000 -f null /dev/null -benchmark > > after patch: > frame= 4317 fps=622 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=24.9x > video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > bench: utime=20.539s stime=1.834s rtime=6.943s > > before patch(c version): > frame= 4317 fps=306 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=12.2x > video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > bench: utime=60.591s stime=1.787s rtime=14.100s > > Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn> > --- > libavfilter/x86/vf_convolution.asm | 131 ++++++++++++++++++++++++++ > libavfilter/x86/vf_convolution_init.c | 9 ++ > 2 files changed, 140 insertions(+) > mode change 100644 => 100755 libavfilter/x86/vf_convolution.asm > > diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm > old mode 100644 > new mode 100755 > index 754d4d1064..2a09374b00 > --- a/libavfilter/x86/vf_convolution.asm > +++ b/libavfilter/x86/vf_convolution.asm > @@ -154,3 +154,134 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c > INIT_XMM sse4 > FILTER_3X3 > %endif > + Patch 2-3 are failing to build: https://unofficial.patchwork-ffmpeg.org/project/FFmpeg/list/?series=26
Hi, Andriy ----- Original Message ----- > From: "Andriy Gelman" <andriy.gelman@gmail.com> > To: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org> > Cc: xujunzz@sjtu.edu.cn > Sent: Monday, December 23, 2019 12:50:48 AM > Subject: Re: [FFmpeg-devel] [PATCH v2 2/3] avfilter/vf_convolution: Add x86 SIMD optimizations for filter_row() > Xu, > > On Sun, 22. Dec 16:37, xujunzz@sjtu.edu.cn wrote: >> From: Xu Jun <xujunzz@sjtu.edu.cn> >> >> Read 16 elements from memory, shuffle and parallally compute 4 rows at a time, >> shuffle and parallelly write 16 results to memory. >> Performance improves about 15% compared to v1. >> >> Tested using this command: >> ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 >> 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 >> 9:1/45:1/45:1/45:1/45:1:2:3:4:row:row:row:row" -an -vframes 5000 -f null >> /dev/null -benchmark >> >> after patch: >> frame= 4317 fps=622 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=24.9x >> video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing >> overhead: unknown >> bench: utime=20.539s stime=1.834s rtime=6.943s >> >> before patch(c version): >> frame= 4317 fps=306 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=12.2x >> video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing >> overhead: unknown >> bench: utime=60.591s stime=1.787s rtime=14.100s >> >> Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn> >> --- >> libavfilter/x86/vf_convolution.asm | 131 ++++++++++++++++++++++++++ >> libavfilter/x86/vf_convolution_init.c | 9 ++ >> 2 files changed, 140 insertions(+) >> mode change 100644 => 100755 libavfilter/x86/vf_convolution.asm >> >> diff --git a/libavfilter/x86/vf_convolution.asm >> b/libavfilter/x86/vf_convolution.asm >> old mode 100644 >> new mode 100755 >> index 754d4d1064..2a09374b00 >> --- a/libavfilter/x86/vf_convolution.asm >> +++ b/libavfilter/x86/vf_convolution.asm >> @@ -154,3 +154,134 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, >> matrix, ptr, c0, c1, c2, c >> INIT_XMM sse4 >> FILTER_3X3 >> %endif >> + > > Patch 2-3 are failing to build: > https://unofficial.patchwork-ffmpeg.org/project/FFmpeg/list/?series=26 > > -- > Andriy I'm sorry I haven't built patches independently. There seem to be some bugs in the dependency of the patches. I'll fix them in v3. Xu Jun
diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm old mode 100644 new mode 100755 index 754d4d1064..2a09374b00 --- a/libavfilter/x86/vf_convolution.asm +++ b/libavfilter/x86/vf_convolution.asm @@ -154,3 +154,134 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c INIT_XMM sse4 FILTER_3X3 %endif + +; void filter_row_sse4(uint8_t *dst, int width, +; float rdiv, float bias, const int *const matrix, +; const uint8_t *c[], int peak, int radius, +; int dstride, int stride) + +%macro COMPUTE_4ROW 1 +pshufb m7, m6, m4 ; get 4 uint8s from the 16 uint8s +pmulld m7, m5 +paddd m1%1, m7 +%endmacro + +%macro CVT_PACK_ROW 1 +cvtdq2ps m1%1, m1%1 +mulps m1%1, m0 ; sum *= rdiv +addps m1%1, m1 ; sum += bias +addps m1%1, m3 ; sum += 0.5 +cvttps2dq m1%1, m1%1 +packssdw m1%1, m1%1 +packuswb m1%1, m1%1 +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse4 +%if UNIX64 +cglobal filter_row, 6, 10, 14, dst, width, matrix, ptr, mult, rad, r, x, i, ci +%else +cglobal filter_row, 4, 10, 14, dst, width, rdiv, bias, matrix, ptr, mult, rad, r, x, i, ci +%endif + +%if WIN64 + SWAP m0, m2 + SWAP m1, m3 + mov r2q, matrixmp + mov r3q, ptrmp + mov r5q, radmp + DEFINE_ARGS dst, width, matrix, ptr, mult, rad, r, x, i, ci +%endif + +movsxdifnidn widthq, widthd +movsxdifnidn radq, radd +lea radq, [radq * 2 + 1] +VBROADCASTSS m0, m0 +VBROADCASTSS m1, m1 +pxor m2, m2 ; zero +movss m3, [half] +VBROADCASTSS m3, m3 ; 0.5 +movdqu m8, [shuf_init] ; shuffle initialization +movdqu m9, [shuf_step] ; shuffle step + +xor xq, xq +cmp widthq, mmsize +jl .less16 + +mov rq, widthq +and rq, mmsize-1 +sub widthq, rq + +.equal16: + pxor m10, m10 + pxor m11, m11 + pxor m12, m12 + pxor m13, m13 + ; m10-13 hold sums + + lea iq, [radq - 1] + .loopi: + movd m5, [matrixq + 4*iq] ; matrix[i] + VBROADCASTSS m5, m5 + mov ciq, [ptrq + iq * gprsize] + movdqu m6, [ciq + xq] ; c[i][y*stride] 16 uint8s + + ;m4 controls shuffle + movdqa m4, m8 + COMPUTE_4ROW 0 ; process 0-3 rows, sum in m10 + paddd m4, m9 + COMPUTE_4ROW 1 ; process 4-7 rows, sum in m11 + paddd m4, m9 + COMPUTE_4ROW 2 ; process 8-11 rows, sum in m12 + paddd m4, m9 + COMPUTE_4ROW 3 ; process 12-15 rows, sum in m13 + + sub iq, 1 + jns .loopi + + CVT_PACK_ROW 0 ; process 0-3 rows, result in m10's low 32bit + CVT_PACK_ROW 1 ; process 4-7 rows, result in m11's low 32bit + CVT_PACK_ROW 2 ; process 8-11 rows, result in m12's low 32bit + CVT_PACK_ROW 3 ; process 12-15 rows, result in m13's low 32bit + punpckldq m10, m11 + punpckldq m12, m13 + punpcklqdq m10, m12 ; pack 16 results in m10 + movdqu [dstq+xq], m10 + + add xq, mmsize + cmp xq, widthq + jl .equal16 + + add widthq, rq + cmp xq, widthq + jge .end + +.less16: + xor rd, rd + lea iq, [radq - 1] + .loopr_i: + mov ciq, [ptrq + iq * gprsize] + movzx multd, byte [ciq + xq] + imul multd, [matrixq + 4*iq] + add rd, multd + + sub iq, 1 + jns .loopr_i + + pxor m7, m7 + cvtsi2ss m7, rd + mulss m7, m0 ; sum *= rdiv + addss m7, m1 ; sum += bias + addss m7, m3 ; sum += 0.5 + cvttps2dq m7, m7 + packssdw m7, m7 + packuswb m7, m7 + movd rd, m7 + mov [dstq + xq], rb + + add xq, 1 + cmp xq, widthq + jl .less16 +.end: + RET +%endif diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c index 51432406ed..5eb3b3bee1 100644 --- a/libavfilter/x86/vf_convolution_init.c +++ b/libavfilter/x86/vf_convolution_init.c @@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width, const uint8_t *c[], int peak, int radius, int dstride, int stride); +void ff_filter_row_sse4(uint8_t *dst, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak, int radius, + int dstride, int stride); + av_cold void ff_convolution_init_x86(ConvolutionContext *s) { #if ARCH_X86_64 @@ -41,6 +46,10 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s) s->filter[i] = ff_filter_3x3_sse4; } } + if (s->mode[i] == MATRIX_ROW) { + if (EXTERNAL_SSE4(cpu_flags)) + s->filter[i] = ff_filter_row_sse4; + } } #endif }