Message ID | 20210213111038.14840-2-onemda@gmail.com |
---|---|
State | Superseded |
Headers | show |
Series | [FFmpeg-devel,1/2] avfilter/vf_gblur: factor out postscale function | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
On 2/13/2021 8:10 AM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol <onemda@gmail.com> > --- > libavfilter/x86/vf_gblur.asm | 46 +++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_gblur_init.c | 11 ++++++-- > 2 files changed, 55 insertions(+), 2 deletions(-) > > diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm > index a25b1659f5..8fea6d2a61 100644 > --- a/libavfilter/x86/vf_gblur.asm > +++ b/libavfilter/x86/vf_gblur.asm > @@ -183,3 +183,49 @@ HORIZ_SLICE > INIT_XMM avx2 > HORIZ_SLICE > %endif > + > +%macro POSTSCALE_SLICE 0 > +%if UNIX64 > +cglobal postscale_slice, 2, 6, 4, ptr, length, postscale, min, max, x cglobal postscale_slice, 2, 3, 4, ptr, length, x > +%else > +cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x > +%endif > + shl lengthd, 2 > +%if WIN64 > + SWAP 0, 2 > + SWAP 1, 3 > + SWAP 2, 4 > +%endif > + shufps xm0, xm0, 0 > + shufps xm1, xm1, 0 > + shufps xm2, xm2, 0 > +%if cpuflag(avx2) > + vinsertf128 m0, m0, xm0, 1 > + vinsertf128 m1, m1, xm1, 1 > + vinsertf128 m2, m2, xm2, 1 You can use vbroadcastss ymm, xmm with AVX2, which combines both the shufps and vinsertf128 into one instruction. As is, this function is base AVX. So if you can't measure any performance gain with vbroadcastss, then just mark the function as AVX. > +%endif > + xor xq, xq > + > + .loop: > + movu m3, [ptrq + xq] > + mulps m3, m0 AVX can use unaligned memory operands, so just do mulps m3, m0, [ptrq + xq] But keep the explicit movu + mulps for the SSE version, otherwise x86inc will expand it into a mova. > + maxps m3, m1 > + minps m3, m2 > + movu [ptrq+xq], m3 > + > + add xq, mmsize > + cmp xd, lengthd Can't you use the neg trick? It should let you reuse length instead of x. > + jl .loop > + > + RET > +%endmacro > + > +%if ARCH_X86_64 Nothing in this function seems to require x86_64. > +INIT_XMM sse4 No instruction is SSE4 here. It's all base SSE. > +POSTSCALE_SLICE > + > +%if HAVE_AVX_EXTERNAL Wrong check. > +INIT_YMM avx2 > +POSTSCALE_SLICE > +%endif > +%endif > diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c > index e63e59fe23..7a9b40b0ad 100644 > --- a/libavfilter/x86/vf_gblur_init.c > +++ b/libavfilter/x86/vf_gblur_init.c > @@ -27,14 +27,21 @@ > void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale); > void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale); > > +void ff_postscale_slice_sse4(float *ptr, int length, float postscale, float min, float max); > +void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max); > + > av_cold void ff_gblur_init_x86(GBlurContext *s) > { > #if ARCH_X86_64 > int cpu_flags = av_get_cpu_flags(); > > - if (EXTERNAL_SSE4(cpu_flags)) > + if (EXTERNAL_SSE4(cpu_flags)) { > s->horiz_slice = ff_horiz_slice_sse4; > - if (EXTERNAL_AVX2(cpu_flags)) > + s->postscale_slice = ff_postscale_slice_sse4; > + } > + if (EXTERNAL_AVX2(cpu_flags)) { > s->horiz_slice = ff_horiz_slice_avx2; > + s->postscale_slice = ff_postscale_slice_avx2; Needs to be EXTERNAL_AVX2_FAST. You're using ymm regs, unlike in ff_horiz_slice_avx2. > + } > #endif > } >
diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm index a25b1659f5..8fea6d2a61 100644 --- a/libavfilter/x86/vf_gblur.asm +++ b/libavfilter/x86/vf_gblur.asm @@ -183,3 +183,49 @@ HORIZ_SLICE INIT_XMM avx2 HORIZ_SLICE %endif + +%macro POSTSCALE_SLICE 0 +%if UNIX64 +cglobal postscale_slice, 2, 6, 4, ptr, length, postscale, min, max, x +%else +cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x +%endif + shl lengthd, 2 +%if WIN64 + SWAP 0, 2 + SWAP 1, 3 + SWAP 2, 4 +%endif + shufps xm0, xm0, 0 + shufps xm1, xm1, 0 + shufps xm2, xm2, 0 +%if cpuflag(avx2) + vinsertf128 m0, m0, xm0, 1 + vinsertf128 m1, m1, xm1, 1 + vinsertf128 m2, m2, xm2, 1 +%endif + xor xq, xq + + .loop: + movu m3, [ptrq + xq] + mulps m3, m0 + maxps m3, m1 + minps m3, m2 + movu [ptrq+xq], m3 + + add xq, mmsize + cmp xd, lengthd + jl .loop + + RET +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse4 +POSTSCALE_SLICE + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx2 +POSTSCALE_SLICE +%endif +%endif diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c index e63e59fe23..7a9b40b0ad 100644 --- a/libavfilter/x86/vf_gblur_init.c +++ b/libavfilter/x86/vf_gblur_init.c @@ -27,14 +27,21 @@ void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale); void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale); +void ff_postscale_slice_sse4(float *ptr, int length, float postscale, float min, float max); +void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max); + av_cold void ff_gblur_init_x86(GBlurContext *s) { #if ARCH_X86_64 int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_SSE4(cpu_flags)) + if (EXTERNAL_SSE4(cpu_flags)) { s->horiz_slice = ff_horiz_slice_sse4; - if (EXTERNAL_AVX2(cpu_flags)) + s->postscale_slice = ff_postscale_slice_sse4; + } + if (EXTERNAL_AVX2(cpu_flags)) { s->horiz_slice = ff_horiz_slice_avx2; + s->postscale_slice = ff_postscale_slice_avx2; + } #endif }
Signed-off-by: Paul B Mahol <onemda@gmail.com> --- libavfilter/x86/vf_gblur.asm | 46 +++++++++++++++++++++++++++++++++ libavfilter/x86/vf_gblur_init.c | 11 ++++++-- 2 files changed, 55 insertions(+), 2 deletions(-)