Message ID | 20211104041841.95318-2-jianhua.wu@intel.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v2,1/3] avfilter/x86/vf_exposure: add x86 SIMD optimization | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
On 11/4/2021 1:18 AM, Wu Jianhua wrote: > Performance data(Less is better): > exposure_sse: 500491 You reported a better result in the first patch. > exposure_avx2: 449122 This looks like a really low speed up for a function that processes twice the amount of floats per loop. > > Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> > --- > libavfilter/x86/vf_exposure.asm | 15 +++++++++++++++ > libavfilter/x86/vf_exposure_init.c | 6 ++++++ > 2 files changed, 21 insertions(+) > > diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm > index 3351c6fb3b..f271167805 100644 > --- a/libavfilter/x86/vf_exposure.asm > +++ b/libavfilter/x86/vf_exposure.asm > @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale > VBROADCASTSS m1, xmm1 > %endif > > +%if cpuflag(fma3) || cpuflag(fma4) Remove the fma4 check if you're not using it. > + mulps m0, m0, m1 ; black * scale > +%endif > + > .loop: > +%if cpuflag(fma3) || cpuflag(fma4) > + mova m2, m0 > + vfmsub231ps m2, m1, [ptrq] > + movu [ptrq], m2 Have you tried to not use FMA for this and just keep the sub + mul even for AVX2 and see how it performs? > +%else > movu m2, [ptrq] > subps m2, m2, m0 > mulps m2, m2, m1 > movu [ptrq], m2 > +%endif > add ptrq, mmsize > sub lengthq, mmsize/4 > > @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale > %if ARCH_X86_64 > INIT_XMM sse > EXPOSURE > + > +%if HAVE_AVX2_EXTERNAL > +INIT_YMM avx2 > +EXPOSURE > +%endif > %endif > diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c > index de1b360f6c..80dae6164e 100644 > --- a/libavfilter/x86/vf_exposure_init.c > +++ b/libavfilter/x86/vf_exposure_init.c > @@ -24,6 +24,7 @@ > #include "libavfilter/exposure.h" > > void ff_exposure_sse(float *ptr, int length, float black, float scale); > +void ff_exposure_avx2(float *ptr, int length, float black, float scale); > > av_cold void ff_exposure_init_x86(ExposureContext *s) > { > @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s) > #if ARCH_X86_64 > if (EXTERNAL_SSE(cpu_flags)) > s->exposure_func = ff_exposure_sse; > + > +#if HAVE_AVX2_EXTERNAL No need for this preprocessor check. > + if (EXTERNAL_AVX2_FAST(cpu_flags)) > + s->exposure_func = ff_exposure_avx2; > +#endif > #endif > } >
James Almer<mailto:jamrial@gmail.com>: On 11/4/2021 1:18 AM, Wu Jianhua wrote: >> Performance data(Less is better): >> exposure_sse: 500491 >You reported a better result in the first patch. For they are tested on different baseline, I think it might be better to only compare these two values. >> exposure_avx2: 449122 > This looks like a really low speed up for a function that processes > twice the amount of floats per loop. > > > > Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> > > --- > > libavfilter/x86/vf_exposure.asm | 15 +++++++++++++++ > > libavfilter/x86/vf_exposure_init.c | 6 ++++++ > > 2 files changed, 21 insertions(+) > > > > diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm > > index 3351c6fb3b..f271167805 100644 > > --- a/libavfilter/x86/vf_exposure.asm > > +++ b/libavfilter/x86/vf_exposure.asm > > @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale > > VBROADCASTSS m1, xmm1 > > %endif > > > > +%if cpuflag(fma3) || cpuflag(fma4) > Remove the fma4 check if you're not using it. No problem. Avx2 flag is only initialized with fma3, so the fma4 is redundant indeed. > > + mulps m0, m0, m1 ; black * scale > > +%endif > > + > > .loop: > > +%if cpuflag(fma3) || cpuflag(fma4) > > + mova m2, m0 > > + vfmsub231ps m2, m1, [ptrq] > > + movu [ptrq], m2 > Have you tried to not use FMA for this and just kept the sub + mul even > for AVX2 and see how it performs? Yeah. Definitely. I have had sufficient tests before. The first version is kept sub + mul for AVX2. After that, I keep trying to find a way out to speed up it further. Using FMA here would be faster than sub + mul indeed, precisely, improving by 4%-10% approximately. Not that much better, but still an optimal way I found at the present. >> +%else >> movu m2, [ptrq] >> subps m2, m2, m0 >> mulps m2, m2, m1 >> movu [ptrq], m2 >> +%endif >> add ptrq, mmsize >> sub lengthq, mmsize/4 >> >> @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale >> %if ARCH_X86_64 >> INIT_XMM sse >> EXPOSURE >> + >> +%if HAVE_AVX2_EXTERNAL >> +INIT_YMM avx2 >> +EXPOSURE >> +%endif >> %endif >> diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c >> index de1b360f6c..80dae6164e 100644 >> --- a/libavfilter/x86/vf_exposure_init.c >> +++ b/libavfilter/x86/vf_exposure_init.c >> @@ -24,6 +24,7 @@ >> #include "libavfilter/exposure.h" >> >> void ff_exposure_sse(float *ptr, int length, float black, float scale); >> +void ff_exposure_avx2(float *ptr, int length, float black, float scale); >> >> av_cold void ff_exposure_init_x86(ExposureContext *s) > > { >> @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s) >> #if ARCH_X86_64 >> if (EXTERNAL_SSE(cpu_flags)) >> s->exposure_func = ff_exposure_sse; >> + >> +#if HAVE_AVX2_EXTERNAL > No need for this preprocessor check. Got it. I’ll update it. Thanks for your review. Jianhua
On 11/20/2021 5:42 PM, Wu Jianhua wrote: > James Almer<mailto:jamrial@gmail.com>: > On 11/4/2021 1:18 AM, Wu Jianhua wrote: >>> Performance data(Less is better): >>> exposure_sse: 500491 > >> You reported a better result in the first patch. > > For they are tested on different baseline, I think it might be better to only compare these two values. > >>> exposure_avx2: 449122 > >> This looks like a really low speed up for a function that processes >> twice the amount of floats per loop. > >>> >>> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> >>> --- >>> libavfilter/x86/vf_exposure.asm | 15 +++++++++++++++ >>> libavfilter/x86/vf_exposure_init.c | 6 ++++++ >>> 2 files changed, 21 insertions(+) >>> >>> diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm >>> index 3351c6fb3b..f271167805 100644 >>> --- a/libavfilter/x86/vf_exposure.asm >>> +++ b/libavfilter/x86/vf_exposure.asm >>> @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale >>> VBROADCASTSS m1, xmm1 >>> %endif >>> >>> +%if cpuflag(fma3) || cpuflag(fma4) > >> Remove the fma4 check if you're not using it. > > No problem. Avx2 flag is only initialized with fma3, so the fma4 is redundant indeed. > >>> + mulps m0, m0, m1 ; black * scale >>> +%endif >>> + >>> .loop: >>> +%if cpuflag(fma3) || cpuflag(fma4) >>> + mova m2, m0 >>> + vfmsub231ps m2, m1, [ptrq] >>> + movu [ptrq], m2 > >> Have you tried to not use FMA for this and just kept the sub + mul even >> for AVX2 and see how it performs? > > Yeah. Definitely. I have had sufficient tests before. The first version is kept sub + mul > for AVX2. After that, I keep trying to find a way out to speed up it further. Using FMA > here would be faster than sub + mul indeed, precisely, improving by 4%-10% approximately. > Not that much better, but still an optimal way I found at the present. I tried the checkasm test you wrote and when i made the AVX2 version use sub + mul instead of vfmsub231ps i noticed that i could change the epsilon value to FLT_EPSILON instead of 0.01f and the test would still succeed, meaning the output of the version using vfmsub231ps deviates a bit from the normal sub + mul one. The speed up is pretty small, so it may be worth just using the sub + mul version instead. > >>> +%else >>> movu m2, [ptrq] >>> subps m2, m2, m0 >>> mulps m2, m2, m1 >>> movu [ptrq], m2 >>> +%endif >>> add ptrq, mmsize >>> sub lengthq, mmsize/4 >>> >>> @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale >>> %if ARCH_X86_64 >>> INIT_XMM sse >>> EXPOSURE >>> + >>> +%if HAVE_AVX2_EXTERNAL >>> +INIT_YMM avx2 >>> +EXPOSURE >>> +%endif >>> %endif >>> diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c >>> index de1b360f6c..80dae6164e 100644 >>> --- a/libavfilter/x86/vf_exposure_init.c >>> +++ b/libavfilter/x86/vf_exposure_init.c >>> @@ -24,6 +24,7 @@ >>> #include "libavfilter/exposure.h" >>> >>> void ff_exposure_sse(float *ptr, int length, float black, float scale); >>> +void ff_exposure_avx2(float *ptr, int length, float black, float scale); >>> >>> av_cold void ff_exposure_init_x86(ExposureContext *s) >>> { >>> @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s) >>> #if ARCH_X86_64 >>> if (EXTERNAL_SSE(cpu_flags)) >>> s->exposure_func = ff_exposure_sse; >>> + >>> +#if HAVE_AVX2_EXTERNAL > >> No need for this preprocessor check. > > Got it. I’ll update it. > > Thanks for your review. > Jianhua > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
James Almer<mailto:jamrial@gmail.com>: >On 11/20/2021 5:42 PM, Wu Jianhua wrote: >> James Almer<mailto:jamrial@gmail.com>: >> On 11/4/2021 1:18 AM, Wu Jianhua wrote: >>>> Performance data(Less is better): >>>> exposure_sse: 500491 >> >>> You reported a better result in the first patch. >> >> For they are tested on different baseline, I think it might be better to only compare these two values. >> >>>> exposure_avx2: 449122 >> >>> This looks like a really low speed up for a function that processes >>> twice the amount of floats per loop. >> >>>> >>>> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> >>>> --- >>>> libavfilter/x86/vf_exposure.asm | 15 +++++++++++++++ >>>> libavfilter/x86/vf_exposure_init.c | 6 ++++++ >>>> 2 files changed, 21 insertions(+) >>>> >>>> diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm >>>> index 3351c6fb3b..f271167805 100644 >>>> --- a/libavfilter/x86/vf_exposure.asm >>>> +++ b/libavfilter/x86/vf_exposure.asm >>>> @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale >>>> VBROADCASTSS m1, xmm1 >>>> %endif >>>> >>>> +%if cpuflag(fma3) || cpuflag(fma4) >> >>> Remove the fma4 check if you're not using it. >> >> No problem. Avx2 flag is only initialized with fma3, so the fma4 is redundant indeed. >> >>>> + mulps m0, m0, m1 ; black * scale >>>> +%endif >>>> + >>>> .loop: >>>> +%if cpuflag(fma3) || cpuflag(fma4) >>>> + mova m2, m0 >>>> + vfmsub231ps m2, m1, [ptrq] >>>> + movu [ptrq], m2 > > >>> Have you tried to not use FMA for this and just kept the sub + mul even >>> for AVX2 and see how it performs? >> >> Yeah. Definitely. I have had sufficient tests before. The first version is kept sub + mul >> for AVX2. After that, I keep trying to find a way out to speed up it further. Using FMA >> here would be faster than sub + mul indeed, precisely, improving by 4%-10% approximately. >> Not that much better, but still an optimal way I found at the present. > I tried the checkasm test you wrote and when i made the AVX2 version use > sub + mul instead of vfmsub231ps i noticed that i could change the > epsilon value to FLT_EPSILON instead of 0.01f and the test would still > succeed, meaning the output of the version using vfmsub231ps deviates a > bit from the normal sub + mul one. > The speed up is pretty small, so it may be worth just using the sub + > mul version instead. Yeah. Small, but it’s not called just one time. Many a little makes a mickle, isn’t it? I might be more prefer to keep this.
diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm index 3351c6fb3b..f271167805 100644 --- a/libavfilter/x86/vf_exposure.asm +++ b/libavfilter/x86/vf_exposure.asm @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale VBROADCASTSS m1, xmm1 %endif +%if cpuflag(fma3) || cpuflag(fma4) + mulps m0, m0, m1 ; black * scale +%endif + .loop: +%if cpuflag(fma3) || cpuflag(fma4) + mova m2, m0 + vfmsub231ps m2, m1, [ptrq] + movu [ptrq], m2 +%else movu m2, [ptrq] subps m2, m2, m0 mulps m2, m2, m1 movu [ptrq], m2 +%endif add ptrq, mmsize sub lengthq, mmsize/4 @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale %if ARCH_X86_64 INIT_XMM sse EXPOSURE + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +EXPOSURE +%endif %endif diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c index de1b360f6c..80dae6164e 100644 --- a/libavfilter/x86/vf_exposure_init.c +++ b/libavfilter/x86/vf_exposure_init.c @@ -24,6 +24,7 @@ #include "libavfilter/exposure.h" void ff_exposure_sse(float *ptr, int length, float black, float scale); +void ff_exposure_avx2(float *ptr, int length, float black, float scale); av_cold void ff_exposure_init_x86(ExposureContext *s) { @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s) #if ARCH_X86_64 if (EXTERNAL_SSE(cpu_flags)) s->exposure_func = ff_exposure_sse; + +#if HAVE_AVX2_EXTERNAL + if (EXTERNAL_AVX2_FAST(cpu_flags)) + s->exposure_func = ff_exposure_avx2; +#endif #endif }
Performance data(Less is better): exposure_sse: 500491 exposure_avx2: 449122 Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> --- libavfilter/x86/vf_exposure.asm | 15 +++++++++++++++ libavfilter/x86/vf_exposure_init.c | 6 ++++++ 2 files changed, 21 insertions(+)