diff mbox series

[FFmpeg-devel,v2,2/3] avfilter/x86/vf_exposure: add ff_exposure_avx2

Message ID 20211104041841.95318-2-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v2,1/3] avfilter/x86/vf_exposure: add x86 SIMD optimization
Related show

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu Jianhua Nov. 4, 2021, 4:18 a.m. UTC
Performance data(Less is better):
    exposure_sse:   500491
    exposure_avx2:  449122

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavfilter/x86/vf_exposure.asm    | 15 +++++++++++++++
 libavfilter/x86/vf_exposure_init.c |  6 ++++++
 2 files changed, 21 insertions(+)

Comments

James Almer Nov. 20, 2021, 4:53 p.m. UTC | #1
On 11/4/2021 1:18 AM, Wu Jianhua wrote:
> Performance data(Less is better):
>      exposure_sse:   500491

You reported a better result in the first patch.

>      exposure_avx2:  449122

This looks like a really low speed up for a function that processes 
twice the amount of floats per loop.

> 
> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> ---
>   libavfilter/x86/vf_exposure.asm    | 15 +++++++++++++++
>   libavfilter/x86/vf_exposure_init.c |  6 ++++++
>   2 files changed, 21 insertions(+)
> 
> diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm
> index 3351c6fb3b..f271167805 100644
> --- a/libavfilter/x86/vf_exposure.asm
> +++ b/libavfilter/x86/vf_exposure.asm
> @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
>       VBROADCASTSS m1, xmm1
>   %endif
>   
> +%if cpuflag(fma3) || cpuflag(fma4)

Remove the fma4 check if you're not using it.

> +    mulps       m0, m0, m1 ; black * scale
> +%endif
> +
>   .loop:
> +%if cpuflag(fma3) || cpuflag(fma4)
> +    mova        m2, m0
> +    vfmsub231ps m2, m1, [ptrq]
> +    movu    [ptrq], m2

Have you tried to not use FMA for this and just keep the sub + mul even 
for AVX2 and see how it performs?

> +%else
>       movu        m2, [ptrq]
>       subps       m2, m2, m0
>       mulps       m2, m2, m1
>       movu    [ptrq], m2
> +%endif
>       add       ptrq, mmsize
>       sub    lengthq, mmsize/4
>   
> @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
>   %if ARCH_X86_64
>   INIT_XMM sse
>   EXPOSURE
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +EXPOSURE
> +%endif
>   %endif
> diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c
> index de1b360f6c..80dae6164e 100644
> --- a/libavfilter/x86/vf_exposure_init.c
> +++ b/libavfilter/x86/vf_exposure_init.c
> @@ -24,6 +24,7 @@
>   #include "libavfilter/exposure.h"
>   
>   void ff_exposure_sse(float *ptr, int length, float black, float scale);
> +void ff_exposure_avx2(float *ptr, int length, float black, float scale);
>   
>   av_cold void ff_exposure_init_x86(ExposureContext *s)
>   {
> @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s)
>   #if ARCH_X86_64
>       if (EXTERNAL_SSE(cpu_flags))
>           s->exposure_func = ff_exposure_sse;
> +
> +#if HAVE_AVX2_EXTERNAL

No need for this preprocessor check.

> +    if (EXTERNAL_AVX2_FAST(cpu_flags))
> +        s->exposure_func = ff_exposure_avx2;
> +#endif
>   #endif
>   }
>
Wu Jianhua Nov. 20, 2021, 8:42 p.m. UTC | #2
James Almer<mailto:jamrial@gmail.com>:
On 11/4/2021 1:18 AM, Wu Jianhua wrote:
>> Performance data(Less is better):
>>      exposure_sse:   500491

>You reported a better result in the first patch.

For they are tested on different baseline, I think it might be better to only compare these two values.

>>      exposure_avx2:  449122

> This looks like a really low speed up for a function that processes
>  twice the amount of floats per loop.

> >
> > Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> > ---
> >   libavfilter/x86/vf_exposure.asm    | 15 +++++++++++++++
> >   libavfilter/x86/vf_exposure_init.c |  6 ++++++
> >   2 files changed, 21 insertions(+)
> >
> > diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm
> > index 3351c6fb3b..f271167805 100644
> > --- a/libavfilter/x86/vf_exposure.asm
> > +++ b/libavfilter/x86/vf_exposure.asm
> > @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
> >       VBROADCASTSS m1, xmm1
> >   %endif
> >
> > +%if cpuflag(fma3) || cpuflag(fma4)

> Remove the fma4 check if you're not using it.

No problem. Avx2 flag is only initialized with fma3, so the fma4 is redundant indeed.

> > +    mulps       m0, m0, m1 ; black * scale
> > +%endif
> > +
> >   .loop:
> > +%if cpuflag(fma3) || cpuflag(fma4)
> > +    mova        m2, m0
> > +    vfmsub231ps m2, m1, [ptrq]
> > +    movu    [ptrq], m2

> Have you tried to not use FMA for this and just kept the sub + mul even
> for AVX2 and see how it performs?

Yeah. Definitely. I have had sufficient tests before. The first version is kept sub + mul
for AVX2. After that, I keep trying to find a way out to speed up it further. Using FMA
here would be faster than sub + mul indeed, precisely, improving by 4%-10% approximately.
Not that much better, but still an optimal way I found at the present.

>>  +%else
>>       movu        m2, [ptrq]
>>       subps       m2, m2, m0
>>       mulps       m2, m2, m1
>>       movu    [ptrq], m2
>>  +%endif
>>       add       ptrq, mmsize
>>       sub    lengthq, mmsize/4
>>
>>  @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
>>   %if ARCH_X86_64
>>   INIT_XMM sse
>>   EXPOSURE
>>  +
>>  +%if HAVE_AVX2_EXTERNAL
>>  +INIT_YMM avx2
>>  +EXPOSURE
>>  +%endif
>>   %endif
>>  diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c
>>  index de1b360f6c..80dae6164e 100644
>>  --- a/libavfilter/x86/vf_exposure_init.c
>>  +++ b/libavfilter/x86/vf_exposure_init.c
>>  @@ -24,6 +24,7 @@
>>   #include "libavfilter/exposure.h"
>>
>>   void ff_exposure_sse(float *ptr, int length, float black, float scale);
>>  +void ff_exposure_avx2(float *ptr, int length, float black, float scale);
>>
>>   av_cold void ff_exposure_init_x86(ExposureContext *s)
> >  {
>>  @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s)
>>   #if ARCH_X86_64
>>       if (EXTERNAL_SSE(cpu_flags))
>>           s->exposure_func = ff_exposure_sse;
>>  +
>>  +#if HAVE_AVX2_EXTERNAL

> No need for this preprocessor check.

Got it. I’ll update it.

Thanks for your review.
Jianhua
James Almer Nov. 20, 2021, 9:28 p.m. UTC | #3
On 11/20/2021 5:42 PM, Wu Jianhua wrote:
> James Almer<mailto:jamrial@gmail.com>:
> On 11/4/2021 1:18 AM, Wu Jianhua wrote:
>>> Performance data(Less is better):
>>>       exposure_sse:   500491
> 
>> You reported a better result in the first patch.
> 
> For they are tested on different baseline, I think it might be better to only compare these two values.
> 
>>>       exposure_avx2:  449122
> 
>> This looks like a really low speed up for a function that processes
>>   twice the amount of floats per loop.
> 
>>>
>>> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
>>> ---
>>>    libavfilter/x86/vf_exposure.asm    | 15 +++++++++++++++
>>>    libavfilter/x86/vf_exposure_init.c |  6 ++++++
>>>    2 files changed, 21 insertions(+)
>>>
>>> diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm
>>> index 3351c6fb3b..f271167805 100644
>>> --- a/libavfilter/x86/vf_exposure.asm
>>> +++ b/libavfilter/x86/vf_exposure.asm
>>> @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
>>>        VBROADCASTSS m1, xmm1
>>>    %endif
>>>
>>> +%if cpuflag(fma3) || cpuflag(fma4)
> 
>> Remove the fma4 check if you're not using it.
> 
> No problem. Avx2 flag is only initialized with fma3, so the fma4 is redundant indeed.
> 
>>> +    mulps       m0, m0, m1 ; black * scale
>>> +%endif
>>> +
>>>    .loop:
>>> +%if cpuflag(fma3) || cpuflag(fma4)
>>> +    mova        m2, m0
>>> +    vfmsub231ps m2, m1, [ptrq]
>>> +    movu    [ptrq], m2
> 
>> Have you tried to not use FMA for this and just kept the sub + mul even
>> for AVX2 and see how it performs?
> 
> Yeah. Definitely. I have had sufficient tests before. The first version is kept sub + mul
> for AVX2. After that, I keep trying to find a way out to speed up it further. Using FMA
> here would be faster than sub + mul indeed, precisely, improving by 4%-10% approximately.
> Not that much better, but still an optimal way I found at the present.

I tried the checkasm test you wrote and when i made the AVX2 version use 
sub + mul instead of vfmsub231ps i noticed that i could change the 
epsilon value to FLT_EPSILON instead of 0.01f and the test would still 
succeed, meaning the output of the version using vfmsub231ps deviates a 
bit from the normal sub + mul one.

The speed up is pretty small, so it may be worth just using the sub + 
mul version instead.

> 
>>>   +%else
>>>        movu        m2, [ptrq]
>>>        subps       m2, m2, m0
>>>        mulps       m2, m2, m1
>>>        movu    [ptrq], m2
>>>   +%endif
>>>        add       ptrq, mmsize
>>>        sub    lengthq, mmsize/4
>>>
>>>   @@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
>>>    %if ARCH_X86_64
>>>    INIT_XMM sse
>>>    EXPOSURE
>>>   +
>>>   +%if HAVE_AVX2_EXTERNAL
>>>   +INIT_YMM avx2
>>>   +EXPOSURE
>>>   +%endif
>>>    %endif
>>>   diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c
>>>   index de1b360f6c..80dae6164e 100644
>>>   --- a/libavfilter/x86/vf_exposure_init.c
>>>   +++ b/libavfilter/x86/vf_exposure_init.c
>>>   @@ -24,6 +24,7 @@
>>>    #include "libavfilter/exposure.h"
>>>
>>>    void ff_exposure_sse(float *ptr, int length, float black, float scale);
>>>   +void ff_exposure_avx2(float *ptr, int length, float black, float scale);
>>>
>>>    av_cold void ff_exposure_init_x86(ExposureContext *s)
>>>   {
>>>   @@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s)
>>>    #if ARCH_X86_64
>>>        if (EXTERNAL_SSE(cpu_flags))
>>>            s->exposure_func = ff_exposure_sse;
>>>   +
>>>   +#if HAVE_AVX2_EXTERNAL
> 
>> No need for this preprocessor check.
> 
> Got it. I’ll update it.
> 
> Thanks for your review.
> Jianhua
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Wu Jianhua Nov. 21, 2021, 6:09 a.m. UTC | #4
James Almer<mailto:jamrial@gmail.com>:
>On 11/20/2021 5:42 PM, Wu Jianhua wrote:
>> James Almer<mailto:jamrial@gmail.com>:
>> On 11/4/2021 1:18 AM, Wu Jianhua wrote:
>>>> Performance data(Less is better):
>>>>       exposure_sse:   500491
>>
>>> You reported a better result in the first patch.
>>
>> For they are tested on different baseline, I think it might be better to only compare these two values.
>>
>>>>       exposure_avx2:  449122
>>
>>> This looks like a really low speed up for a function that processes
>>>   twice the amount of floats per loop.
>>
>>>>
>>>> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
>>>> ---
>>>>    libavfilter/x86/vf_exposure.asm    | 15 +++++++++++++++
>>>>    libavfilter/x86/vf_exposure_init.c |  6 ++++++
>>>>    2 files changed, 21 insertions(+)
>>>>
>>>> diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm
>>>> index 3351c6fb3b..f271167805 100644
>>>> --- a/libavfilter/x86/vf_exposure.asm
>>>> +++ b/libavfilter/x86/vf_exposure.asm
>>>> @@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
>>>>        VBROADCASTSS m1, xmm1
>>>>    %endif
>>>>
>>>> +%if cpuflag(fma3) || cpuflag(fma4)
>>
>>> Remove the fma4 check if you're not using it.
>>
>> No problem. Avx2 flag is only initialized with fma3, so the fma4 is redundant indeed.
>>
>>>> +    mulps       m0, m0, m1 ; black * scale
>>>> +%endif
>>>> +
>>>>    .loop:
>>>> +%if cpuflag(fma3) || cpuflag(fma4)
>>>> +    mova        m2, m0
>>>> +    vfmsub231ps m2, m1, [ptrq]
>>>> +    movu    [ptrq], m2
> >
>>> Have you tried to not use FMA for this and just kept the sub + mul even
>>> for AVX2 and see how it performs?
>>
>> Yeah. Definitely. I have had sufficient tests before. The first version is kept sub + mul
>> for AVX2. After that, I keep trying to find a way out to speed up it further. Using FMA
>> here would be faster than sub + mul indeed, precisely, improving by 4%-10% approximately.
>> Not that much better, but still an optimal way I found at the present.

> I tried the checkasm test you wrote and when i made the AVX2 version use
> sub + mul instead of vfmsub231ps i noticed that i could change the
> epsilon value to FLT_EPSILON instead of 0.01f and the test would still
> succeed, meaning the output of the version using vfmsub231ps deviates a
> bit from the normal sub + mul one.

> The speed up is pretty small, so it may be worth just using the sub +
> mul version instead.

Yeah. Small, but it’s not called just one time. Many a little makes a mickle, isn’t it?
I might be more prefer to keep this.
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm
index 3351c6fb3b..f271167805 100644
--- a/libavfilter/x86/vf_exposure.asm
+++ b/libavfilter/x86/vf_exposure.asm
@@ -36,11 +36,21 @@  cglobal exposure, 2, 2, 4, ptr, length, black, scale
     VBROADCASTSS m1, xmm1
 %endif
 
+%if cpuflag(fma3) || cpuflag(fma4)
+    mulps       m0, m0, m1 ; black * scale
+%endif
+
 .loop:
+%if cpuflag(fma3) || cpuflag(fma4)
+    mova        m2, m0
+    vfmsub231ps m2, m1, [ptrq]
+    movu    [ptrq], m2
+%else
     movu        m2, [ptrq]
     subps       m2, m2, m0
     mulps       m2, m2, m1
     movu    [ptrq], m2
+%endif
     add       ptrq, mmsize
     sub    lengthq, mmsize/4
 
@@ -52,4 +62,9 @@  cglobal exposure, 2, 2, 4, ptr, length, black, scale
 %if ARCH_X86_64
 INIT_XMM sse
 EXPOSURE
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+EXPOSURE
+%endif
 %endif
diff --git a/libavfilter/x86/vf_exposure_init.c b/libavfilter/x86/vf_exposure_init.c
index de1b360f6c..80dae6164e 100644
--- a/libavfilter/x86/vf_exposure_init.c
+++ b/libavfilter/x86/vf_exposure_init.c
@@ -24,6 +24,7 @@ 
 #include "libavfilter/exposure.h"
 
 void ff_exposure_sse(float *ptr, int length, float black, float scale);
+void ff_exposure_avx2(float *ptr, int length, float black, float scale);
 
 av_cold void ff_exposure_init_x86(ExposureContext *s)
 {
@@ -32,5 +33,10 @@  av_cold void ff_exposure_init_x86(ExposureContext *s)
 #if ARCH_X86_64
     if (EXTERNAL_SSE(cpu_flags))
         s->exposure_func = ff_exposure_sse;
+
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2_FAST(cpu_flags))
+        s->exposure_func = ff_exposure_avx2;
+#endif
 #endif
 }