diff mbox series

[FFmpeg-devel,v2,1/5] libavfilter/x86/vf_gblur: add ff_postscale_slice_avx512()

Message ID 20210804020616.82866-1-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v2,1/5] libavfilter/x86/vf_gblur: add ff_postscale_slice_avx512()
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Wu Jianhua Aug. 4, 2021, 2:06 a.m. UTC
Co-authored-by: Cheng Yanfei <yanfei.cheng@intel.com>
Co-authored-by: Jin Jun <jun.i.jin@intel.com>
Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavfilter/x86/vf_gblur.asm    | 21 ++++++++++++---------
 libavfilter/x86/vf_gblur_init.c |  4 ++++
 2 files changed, 16 insertions(+), 9 deletions(-)

Comments

Wu Jianhua Aug. 11, 2021, 1:13 a.m. UTC | #1
Ping

> -----Original Message-----
> From: Wu, Jianhua <jianhua.wu@intel.com>
> Sent: Wednesday, August 4, 2021 10:06 AM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu@intel.com>
> Subject: [PATCH v2 1/5] libavfilter/x86/vf_gblur: add
> ff_postscale_slice_avx512()
> 
> Co-authored-by: Cheng Yanfei <yanfei.cheng@intel.com>
> Co-authored-by: Jin Jun <jun.i.jin@intel.com>
> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> ---
>  libavfilter/x86/vf_gblur.asm    | 21 ++++++++++++---------
>  libavfilter/x86/vf_gblur_init.c |  4 ++++
>  2 files changed, 16 insertions(+), 9 deletions(-)
> 
> diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm index
> 4d84e6d011..276fe347f5 100644
> --- a/libavfilter/x86/vf_gblur.asm
> +++ b/libavfilter/x86/vf_gblur.asm
> @@ -194,19 +194,17 @@ cglobal postscale_slice, 2, 2, 4, ptr, length, postscale,
> min, max
>      VBROADCASTSS m1, minm
>      VBROADCASTSS m2, maxm
>  %elif WIN64
> -    SWAP 0, 2
> -    SWAP 1, 3
> -    VBROADCASTSS m0, xm0
> -    VBROADCASTSS m1, xm1
> +    VBROADCASTSS m0, xmm2
> +    VBROADCASTSS m1, xmm3
>      VBROADCASTSS m2, maxm
> -%else ; UNIX64
> -    VBROADCASTSS m0, xm0
> -    VBROADCASTSS m1, xm1
> -    VBROADCASTSS m2, xm2
> +%else ; UNIX
> +    VBROADCASTSS m0, xmm0
> +    VBROADCASTSS m1, xmm1
> +    VBROADCASTSS m2, xmm2
>  %endif
> 
>      .loop:
> -%if cpuflag(avx2)
> +%if cpuflag(avx2) || cpuflag(avx512)
>      mulps         m3, m0, [ptrq + lengthq]
>  %else
>      movu          m3, [ptrq + lengthq]
> @@ -229,3 +227,8 @@ POSTSCALE_SLICE
>  INIT_YMM avx2
>  POSTSCALE_SLICE
>  %endif
> +
> +%if HAVE_AVX512_EXTERNAL
> +INIT_ZMM avx512
> +POSTSCALE_SLICE
> +%endif
> diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
> index d80fb46fe4..34aba4ca6e 100644
> --- a/libavfilter/x86/vf_gblur_init.c
> +++ b/libavfilter/x86/vf_gblur_init.c
> @@ -29,6 +29,7 @@ void ff_horiz_slice_avx2(float *ptr, int width, int height,
> int steps, float nu,
> 
>  void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min,
> float max);  void ff_postscale_slice_avx2(float *ptr, int length, float postscale,
> float min, float max);
> +void ff_postscale_slice_avx512(float *ptr, int length, float postscale,
> +float min, float max);
> 
>  av_cold void ff_gblur_init_x86(GBlurContext *s)  { @@ -47,5 +48,8 @@
> av_cold void ff_gblur_init_x86(GBlurContext *s)
>      if (EXTERNAL_AVX2(cpu_flags)) {
>          s->horiz_slice = ff_horiz_slice_avx2;
>      }
> +    if (EXTERNAL_AVX512(cpu_flags)) {
> +        s->postscale_slice = ff_postscale_slice_avx512;
> +    }
>  #endif
>  }
> --
> 2.17.1
Paul B Mahol Aug. 11, 2021, 7:22 a.m. UTC | #2
will apply if nobody complains.
Kieran Kunhya Aug. 11, 2021, 9:43 a.m. UTC | #3
On Wed, 11 Aug 2021 at 08:23, Paul B Mahol <onemda@gmail.com> wrote:

> will apply if nobody complains.
>

Is this really a good idea considering the heavy downclocking of avx512 on
server SKUs?
James Almer Aug. 11, 2021, 12:30 p.m. UTC | #4
On 8/11/2021 6:43 AM, Kieran Kunhya wrote:
> On Wed, 11 Aug 2021 at 08:23, Paul B Mahol <onemda@gmail.com> wrote:
> 
>> will apply if nobody complains.
>>
> 
> Is this really a good idea considering the heavy downclocking of avx512 on
> server SKUs?

You can disable AVX512 at both runtime and compile time. I don't think 
that because there's one CPU arch out there that sees a hit in 
performance for one instruction set we should stop applying code other 
CPUs will benefit from.
Kieran Kunhya Aug. 11, 2021, 1:11 p.m. UTC | #5
On Wed, 11 Aug 2021 at 13:31, James Almer <jamrial@gmail.com> wrote:

> You can disable AVX512 at both runtime and compile time. I don't think
> that because there's one CPU arch out there that sees a hit in
> performance for one instruction set we should stop applying code other
> CPUs will benefit from.
>

Gramner suggests using the ice lake avx-512 subset as the minimum baseline
which I think is a good idea.

Kieran
James Almer Aug. 11, 2021, 1:19 p.m. UTC | #6
On 8/11/2021 10:11 AM, Kieran Kunhya wrote:
> On Wed, 11 Aug 2021 at 13:31, James Almer <jamrial@gmail.com> wrote:
> 
>> You can disable AVX512 at both runtime and compile time. I don't think
>> that because there's one CPU arch out there that sees a hit in
>> performance for one instruction set we should stop applying code other
>> CPUs will benefit from.
>>
> 
> Gramner suggests using the ice lake avx-512 subset as the minimum baseline
> which I think is a good idea.
> 
> Kieran

I'm fine with that, yes.
Wu Jianhua Aug. 12, 2021, 8:41 a.m. UTC | #7
James Wrote: 
> On 8/11/2021 10:11 AM, Kieran Kunhya wrote:
> > On Wed, 11 Aug 2021 at 13:31, James Almer <jamrial@gmail.com> wrote:
> >
> >> You can disable AVX512 at both runtime and compile time. I don't
> >> think that because there's one CPU arch out there that sees a hit in
> >> performance for one instruction set we should stop applying code
> >> other CPUs will benefit from.
> >>
> >
> > Gramner suggests using the ice lake avx-512 subset as the minimum
> > baseline which I think is a good idea.
> >
> > Kieran
> 
> I'm fine with that, yes.
> 

Ice Lake is the arch that almost supports the most AVX512 subsets except
Tiger Lake for now. Actually, there are quite a few projects that have already
chosen Ice Lake as the minimum baseline. But on my mind, the subsets of 
AVX512, like F,  VL, DQ,	BW, was supported by a number of architectures that
have already been released, and they could help us improve performance in
most cases. Keep the quo and more CPUs would benefit from it.

I agree with James. The usage of AVX512 is not mandatory. Use whatever you prefer.

By the way, after I implement the algorithm newly designed for the gblur filter
with AVX512, which proved that it could definitely improve the performance,
I provided a version of AVX2, a instructions set supported by more CPUs.

Here let me show you some performance comparisons. It's better to care about 
the ratio between two fps, different CPUs could change the fps but the ratio 
would be the same approximately.

1. 1080p sigma=10:step=1:
        old gblur with avx2: 45 fps
        new gblur with avx512: 109 fps
2: 1080p sigma=10:steps=3:
        old gblur with avx2: 19 fps
        new gblur with avx512: 84 fps

Hopefully the info above is helpful.

Best regards,
Jianhua
Wu Jianhua Aug. 16, 2021, 1:26 a.m. UTC | #8
Paul B Mahol Wrote:
> will apply if nobody complains.

Hi Paul,

It seemed that there is no more objection to the patches. Is it able to get to the next step? 
Feel free to let me know if there is any other problem.

Best regards.
Jianhua
Kieran Kunhya Aug. 16, 2021, 8:43 a.m. UTC | #9
On Mon, 16 Aug 2021, 02:26 Wu, Jianhua, <jianhua.wu@intel.com> wrote:

> Paul B Mahol Wrote:
> > will apply if nobody complains.
>
> Hi Paul,
>
> It seemed that there is no more objection to the patches. Is it able to
> get to the next step?
> Feel free to let me know if there is any other problem.
>
> Best regards.
> Jianhua


Hi Jianhua,

Can you make the avx512 code only be run for ice lake processors?

Kieran
Wu Jianhua Aug. 16, 2021, 9:14 a.m. UTC | #10
Kieran Wrote:
> Hi Jianhua,
> 
> Can you make the avx512 code only be run for ice lake processors?
> 

Hi Kieran,

As I see the macro, EXTERNAL_AVX512, located in libavutil/cpu.h, only an
EXTERNAL_AVX512  is used to check the state of AVX512, which did not
district the Icelake-AVX512 or Skylake-AVX512. If I have the approval from
maintainers, I'm supposed to be able to change the configure file and CPU.h
to add a feature that the configuration only enables AVX512 when the
architecture of a running CPU is satisfied to the IceLake. Or we could keep
the EXTERNAL_AVX512 and add a new macro, EXTERNAL_AVX512ICL. How
about your opinion?

By the way, these patches only used the minimum subsets of AVX512 and
have higher compatibility. I think it's okay to use it in any arch that supports
AVX512.

Feel free to let me know if you have different perspectives or if I neglected something.

Best regards.
Jianhua
Kieran Kunhya Aug. 16, 2021, 9:23 a.m. UTC | #11
>
> Feel free to let me know if you have different perspectives or if I
> neglected something.
>

Yes, but we don't want to use AVX512 on hardware that downclocks heavily.

Kieran
Wu Jianhua Aug. 16, 2021, 9:27 a.m. UTC | #12
> 
> Yes, but we don't want to use AVX512 on hardware that downclocks heavily.
> 

It's okay. Should I updated the configure file to let the IceLake-AVX512 to be the minimum baseline or add a new macro name AVX512ICL?

Jianhua
James Almer Aug. 16, 2021, 11:49 a.m. UTC | #13
On 8/16/2021 6:27 AM, Wu, Jianhua wrote:
>>
>> Yes, but we don't want to use AVX512 on hardware that downclocks heavily.
>>
> 
> It's okay. Should I updated the configure file to let the IceLake-AVX512 to be the minimum baseline or add a new macro name AVX512ICL?

Update the test in configure to look for an instruction available on Ice 
Lake (like VL), then update the check in cpu.c to make sure AVX2 is only 
enabled if AVX2-related flags Ice Lake and newer CPUs have in common are 
signaled.

> 
> Jianhua
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Wu Jianhua Aug. 19, 2021, 2:01 a.m. UTC | #14
> James Almer Wrote: 
> >>
> >> Yes, but we don't want to use AVX512 on hardware that downclocks
> heavily.
> >>
> >
> > It's okay. Should I updated the configure file to let the IceLake-AVX512 to
> be the minimum baseline or add a new macro name AVX512ICL?
> 
> Update the test in configure to look for an instruction available on Ice Lake
> (like VL), then update the check in cpu.c to make sure AVX2 is only enabled if
> AVX2-related flags Ice Lake and newer CPUs have in common are signaled.
> 

Hi there,

For this patchset, I receive two suggestions from you.

one is about to make avx512 codes only run for icelake
and newer architectures. So I submit a patchset for icelake-avx512
minimums baseline. Please check this link for more details:
http://ffmpeg.org/pipermail/ffmpeg-devel/2021-August/283973.html

The second is about giving the choice to users and developers.
Check this link for more details:
http://ffmpeg.org/pipermail/ffmpeg-devel/2021-August/283984.html

Because this patchset was developed based on the configurations
of AVX512 that FFmpeg already offered, It was totally independent
and compatible whatever the baseline of avx512 is, skylake-avx512,
icelake-avx512, or newer architectures. So whatever add a new avx512
flag for icelake or not, it would not affect the functionality of this patchset.

Could you help review the chat history and send me the modification plan?
I am glad to update and improve with whatever I am able to do. 

Best regards.
Jianhua
Paul B Mahol Aug. 19, 2021, 6:58 a.m. UTC | #15
On Mon, Aug 16, 2021 at 1:49 PM James Almer <jamrial@gmail.com> wrote:

> On 8/16/2021 6:27 AM, Wu, Jianhua wrote:
> >>
> >> Yes, but we don't want to use AVX512 on hardware that downclocks
> heavily.
> >>
> >
> > It's okay. Should I updated the configure file to let the IceLake-AVX512
> to be the minimum baseline or add a new macro name AVX512ICL?
>
> Update the test in configure to look for an instruction available on Ice
> Lake (like VL), then update the check in cpu.c to make sure AVX2 is only
> enabled if AVX2-related flags Ice Lake and newer CPUs have in common are
> signaled.
>

I think you are confused. AVX2 is doing fine.

>
> >
> > Jianhua
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Paul B Mahol Aug. 25, 2021, 1:48 p.m. UTC | #16
will apply shortly
James Almer Aug. 25, 2021, 1:51 p.m. UTC | #17
On 8/19/2021 3:58 AM, Paul B Mahol wrote:
> On Mon, Aug 16, 2021 at 1:49 PM James Almer <jamrial@gmail.com> wrote:
> 
>> On 8/16/2021 6:27 AM, Wu, Jianhua wrote:
>>>>
>>>> Yes, but we don't want to use AVX512 on hardware that downclocks
>> heavily.
>>>>
>>>
>>> It's okay. Should I updated the configure file to let the IceLake-AVX512
>> to be the minimum baseline or add a new macro name AVX512ICL?
>>
>> Update the test in configure to look for an instruction available on Ice
>> Lake (like VL), then update the check in cpu.c to make sure AVX2 is only
>> enabled if AVX2-related flags Ice Lake and newer CPUs have in common are
>> signaled.
>>
> 
> I think you are confused. AVX2 is doing fine.

Meant to say AVX512, yes.

What are you planning to push? Was a patch bumping the requirements for 
the current AVX512 flag as discussed submitted or not?

> 
>>
>>>
>>> Jianhua
>>>
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>>
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Paul B Mahol Aug. 25, 2021, 1:54 p.m. UTC | #18
On Wed, Aug 25, 2021 at 3:51 PM James Almer <jamrial@gmail.com> wrote:

> On 8/19/2021 3:58 AM, Paul B Mahol wrote:
> > On Mon, Aug 16, 2021 at 1:49 PM James Almer <jamrial@gmail.com> wrote:
> >
> >> On 8/16/2021 6:27 AM, Wu, Jianhua wrote:
> >>>>
> >>>> Yes, but we don't want to use AVX512 on hardware that downclocks
> >> heavily.
> >>>>
> >>>
> >>> It's okay. Should I updated the configure file to let the
> IceLake-AVX512
> >> to be the minimum baseline or add a new macro name AVX512ICL?
> >>
> >> Update the test in configure to look for an instruction available on Ice
> >> Lake (like VL), then update the check in cpu.c to make sure AVX2 is only
> >> enabled if AVX2-related flags Ice Lake and newer CPUs have in common are
> >> signaled.
> >>
> >
> > I think you are confused. AVX2 is doing fine.
>
> Meant to say AVX512, yes.
>
> What are you planning to push? Was a patch bumping the requirements for
> the current AVX512 flag as discussed submitted or not?
>

Everything. You can add additional commits later when you all finally
decide what to actually do.

>
> >
> >>
> >>>
> >>> Jianhua
> >>>
> >>> _______________________________________________
> >>> ffmpeg-devel mailing list
> >>> ffmpeg-devel@ffmpeg.org
> >>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>>
> >>> To unsubscribe, visit link above, or email
> >>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >>>
> >>
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >>
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Wu Jianhua Aug. 26, 2021, 9:19 a.m. UTC | #19
Paul B Mahol Wrote:
>
> will apply shortly
> 

Gotcha. Much appreciated.

Jianhua
Wu Jianhua Aug. 28, 2021, 9:15 a.m. UTC | #20
Paul B Mahol Wrote:
> will apply shortly

Hi there,

May I know when this patches will be applied?

Thanks,
Jianhua
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
index 4d84e6d011..276fe347f5 100644
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@@ -194,19 +194,17 @@  cglobal postscale_slice, 2, 2, 4, ptr, length, postscale, min, max
     VBROADCASTSS m1, minm
     VBROADCASTSS m2, maxm
 %elif WIN64
-    SWAP 0, 2
-    SWAP 1, 3
-    VBROADCASTSS m0, xm0
-    VBROADCASTSS m1, xm1
+    VBROADCASTSS m0, xmm2
+    VBROADCASTSS m1, xmm3
     VBROADCASTSS m2, maxm
-%else ; UNIX64
-    VBROADCASTSS m0, xm0
-    VBROADCASTSS m1, xm1
-    VBROADCASTSS m2, xm2
+%else ; UNIX
+    VBROADCASTSS m0, xmm0
+    VBROADCASTSS m1, xmm1
+    VBROADCASTSS m2, xmm2
 %endif
 
     .loop:
-%if cpuflag(avx2)
+%if cpuflag(avx2) || cpuflag(avx512)
     mulps         m3, m0, [ptrq + lengthq]
 %else
     movu          m3, [ptrq + lengthq]
@@ -229,3 +227,8 @@  POSTSCALE_SLICE
 INIT_YMM avx2
 POSTSCALE_SLICE
 %endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+POSTSCALE_SLICE
+%endif
diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
index d80fb46fe4..34aba4ca6e 100644
--- a/libavfilter/x86/vf_gblur_init.c
+++ b/libavfilter/x86/vf_gblur_init.c
@@ -29,6 +29,7 @@  void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu,
 
 void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max);
 void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
+void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max);
 
 av_cold void ff_gblur_init_x86(GBlurContext *s)
 {
@@ -47,5 +48,8 @@  av_cold void ff_gblur_init_x86(GBlurContext *s)
     if (EXTERNAL_AVX2(cpu_flags)) {
         s->horiz_slice = ff_horiz_slice_avx2;
     }
+    if (EXTERNAL_AVX512(cpu_flags)) {
+        s->postscale_slice = ff_postscale_slice_avx512;
+    }
 #endif
 }