diff mbox series

[FFmpeg-devel,1/2] libavutil/cpu: Adds av_cpu_has_fast_gather to detect cpus with avx fast gather instruction

Message ID 20210614111407.1897690-1-alankelly@google.com
State New
Headers show
Series [FFmpeg-devel,1/2] libavutil/cpu: Adds av_cpu_has_fast_gather to detect cpus with avx fast gather instruction
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Alan Kelly June 14, 2021, 11:14 a.m. UTC
Broadwell and later have fast gather instructions.
---
 This is so that the avx2 version of ff_hscale8to15X which uses gather
 instructions is only selected on machines where it will actually be
 faster.
 libavutil/cpu.c          |  6 ++++++
 libavutil/cpu.h          |  6 ++++++
 libavutil/cpu_internal.h |  1 +
 libavutil/x86/cpu.c      | 18 ++++++++++++++++++
 4 files changed, 31 insertions(+)

Comments

Andreas Rheinhardt June 14, 2021, 11:41 a.m. UTC | #1
Alan Kelly:
> Broadwell and later have fast gather instructions.
> ---
>  This is so that the avx2 version of ff_hscale8to15X which uses gather
>  instructions is only selected on machines where it will actually be
>  faster.
>  libavutil/cpu.c          |  6 ++++++
>  libavutil/cpu.h          |  6 ++++++
>  libavutil/cpu_internal.h |  1 +
>  libavutil/x86/cpu.c      | 18 ++++++++++++++++++
>  4 files changed, 31 insertions(+)
> 
> diff --git a/libavutil/cpu.c b/libavutil/cpu.c
> index 8960415d00..0a723eeb7a 100644
> --- a/libavutil/cpu.c
> +++ b/libavutil/cpu.c
> @@ -49,6 +49,12 @@
>  
>  static atomic_int cpu_flags = ATOMIC_VAR_INIT(-1);
>  
> +int av_cpu_has_fast_gather(void){
> +    if (ARCH_X86)
> +        return ff_cpu_has_fast_gather();
> +    return 0;
> +}
> +
>  static int get_cpu_flags(void)
>  {
>      if (ARCH_MIPS)
> diff --git a/libavutil/cpu.h b/libavutil/cpu.h
> index b555422dae..faf3a221f4 100644
> --- a/libavutil/cpu.h
> +++ b/libavutil/cpu.h
> @@ -72,6 +72,7 @@
>  #define AV_CPU_FLAG_MMI          (1 << 0)
>  #define AV_CPU_FLAG_MSA          (1 << 1)
>  
> +int av_cpu_has_fast_gather(void);
>  /**
>   * Return the flags which specify extensions supported by the CPU.
>   * The returned value is affected by av_force_cpu_flags() if that was used
> @@ -107,6 +108,11 @@ int av_cpu_count(void);
>   *  av_set_cpu_flags_mask(), then this function will behave as if AVX is not
>   *  present.
>   */
> +
> +/**
> + * Returns true if the cpu has fast gather instructions.
> + * Broadwell and later cpus have fast gather
> + */

You added the documentation to av_cpu_max_align(), not
av_cpu_has_fast_gather().

>  size_t av_cpu_max_align(void);
>  
>  #endif /* AVUTIL_CPU_H */
> diff --git a/libavutil/cpu_internal.h b/libavutil/cpu_internal.h
> index 889764320b..92525df0c1 100644
> --- a/libavutil/cpu_internal.h
> +++ b/libavutil/cpu_internal.h
> @@ -46,6 +46,7 @@ int ff_get_cpu_flags_aarch64(void);
>  int ff_get_cpu_flags_arm(void);
>  int ff_get_cpu_flags_ppc(void);
>  int ff_get_cpu_flags_x86(void);
> +int ff_cpu_has_fast_gather(void);
>  
>  size_t ff_get_cpu_max_align_mips(void);
>  size_t ff_get_cpu_max_align_aarch64(void);
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> index bcd41a50a2..9724e0017b 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -270,3 +270,21 @@ size_t ff_get_cpu_max_align_x86(void)
>  
>      return 8;
>  }
> +
> +int ff_cpu_has_fast_gather(void){
> +    int eax, ebx, ecx;
> +    int max_std_level, std_caps = 0;
> +    int family = 0, model = 0;
> +    cpuid(0, max_std_level, ebx, ecx, std_caps);
> +
> +    if (max_std_level >= 1) {
> +        cpuid(1, eax, ebx, ecx, std_caps);
> +        family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
> +        model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
> +        // Broadwell and later
> +        if(family == 6 && model >= 70){
> +          return 1;
> +        }
> +    }
> +    return 0;
> +}
> 

The usual way to signal things that a processor supports even if slow is
by a CPU flag; see AV_CPU_FLAG_(AVX|SSE2|SSE3)SLOW. That way one also
avoids adding a new public function that is completely useless when not
on X86.

- Andreas
Ronald S. Bultje June 14, 2021, 11:53 a.m. UTC | #2
Hi Alan,

On Mon, Jun 14, 2021 at 7:20 AM Alan Kelly <
alankelly-at-google.com@ffmpeg.org> wrote:

> Broadwell and later have fast gather instructions.
> ---
>  This is so that the avx2 version of ff_hscale8to15X which uses gather
>  instructions is only selected on machines where it will actually be
>  faster.
>

We've in the past typically done this with a bit in the cpuflags return
value. Can this be added there instead of being its own function?

Also, what is the cycle count of ssse3/avx2 implementation for this
specific function on Haswell? It would be good to note that in the
respective patch so that we understand why the check was added.

Ronald
James Almer June 14, 2021, 12:17 p.m. UTC | #3
On 6/14/2021 8:53 AM, Ronald S. Bultje wrote:
> Hi Alan,
> 
> On Mon, Jun 14, 2021 at 7:20 AM Alan Kelly <
> alankelly-at-google.com@ffmpeg.org> wrote:
> 
>> Broadwell and later have fast gather instructions.
>> ---
>>   This is so that the avx2 version of ff_hscale8to15X which uses gather
>>   instructions is only selected on machines where it will actually be
>>   faster.
>>
> 
> We've in the past typically done this with a bit in the cpuflags return
> value. Can this be added there instead of being its own function?
> 
> Also, what is the cycle count of ssse3/avx2 implementation for this
> specific function on Haswell? It would be good to note that in the
> respective patch so that we understand why the check was added.

Between 9 and 12 on Haswell, 5 to 7 on Broadwell, and about 2 to 5 on 
Skylake and newer, acording to Agner's pdf if i'm reading it right. It's 
also slow on AMD before Zen 3.

And yes, this should if anything be a new cpu flag and not a new function.
Alan Kelly June 24, 2021, 1:30 p.m. UTC | #4
Hi,

Sorry for the late reply, busy oncall week. Thanks for your responses. I
have looked at the code for cpuflags and what you suggested makes sense. I
just have a question about naming. EXTERNAL_AVX2_FAST is already used in
many places - it checks whether the flag AV_CPU_FLAG_AVXSLOW is set so I
can't use this as it would change the meaning of it. Could I define a flag
like for AV_CPU_FLAG_CMOV? AV_CPU_FLAG_FAST_GATHER or similar? Or could you
please suggest a better solution.

Thanks

On Mon, Jun 14, 2021 at 2:17 PM James Almer <jamrial@gmail.com> wrote:

> On 6/14/2021 8:53 AM, Ronald S. Bultje wrote:
> > Hi Alan,
> >
> > On Mon, Jun 14, 2021 at 7:20 AM Alan Kelly <
> > alankelly-at-google.com@ffmpeg.org> wrote:
> >
> >> Broadwell and later have fast gather instructions.
> >> ---
> >>   This is so that the avx2 version of ff_hscale8to15X which uses gather
> >>   instructions is only selected on machines where it will actually be
> >>   faster.
> >>
> >
> > We've in the past typically done this with a bit in the cpuflags return
> > value. Can this be added there instead of being its own function?
> >
> > Also, what is the cycle count of ssse3/avx2 implementation for this
> > specific function on Haswell? It would be good to note that in the
> > respective patch so that we understand why the check was added.
>
> Between 9 and 12 on Haswell, 5 to 7 on Broadwell, and about 2 to 5 on
> Skylake and newer, acording to Agner's pdf if i'm reading it right. It's
> also slow on AMD before Zen 3.
>
> And yes, this should if anything be a new cpu flag and not a new function.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
James Almer June 24, 2021, 1:49 p.m. UTC | #5
On 6/24/2021 10:30 AM, Alan Kelly wrote:
> Hi,
> 
> Sorry for the late reply, busy oncall week. Thanks for your responses. I
> have looked at the code for cpuflags and what you suggested makes sense. I
> just have a question about naming. EXTERNAL_AVX2_FAST is already used in
> many places - it checks whether the flag AV_CPU_FLAG_AVXSLOW is set so I
> can't use this as it would change the meaning of it. Could I define a flag
> like for AV_CPU_FLAG_CMOV? AV_CPU_FLAG_FAST_GATHER or similar? Or could you
> please suggest a better solution.

Add a new AV_CPU_FLAG_AVX2SLOW public define (use the available value 
0x2000000), then maybe add an internal EXTERNAL_AVX2_FAST_GATHER() 
helper macro that expands to CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, AVX2).

The AV_CPU_FLAG_AVX2SLOW flag should be set for all cpus currently being 
flagged as AV_CPU_FLAG_AVXSLOW, plus Haswell and all AMD cpus prior to 
Zen 3.

Also, please don't top post when replying to an email.

> 
> Thanks
> 
> On Mon, Jun 14, 2021 at 2:17 PM James Almer <jamrial@gmail.com> wrote:
> 
>> On 6/14/2021 8:53 AM, Ronald S. Bultje wrote:
>>> Hi Alan,
>>>
>>> On Mon, Jun 14, 2021 at 7:20 AM Alan Kelly <
>>> alankelly-at-google.com@ffmpeg.org> wrote:
>>>
>>>> Broadwell and later have fast gather instructions.
>>>> ---
>>>>    This is so that the avx2 version of ff_hscale8to15X which uses gather
>>>>    instructions is only selected on machines where it will actually be
>>>>    faster.
>>>>
>>>
>>> We've in the past typically done this with a bit in the cpuflags return
>>> value. Can this be added there instead of being its own function?
>>>
>>> Also, what is the cycle count of ssse3/avx2 implementation for this
>>> specific function on Haswell? It would be good to note that in the
>>> respective patch so that we understand why the check was added.
>>
>> Between 9 and 12 on Haswell, 5 to 7 on Broadwell, and about 2 to 5 on
>> Skylake and newer, acording to Agner's pdf if i'm reading it right. It's
>> also slow on AMD before Zen 3.
>>
>> And yes, this should if anything be a new cpu flag and not a new function.
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 8960415d00..0a723eeb7a 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -49,6 +49,12 @@ 
 
 static atomic_int cpu_flags = ATOMIC_VAR_INIT(-1);
 
+int av_cpu_has_fast_gather(void){
+    if (ARCH_X86)
+        return ff_cpu_has_fast_gather();
+    return 0;
+}
+
 static int get_cpu_flags(void)
 {
     if (ARCH_MIPS)
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index b555422dae..faf3a221f4 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -72,6 +72,7 @@ 
 #define AV_CPU_FLAG_MMI          (1 << 0)
 #define AV_CPU_FLAG_MSA          (1 << 1)
 
+int av_cpu_has_fast_gather(void);
 /**
  * Return the flags which specify extensions supported by the CPU.
  * The returned value is affected by av_force_cpu_flags() if that was used
@@ -107,6 +108,11 @@  int av_cpu_count(void);
  *  av_set_cpu_flags_mask(), then this function will behave as if AVX is not
  *  present.
  */
+
+/**
+ * Returns true if the cpu has fast gather instructions.
+ * Broadwell and later cpus have fast gather
+ */
 size_t av_cpu_max_align(void);
 
 #endif /* AVUTIL_CPU_H */
diff --git a/libavutil/cpu_internal.h b/libavutil/cpu_internal.h
index 889764320b..92525df0c1 100644
--- a/libavutil/cpu_internal.h
+++ b/libavutil/cpu_internal.h
@@ -46,6 +46,7 @@  int ff_get_cpu_flags_aarch64(void);
 int ff_get_cpu_flags_arm(void);
 int ff_get_cpu_flags_ppc(void);
 int ff_get_cpu_flags_x86(void);
+int ff_cpu_has_fast_gather(void);
 
 size_t ff_get_cpu_max_align_mips(void);
 size_t ff_get_cpu_max_align_aarch64(void);
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..9724e0017b 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -270,3 +270,21 @@  size_t ff_get_cpu_max_align_x86(void)
 
     return 8;
 }
+
+int ff_cpu_has_fast_gather(void){
+    int eax, ebx, ecx;
+    int max_std_level, std_caps = 0;
+    int family = 0, model = 0;
+    cpuid(0, max_std_level, ebx, ecx, std_caps);
+
+    if (max_std_level >= 1) {
+        cpuid(1, eax, ebx, ecx, std_caps);
+        family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+        model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+        // Broadwell and later
+        if(family == 6 && model >= 70){
+          return 1;
+        }
+    }
+    return 0;
+}