diff mbox series

[FFmpeg-devel,1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()

Message ID 20210827045144.73794-1-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512() | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu Jianhua Aug. 27, 2021, 4:51 a.m. UTC
Performance(Less is better):
8bit:
    ff_hflip_byte_ssse3   0.61
    ff_hflip_byte_avx2    0.37
    ff_hflip_byte_avx512  0.19
16bit:
    ff_hflip_short_ssse3  1.27
    ff_hflip_short_avx2   0.76
    ff_hflip_short_avx512 0.40

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavfilter/x86/vf_hflip.asm    | 23 ++++++++++++++++++-----
 libavfilter/x86/vf_hflip_init.c |  8 ++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

Comments

Wu Jianhua Sept. 6, 2021, 2:10 a.m. UTC | #1
Ping.

> -----Original Message-----
> From: Wu, Jianhua <jianhua.wu@intel.com>
> Sent: Friday, August 27, 2021 12:52 PM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu@intel.com>
> Subject: [PATCH 1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()
> 
> Performance(Less is better):
> 8bit:
>     ff_hflip_byte_ssse3   0.61
>     ff_hflip_byte_avx2    0.37
>     ff_hflip_byte_avx512  0.19
> 16bit:
>     ff_hflip_short_ssse3  1.27
>     ff_hflip_short_avx2   0.76
>     ff_hflip_short_avx512 0.40
> 
> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> ---
>  libavfilter/x86/vf_hflip.asm    | 23 ++++++++++++++++++-----
>  libavfilter/x86/vf_hflip_init.c |  8 ++++++++
>  2 files changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm index
> 285618954f..c2237217f7 100644
> --- a/libavfilter/x86/vf_hflip.asm
> +++ b/libavfilter/x86/vf_hflip.asm
> @@ -26,12 +26,16 @@ SECTION_RODATA
> 
>  pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
>  pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
> +pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
> 
>  SECTION .text
> 
>  ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)  %macro
> HFLIP 3  cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
> +%if mmsize == 64
> +    movu              m3, [pd_flip_indicies]
> +%endif
>      VBROADCASTI128    m0, [pb_flip_%1]
>      xor               xq, xq
>  %if %3 == 1
> @@ -47,12 +51,15 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
> 
>      .loop0:
>          neg     xq
> -%if mmsize == 32
> -        vpermq  m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
> -        vpermq  m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
> +%if   mmsize == 64
> +        vpermd  m1, m3, [srcq + xq -     mmsize + %3]
> +        vpermd  m2, m3, [srcq + xq - 2 * mmsize + %3] %elif mmsize ==
> +32
> +        vpermq      m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
> +        vpermq      m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
>  %else
> -        movu    m1, [srcq + xq -     mmsize + %3]
> -        movu    m2, [srcq + xq - 2 * mmsize + %3]
> +        movu        m1, [srcq + xq -     mmsize + %3]
> +        movu        m2, [srcq + xq - 2 * mmsize + %3]
>  %endif
>          pshufb  m1, m0
>          pshufb  m2, m0
> @@ -88,3 +95,9 @@ INIT_YMM avx2
>  HFLIP byte, b, 1
>  HFLIP short, w, 2
>  %endif
> +
> +%if HAVE_AVX512_EXTERNAL
> +INIT_ZMM avx512
> +HFLIP byte, b, 1
> +HFLIP short, w, 2
> +%endif
> diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c index
> 0ac399b0d4..25fc40f7b0 100644
> --- a/libavfilter/x86/vf_hflip_init.c
> +++ b/libavfilter/x86/vf_hflip_init.c
> @@ -25,8 +25,10 @@
> 
>  void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);  void
> ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
> +void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w);
>  void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);  void
> ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
> +void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w);
> 
>  av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
> { @@ -41,6 +43,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int
> step[4], int nb_planes)
>              if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>                  s->flip_line[i] = ff_hflip_byte_avx2;
>              }
> +            if (EXTERNAL_AVX512(cpu_flags)) {
> +                s->flip_line[i] = ff_hflip_byte_avx512;
> +            }
>          } else if (step[i] == 2) {
>              if (EXTERNAL_SSSE3(cpu_flags)) {
>                  s->flip_line[i] = ff_hflip_short_ssse3; @@ -48,6 +53,9 @@ av_cold
> void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
>              if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>                  s->flip_line[i] = ff_hflip_short_avx2;
>              }
> +            if (EXTERNAL_AVX512(cpu_flags)) {
> +                s->flip_line[i] = ff_hflip_short_avx512;
> +            }
>          }
>      }
>  }
> --
> 2.17.1
Paul B Mahol Sept. 7, 2021, 4:39 p.m. UTC | #2
will apply soon if nobody objects
Wu Jianhua Sept. 14, 2021, 5:02 a.m. UTC | #3
Paul B Mahol wrote:
> 
> will apply soon if nobody objects
> 

Hi Paul,

It seemed that there is no one with objection over the past two weeks. Are  the patches able to be applied?

Best regards,
Jianhua
Wu Jianhua Sept. 23, 2021, 2:41 a.m. UTC | #4
Ping.

Jianhua wrote:
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Wu,
> Jianhua
> Sent: Tuesday, September 14, 2021 1:02 PM
> To: FFmpeg development discussions and patches <ffmpeg-
> devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add
> ff_flip_byte/short_avx512()
> 
> 
> It seemed that there is no one with objection over the past two weeks. Are
> the patches able to be applied?
>
Wu Jianhua Sept. 27, 2021, 6:47 a.m. UTC | #5
Ping.
Jianhua wrote:
> 
> Ping.
> 
> Jianhua wrote:
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> Wu,
> > Jianhua
> > Sent: Tuesday, September 14, 2021 1:02 PM
> > To: FFmpeg development discussions and patches <ffmpeg-
> > devel@ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add
> > ff_flip_byte/short_avx512()
> >
> >
> > It seemed that there is no one with objection over the past two weeks.
> > Are the patches able to be applied?
> >
> 
Hi there,

Looks like one month elapsed. Any update?

Thanks,
Jianhua
Paul B Mahol Sept. 27, 2021, 6:53 a.m. UTC | #6
On Mon, Sep 27, 2021 at 8:48 AM Wu, Jianhua <jianhua.wu@intel.com> wrote:

> Ping.
> Jianhua wrote:
> >
> > Ping.
> >
> > Jianhua wrote:
> > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
> > Wu,
> > > Jianhua
> > > Sent: Tuesday, September 14, 2021 1:02 PM
> > > To: FFmpeg development discussions and patches <ffmpeg-
> > > devel@ffmpeg.org>
> > > Subject: Re: [FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add
> > > ff_flip_byte/short_avx512()
> > >
> > >
> > > It seemed that there is no one with objection over the past two weeks.
> > > Are the patches able to be applied?
> > >
> >
> Hi there,
>
> Looks like one month elapsed. Any update?
>

No rushing needed.


> Thanks,
> Jianhua
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
index 285618954f..c2237217f7 100644
--- a/libavfilter/x86/vf_hflip.asm
+++ b/libavfilter/x86/vf_hflip.asm
@@ -26,12 +26,16 @@  SECTION_RODATA
 
 pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
+pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
 
 SECTION .text
 
 ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
 %macro HFLIP 3
 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
+%if mmsize == 64
+    movu              m3, [pd_flip_indicies]
+%endif
     VBROADCASTI128    m0, [pb_flip_%1]
     xor               xq, xq
 %if %3 == 1
@@ -47,12 +51,15 @@  cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
 
     .loop0:
         neg     xq
-%if mmsize == 32
-        vpermq  m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
-        vpermq  m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
+%if   mmsize == 64
+        vpermd  m1, m3, [srcq + xq -     mmsize + %3]
+        vpermd  m2, m3, [srcq + xq - 2 * mmsize + %3]
+%elif mmsize == 32
+        vpermq      m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
+        vpermq      m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
 %else
-        movu    m1, [srcq + xq -     mmsize + %3]
-        movu    m2, [srcq + xq - 2 * mmsize + %3]
+        movu        m1, [srcq + xq -     mmsize + %3]
+        movu        m2, [srcq + xq - 2 * mmsize + %3]
 %endif
         pshufb  m1, m0
         pshufb  m2, m0
@@ -88,3 +95,9 @@  INIT_YMM avx2
 HFLIP byte, b, 1
 HFLIP short, w, 2
 %endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+HFLIP byte, b, 1
+HFLIP short, w, 2
+%endif
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
index 0ac399b0d4..25fc40f7b0 100644
--- a/libavfilter/x86/vf_hflip_init.c
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -25,8 +25,10 @@ 
 
 void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w);
 
 av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
 {
@@ -41,6 +43,9 @@  av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
             if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_byte_avx2;
             }
+            if (EXTERNAL_AVX512(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_byte_avx512;
+            }
         } else if (step[i] == 2) {
             if (EXTERNAL_SSSE3(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_short_ssse3;
@@ -48,6 +53,9 @@  av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
             if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_short_avx2;
             }
+            if (EXTERNAL_AVX512(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_short_avx512;
+            }
         }
     }
 }