Message ID | 20210827045144.73794-1-jianhua.wu@intel.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512() | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
Ping. > -----Original Message----- > From: Wu, Jianhua <jianhua.wu@intel.com> > Sent: Friday, August 27, 2021 12:52 PM > To: ffmpeg-devel@ffmpeg.org > Cc: Wu, Jianhua <jianhua.wu@intel.com> > Subject: [PATCH 1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512() > > Performance(Less is better): > 8bit: > ff_hflip_byte_ssse3 0.61 > ff_hflip_byte_avx2 0.37 > ff_hflip_byte_avx512 0.19 > 16bit: > ff_hflip_short_ssse3 1.27 > ff_hflip_short_avx2 0.76 > ff_hflip_short_avx512 0.40 > > Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> > --- > libavfilter/x86/vf_hflip.asm | 23 ++++++++++++++++++----- > libavfilter/x86/vf_hflip_init.c | 8 ++++++++ > 2 files changed, 26 insertions(+), 5 deletions(-) > > diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm index > 285618954f..c2237217f7 100644 > --- a/libavfilter/x86/vf_hflip.asm > +++ b/libavfilter/x86/vf_hflip.asm > @@ -26,12 +26,16 @@ SECTION_RODATA > > pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 > pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 > +pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3 > > SECTION .text > > ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) %macro > HFLIP 3 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x > +%if mmsize == 64 > + movu m3, [pd_flip_indicies] > +%endif > VBROADCASTI128 m0, [pb_flip_%1] > xor xq, xq > %if %3 == 1 > @@ -47,12 +51,15 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x > > .loop0: > neg xq > -%if mmsize == 32 > - vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load > - vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load > +%if mmsize == 64 > + vpermd m1, m3, [srcq + xq - mmsize + %3] > + vpermd m2, m3, [srcq + xq - 2 * mmsize + %3] %elif mmsize == > +32 > + vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load > + vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load > %else > - movu m1, [srcq + xq - mmsize + %3] > - movu m2, [srcq + xq - 2 * mmsize + %3] > + movu m1, [srcq + xq - mmsize + %3] > + movu m2, [srcq + xq - 2 * mmsize + %3] > %endif > pshufb m1, m0 > pshufb m2, m0 > @@ -88,3 +95,9 @@ INIT_YMM avx2 > HFLIP byte, b, 1 > HFLIP short, w, 2 > %endif > + > +%if HAVE_AVX512_EXTERNAL > +INIT_ZMM avx512 > +HFLIP byte, b, 1 > +HFLIP short, w, 2 > +%endif > diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c index > 0ac399b0d4..25fc40f7b0 100644 > --- a/libavfilter/x86/vf_hflip_init.c > +++ b/libavfilter/x86/vf_hflip_init.c > @@ -25,8 +25,10 @@ > > void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w); void > ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w); > +void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w); > void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w); void > ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w); > +void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w); > > av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) > { @@ -41,6 +43,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int > step[4], int nb_planes) > if (EXTERNAL_AVX2_FAST(cpu_flags)) { > s->flip_line[i] = ff_hflip_byte_avx2; > } > + if (EXTERNAL_AVX512(cpu_flags)) { > + s->flip_line[i] = ff_hflip_byte_avx512; > + } > } else if (step[i] == 2) { > if (EXTERNAL_SSSE3(cpu_flags)) { > s->flip_line[i] = ff_hflip_short_ssse3; @@ -48,6 +53,9 @@ av_cold > void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) > if (EXTERNAL_AVX2_FAST(cpu_flags)) { > s->flip_line[i] = ff_hflip_short_avx2; > } > + if (EXTERNAL_AVX512(cpu_flags)) { > + s->flip_line[i] = ff_hflip_short_avx512; > + } > } > } > } > -- > 2.17.1
will apply soon if nobody objects
Paul B Mahol wrote: > > will apply soon if nobody objects > Hi Paul, It seemed that there is no one with objection over the past two weeks. Are the patches able to be applied? Best regards, Jianhua
Ping. Jianhua wrote: > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Wu, > Jianhua > Sent: Tuesday, September 14, 2021 1:02 PM > To: FFmpeg development discussions and patches <ffmpeg- > devel@ffmpeg.org> > Subject: Re: [FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add > ff_flip_byte/short_avx512() > > > It seemed that there is no one with objection over the past two weeks. Are > the patches able to be applied? >
Ping. Jianhua wrote: > > Ping. > > Jianhua wrote: > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of > Wu, > > Jianhua > > Sent: Tuesday, September 14, 2021 1:02 PM > > To: FFmpeg development discussions and patches <ffmpeg- > > devel@ffmpeg.org> > > Subject: Re: [FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add > > ff_flip_byte/short_avx512() > > > > > > It seemed that there is no one with objection over the past two weeks. > > Are the patches able to be applied? > > > Hi there, Looks like one month elapsed. Any update? Thanks, Jianhua
On Mon, Sep 27, 2021 at 8:48 AM Wu, Jianhua <jianhua.wu@intel.com> wrote: > Ping. > Jianhua wrote: > > > > Ping. > > > > Jianhua wrote: > > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of > > Wu, > > > Jianhua > > > Sent: Tuesday, September 14, 2021 1:02 PM > > > To: FFmpeg development discussions and patches <ffmpeg- > > > devel@ffmpeg.org> > > > Subject: Re: [FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add > > > ff_flip_byte/short_avx512() > > > > > > > > > It seemed that there is no one with objection over the past two weeks. > > > Are the patches able to be applied? > > > > > > Hi there, > > Looks like one month elapsed. Any update? > No rushing needed. > Thanks, > Jianhua > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm index 285618954f..c2237217f7 100644 --- a/libavfilter/x86/vf_hflip.asm +++ b/libavfilter/x86/vf_hflip.asm @@ -26,12 +26,16 @@ SECTION_RODATA pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 +pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3 SECTION .text ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) %macro HFLIP 3 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x +%if mmsize == 64 + movu m3, [pd_flip_indicies] +%endif VBROADCASTI128 m0, [pb_flip_%1] xor xq, xq %if %3 == 1 @@ -47,12 +51,15 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x .loop0: neg xq -%if mmsize == 32 - vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load - vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load +%if mmsize == 64 + vpermd m1, m3, [srcq + xq - mmsize + %3] + vpermd m2, m3, [srcq + xq - 2 * mmsize + %3] +%elif mmsize == 32 + vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load + vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load %else - movu m1, [srcq + xq - mmsize + %3] - movu m2, [srcq + xq - 2 * mmsize + %3] + movu m1, [srcq + xq - mmsize + %3] + movu m2, [srcq + xq - 2 * mmsize + %3] %endif pshufb m1, m0 pshufb m2, m0 @@ -88,3 +95,9 @@ INIT_YMM avx2 HFLIP byte, b, 1 HFLIP short, w, 2 %endif + +%if HAVE_AVX512_EXTERNAL +INIT_ZMM avx512 +HFLIP byte, b, 1 +HFLIP short, w, 2 +%endif diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c index 0ac399b0d4..25fc40f7b0 100644 --- a/libavfilter/x86/vf_hflip_init.c +++ b/libavfilter/x86/vf_hflip_init.c @@ -25,8 +25,10 @@ void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w); +void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w); +void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w); av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) { @@ -41,6 +43,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) if (EXTERNAL_AVX2_FAST(cpu_flags)) { s->flip_line[i] = ff_hflip_byte_avx2; } + if (EXTERNAL_AVX512(cpu_flags)) { + s->flip_line[i] = ff_hflip_byte_avx512; + } } else if (step[i] == 2) { if (EXTERNAL_SSSE3(cpu_flags)) { s->flip_line[i] = ff_hflip_short_ssse3; @@ -48,6 +53,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) if (EXTERNAL_AVX2_FAST(cpu_flags)) { s->flip_line[i] = ff_hflip_short_avx2; } + if (EXTERNAL_AVX512(cpu_flags)) { + s->flip_line[i] = ff_hflip_short_avx512; + } } } }
Performance(Less is better): 8bit: ff_hflip_byte_ssse3 0.61 ff_hflip_byte_avx2 0.37 ff_hflip_byte_avx512 0.19 16bit: ff_hflip_short_ssse3 1.27 ff_hflip_short_avx2 0.76 ff_hflip_short_avx512 0.40 Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> --- libavfilter/x86/vf_hflip.asm | 23 ++++++++++++++++++----- libavfilter/x86/vf_hflip_init.c | 8 ++++++++ 2 files changed, 26 insertions(+), 5 deletions(-)