diff mbox series

[FFmpeg-devel,2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

Message ID 20210625075943.79619-1-alankelly@google.com
State Superseded
Headers show
Series [FFmpeg-devel,1/2] libavutil/cpu: Adds fast gather detection. | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Alan Kelly June 25, 2021, 7:59 a.m. UTC
These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
---
 libswscale/swscale_internal.h |   2 +
 libswscale/utils.c            |  37 +++++++++++
 libswscale/x86/Makefile       |   1 +
 libswscale/x86/scale_avx2.asm | 112 ++++++++++++++++++++++++++++++++++
 libswscale/x86/swscale.c      |  19 ++++++
 tests/checkasm/sw_scale.c     |  21 +++++--
 6 files changed, 187 insertions(+), 5 deletions(-)
 create mode 100644 libswscale/x86/scale_avx2.asm

Comments

Ronald S. Bultje June 25, 2021, 11:25 a.m. UTC | #1
Hi Alan,

On Fri, Jun 25, 2021 at 3:59 AM Alan Kelly <
alankelly-at-google.com@ffmpeg.org> wrote:

> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
>

Re-asking a question I asked before in the other thread:

Also, what is the cycle count of ssse3/avx2 implementation for this
specific function on Haswell? It would be good to note that in the
respective patch so that we understand why the check was added.

You should be able to find this in the checkasm --bench --test=X numbers
for this relevant function.

Ronald
Alan Kelly June 25, 2021, 11:52 a.m. UTC | #2
On Fri, Jun 25, 2021 at 1:26 PM Ronald S. Bultje <rsbultje@gmail.com> wrote:

> Hi Alan,
>
> On Fri, Jun 25, 2021 at 3:59 AM Alan Kelly <
> alankelly-at-google.com@ffmpeg.org> wrote:
>
>> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
>>
>
> Re-asking a question I asked before in the other thread:
>
> Also, what is the cycle count of ssse3/avx2 implementation for this
> specific function on Haswell? It would be good to note that in the
> respective patch so that we understand why the check was added.
>
> You should be able to find this in the checkasm --bench --test=X numbers
> for this relevant function.
>
> Ronald
>

Hi Ronald,

Skylake Haswell
hscale_8_to_15_width4_ssse3 761.2 760
hscale_8_to_15_width4_avx2 468.7 957
hscale_8_to_15_width8_ssse3 1170.7 1032
hscale_8_to_15_width8_avx2 865.7 1979
hscale_8_to_15_width12_ssse3 2172.2 2472
hscale_8_to_15_width12_avx2 1245.7 2901
hscale_8_to_15_width16_ssse3 2244.2 2400
hscale_8_to_15_width16_avx2 1647.2 3681

As you can see, it is catastrophic on Haswell. In the next iteration of the
patch, I will update the description with these numbers.

Thanks
Ronald S. Bultje June 25, 2021, 12:24 p.m. UTC | #3
Hi Alan,

On Fri, Jun 25, 2021 at 7:53 AM Alan Kelly <alankelly@google.com> wrote:

>
>
> On Fri, Jun 25, 2021 at 1:26 PM Ronald S. Bultje <rsbultje@gmail.com>
> wrote:
>
>> Hi Alan,
>>
>> On Fri, Jun 25, 2021 at 3:59 AM Alan Kelly <
>> alankelly-at-google.com@ffmpeg.org> wrote:
>>
>>> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is
>>> available.
>>>
>>
>> Re-asking a question I asked before in the other thread:
>>
>> Also, what is the cycle count of ssse3/avx2 implementation for this
>> specific function on Haswell? It would be good to note that in the
>> respective patch so that we understand why the check was added.
>>
>> You should be able to find this in the checkasm --bench --test=X numbers
>> for this relevant function.
>>
>> Ronald
>>
>
> Hi Ronald,
>
> Skylake Haswell
> hscale_8_to_15_width4_ssse3 761.2 760
> hscale_8_to_15_width4_avx2 468.7 957
> hscale_8_to_15_width8_ssse3 1170.7 1032
> hscale_8_to_15_width8_avx2 865.7 1979
> hscale_8_to_15_width12_ssse3 2172.2 2472
> hscale_8_to_15_width12_avx2 1245.7 2901
> hscale_8_to_15_width16_ssse3 2244.2 2400
> hscale_8_to_15_width16_avx2 1647.2 3681
>
> As you can see, it is catastrophic on Haswell. In the next iteration of
> the patch, I will update the description with these numbers.
>

Thanks, that's very helpful. No further comments from me.

Ronald
diff mbox series

Patch

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index a1de95cee0..45ef657cd4 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1056,4 +1056,6 @@  void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
+//shuffle filter and filterPos for hyScale and hcScale filters in avx2
+void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 6bac7b658d..07c4d2f741 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -267,6 +267,41 @@  static const FormatEntry format_entries[] = {
     [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
 };
 
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){
+#if ARCH_X86_64
+    int i, j, k, l;
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_AVX2_FAST_GATHER(cpu_flags)){
+        if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+            if (dstW % 16 == 0){
+                if (filter != NULL){
+                    for (i = 0; i < dstW; i += 8){
+                        FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
+                        FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
+                    }
+                    if (filterSize > 4){
+                        int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+                        memcpy(tmp2, filter, dstW * filterSize * 2);
+                        for (i = 0; i < dstW; i += 16){//pixel
+                            for (k = 0; k < filterSize / 4; ++k){//fcoeff
+                                for (j = 0; j < 16; ++j){//inner pixel
+                                    for (l = 0; l < 4; ++l){//coeff
+                                        int from = i * filterSize + j * filterSize + k * 4 + l;
+                                        int to = (i) * filterSize + j * 4 + l + k * 64;
+                                        filter[to] = tmp2[from];
+                                    }
+                                }
+                            }
+                        }
+                        av_free(tmp2);
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
+
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 {
     return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
@@ -1697,6 +1732,7 @@  av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                            get_local_pos(c, 0, 0, 0),
                            get_local_pos(c, 0, 0, 0))) < 0)
                 goto fail;
+            ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW);
             if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
                            &c->hChrFilterSize, c->chrXInc,
                            c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1706,6 +1742,7 @@  av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                            get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0),
                            get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0)
                 goto fail;
+            ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW);
         }
     } // initialize horizontal stuff
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index bfe383364e..68391494be 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -11,6 +11,7 @@  OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 X86ASM-OBJS                     += x86/input.o                          \
                                    x86/output.o                         \
                                    x86/scale.o                          \
+                                   x86/scale_avx2.o                          \
                                    x86/rgb_2_rgb.o                      \
                                    x86/yuv_2_rgb.o                      \
                                    x86/yuv2yuvX.o                       \
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
new file mode 100644
index 0000000000..d90fd2d791
--- /dev/null
+++ b/libswscale/x86/scale_avx2.asm
@@ -0,0 +1,112 @@ 
+;******************************************************************************
+;* x86-optimized horizontal line scaling functions
+;* Copyright 2020 Google LLC
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
+four: times 8 dd 4
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; horizontal line scaling
+;
+; void hscale8to15_<filterSize>_<opt>
+;                   (SwsContext *c, int16_t *dst,
+;                    int dstW, const uint8_t *src,
+;                    const int16_t *filter,
+;                    const int32_t *filterPos, int filterSize);
+;
+; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is
+; 15 bits (in int16_t). Each output pixel is generated from $filterSize input
+; pixels, the position of the first pixel is given in filterPos[nOutputPixel].
+;-----------------------------------------------------------------------------
+
+%macro SCALE_FUNC 1
+cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner
+  pxor m0, m0
+  movu m15, [swizzle]
+  mov countq, $0
+%ifidn %1, X4
+  movu m14, [four]
+  movsxd fltsizeq, fltsized
+  shr fltsizeq, 2
+%endif
+.loop:
+  movu m1, [fltposq]
+  movu m2, [fltposq+32]
+%ifidn %1, X4
+  pxor m9, m9
+  pxor m10, m10
+  pxor m11, m11
+  pxor m12, m12
+  mov innerq, $0
+.innerloop:
+%endif
+  vpcmpeqd  m13, m13
+  vpgatherdd m3,[srcmemq + m1], m13
+  vpcmpeqd  m13, m13
+  vpgatherdd m4,[srcmemq + m2], m13
+  vpunpcklbw m5, m3, m0
+  vpunpckhbw m6, m3, m0
+  vpunpcklbw m7, m4, m0
+  vpunpckhbw m8, m4, m0
+  vpmaddwd m5, m5, [filterq]
+  vpmaddwd m6, m6, [filterq + 32]
+  vpmaddwd m7, m7, [filterq + 64]
+  vpmaddwd m8, m8, [filterq + 96]
+  add filterq, $80
+%ifidn %1, X4
+  paddd m9, m5
+  paddd m10, m6
+  paddd m11, m7
+  paddd m12, m8
+  paddd m1, m14
+  paddd m2, m14
+  add innerq, $1
+  cmp innerq, fltsizeq
+  jl .innerloop
+  vphaddd m5, m9, m10
+  vphaddd m6, m11, m12
+%else
+  vphaddd m5, m5, m6
+  vphaddd m6, m7, m8
+%endif
+  vpsrad  m5, 7
+  vpsrad  m6, 7
+  vpackssdw m5, m5, m6
+  vpermd m5, m15, m5
+  vmovdqu [dstq + countq * 2], m5
+  add fltposq, $40
+  add countq, $10
+  cmp countq, wq
+  jl .loop
+REP_RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_YMM avx2
+SCALE_FUNC 4
+SCALE_FUNC X4
+%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 0848a31461..4412ff5f92 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -276,6 +276,9 @@  SCALE_FUNCS_SSE(sse2);
 SCALE_FUNCS_SSE(ssse3);
 SCALE_FUNCS_SSE(sse4);
 
+SCALE_FUNC(4, 8, 15, avx2);
+SCALE_FUNC(X4, 8, 15, avx2);
+
 #define VSCALEX_FUNC(size, opt) \
 void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
                                         const int16_t **src, uint8_t *dest, int dstW, \
@@ -568,6 +571,22 @@  switch(c->dstBpc){ \
     }
 
 #if ARCH_X86_64
+#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \
+    switch (filtersize) { \
+    case 4:  hscalefn = ff_hscale8to15_4_avx2; break; \
+    default:  hscalefn = ff_hscale8to15_X4_avx2; break; \
+             break; \
+    }
+
+    if (EXTERNAL_AVX2_FAST_GATHER(cpu_flags)){
+      if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+        if(c->chrDstW % 16 == 0)
+          ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
+        if(c->dstW % 16 == 0)
+          ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
+      }
+    }
+
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         switch (c->dstFormat) {
         case AV_PIX_FMT_NV12:
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 3ac0f9082f..177f9df3c4 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -135,13 +135,13 @@  static void check_yuv2yuvX(void)
 }
 
 #undef SRC_PIXELS
-#define SRC_PIXELS 128
+#define SRC_PIXELS 512
 
 static void check_hscale(void)
 {
 #define MAX_FILTER_WIDTH 40
-#define FILTER_SIZES 5
-    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 };
+#define FILTER_SIZES 6
+    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40 };
 
 #define HSCALE_PAIRS 2
     static const int hscale_pairs[HSCALE_PAIRS][2] = {
@@ -160,6 +160,8 @@  static void check_hscale(void)
     // padded
     LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH]);
     LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]);
+    LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH]);
+    LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]);
 
     // The dst parameter here is either int16_t or int32_t but we use void* to
     // just cover both cases.
@@ -167,6 +169,8 @@  static void check_hscale(void)
                       const uint8_t *src, const int16_t *filter,
                       const int32_t *filterPos, int filterSize);
 
+    int cpu_flags = av_get_cpu_flags();
+
     ctx = sws_alloc_context();
     if (sws_init_context(ctx, NULL, NULL) < 0)
         fail();
@@ -180,9 +184,11 @@  static void check_hscale(void)
             ctx->srcBpc = hscale_pairs[hpi][0];
             ctx->dstBpc = hscale_pairs[hpi][1];
             ctx->hLumFilterSize = ctx->hChrFilterSize = width;
+            ctx->dstW = ctx->chrDstW = SRC_PIXELS;
 
             for (i = 0; i < SRC_PIXELS; i++) {
                 filterPos[i] = i;
+                filterPosAvx[i] = i;
 
                 // These filter cofficients are chosen to try break two corner
                 // cases, namely:
@@ -210,6 +216,11 @@  static void check_hscale(void)
 
                 filter[SRC_PIXELS * width + i] = rnd();
             }
+            memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
+            if (cpu_flags & AV_CPU_FLAG_AVX2){
+                ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS);
+            }
+
             ff_getSwsFunc(ctx);
 
             if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) {
@@ -217,10 +228,10 @@  static void check_hscale(void)
                 memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0]));
 
                 call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos, width);
-                call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos, width);
+                call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2, filterPosAvx, width);
                 if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0])))
                     fail();
-                bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPos, width);
+                bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPosAvx, width);
             }
         }
     }