diff mbox series

[FFmpeg-devel,2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

Message ID 20210716134818.1127438-1-alankelly@google.com
State New
Headers show
Series [FFmpeg-devel,1/2] libavutil/cpu: Adds fast gather detection.
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Alan Kelly July 16, 2021, 1:48 p.m. UTC
These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
---
EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as
discussed in the email thread for part 1 of this patch.

Benchmark results on Skylake and Haswell:

                                Skylake	Haswell
hscale_8_to_15_width4_ssse3	761.2	760
hscale_8_to_15_width4_avx2	468.7	957
hscale_8_to_15_width8_ssse3	1170.7	1032
hscale_8_to_15_width8_avx2	865.7	1979
hscale_8_to_15_width12_ssse3	2172.2	2472
hscale_8_to_15_width12_avx2	1245.7	2901
hscale_8_to_15_width16_ssse3	2244.2	2400
hscale_8_to_15_width16_avx2	1647.2	3681

 libswscale/swscale_internal.h |   2 +
 libswscale/utils.c            |  37 +++++++++++
 libswscale/x86/Makefile       |   1 +
 libswscale/x86/scale_avx2.asm | 112 ++++++++++++++++++++++++++++++++++
 libswscale/x86/swscale.c      |  19 ++++++
 tests/checkasm/sw_scale.c     |  20 ++++--
 6 files changed, 186 insertions(+), 5 deletions(-)
 create mode 100644 libswscale/x86/scale_avx2.asm

Comments

Alan Kelly July 21, 2021, 9:11 a.m. UTC | #1
On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly <alankelly@google.com> wrote:

> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
> ---
> EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as
> discussed in the email thread for part 1 of this patch.
>
> Benchmark results on Skylake and Haswell:
>
>                                 Skylake Haswell
> hscale_8_to_15_width4_ssse3     761.2   760
> hscale_8_to_15_width4_avx2      468.7   957
> hscale_8_to_15_width8_ssse3     1170.7  1032
> hscale_8_to_15_width8_avx2      865.7   1979
> hscale_8_to_15_width12_ssse3    2172.2  2472
> hscale_8_to_15_width12_avx2     1245.7  2901
> hscale_8_to_15_width16_ssse3    2244.2  2400
> hscale_8_to_15_width16_avx2     1647.2  3681
>
>  libswscale/swscale_internal.h |   2 +
>  libswscale/utils.c            |  37 +++++++++++
>  libswscale/x86/Makefile       |   1 +
>  libswscale/x86/scale_avx2.asm | 112 ++++++++++++++++++++++++++++++++++
>  libswscale/x86/swscale.c      |  19 ++++++
>  tests/checkasm/sw_scale.c     |  20 ++++--
>  6 files changed, 186 insertions(+), 5 deletions(-)
>  create mode 100644 libswscale/x86/scale_avx2.asm
>
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index 673407636a..fba3dabe5b 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c,
> yuv2planar1_fn yuv2plane1, yuv2planarX_fn
>  //number of extra lines to process
>  #define MAX_LINES_AHEAD 4
>
> +//shuffle filter and filterPos for hyScale and hcScale filters in avx2
> +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
> filterSize, int16_t *filter, int dstW);
>  #endif /* SWSCALE_SWSCALE_INTERNAL_H */
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index 176fc6fd63..0577fd5490 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = {
>      [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
>  };
>
> +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int
> filterSize, int16_t *filter, int dstW){
> +#if ARCH_X86_64
> +    int i, j, k, l;
> +    int cpu_flags = av_get_cpu_flags();
> +    if (EXTERNAL_AVX2_FAST(cpu_flags)){
> +        if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
> +            if (dstW % 16 == 0){
> +                if (filter != NULL){
> +                    for (i = 0; i < dstW; i += 8){
> +                        FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
> +                        FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
> +                    }
> +                    if (filterSize > 4){
> +                        int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
> +                        memcpy(tmp2, filter, dstW * filterSize * 2);
> +                        for (i = 0; i < dstW; i += 16){//pixel
> +                            for (k = 0; k < filterSize / 4; ++k){//fcoeff
> +                                for (j = 0; j < 16; ++j){//inner pixel
> +                                    for (l = 0; l < 4; ++l){//coeff
> +                                        int from = i * filterSize + j *
> filterSize + k * 4 + l;
> +                                        int to = (i) * filterSize + j * 4
> + l + k * 64;
> +                                        filter[to] = tmp2[from];
> +                                    }
> +                                }
> +                            }
> +                        }
> +                        av_free(tmp2);
> +                    }
> +                }
> +            }
> +        }
> +    }
> +#endif
> +}
> +
>  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
>  {
>      return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
> @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c,
> SwsFilter *srcFilter,
>                             get_local_pos(c, 0, 0, 0),
>                             get_local_pos(c, 0, 0, 0))) < 0)
>                  goto fail;
> +            ff_shuffle_filter_coefficients(c, c->hLumFilterPos,
> c->hLumFilterSize, c->hLumFilter, dstW);
>              if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
>                             &c->hChrFilterSize, c->chrXInc,
>                             c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
> @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c,
> SwsFilter *srcFilter,
>                             get_local_pos(c, c->chrSrcHSubSample,
> c->src_h_chr_pos, 0),
>                             get_local_pos(c, c->chrDstHSubSample,
> c->dst_h_chr_pos, 0))) < 0)
>                  goto fail;
> +            ff_shuffle_filter_coefficients(c, c->hChrFilterPos,
> c->hChrFilterSize, c->hChrFilter, c->chrDstW);
>          }
>      } // initialize horizontal stuff
>
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index bfe383364e..68391494be 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
>  X86ASM-OBJS                     += x86/input.o                          \
>                                     x86/output.o                         \
>                                     x86/scale.o                          \
> +                                   x86/scale_avx2.o
>     \
>                                     x86/rgb_2_rgb.o                      \
>                                     x86/yuv_2_rgb.o                      \
>                                     x86/yuv2yuvX.o                       \
> diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
> new file mode 100644
> index 0000000000..d90fd2d791
> --- /dev/null
> +++ b/libswscale/x86/scale_avx2.asm
> @@ -0,0 +1,112 @@
>
> +;******************************************************************************
> +;* x86-optimized horizontal line scaling functions
> +;* Copyright 2020 Google LLC
> +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
>
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
> +four: times 8 dd 4
> +
> +SECTION .text
> +
>
> +;-----------------------------------------------------------------------------
> +; horizontal line scaling
> +;
> +; void hscale8to15_<filterSize>_<opt>
> +;                   (SwsContext *c, int16_t *dst,
> +;                    int dstW, const uint8_t *src,
> +;                    const int16_t *filter,
> +;                    const int32_t *filterPos, int filterSize);
> +;
> +; Scale one horizontal line. Input is 8-bit width Filter is 14 bits.
> Output is
> +; 15 bits (in int16_t). Each output pixel is generated from $filterSize
> input
> +; pixels, the position of the first pixel is given in
> filterPos[nOutputPixel].
>
> +;-----------------------------------------------------------------------------
> +
> +%macro SCALE_FUNC 1
> +cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos,
> fltsize, count, inner
> +  pxor m0, m0
> +  movu m15, [swizzle]
> +  mov countq, $0
> +%ifidn %1, X4
> +  movu m14, [four]
> +  movsxd fltsizeq, fltsized
> +  shr fltsizeq, 2
> +%endif
> +.loop:
> +  movu m1, [fltposq]
> +  movu m2, [fltposq+32]
> +%ifidn %1, X4
> +  pxor m9, m9
> +  pxor m10, m10
> +  pxor m11, m11
> +  pxor m12, m12
> +  mov innerq, $0
> +.innerloop:
> +%endif
> +  vpcmpeqd  m13, m13
> +  vpgatherdd m3,[srcmemq + m1], m13
> +  vpcmpeqd  m13, m13
> +  vpgatherdd m4,[srcmemq + m2], m13
> +  vpunpcklbw m5, m3, m0
> +  vpunpckhbw m6, m3, m0
> +  vpunpcklbw m7, m4, m0
> +  vpunpckhbw m8, m4, m0
> +  vpmaddwd m5, m5, [filterq]
> +  vpmaddwd m6, m6, [filterq + 32]
> +  vpmaddwd m7, m7, [filterq + 64]
> +  vpmaddwd m8, m8, [filterq + 96]
> +  add filterq, $80
> +%ifidn %1, X4
> +  paddd m9, m5
> +  paddd m10, m6
> +  paddd m11, m7
> +  paddd m12, m8
> +  paddd m1, m14
> +  paddd m2, m14
> +  add innerq, $1
> +  cmp innerq, fltsizeq
> +  jl .innerloop
> +  vphaddd m5, m9, m10
> +  vphaddd m6, m11, m12
> +%else
> +  vphaddd m5, m5, m6
> +  vphaddd m6, m7, m8
> +%endif
> +  vpsrad  m5, 7
> +  vpsrad  m6, 7
> +  vpackssdw m5, m5, m6
> +  vpermd m5, m15, m5
> +  vmovdqu [dstq + countq * 2], m5
> +  add fltposq, $40
> +  add countq, $10
> +  cmp countq, wq
> +  jl .loop
> +REP_RET
> +%endmacro
> +
> +%if ARCH_X86_64
> +INIT_YMM avx2
> +SCALE_FUNC 4
> +SCALE_FUNC X4
> +%endif
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 0848a31461..164b06d6ba 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -276,6 +276,9 @@ SCALE_FUNCS_SSE(sse2);
>  SCALE_FUNCS_SSE(ssse3);
>  SCALE_FUNCS_SSE(sse4);
>
> +SCALE_FUNC(4, 8, 15, avx2);
> +SCALE_FUNC(X4, 8, 15, avx2);
> +
>  #define VSCALEX_FUNC(size, opt) \
>  void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int
> filterSize, \
>                                          const int16_t **src, uint8_t
> *dest, int dstW, \
> @@ -568,6 +571,22 @@ switch(c->dstBpc){ \
>      }
>
>  #if ARCH_X86_64
> +#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \
> +    switch (filtersize) { \
> +    case 4:  hscalefn = ff_hscale8to15_4_avx2; break; \
> +    default:  hscalefn = ff_hscale8to15_X4_avx2; break; \
> +             break; \
> +    }
> +
> +    if (EXTERNAL_AVX2_FAST(cpu_flags)){
> +      if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
> +        if(c->chrDstW % 16 == 0)
> +          ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
> +        if(c->dstW % 16 == 0)
> +          ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
> +      }
> +    }
> +
>      if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>          switch (c->dstFormat) {
>          case AV_PIX_FMT_NV12:
> diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
> index 40c5eb3aa8..103b1aa5da 100644
> --- a/tests/checkasm/sw_scale.c
> +++ b/tests/checkasm/sw_scale.c
> @@ -135,13 +135,13 @@ static void check_yuv2yuvX(void)
>  }
>
>  #undef SRC_PIXELS
> -#define SRC_PIXELS 128
> +#define SRC_PIXELS 512
>
>  static void check_hscale(void)
>  {
>  #define MAX_FILTER_WIDTH 40
> -#define FILTER_SIZES 5
> -    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 };
> +#define FILTER_SIZES 6
> +    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40
> };
>
>  #define HSCALE_PAIRS 2
>      static const int hscale_pairs[HSCALE_PAIRS][2] = {
> @@ -160,6 +160,8 @@ static void check_hscale(void)
>      // padded
>      LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH +
> MAX_FILTER_WIDTH]);
>      LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]);
> +    LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH
> + MAX_FILTER_WIDTH]);
> +    LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]);
>
>      // The dst parameter here is either int16_t or int32_t but we use
> void* to
>      // just cover both cases.
> @@ -167,6 +169,8 @@ static void check_hscale(void)
>                        const uint8_t *src, const int16_t *filter,
>                        const int32_t *filterPos, int filterSize);
>
> +    int cpu_flags = av_get_cpu_flags();
> +
>      ctx = sws_alloc_context();
>      if (sws_init_context(ctx, NULL, NULL) < 0)
>          fail();
> @@ -180,9 +184,11 @@ static void check_hscale(void)
>              ctx->srcBpc = hscale_pairs[hpi][0];
>              ctx->dstBpc = hscale_pairs[hpi][1];
>              ctx->hLumFilterSize = ctx->hChrFilterSize = width;
> +            ctx->dstW = ctx->chrDstW = SRC_PIXELS;
>
>              for (i = 0; i < SRC_PIXELS; i++) {
>                  filterPos[i] = i;
> +                filterPosAvx[i] = i;
>
>                  // These filter cofficients are chosen to try break two
> corner
>                  // cases, namely:
> @@ -211,16 +217,20 @@ static void check_hscale(void)
>                  filter[SRC_PIXELS * width + i] = rnd();
>              }
>              ff_sws_init_scale(ctx);
> +            memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS *
> MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
> +            if (cpu_flags & AV_CPU_FLAG_AVX2){
> +                ff_shuffle_filter_coefficients(ctx, filterPosAvx, width,
> filterAvx2, SRC_PIXELS);
> +            }
>
>              if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d",
> ctx->srcBpc, ctx->dstBpc + 1, width)) {
>                  memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
>                  memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0]));
>
>                  call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos,
> width);
> -                call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos,
> width);
> +                call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2,
> filterPosAvx, width);
>                  if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0])))
>                      fail();
> -                bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPos,
> width);
> +                bench_new(NULL, dst0, SRC_PIXELS, src, filter,
> filterPosAvx, width);
>              }
>          }
>      }
> --
> 2.32.0.402.g57bb445576-goog
>
>
Part 1 of this patch has been abandoned as it is no longer required. Are
there any further comments on this patch or can it be merged?

Thanks
Alan Kelly July 26, 2021, 11:44 a.m. UTC | #2
On Wed, Jul 21, 2021 at 11:11 AM Alan Kelly <alankelly@google.com> wrote:

>
>
> On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly <alankelly@google.com> wrote:
>
>> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
>> ---
>> EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as
>> discussed in the email thread for part 1 of this patch.
>>
>> Benchmark results on Skylake and Haswell:
>>
>>                                 Skylake Haswell
>> hscale_8_to_15_width4_ssse3     761.2   760
>> hscale_8_to_15_width4_avx2      468.7   957
>> hscale_8_to_15_width8_ssse3     1170.7  1032
>> hscale_8_to_15_width8_avx2      865.7   1979
>> hscale_8_to_15_width12_ssse3    2172.2  2472
>> hscale_8_to_15_width12_avx2     1245.7  2901
>> hscale_8_to_15_width16_ssse3    2244.2  2400
>> hscale_8_to_15_width16_avx2     1647.2  3681
>>
>>  libswscale/swscale_internal.h |   2 +
>>  libswscale/utils.c            |  37 +++++++++++
>>  libswscale/x86/Makefile       |   1 +
>>  libswscale/x86/scale_avx2.asm | 112 ++++++++++++++++++++++++++++++++++
>>  libswscale/x86/swscale.c      |  19 ++++++
>>  tests/checkasm/sw_scale.c     |  20 ++++--
>>  6 files changed, 186 insertions(+), 5 deletions(-)
>>  create mode 100644 libswscale/x86/scale_avx2.asm
>>
>> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
>> index 673407636a..fba3dabe5b 100644
>> --- a/libswscale/swscale_internal.h
>> +++ b/libswscale/swscale_internal.h
>> @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c,
>> yuv2planar1_fn yuv2plane1, yuv2planarX_fn
>>  //number of extra lines to process
>>  #define MAX_LINES_AHEAD 4
>>
>> +//shuffle filter and filterPos for hyScale and hcScale filters in avx2
>> +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
>> filterSize, int16_t *filter, int dstW);
>>  #endif /* SWSCALE_SWSCALE_INTERNAL_H */
>> diff --git a/libswscale/utils.c b/libswscale/utils.c
>> index 176fc6fd63..0577fd5490 100644
>> --- a/libswscale/utils.c
>> +++ b/libswscale/utils.c
>> @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = {
>>      [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
>>  };
>>
>> +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int
>> filterSize, int16_t *filter, int dstW){
>> +#if ARCH_X86_64
>> +    int i, j, k, l;
>> +    int cpu_flags = av_get_cpu_flags();
>> +    if (EXTERNAL_AVX2_FAST(cpu_flags)){
>> +        if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
>> +            if (dstW % 16 == 0){
>> +                if (filter != NULL){
>> +                    for (i = 0; i < dstW; i += 8){
>> +                        FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
>> +                        FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
>> +                    }
>> +                    if (filterSize > 4){
>> +                        int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
>> +                        memcpy(tmp2, filter, dstW * filterSize * 2);
>> +                        for (i = 0; i < dstW; i += 16){//pixel
>> +                            for (k = 0; k < filterSize / 4; ++k){//fcoeff
>> +                                for (j = 0; j < 16; ++j){//inner pixel
>> +                                    for (l = 0; l < 4; ++l){//coeff
>> +                                        int from = i * filterSize + j *
>> filterSize + k * 4 + l;
>> +                                        int to = (i) * filterSize + j *
>> 4 + l + k * 64;
>> +                                        filter[to] = tmp2[from];
>> +                                    }
>> +                                }
>> +                            }
>> +                        }
>> +                        av_free(tmp2);
>> +                    }
>> +                }
>> +            }
>> +        }
>> +    }
>> +#endif
>> +}
>> +
>>  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
>>  {
>>      return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
>> @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c,
>> SwsFilter *srcFilter,
>>                             get_local_pos(c, 0, 0, 0),
>>                             get_local_pos(c, 0, 0, 0))) < 0)
>>                  goto fail;
>> +            ff_shuffle_filter_coefficients(c, c->hLumFilterPos,
>> c->hLumFilterSize, c->hLumFilter, dstW);
>>              if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
>>                             &c->hChrFilterSize, c->chrXInc,
>>                             c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
>> @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c,
>> SwsFilter *srcFilter,
>>                             get_local_pos(c, c->chrSrcHSubSample,
>> c->src_h_chr_pos, 0),
>>                             get_local_pos(c, c->chrDstHSubSample,
>> c->dst_h_chr_pos, 0))) < 0)
>>                  goto fail;
>> +            ff_shuffle_filter_coefficients(c, c->hChrFilterPos,
>> c->hChrFilterSize, c->hChrFilter, c->chrDstW);
>>          }
>>      } // initialize horizontal stuff
>>
>> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
>> index bfe383364e..68391494be 100644
>> --- a/libswscale/x86/Makefile
>> +++ b/libswscale/x86/Makefile
>> @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
>>  X86ASM-OBJS                     += x86/input.o                          \
>>                                     x86/output.o                         \
>>                                     x86/scale.o                          \
>> +                                   x86/scale_avx2.o
>>     \
>>                                     x86/rgb_2_rgb.o                      \
>>                                     x86/yuv_2_rgb.o                      \
>>                                     x86/yuv2yuvX.o                       \
>> diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
>> new file mode 100644
>> index 0000000000..d90fd2d791
>> --- /dev/null
>> +++ b/libswscale/x86/scale_avx2.asm
>> @@ -0,0 +1,112 @@
>>
>> +;******************************************************************************
>> +;* x86-optimized horizontal line scaling functions
>> +;* Copyright 2020 Google LLC
>> +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
>> +;*
>> +;* This file is part of FFmpeg.
>> +;*
>> +;* FFmpeg is free software; you can redistribute it and/or
>> +;* modify it under the terms of the GNU Lesser General Public
>> +;* License as published by the Free Software Foundation; either
>> +;* version 2.1 of the License, or (at your option) any later version.
>> +;*
>> +;* FFmpeg is distributed in the hope that it will be useful,
>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +;* Lesser General Public License for more details.
>> +;*
>> +;* You should have received a copy of the GNU Lesser General Public
>> +;* License along with FFmpeg; if not, write to the Free Software
>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>>
>> +;******************************************************************************
>> +
>> +%include "libavutil/x86/x86util.asm"
>> +
>> +SECTION_RODATA
>> +
>> +swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
>> +four: times 8 dd 4
>> +
>> +SECTION .text
>> +
>>
>> +;-----------------------------------------------------------------------------
>> +; horizontal line scaling
>> +;
>> +; void hscale8to15_<filterSize>_<opt>
>> +;                   (SwsContext *c, int16_t *dst,
>> +;                    int dstW, const uint8_t *src,
>> +;                    const int16_t *filter,
>> +;                    const int32_t *filterPos, int filterSize);
>> +;
>> +; Scale one horizontal line. Input is 8-bit width Filter is 14 bits.
>> Output is
>> +; 15 bits (in int16_t). Each output pixel is generated from $filterSize
>> input
>> +; pixels, the position of the first pixel is given in
>> filterPos[nOutputPixel].
>>
>> +;-----------------------------------------------------------------------------
>> +
>> +%macro SCALE_FUNC 1
>> +cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos,
>> fltsize, count, inner
>> +  pxor m0, m0
>> +  movu m15, [swizzle]
>> +  mov countq, $0
>> +%ifidn %1, X4
>> +  movu m14, [four]
>> +  movsxd fltsizeq, fltsized
>> +  shr fltsizeq, 2
>> +%endif
>> +.loop:
>> +  movu m1, [fltposq]
>> +  movu m2, [fltposq+32]
>> +%ifidn %1, X4
>> +  pxor m9, m9
>> +  pxor m10, m10
>> +  pxor m11, m11
>> +  pxor m12, m12
>> +  mov innerq, $0
>> +.innerloop:
>> +%endif
>> +  vpcmpeqd  m13, m13
>> +  vpgatherdd m3,[srcmemq + m1], m13
>> +  vpcmpeqd  m13, m13
>> +  vpgatherdd m4,[srcmemq + m2], m13
>> +  vpunpcklbw m5, m3, m0
>> +  vpunpckhbw m6, m3, m0
>> +  vpunpcklbw m7, m4, m0
>> +  vpunpckhbw m8, m4, m0
>> +  vpmaddwd m5, m5, [filterq]
>> +  vpmaddwd m6, m6, [filterq + 32]
>> +  vpmaddwd m7, m7, [filterq + 64]
>> +  vpmaddwd m8, m8, [filterq + 96]
>> +  add filterq, $80
>> +%ifidn %1, X4
>> +  paddd m9, m5
>> +  paddd m10, m6
>> +  paddd m11, m7
>> +  paddd m12, m8
>> +  paddd m1, m14
>> +  paddd m2, m14
>> +  add innerq, $1
>> +  cmp innerq, fltsizeq
>> +  jl .innerloop
>> +  vphaddd m5, m9, m10
>> +  vphaddd m6, m11, m12
>> +%else
>> +  vphaddd m5, m5, m6
>> +  vphaddd m6, m7, m8
>> +%endif
>> +  vpsrad  m5, 7
>> +  vpsrad  m6, 7
>> +  vpackssdw m5, m5, m6
>> +  vpermd m5, m15, m5
>> +  vmovdqu [dstq + countq * 2], m5
>> +  add fltposq, $40
>> +  add countq, $10
>> +  cmp countq, wq
>> +  jl .loop
>> +REP_RET
>> +%endmacro
>> +
>> +%if ARCH_X86_64
>> +INIT_YMM avx2
>> +SCALE_FUNC 4
>> +SCALE_FUNC X4
>> +%endif
>> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
>> index 0848a31461..164b06d6ba 100644
>> --- a/libswscale/x86/swscale.c
>> +++ b/libswscale/x86/swscale.c
>> @@ -276,6 +276,9 @@ SCALE_FUNCS_SSE(sse2);
>>  SCALE_FUNCS_SSE(ssse3);
>>  SCALE_FUNCS_SSE(sse4);
>>
>> +SCALE_FUNC(4, 8, 15, avx2);
>> +SCALE_FUNC(X4, 8, 15, avx2);
>> +
>>  #define VSCALEX_FUNC(size, opt) \
>>  void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int
>> filterSize, \
>>                                          const int16_t **src, uint8_t
>> *dest, int dstW, \
>> @@ -568,6 +571,22 @@ switch(c->dstBpc){ \
>>      }
>>
>>  #if ARCH_X86_64
>> +#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \
>> +    switch (filtersize) { \
>> +    case 4:  hscalefn = ff_hscale8to15_4_avx2; break; \
>> +    default:  hscalefn = ff_hscale8to15_X4_avx2; break; \
>> +             break; \
>> +    }
>> +
>> +    if (EXTERNAL_AVX2_FAST(cpu_flags)){
>> +      if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
>> +        if(c->chrDstW % 16 == 0)
>> +          ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
>> +        if(c->dstW % 16 == 0)
>> +          ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
>> +      }
>> +    }
>> +
>>      if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>>          switch (c->dstFormat) {
>>          case AV_PIX_FMT_NV12:
>> diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
>> index 40c5eb3aa8..103b1aa5da 100644
>> --- a/tests/checkasm/sw_scale.c
>> +++ b/tests/checkasm/sw_scale.c
>> @@ -135,13 +135,13 @@ static void check_yuv2yuvX(void)
>>  }
>>
>>  #undef SRC_PIXELS
>> -#define SRC_PIXELS 128
>> +#define SRC_PIXELS 512
>>
>>  static void check_hscale(void)
>>  {
>>  #define MAX_FILTER_WIDTH 40
>> -#define FILTER_SIZES 5
>> -    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 };
>> +#define FILTER_SIZES 6
>> +    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40
>> };
>>
>>  #define HSCALE_PAIRS 2
>>      static const int hscale_pairs[HSCALE_PAIRS][2] = {
>> @@ -160,6 +160,8 @@ static void check_hscale(void)
>>      // padded
>>      LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH +
>> MAX_FILTER_WIDTH]);
>>      LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]);
>> +    LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH
>> + MAX_FILTER_WIDTH]);
>> +    LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]);
>>
>>      // The dst parameter here is either int16_t or int32_t but we use
>> void* to
>>      // just cover both cases.
>> @@ -167,6 +169,8 @@ static void check_hscale(void)
>>                        const uint8_t *src, const int16_t *filter,
>>                        const int32_t *filterPos, int filterSize);
>>
>> +    int cpu_flags = av_get_cpu_flags();
>> +
>>      ctx = sws_alloc_context();
>>      if (sws_init_context(ctx, NULL, NULL) < 0)
>>          fail();
>> @@ -180,9 +184,11 @@ static void check_hscale(void)
>>              ctx->srcBpc = hscale_pairs[hpi][0];
>>              ctx->dstBpc = hscale_pairs[hpi][1];
>>              ctx->hLumFilterSize = ctx->hChrFilterSize = width;
>> +            ctx->dstW = ctx->chrDstW = SRC_PIXELS;
>>
>>              for (i = 0; i < SRC_PIXELS; i++) {
>>                  filterPos[i] = i;
>> +                filterPosAvx[i] = i;
>>
>>                  // These filter cofficients are chosen to try break two
>> corner
>>                  // cases, namely:
>> @@ -211,16 +217,20 @@ static void check_hscale(void)
>>                  filter[SRC_PIXELS * width + i] = rnd();
>>              }
>>              ff_sws_init_scale(ctx);
>> +            memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS *
>> MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
>> +            if (cpu_flags & AV_CPU_FLAG_AVX2){
>> +                ff_shuffle_filter_coefficients(ctx, filterPosAvx, width,
>> filterAvx2, SRC_PIXELS);
>> +            }
>>
>>              if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d",
>> ctx->srcBpc, ctx->dstBpc + 1, width)) {
>>                  memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
>>                  memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0]));
>>
>>                  call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos,
>> width);
>> -                call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos,
>> width);
>> +                call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2,
>> filterPosAvx, width);
>>                  if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0])))
>>                      fail();
>> -                bench_new(NULL, dst0, SRC_PIXELS, src, filter,
>> filterPos, width);
>> +                bench_new(NULL, dst0, SRC_PIXELS, src, filter,
>> filterPosAvx, width);
>>              }
>>          }
>>      }
>> --
>> 2.32.0.402.g57bb445576-goog
>>
>>
> Part 1 of this patch has been abandoned as it is no longer required. Are
> there any further comments on this patch or can it be merged?
>
> Thanks
>

Hi,

Ping! Are there any objections/comments on this patch?

Thanks
diff mbox series

Patch

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 673407636a..fba3dabe5b 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1064,4 +1064,6 @@  void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
+//shuffle filter and filterPos for hyScale and hcScale filters in avx2
+void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 176fc6fd63..0577fd5490 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -268,6 +268,41 @@  static const FormatEntry format_entries[] = {
     [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
 };
 
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){
+#if ARCH_X86_64
+    int i, j, k, l;
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_AVX2_FAST(cpu_flags)){
+        if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+            if (dstW % 16 == 0){
+                if (filter != NULL){
+                    for (i = 0; i < dstW; i += 8){
+                        FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
+                        FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
+                    }
+                    if (filterSize > 4){
+                        int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+                        memcpy(tmp2, filter, dstW * filterSize * 2);
+                        for (i = 0; i < dstW; i += 16){//pixel
+                            for (k = 0; k < filterSize / 4; ++k){//fcoeff
+                                for (j = 0; j < 16; ++j){//inner pixel
+                                    for (l = 0; l < 4; ++l){//coeff
+                                        int from = i * filterSize + j * filterSize + k * 4 + l;
+                                        int to = (i) * filterSize + j * 4 + l + k * 64;
+                                        filter[to] = tmp2[from];
+                                    }
+                                }
+                            }
+                        }
+                        av_free(tmp2);
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
+
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 {
     return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
@@ -1699,6 +1734,7 @@  av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                            get_local_pos(c, 0, 0, 0),
                            get_local_pos(c, 0, 0, 0))) < 0)
                 goto fail;
+            ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW);
             if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
                            &c->hChrFilterSize, c->chrXInc,
                            c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1708,6 +1744,7 @@  av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                            get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0),
                            get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0)
                 goto fail;
+            ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW);
         }
     } // initialize horizontal stuff
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index bfe383364e..68391494be 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -11,6 +11,7 @@  OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 X86ASM-OBJS                     += x86/input.o                          \
                                    x86/output.o                         \
                                    x86/scale.o                          \
+                                   x86/scale_avx2.o                          \
                                    x86/rgb_2_rgb.o                      \
                                    x86/yuv_2_rgb.o                      \
                                    x86/yuv2yuvX.o                       \
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
new file mode 100644
index 0000000000..d90fd2d791
--- /dev/null
+++ b/libswscale/x86/scale_avx2.asm
@@ -0,0 +1,112 @@ 
+;******************************************************************************
+;* x86-optimized horizontal line scaling functions
+;* Copyright 2020 Google LLC
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
+four: times 8 dd 4
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; horizontal line scaling
+;
+; void hscale8to15_<filterSize>_<opt>
+;                   (SwsContext *c, int16_t *dst,
+;                    int dstW, const uint8_t *src,
+;                    const int16_t *filter,
+;                    const int32_t *filterPos, int filterSize);
+;
+; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is
+; 15 bits (in int16_t). Each output pixel is generated from $filterSize input
+; pixels, the position of the first pixel is given in filterPos[nOutputPixel].
+;-----------------------------------------------------------------------------
+
+%macro SCALE_FUNC 1
+cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner
+  pxor m0, m0
+  movu m15, [swizzle]
+  mov countq, $0
+%ifidn %1, X4
+  movu m14, [four]
+  movsxd fltsizeq, fltsized
+  shr fltsizeq, 2
+%endif
+.loop:
+  movu m1, [fltposq]
+  movu m2, [fltposq+32]
+%ifidn %1, X4
+  pxor m9, m9
+  pxor m10, m10
+  pxor m11, m11
+  pxor m12, m12
+  mov innerq, $0
+.innerloop:
+%endif
+  vpcmpeqd  m13, m13
+  vpgatherdd m3,[srcmemq + m1], m13
+  vpcmpeqd  m13, m13
+  vpgatherdd m4,[srcmemq + m2], m13
+  vpunpcklbw m5, m3, m0
+  vpunpckhbw m6, m3, m0
+  vpunpcklbw m7, m4, m0
+  vpunpckhbw m8, m4, m0
+  vpmaddwd m5, m5, [filterq]
+  vpmaddwd m6, m6, [filterq + 32]
+  vpmaddwd m7, m7, [filterq + 64]
+  vpmaddwd m8, m8, [filterq + 96]
+  add filterq, $80
+%ifidn %1, X4
+  paddd m9, m5
+  paddd m10, m6
+  paddd m11, m7
+  paddd m12, m8
+  paddd m1, m14
+  paddd m2, m14
+  add innerq, $1
+  cmp innerq, fltsizeq
+  jl .innerloop
+  vphaddd m5, m9, m10
+  vphaddd m6, m11, m12
+%else
+  vphaddd m5, m5, m6
+  vphaddd m6, m7, m8
+%endif
+  vpsrad  m5, 7
+  vpsrad  m6, 7
+  vpackssdw m5, m5, m6
+  vpermd m5, m15, m5
+  vmovdqu [dstq + countq * 2], m5
+  add fltposq, $40
+  add countq, $10
+  cmp countq, wq
+  jl .loop
+REP_RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_YMM avx2
+SCALE_FUNC 4
+SCALE_FUNC X4
+%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 0848a31461..164b06d6ba 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -276,6 +276,9 @@  SCALE_FUNCS_SSE(sse2);
 SCALE_FUNCS_SSE(ssse3);
 SCALE_FUNCS_SSE(sse4);
 
+SCALE_FUNC(4, 8, 15, avx2);
+SCALE_FUNC(X4, 8, 15, avx2);
+
 #define VSCALEX_FUNC(size, opt) \
 void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
                                         const int16_t **src, uint8_t *dest, int dstW, \
@@ -568,6 +571,22 @@  switch(c->dstBpc){ \
     }
 
 #if ARCH_X86_64
+#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \
+    switch (filtersize) { \
+    case 4:  hscalefn = ff_hscale8to15_4_avx2; break; \
+    default:  hscalefn = ff_hscale8to15_X4_avx2; break; \
+             break; \
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)){
+      if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+        if(c->chrDstW % 16 == 0)
+          ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
+        if(c->dstW % 16 == 0)
+          ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
+      }
+    }
+
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         switch (c->dstFormat) {
         case AV_PIX_FMT_NV12:
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 40c5eb3aa8..103b1aa5da 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -135,13 +135,13 @@  static void check_yuv2yuvX(void)
 }
 
 #undef SRC_PIXELS
-#define SRC_PIXELS 128
+#define SRC_PIXELS 512
 
 static void check_hscale(void)
 {
 #define MAX_FILTER_WIDTH 40
-#define FILTER_SIZES 5
-    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 };
+#define FILTER_SIZES 6
+    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40 };
 
 #define HSCALE_PAIRS 2
     static const int hscale_pairs[HSCALE_PAIRS][2] = {
@@ -160,6 +160,8 @@  static void check_hscale(void)
     // padded
     LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH]);
     LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]);
+    LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH]);
+    LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]);
 
     // The dst parameter here is either int16_t or int32_t but we use void* to
     // just cover both cases.
@@ -167,6 +169,8 @@  static void check_hscale(void)
                       const uint8_t *src, const int16_t *filter,
                       const int32_t *filterPos, int filterSize);
 
+    int cpu_flags = av_get_cpu_flags();
+
     ctx = sws_alloc_context();
     if (sws_init_context(ctx, NULL, NULL) < 0)
         fail();
@@ -180,9 +184,11 @@  static void check_hscale(void)
             ctx->srcBpc = hscale_pairs[hpi][0];
             ctx->dstBpc = hscale_pairs[hpi][1];
             ctx->hLumFilterSize = ctx->hChrFilterSize = width;
+            ctx->dstW = ctx->chrDstW = SRC_PIXELS;
 
             for (i = 0; i < SRC_PIXELS; i++) {
                 filterPos[i] = i;
+                filterPosAvx[i] = i;
 
                 // These filter cofficients are chosen to try break two corner
                 // cases, namely:
@@ -211,16 +217,20 @@  static void check_hscale(void)
                 filter[SRC_PIXELS * width + i] = rnd();
             }
             ff_sws_init_scale(ctx);
+            memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
+            if (cpu_flags & AV_CPU_FLAG_AVX2){
+                ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS);
+            }
 
             if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) {
                 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
                 memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0]));
 
                 call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos, width);
-                call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos, width);
+                call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2, filterPosAvx, width);
                 if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0])))
                     fail();
-                bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPos, width);
+                bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPosAvx, width);
             }
         }
     }