Message ID | 20210716134818.1127438-1-alankelly@google.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/2] libavutil/cpu: Adds fast gather detection. | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly <alankelly@google.com> wrote: > These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. > --- > EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as > discussed in the email thread for part 1 of this patch. > > Benchmark results on Skylake and Haswell: > > Skylake Haswell > hscale_8_to_15_width4_ssse3 761.2 760 > hscale_8_to_15_width4_avx2 468.7 957 > hscale_8_to_15_width8_ssse3 1170.7 1032 > hscale_8_to_15_width8_avx2 865.7 1979 > hscale_8_to_15_width12_ssse3 2172.2 2472 > hscale_8_to_15_width12_avx2 1245.7 2901 > hscale_8_to_15_width16_ssse3 2244.2 2400 > hscale_8_to_15_width16_avx2 1647.2 3681 > > libswscale/swscale_internal.h | 2 + > libswscale/utils.c | 37 +++++++++++ > libswscale/x86/Makefile | 1 + > libswscale/x86/scale_avx2.asm | 112 ++++++++++++++++++++++++++++++++++ > libswscale/x86/swscale.c | 19 ++++++ > tests/checkasm/sw_scale.c | 20 ++++-- > 6 files changed, 186 insertions(+), 5 deletions(-) > create mode 100644 libswscale/x86/scale_avx2.asm > > diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h > index 673407636a..fba3dabe5b 100644 > --- a/libswscale/swscale_internal.h > +++ b/libswscale/swscale_internal.h > @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c, > yuv2planar1_fn yuv2plane1, yuv2planarX_fn > //number of extra lines to process > #define MAX_LINES_AHEAD 4 > > +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 > +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int > filterSize, int16_t *filter, int dstW); > #endif /* SWSCALE_SWSCALE_INTERNAL_H */ > diff --git a/libswscale/utils.c b/libswscale/utils.c > index 176fc6fd63..0577fd5490 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = { > [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, > }; > > +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int > filterSize, int16_t *filter, int dstW){ > +#if ARCH_X86_64 > + int i, j, k, l; > + int cpu_flags = av_get_cpu_flags(); > + if (EXTERNAL_AVX2_FAST(cpu_flags)){ > + if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ > + if (dstW % 16 == 0){ > + if (filter != NULL){ > + for (i = 0; i < dstW; i += 8){ > + FFSWAP(int, filterPos[i + 2], filterPos[i+4]); > + FFSWAP(int, filterPos[i + 3], filterPos[i+5]); > + } > + if (filterSize > 4){ > + int16_t *tmp2 = av_malloc(dstW * filterSize * 2); > + memcpy(tmp2, filter, dstW * filterSize * 2); > + for (i = 0; i < dstW; i += 16){//pixel > + for (k = 0; k < filterSize / 4; ++k){//fcoeff > + for (j = 0; j < 16; ++j){//inner pixel > + for (l = 0; l < 4; ++l){//coeff > + int from = i * filterSize + j * > filterSize + k * 4 + l; > + int to = (i) * filterSize + j * 4 > + l + k * 64; > + filter[to] = tmp2[from]; > + } > + } > + } > + } > + av_free(tmp2); > + } > + } > + } > + } > + } > +#endif > +} > + > int sws_isSupportedInput(enum AVPixelFormat pix_fmt) > { > return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? > @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > get_local_pos(c, 0, 0, 0), > get_local_pos(c, 0, 0, 0))) < 0) > goto fail; > + ff_shuffle_filter_coefficients(c, c->hLumFilterPos, > c->hLumFilterSize, c->hLumFilter, dstW); > if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, > &c->hChrFilterSize, c->chrXInc, > c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, > @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > get_local_pos(c, c->chrSrcHSubSample, > c->src_h_chr_pos, 0), > get_local_pos(c, c->chrDstHSubSample, > c->dst_h_chr_pos, 0))) < 0) > goto fail; > + ff_shuffle_filter_coefficients(c, c->hChrFilterPos, > c->hChrFilterSize, c->hChrFilter, c->chrDstW); > } > } // initialize horizontal stuff > > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile > index bfe383364e..68391494be 100644 > --- a/libswscale/x86/Makefile > +++ b/libswscale/x86/Makefile > @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o > X86ASM-OBJS += x86/input.o \ > x86/output.o \ > x86/scale.o \ > + x86/scale_avx2.o > \ > x86/rgb_2_rgb.o \ > x86/yuv_2_rgb.o \ > x86/yuv2yuvX.o \ > diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm > new file mode 100644 > index 0000000000..d90fd2d791 > --- /dev/null > +++ b/libswscale/x86/scale_avx2.asm > @@ -0,0 +1,112 @@ > > +;****************************************************************************** > +;* x86-optimized horizontal line scaling functions > +;* Copyright 2020 Google LLC > +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7 > +four: times 8 dd 4 > + > +SECTION .text > + > > +;----------------------------------------------------------------------------- > +; horizontal line scaling > +; > +; void hscale8to15_<filterSize>_<opt> > +; (SwsContext *c, int16_t *dst, > +; int dstW, const uint8_t *src, > +; const int16_t *filter, > +; const int32_t *filterPos, int filterSize); > +; > +; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. > Output is > +; 15 bits (in int16_t). Each output pixel is generated from $filterSize > input > +; pixels, the position of the first pixel is given in > filterPos[nOutputPixel]. > > +;----------------------------------------------------------------------------- > + > +%macro SCALE_FUNC 1 > +cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos, > fltsize, count, inner > + pxor m0, m0 > + movu m15, [swizzle] > + mov countq, $0 > +%ifidn %1, X4 > + movu m14, [four] > + movsxd fltsizeq, fltsized > + shr fltsizeq, 2 > +%endif > +.loop: > + movu m1, [fltposq] > + movu m2, [fltposq+32] > +%ifidn %1, X4 > + pxor m9, m9 > + pxor m10, m10 > + pxor m11, m11 > + pxor m12, m12 > + mov innerq, $0 > +.innerloop: > +%endif > + vpcmpeqd m13, m13 > + vpgatherdd m3,[srcmemq + m1], m13 > + vpcmpeqd m13, m13 > + vpgatherdd m4,[srcmemq + m2], m13 > + vpunpcklbw m5, m3, m0 > + vpunpckhbw m6, m3, m0 > + vpunpcklbw m7, m4, m0 > + vpunpckhbw m8, m4, m0 > + vpmaddwd m5, m5, [filterq] > + vpmaddwd m6, m6, [filterq + 32] > + vpmaddwd m7, m7, [filterq + 64] > + vpmaddwd m8, m8, [filterq + 96] > + add filterq, $80 > +%ifidn %1, X4 > + paddd m9, m5 > + paddd m10, m6 > + paddd m11, m7 > + paddd m12, m8 > + paddd m1, m14 > + paddd m2, m14 > + add innerq, $1 > + cmp innerq, fltsizeq > + jl .innerloop > + vphaddd m5, m9, m10 > + vphaddd m6, m11, m12 > +%else > + vphaddd m5, m5, m6 > + vphaddd m6, m7, m8 > +%endif > + vpsrad m5, 7 > + vpsrad m6, 7 > + vpackssdw m5, m5, m6 > + vpermd m5, m15, m5 > + vmovdqu [dstq + countq * 2], m5 > + add fltposq, $40 > + add countq, $10 > + cmp countq, wq > + jl .loop > +REP_RET > +%endmacro > + > +%if ARCH_X86_64 > +INIT_YMM avx2 > +SCALE_FUNC 4 > +SCALE_FUNC X4 > +%endif > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 0848a31461..164b06d6ba 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -276,6 +276,9 @@ SCALE_FUNCS_SSE(sse2); > SCALE_FUNCS_SSE(ssse3); > SCALE_FUNCS_SSE(sse4); > > +SCALE_FUNC(4, 8, 15, avx2); > +SCALE_FUNC(X4, 8, 15, avx2); > + > #define VSCALEX_FUNC(size, opt) \ > void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int > filterSize, \ > const int16_t **src, uint8_t > *dest, int dstW, \ > @@ -568,6 +571,22 @@ switch(c->dstBpc){ \ > } > > #if ARCH_X86_64 > +#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \ > + switch (filtersize) { \ > + case 4: hscalefn = ff_hscale8to15_4_avx2; break; \ > + default: hscalefn = ff_hscale8to15_X4_avx2; break; \ > + break; \ > + } > + > + if (EXTERNAL_AVX2_FAST(cpu_flags)){ > + if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ > + if(c->chrDstW % 16 == 0) > + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > + if(c->dstW % 16 == 0) > + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > + } > + } > + > if (EXTERNAL_AVX2_FAST(cpu_flags)) { > switch (c->dstFormat) { > case AV_PIX_FMT_NV12: > diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c > index 40c5eb3aa8..103b1aa5da 100644 > --- a/tests/checkasm/sw_scale.c > +++ b/tests/checkasm/sw_scale.c > @@ -135,13 +135,13 @@ static void check_yuv2yuvX(void) > } > > #undef SRC_PIXELS > -#define SRC_PIXELS 128 > +#define SRC_PIXELS 512 > > static void check_hscale(void) > { > #define MAX_FILTER_WIDTH 40 > -#define FILTER_SIZES 5 > - static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 }; > +#define FILTER_SIZES 6 > + static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40 > }; > > #define HSCALE_PAIRS 2 > static const int hscale_pairs[HSCALE_PAIRS][2] = { > @@ -160,6 +160,8 @@ static void check_hscale(void) > // padded > LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH + > MAX_FILTER_WIDTH]); > LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]); > + LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH > + MAX_FILTER_WIDTH]); > + LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]); > > // The dst parameter here is either int16_t or int32_t but we use > void* to > // just cover both cases. > @@ -167,6 +169,8 @@ static void check_hscale(void) > const uint8_t *src, const int16_t *filter, > const int32_t *filterPos, int filterSize); > > + int cpu_flags = av_get_cpu_flags(); > + > ctx = sws_alloc_context(); > if (sws_init_context(ctx, NULL, NULL) < 0) > fail(); > @@ -180,9 +184,11 @@ static void check_hscale(void) > ctx->srcBpc = hscale_pairs[hpi][0]; > ctx->dstBpc = hscale_pairs[hpi][1]; > ctx->hLumFilterSize = ctx->hChrFilterSize = width; > + ctx->dstW = ctx->chrDstW = SRC_PIXELS; > > for (i = 0; i < SRC_PIXELS; i++) { > filterPos[i] = i; > + filterPosAvx[i] = i; > > // These filter cofficients are chosen to try break two > corner > // cases, namely: > @@ -211,16 +217,20 @@ static void check_hscale(void) > filter[SRC_PIXELS * width + i] = rnd(); > } > ff_sws_init_scale(ctx); > + memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * > MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); > + if (cpu_flags & AV_CPU_FLAG_AVX2){ > + ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, > filterAvx2, SRC_PIXELS); > + } > > if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", > ctx->srcBpc, ctx->dstBpc + 1, width)) { > memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); > memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0])); > > call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos, > width); > - call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos, > width); > + call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2, > filterPosAvx, width); > if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0]))) > fail(); > - bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPos, > width); > + bench_new(NULL, dst0, SRC_PIXELS, src, filter, > filterPosAvx, width); > } > } > } > -- > 2.32.0.402.g57bb445576-goog > > Part 1 of this patch has been abandoned as it is no longer required. Are there any further comments on this patch or can it be merged? Thanks
On Wed, Jul 21, 2021 at 11:11 AM Alan Kelly <alankelly@google.com> wrote: > > > On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly <alankelly@google.com> wrote: > >> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. >> --- >> EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as >> discussed in the email thread for part 1 of this patch. >> >> Benchmark results on Skylake and Haswell: >> >> Skylake Haswell >> hscale_8_to_15_width4_ssse3 761.2 760 >> hscale_8_to_15_width4_avx2 468.7 957 >> hscale_8_to_15_width8_ssse3 1170.7 1032 >> hscale_8_to_15_width8_avx2 865.7 1979 >> hscale_8_to_15_width12_ssse3 2172.2 2472 >> hscale_8_to_15_width12_avx2 1245.7 2901 >> hscale_8_to_15_width16_ssse3 2244.2 2400 >> hscale_8_to_15_width16_avx2 1647.2 3681 >> >> libswscale/swscale_internal.h | 2 + >> libswscale/utils.c | 37 +++++++++++ >> libswscale/x86/Makefile | 1 + >> libswscale/x86/scale_avx2.asm | 112 ++++++++++++++++++++++++++++++++++ >> libswscale/x86/swscale.c | 19 ++++++ >> tests/checkasm/sw_scale.c | 20 ++++-- >> 6 files changed, 186 insertions(+), 5 deletions(-) >> create mode 100644 libswscale/x86/scale_avx2.asm >> >> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h >> index 673407636a..fba3dabe5b 100644 >> --- a/libswscale/swscale_internal.h >> +++ b/libswscale/swscale_internal.h >> @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c, >> yuv2planar1_fn yuv2plane1, yuv2planarX_fn >> //number of extra lines to process >> #define MAX_LINES_AHEAD 4 >> >> +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 >> +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int >> filterSize, int16_t *filter, int dstW); >> #endif /* SWSCALE_SWSCALE_INTERNAL_H */ >> diff --git a/libswscale/utils.c b/libswscale/utils.c >> index 176fc6fd63..0577fd5490 100644 >> --- a/libswscale/utils.c >> +++ b/libswscale/utils.c >> @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = { >> [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, >> }; >> >> +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int >> filterSize, int16_t *filter, int dstW){ >> +#if ARCH_X86_64 >> + int i, j, k, l; >> + int cpu_flags = av_get_cpu_flags(); >> + if (EXTERNAL_AVX2_FAST(cpu_flags)){ >> + if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ >> + if (dstW % 16 == 0){ >> + if (filter != NULL){ >> + for (i = 0; i < dstW; i += 8){ >> + FFSWAP(int, filterPos[i + 2], filterPos[i+4]); >> + FFSWAP(int, filterPos[i + 3], filterPos[i+5]); >> + } >> + if (filterSize > 4){ >> + int16_t *tmp2 = av_malloc(dstW * filterSize * 2); >> + memcpy(tmp2, filter, dstW * filterSize * 2); >> + for (i = 0; i < dstW; i += 16){//pixel >> + for (k = 0; k < filterSize / 4; ++k){//fcoeff >> + for (j = 0; j < 16; ++j){//inner pixel >> + for (l = 0; l < 4; ++l){//coeff >> + int from = i * filterSize + j * >> filterSize + k * 4 + l; >> + int to = (i) * filterSize + j * >> 4 + l + k * 64; >> + filter[to] = tmp2[from]; >> + } >> + } >> + } >> + } >> + av_free(tmp2); >> + } >> + } >> + } >> + } >> + } >> +#endif >> +} >> + >> int sws_isSupportedInput(enum AVPixelFormat pix_fmt) >> { >> return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? >> @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c, >> SwsFilter *srcFilter, >> get_local_pos(c, 0, 0, 0), >> get_local_pos(c, 0, 0, 0))) < 0) >> goto fail; >> + ff_shuffle_filter_coefficients(c, c->hLumFilterPos, >> c->hLumFilterSize, c->hLumFilter, dstW); >> if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, >> &c->hChrFilterSize, c->chrXInc, >> c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, >> @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c, >> SwsFilter *srcFilter, >> get_local_pos(c, c->chrSrcHSubSample, >> c->src_h_chr_pos, 0), >> get_local_pos(c, c->chrDstHSubSample, >> c->dst_h_chr_pos, 0))) < 0) >> goto fail; >> + ff_shuffle_filter_coefficients(c, c->hChrFilterPos, >> c->hChrFilterSize, c->hChrFilter, c->chrDstW); >> } >> } // initialize horizontal stuff >> >> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile >> index bfe383364e..68391494be 100644 >> --- a/libswscale/x86/Makefile >> +++ b/libswscale/x86/Makefile >> @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o >> X86ASM-OBJS += x86/input.o \ >> x86/output.o \ >> x86/scale.o \ >> + x86/scale_avx2.o >> \ >> x86/rgb_2_rgb.o \ >> x86/yuv_2_rgb.o \ >> x86/yuv2yuvX.o \ >> diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm >> new file mode 100644 >> index 0000000000..d90fd2d791 >> --- /dev/null >> +++ b/libswscale/x86/scale_avx2.asm >> @@ -0,0 +1,112 @@ >> >> +;****************************************************************************** >> +;* x86-optimized horizontal line scaling functions >> +;* Copyright 2020 Google LLC >> +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> >> +;* >> +;* This file is part of FFmpeg. >> +;* >> +;* FFmpeg is free software; you can redistribute it and/or >> +;* modify it under the terms of the GNU Lesser General Public >> +;* License as published by the Free Software Foundation; either >> +;* version 2.1 of the License, or (at your option) any later version. >> +;* >> +;* FFmpeg is distributed in the hope that it will be useful, >> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of >> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> +;* Lesser General Public License for more details. >> +;* >> +;* You should have received a copy of the GNU Lesser General Public >> +;* License along with FFmpeg; if not, write to the Free Software >> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> >> +;****************************************************************************** >> + >> +%include "libavutil/x86/x86util.asm" >> + >> +SECTION_RODATA >> + >> +swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7 >> +four: times 8 dd 4 >> + >> +SECTION .text >> + >> >> +;----------------------------------------------------------------------------- >> +; horizontal line scaling >> +; >> +; void hscale8to15_<filterSize>_<opt> >> +; (SwsContext *c, int16_t *dst, >> +; int dstW, const uint8_t *src, >> +; const int16_t *filter, >> +; const int32_t *filterPos, int filterSize); >> +; >> +; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. >> Output is >> +; 15 bits (in int16_t). Each output pixel is generated from $filterSize >> input >> +; pixels, the position of the first pixel is given in >> filterPos[nOutputPixel]. >> >> +;----------------------------------------------------------------------------- >> + >> +%macro SCALE_FUNC 1 >> +cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos, >> fltsize, count, inner >> + pxor m0, m0 >> + movu m15, [swizzle] >> + mov countq, $0 >> +%ifidn %1, X4 >> + movu m14, [four] >> + movsxd fltsizeq, fltsized >> + shr fltsizeq, 2 >> +%endif >> +.loop: >> + movu m1, [fltposq] >> + movu m2, [fltposq+32] >> +%ifidn %1, X4 >> + pxor m9, m9 >> + pxor m10, m10 >> + pxor m11, m11 >> + pxor m12, m12 >> + mov innerq, $0 >> +.innerloop: >> +%endif >> + vpcmpeqd m13, m13 >> + vpgatherdd m3,[srcmemq + m1], m13 >> + vpcmpeqd m13, m13 >> + vpgatherdd m4,[srcmemq + m2], m13 >> + vpunpcklbw m5, m3, m0 >> + vpunpckhbw m6, m3, m0 >> + vpunpcklbw m7, m4, m0 >> + vpunpckhbw m8, m4, m0 >> + vpmaddwd m5, m5, [filterq] >> + vpmaddwd m6, m6, [filterq + 32] >> + vpmaddwd m7, m7, [filterq + 64] >> + vpmaddwd m8, m8, [filterq + 96] >> + add filterq, $80 >> +%ifidn %1, X4 >> + paddd m9, m5 >> + paddd m10, m6 >> + paddd m11, m7 >> + paddd m12, m8 >> + paddd m1, m14 >> + paddd m2, m14 >> + add innerq, $1 >> + cmp innerq, fltsizeq >> + jl .innerloop >> + vphaddd m5, m9, m10 >> + vphaddd m6, m11, m12 >> +%else >> + vphaddd m5, m5, m6 >> + vphaddd m6, m7, m8 >> +%endif >> + vpsrad m5, 7 >> + vpsrad m6, 7 >> + vpackssdw m5, m5, m6 >> + vpermd m5, m15, m5 >> + vmovdqu [dstq + countq * 2], m5 >> + add fltposq, $40 >> + add countq, $10 >> + cmp countq, wq >> + jl .loop >> +REP_RET >> +%endmacro >> + >> +%if ARCH_X86_64 >> +INIT_YMM avx2 >> +SCALE_FUNC 4 >> +SCALE_FUNC X4 >> +%endif >> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c >> index 0848a31461..164b06d6ba 100644 >> --- a/libswscale/x86/swscale.c >> +++ b/libswscale/x86/swscale.c >> @@ -276,6 +276,9 @@ SCALE_FUNCS_SSE(sse2); >> SCALE_FUNCS_SSE(ssse3); >> SCALE_FUNCS_SSE(sse4); >> >> +SCALE_FUNC(4, 8, 15, avx2); >> +SCALE_FUNC(X4, 8, 15, avx2); >> + >> #define VSCALEX_FUNC(size, opt) \ >> void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int >> filterSize, \ >> const int16_t **src, uint8_t >> *dest, int dstW, \ >> @@ -568,6 +571,22 @@ switch(c->dstBpc){ \ >> } >> >> #if ARCH_X86_64 >> +#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \ >> + switch (filtersize) { \ >> + case 4: hscalefn = ff_hscale8to15_4_avx2; break; \ >> + default: hscalefn = ff_hscale8to15_X4_avx2; break; \ >> + break; \ >> + } >> + >> + if (EXTERNAL_AVX2_FAST(cpu_flags)){ >> + if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ >> + if(c->chrDstW % 16 == 0) >> + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); >> + if(c->dstW % 16 == 0) >> + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); >> + } >> + } >> + >> if (EXTERNAL_AVX2_FAST(cpu_flags)) { >> switch (c->dstFormat) { >> case AV_PIX_FMT_NV12: >> diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c >> index 40c5eb3aa8..103b1aa5da 100644 >> --- a/tests/checkasm/sw_scale.c >> +++ b/tests/checkasm/sw_scale.c >> @@ -135,13 +135,13 @@ static void check_yuv2yuvX(void) >> } >> >> #undef SRC_PIXELS >> -#define SRC_PIXELS 128 >> +#define SRC_PIXELS 512 >> >> static void check_hscale(void) >> { >> #define MAX_FILTER_WIDTH 40 >> -#define FILTER_SIZES 5 >> - static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 }; >> +#define FILTER_SIZES 6 >> + static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40 >> }; >> >> #define HSCALE_PAIRS 2 >> static const int hscale_pairs[HSCALE_PAIRS][2] = { >> @@ -160,6 +160,8 @@ static void check_hscale(void) >> // padded >> LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH + >> MAX_FILTER_WIDTH]); >> LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]); >> + LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH >> + MAX_FILTER_WIDTH]); >> + LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]); >> >> // The dst parameter here is either int16_t or int32_t but we use >> void* to >> // just cover both cases. >> @@ -167,6 +169,8 @@ static void check_hscale(void) >> const uint8_t *src, const int16_t *filter, >> const int32_t *filterPos, int filterSize); >> >> + int cpu_flags = av_get_cpu_flags(); >> + >> ctx = sws_alloc_context(); >> if (sws_init_context(ctx, NULL, NULL) < 0) >> fail(); >> @@ -180,9 +184,11 @@ static void check_hscale(void) >> ctx->srcBpc = hscale_pairs[hpi][0]; >> ctx->dstBpc = hscale_pairs[hpi][1]; >> ctx->hLumFilterSize = ctx->hChrFilterSize = width; >> + ctx->dstW = ctx->chrDstW = SRC_PIXELS; >> >> for (i = 0; i < SRC_PIXELS; i++) { >> filterPos[i] = i; >> + filterPosAvx[i] = i; >> >> // These filter cofficients are chosen to try break two >> corner >> // cases, namely: >> @@ -211,16 +217,20 @@ static void check_hscale(void) >> filter[SRC_PIXELS * width + i] = rnd(); >> } >> ff_sws_init_scale(ctx); >> + memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * >> MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); >> + if (cpu_flags & AV_CPU_FLAG_AVX2){ >> + ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, >> filterAvx2, SRC_PIXELS); >> + } >> >> if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", >> ctx->srcBpc, ctx->dstBpc + 1, width)) { >> memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); >> memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0])); >> >> call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos, >> width); >> - call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos, >> width); >> + call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2, >> filterPosAvx, width); >> if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0]))) >> fail(); >> - bench_new(NULL, dst0, SRC_PIXELS, src, filter, >> filterPos, width); >> + bench_new(NULL, dst0, SRC_PIXELS, src, filter, >> filterPosAvx, width); >> } >> } >> } >> -- >> 2.32.0.402.g57bb445576-goog >> >> > Part 1 of this patch has been abandoned as it is no longer required. Are > there any further comments on this patch or can it be merged? > > Thanks > Hi, Ping! Are there any objections/comments on this patch? Thanks
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 673407636a..fba3dabe5b 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn //number of extra lines to process #define MAX_LINES_AHEAD 4 +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index 176fc6fd63..0577fd5490 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, }; +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +#if ARCH_X86_64 + int i, j, k, l; + int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_AVX2_FAST(cpu_flags)){ + if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ + if (dstW % 16 == 0){ + if (filter != NULL){ + for (i = 0; i < dstW; i += 8){ + FFSWAP(int, filterPos[i + 2], filterPos[i+4]); + FFSWAP(int, filterPos[i + 3], filterPos[i+5]); + } + if (filterSize > 4){ + int16_t *tmp2 = av_malloc(dstW * filterSize * 2); + memcpy(tmp2, filter, dstW * filterSize * 2); + for (i = 0; i < dstW; i += 16){//pixel + for (k = 0; k < filterSize / 4; ++k){//fcoeff + for (j = 0; j < 16; ++j){//inner pixel + for (l = 0; l < 4; ++l){//coeff + int from = i * filterSize + j * filterSize + k * 4 + l; + int to = (i) * filterSize + j * 4 + l + k * 64; + filter[to] = tmp2[from]; + } + } + } + } + av_free(tmp2); + } + } + } + } + } +#endif +} + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) { return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; + ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; + ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); } } // initialize horizontal stuff diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index bfe383364e..68391494be 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o \ + x86/scale_avx2.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ x86/yuv2yuvX.o \ diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm new file mode 100644 index 0000000000..d90fd2d791 --- /dev/null +++ b/libswscale/x86/scale_avx2.asm @@ -0,0 +1,112 @@ +;****************************************************************************** +;* x86-optimized horizontal line scaling functions +;* Copyright 2020 Google LLC +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7 +four: times 8 dd 4 + +SECTION .text + +;----------------------------------------------------------------------------- +; horizontal line scaling +; +; void hscale8to15_<filterSize>_<opt> +; (SwsContext *c, int16_t *dst, +; int dstW, const uint8_t *src, +; const int16_t *filter, +; const int32_t *filterPos, int filterSize); +; +; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is +; 15 bits (in int16_t). Each output pixel is generated from $filterSize input +; pixels, the position of the first pixel is given in filterPos[nOutputPixel]. +;----------------------------------------------------------------------------- + +%macro SCALE_FUNC 1 +cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner + pxor m0, m0 + movu m15, [swizzle] + mov countq, $0 +%ifidn %1, X4 + movu m14, [four] + movsxd fltsizeq, fltsized + shr fltsizeq, 2 +%endif +.loop: + movu m1, [fltposq] + movu m2, [fltposq+32] +%ifidn %1, X4 + pxor m9, m9 + pxor m10, m10 + pxor m11, m11 + pxor m12, m12 + mov innerq, $0 +.innerloop: +%endif + vpcmpeqd m13, m13 + vpgatherdd m3,[srcmemq + m1], m13 + vpcmpeqd m13, m13 + vpgatherdd m4,[srcmemq + m2], m13 + vpunpcklbw m5, m3, m0 + vpunpckhbw m6, m3, m0 + vpunpcklbw m7, m4, m0 + vpunpckhbw m8, m4, m0 + vpmaddwd m5, m5, [filterq] + vpmaddwd m6, m6, [filterq + 32] + vpmaddwd m7, m7, [filterq + 64] + vpmaddwd m8, m8, [filterq + 96] + add filterq, $80 +%ifidn %1, X4 + paddd m9, m5 + paddd m10, m6 + paddd m11, m7 + paddd m12, m8 + paddd m1, m14 + paddd m2, m14 + add innerq, $1 + cmp innerq, fltsizeq + jl .innerloop + vphaddd m5, m9, m10 + vphaddd m6, m11, m12 +%else + vphaddd m5, m5, m6 + vphaddd m6, m7, m8 +%endif + vpsrad m5, 7 + vpsrad m6, 7 + vpackssdw m5, m5, m6 + vpermd m5, m15, m5 + vmovdqu [dstq + countq * 2], m5 + add fltposq, $40 + add countq, $10 + cmp countq, wq + jl .loop +REP_RET +%endmacro + +%if ARCH_X86_64 +INIT_YMM avx2 +SCALE_FUNC 4 +SCALE_FUNC X4 +%endif diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 0848a31461..164b06d6ba 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -276,6 +276,9 @@ SCALE_FUNCS_SSE(sse2); SCALE_FUNCS_SSE(ssse3); SCALE_FUNCS_SSE(sse4); +SCALE_FUNC(4, 8, 15, avx2); +SCALE_FUNC(X4, 8, 15, avx2); + #define VSCALEX_FUNC(size, opt) \ void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \ const int16_t **src, uint8_t *dest, int dstW, \ @@ -568,6 +571,22 @@ switch(c->dstBpc){ \ } #if ARCH_X86_64 +#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \ + switch (filtersize) { \ + case 4: hscalefn = ff_hscale8to15_4_avx2; break; \ + default: hscalefn = ff_hscale8to15_X4_avx2; break; \ + break; \ + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)){ + if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ + if(c->chrDstW % 16 == 0) + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); + if(c->dstW % 16 == 0) + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); + } + } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { switch (c->dstFormat) { case AV_PIX_FMT_NV12: diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 40c5eb3aa8..103b1aa5da 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -135,13 +135,13 @@ static void check_yuv2yuvX(void) } #undef SRC_PIXELS -#define SRC_PIXELS 128 +#define SRC_PIXELS 512 static void check_hscale(void) { #define MAX_FILTER_WIDTH 40 -#define FILTER_SIZES 5 - static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 }; +#define FILTER_SIZES 6 + static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40 }; #define HSCALE_PAIRS 2 static const int hscale_pairs[HSCALE_PAIRS][2] = { @@ -160,6 +160,8 @@ static void check_hscale(void) // padded LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH]); LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]); + LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH]); + LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]); // The dst parameter here is either int16_t or int32_t but we use void* to // just cover both cases. @@ -167,6 +169,8 @@ static void check_hscale(void) const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); + int cpu_flags = av_get_cpu_flags(); + ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); @@ -180,9 +184,11 @@ static void check_hscale(void) ctx->srcBpc = hscale_pairs[hpi][0]; ctx->dstBpc = hscale_pairs[hpi][1]; ctx->hLumFilterSize = ctx->hChrFilterSize = width; + ctx->dstW = ctx->chrDstW = SRC_PIXELS; for (i = 0; i < SRC_PIXELS; i++) { filterPos[i] = i; + filterPosAvx[i] = i; // These filter cofficients are chosen to try break two corner // cases, namely: @@ -211,16 +217,20 @@ static void check_hscale(void) filter[SRC_PIXELS * width + i] = rnd(); } ff_sws_init_scale(ctx); + memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); + if (cpu_flags & AV_CPU_FLAG_AVX2){ + ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); + } if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0])); call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos, width); - call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos, width); + call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2, filterPosAvx, width); if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0]))) fail(); - bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPos, width); + bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPosAvx, width); } } }