Message ID | 20220715145943.3474147-1-alankelly@google.com |
---|---|
State | Accepted |
Commit | a38293e4448c9389e604af9858984361a5677a20 |
Headers | show |
Series | None | expand |
Hi Michael, Thanks for looking at this. I fixed the test issue. Alan On Fri, Jul 15, 2022 at 4:59 PM Alan Kelly <alankelly@google.com> wrote: > ff_shuffle_filter_coefficients shuffles the tail as required. > --- > libswscale/utils.c | 19 ++++++++++++++++--- > libswscale/x86/swscale.c | 6 ++---- > tests/checkasm/sw_scale.c | 2 +- > 3 files changed, 19 insertions(+), 8 deletions(-) > > diff --git a/libswscale/utils.c b/libswscale/utils.c > index cb4f5b521c..544b7fee96 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -266,8 +266,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > #if ARCH_X86_64 > int i, j, k; > int cpu_flags = av_get_cpu_flags(); > - // avx2 hscale filter processes 16 pixel blocks. > - if (!filter || dstW % 16 != 0) > + if (!filter) > return 0; > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > @@ -279,9 +278,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > } > // Do not swap filterPos for pixels which won't be processed by > // the main loop. > - for (i = 0; i + 8 <= dstW; i += 8) { > + for (i = 0; i + 16 <= dstW; i += 16) { > FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); > FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); > + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); > + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); > } > if (filterSize > 4) { > // 16 pixels are processed at a time. > @@ -295,6 +296,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > } > } > } > + // 4 pixels are processed at a time in the tail. > + for (; i < dstW; i += 4) { > + // 4 filter coeffs are processed at a time. > + int rem = dstW - i >= 4 ? 4 : dstW - i; > + for (k = 0; k + 4 <= filterSize; k += 4) { > + for (j = 0; j < rem; ++j) { > + int from = (i + j) * filterSize + k; > + int to = i * filterSize + j * 4 + k * 4; > + memcpy(&filter[to], &filterCopy[from], 4 * > sizeof(int16_t)); > + } > + } > + } > } > av_free(filterCopy); > } > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 628f12137c..f628c71bd4 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -626,10 +626,8 @@ switch(c->dstBpc){ \ > > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > - if (c->chrDstW % 16 == 0) > - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > - if (c->dstW % 16 == 0) > - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > } > } > > diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c > index b643a47c30..798990a6cf 100644 > --- a/tests/checkasm/sw_scale.c > +++ b/tests/checkasm/sw_scale.c > @@ -223,7 +223,7 @@ static void check_hscale(void) > ff_sws_init_scale(ctx); > memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS > * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); > if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) > - ff_shuffle_filter_coefficients(ctx, filterPosAvx, > width, filterAvx2, SRC_PIXELS); > + ff_shuffle_filter_coefficients(ctx, filterPosAvx, > width, filterAvx2, ctx->dstW); > > if (check_func(ctx->hcScale, > "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, > ctx->dstW)) { > memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); > -- > 2.37.0.170.g444d1eabd0-goog > >
On Fri, Jul 15, 2022 at 05:03:56PM +0200, Alan Kelly wrote: > Hi Michael, > > Thanks for looking at this. I fixed the test issue. seems to be still failing here: make distclean ; ./configure && make -j32 tests/checkasm/checkasm && tests/checkasm/checkasm --test=sw_scale checkasm: using random seed 1328711543 MMXEXT: - sw_scale.yuv2yuvX [OK] SSE2: - sw_scale.hscale [OK] SSE3: - sw_scale.yuv2yuvX [OK] SSSE3: - sw_scale.hscale [OK] SSE4.1: - sw_scale.hscale [OK] AVX2: hscale_8_to_15__fs_4_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_4_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_8_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_8_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_12_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_12_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_16_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_16_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_32_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_32_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_40_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_40_dstW_24_avx2 (sw_scale.c:235) - sw_scale.hscale [FAILED] - sw_scale.yuv2yuvX [OK] checkasm: 12 of 504 tests have failed [...]
Hi Michael, I have tried to recreate this locally in a clean client applying the patches as sent in the email thread. I have tried gcc and mingw and this passes for me. Are you sure you applied both patches 3 & 4? If only patch 4 is applied, then I get the error you have. Thanks, Alan On Sat, Jul 16, 2022 at 1:14 PM Michael Niedermayer <michael@niedermayer.cc> wrote: > On Fri, Jul 15, 2022 at 05:03:56PM +0200, Alan Kelly wrote: > > Hi Michael, > > > > Thanks for looking at this. I fixed the test issue. > > seems to be still failing here: > make distclean ; ./configure && make -j32 tests/checkasm/checkasm && > tests/checkasm/checkasm --test=sw_scale > checkasm: using random seed 1328711543 > MMXEXT: > - sw_scale.yuv2yuvX [OK] > SSE2: > - sw_scale.hscale [OK] > SSE3: > - sw_scale.yuv2yuvX [OK] > SSSE3: > - sw_scale.hscale [OK] > SSE4.1: > - sw_scale.hscale [OK] > AVX2: > hscale_8_to_15__fs_4_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_4_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_8_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_8_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_12_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_12_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_16_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_16_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_32_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_32_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_40_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_40_dstW_24_avx2 (sw_scale.c:235) > - sw_scale.hscale [FAILED] > - sw_scale.yuv2yuvX [OK] > checkasm: 12 of 504 tests have failed > > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > It is a danger to trust the dream we wish for rather than > the science we have, -- Dr. Kenneth Brown > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
On Mon, Jul 18, 2022 at 09:54:39AM +0200, Alan Kelly wrote: > Hi Michael, > > I have tried to recreate this locally in a clean client applying the > patches as sent in the email thread. I have tried gcc and mingw and this > passes for me. Are you sure you applied both patches 3 & 4? If only patch 4 > is applied, then I get the error you have. ive retested, and i cannot reproduce, i think i had #4 & #5 not #3 and #4 applied thx [...]
Hi Michael, Is there anything blocking this change being applied? Is there anything I can do to help? Thanks, Alan On Mon, Jul 18, 2022 at 6:49 PM Michael Niedermayer <michael@niedermayer.cc> wrote: > On Mon, Jul 18, 2022 at 09:54:39AM +0200, Alan Kelly wrote: > > Hi Michael, > > > > I have tried to recreate this locally in a clean client applying the > > patches as sent in the email thread. I have tried gcc and mingw and this > > passes for me. Are you sure you applied both patches 3 & 4? If only > patch 4 > > is applied, then I get the error you have. > > ive retested, and i cannot reproduce, i think i had #4 & #5 not #3 and #4 > applied > > thx > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > Those who are too smart to engage in politics are punished by being > governed by those who are dumber. -- Plato > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
diff --git a/libswscale/utils.c b/libswscale/utils.c index cb4f5b521c..544b7fee96 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -266,8 +266,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i, j, k; int cpu_flags = av_get_cpu_flags(); - // avx2 hscale filter processes 16 pixel blocks. - if (!filter || dstW % 16 != 0) + if (!filter) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { @@ -279,9 +278,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -295,6 +296,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i < dstW; i += 4) { + // 4 filter coeffs are processed at a time. + int rem = dstW - i >= 4 ? 4 : dstW - i; + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < rem; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } av_free(filterCopy); } diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 628f12137c..f628c71bd4 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -626,10 +626,8 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { - if (c->chrDstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); - if (c->dstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index b643a47c30..798990a6cf 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -223,7 +223,7 @@ static void check_hscale(void) ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) - ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); + ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, ctx->dstW); if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));