diff mbox series

[FFmpeg-devel,1/2] checkasm: updated tests for sw_scale

Message ID 005de8b06dea40c4a60fdad9a084138f@EX13D07UWB004.ant.amazon.com
State Superseded
Headers show
Series checkasm: updated tests for sw_scale | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Swinney, Jonathan June 13, 2022, 4:36 p.m. UTC
- added a test for yuv2plane1 (currently disabled for x86_64)
- fixed test for yuv2planeX for aarch64 which was previously not working at all

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
---
 tests/checkasm/sw_scale.c | 176 +++++++++++++++++++++++++++++++++-----
 1 file changed, 156 insertions(+), 20 deletions(-)

Comments

Martin Storsjö June 21, 2022, 8:16 p.m. UTC | #1
On Mon, 13 Jun 2022, Swinney, Jonathan wrote:

> - added a test for yuv2plane1 (currently disabled for x86_64)

What's the reason for having it disabled for x86 - is it another case 
where the current implementations there aren't bitexact? Could we avoid 
that by setting the bitexact flag for the new yuv2yuv1 test?

> - fixed test for yuv2planeX for aarch64 which was previously not working 
> at all

Could we make the test fuzzy and allow minor differences from the 
reference, when the bitexact flag isn't set, and separately test with the 
bitexact flag and require exact matches?

> @@ -95,7 +210,7 @@ static void check_yuv2yuvX(void)
>     ff_sws_init_scale(ctx);
>     for(isi = 0; isi < INPUT_SIZES; ++isi){
>         dstW = input_sizes[isi];
> -        for(osi = 0; osi < 64; osi += 16){
> +        for(osi = 0; osi < 1; osi += 16){

This looks like a stray leftover change?

// Martin
Martin Storsjö June 22, 2022, 9:22 a.m. UTC | #2
On Tue, 21 Jun 2022, Martin Storsjö wrote:

> On Mon, 13 Jun 2022, Swinney, Jonathan wrote:
>
>> - added a test for yuv2plane1 (currently disabled for x86_64)
>
> What's the reason for having it disabled for x86 - is it another case where 
> the current implementations there aren't bitexact? Could we avoid that by 
> setting the bitexact flag for the new yuv2yuv1 test?
>
>> - fixed test for yuv2planeX for aarch64 which was previously not working at 
>> all
>
> Could we make the test fuzzy and allow minor differences from the reference, 
> when the bitexact flag isn't set, and separately test with the bitexact flag 
> and require exact matches?
>
>> @@ -95,7 +210,7 @@ static void check_yuv2yuvX(void)
>>     ff_sws_init_scale(ctx);
>>     for(isi = 0; isi < INPUT_SIZES; ++isi){
>>         dstW = input_sizes[isi];
>> -        for(osi = 0; osi < 64; osi += 16){
>> +        for(osi = 0; osi < 1; osi += 16){
>
> This looks like a stray leftover change?

I had a look at this, trying to fix things up. This now passes tests on 
x86_32, x86_64 and aarch64. See the attached patch, which goes on top of 
yours.

It's not intended as a final version of how things should be necessarily, 
but as a more concrete pointer about how it could be done - it needs at 
least reindenting after adding the outer for loop.

I also had to skip the filter sizes 1 and 3 in check_yuv2yuvX, because 
ff_yuv2planeX_8_sse2 couldn't handle those. I presume that means that in 
practice, those aren't ever used?

// Martin
diff mbox series

Patch

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 31d9a525e9..537cbd3265 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -35,12 +35,13 @@ 
             AV_WN32(buf + j, rnd());      \
     } while (0)
 
-// This reference function is the same approximate algorithm employed by the
-// SIMD functions
-static void ref_function(const int16_t *filter, int filterSize,
-                                                 const int16_t **src, uint8_t *dest, int dstW,
-                                                 const uint8_t *dither, int offset)
+static void yuv2planeX_8_ref(const int16_t *filter, int filterSize,
+                             const int16_t **src, uint8_t *dest, int dstW,
+                             const uint8_t *dither, int offset)
 {
+#if ARCH_X86_64
+    // This reference function is the same approximate algorithm employed by the
+    // SIMD functions on x86.
     int i, d;
     d = ((filterSize - 1) * 8 + dither[0]) >> 4;
     for ( i = 0; i < dstW; i++) {
@@ -56,6 +57,120 @@  static void ref_function(const int16_t *filter, int filterSize,
         }
         dest[i]= av_clip_uint8(val>>3);
     }
+#else
+    // Other architectures use the default implementation as the reference.
+    int i;
+    for (i=0; i<dstW; i++) {
+        int val = dither[(i + offset) & 7] << 12;
+        int j;
+        for (j=0; j<filterSize; j++)
+            val += src[j][i] * filter[j];
+
+        dest[i]= av_clip_uint8(val>>19);
+    }
+#endif
+}
+static void yuv2plane1_8_ref(const int16_t *src, uint8_t *dest, int dstW,
+                             const uint8_t *dither, int offset)
+{
+    int i;
+    for (i=0; i<dstW; i++) {
+        int val = (src[i] + dither[(i + offset) & 7]) >> 7;
+        dest[i]= av_clip_uint8(val);
+    }
+}
+
+static void print_data(uint8_t *p, size_t len, size_t offset)
+{
+    size_t i = 0;
+    for (; i < len; i++) {
+        if (i % 8 == 0) {
+            printf("0x%04lx: ", i+offset);
+        }
+        printf("0x%02x ", (uint32_t) p[i]);
+        if (i % 8 == 7) {
+            printf("\n");
+        }
+    }
+    if (i % 8 != 0) {
+        printf("\n");
+    }
+}
+
+static size_t show_differences(uint8_t *a, uint8_t *b, size_t len)
+{
+    for (size_t i = 0; i < len; i++) {
+        if (a[i] != b[i]) {
+            size_t offset_of_mismatch = i;
+            size_t offset;
+            if (i >= 8) i-=8;
+            offset = i & (~7);
+            printf("test a:\n");
+            print_data(&a[offset], 32, offset);
+            printf("\ntest b:\n");
+            print_data(&b[offset], 32, offset);
+            printf("\n");
+            return offset_of_mismatch;
+        }
+    }
+    return len;
+}
+
+static void check_yuv2yuv1(void)
+{
+    struct SwsContext *ctx;
+    int osi, isi;
+    int dstW, offset;
+    size_t fail_offset;
+    const int input_sizes[] = {8, 24, 128, 144, 256, 512};
+    const int INPUT_SIZES = sizeof(input_sizes)/sizeof(input_sizes[0]);
+    #define LARGEST_INPUT_SIZE 512
+
+    const int offsets[] = {0, 3, 8, 11, 16, 19};
+    const int OFFSET_SIZES = sizeof(offsets)/sizeof(offsets[0]);
+
+    declare_func_emms(AV_CPU_FLAG_MMX, void,
+                      const int16_t *src, uint8_t *dest,
+                      int dstW, const uint8_t *dither, int offset);
+
+    LOCAL_ALIGNED_8(int16_t, src_pixels, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, dst0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, dst1, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, dither, [8]);
+
+    randomize_buffers((uint8_t*)dither, 8);
+    randomize_buffers((uint8_t*)src_pixels, LARGEST_INPUT_SIZE * sizeof(int16_t));
+    ctx = sws_alloc_context();
+    if (sws_init_context(ctx, NULL, NULL) < 0)
+        fail();
+
+    ff_sws_init_scale(ctx);
+    for(isi = 0; isi < INPUT_SIZES; ++isi){
+        dstW = input_sizes[isi];
+        for(osi = 0; osi < OFFSET_SIZES; osi++){
+            offset = offsets[osi];
+            if (check_func(ctx->yuv2plane1, "yuv2yuv1_%d_%d", offset, dstW)){
+                memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
+                memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
+
+                yuv2plane1_8_ref(src_pixels, dst0, dstW, dither, offset);
+                call_new(src_pixels, dst1, dstW, dither, offset);
+                if (memcmp(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]))) {
+                    fail();
+                    printf("failed: yuv2yuv1_%d_%d\n", offset, dstW);
+                    fail_offset = show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
+                    printf("failing values: src: 0x%04x dither: 0x%02x dst-c: %02x dst-asm: %02x\n",
+                            (int) src_pixels[fail_offset],
+                            (int) dither[(fail_offset + fail_offset) & 7],
+                            (int) dst0[fail_offset],
+                            (int) dst1[fail_offset]);
+                }
+                if(dstW == LARGEST_INPUT_SIZE)
+                    bench_new(src_pixels, dst1, dstW, dither, offset);
+            }
+        }
+    }
+    sws_freeContext(ctx);
 }
 
 static void check_yuv2yuvX(void)
@@ -64,11 +179,11 @@  static void check_yuv2yuvX(void)
     int fsi, osi, isi, i, j;
     int dstW;
 #define LARGEST_FILTER 16
-#define FILTER_SIZES 4
-    static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16};
+    const int filter_sizes[] = {1, 2, 3, 4, 8, 16};
+    const int FILTER_SIZES = sizeof(filter_sizes)/sizeof(filter_sizes[0]);
 #define LARGEST_INPUT_SIZE 512
-#define INPUT_SIZES 6
-    static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512};
+    static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
+    const int INPUT_SIZES = sizeof(input_sizes)/sizeof(input_sizes[0]);
 
     declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter,
                       int filterSize, const int16_t **src, uint8_t *dest,
@@ -95,7 +210,7 @@  static void check_yuv2yuvX(void)
     ff_sws_init_scale(ctx);
     for(isi = 0; isi < INPUT_SIZES; ++isi){
         dstW = input_sizes[isi];
-        for(osi = 0; osi < 64; osi += 16){
+        for(osi = 0; osi < 1; osi += 16){
             for(fsi = 0; fsi < FILTER_SIZES; ++fsi){
                 src = av_malloc(sizeof(int16_t*) * filter_sizes[fsi]);
                 vFilterData = av_malloc((filter_sizes[fsi] + 2) * sizeof(union VFilterData));
@@ -110,18 +225,35 @@  static void check_yuv2yuvX(void)
                     memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
                     memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
 
-                    // The reference function is not the scalar function selected when mmx
-                    // is deactivated as the SIMD functions do not give the same result as
-                    // the scalar ones due to rounding. The SIMD functions are activated by
-                    // the flag SWS_ACCURATE_RND
-                    ref_function(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi);
-                    // There's no point in calling new for the reference function
-                    if(ctx->use_mmx_vfilter){
-                        call_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
-                        if (memcmp(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0])))
+                    if (ARCH_X86_64) {
+                        // The reference function is not the scalar function selected when mmx
+                        // is deactivated as the SIMD functions do not give the same result as
+                        // the scalar ones due to rounding. The SIMD functions are activated by
+                        // the flag SWS_ACCURATE_RND
+                        yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi);
+                        // There's no point in calling new for the reference function
+                        if(ctx->use_mmx_vfilter) {
+                            call_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                            if (memcmp(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]))) {
+                                fail();
+                                printf("failed: yuv2yuvX_%d_%d_%d\n", filter_sizes[fsi], osi, dstW);
+                                show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
+                            }
+                            if(dstW == LARGEST_INPUT_SIZE)
+                                bench_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                        }
+                    }
+
+                    if (ARCH_AARCH64) {
+                        yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi);
+                        call_new(&filter_coeff[0], filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                        if (memcmp(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]))) {
                             fail();
+                            printf("failed: yuv2yuvX_%d_%d_%d\n", filter_sizes[fsi], osi, dstW);
+                            show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
+                        }
                         if(dstW == LARGEST_INPUT_SIZE)
-                            bench_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                            bench_new(&filter_coeff[0], filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
                     }
                 }
                 av_freep(&src);
@@ -245,6 +377,10 @@  void checkasm_check_sw_scale(void)
 {
     check_hscale();
     report("hscale");
+    if (!ARCH_X86_64) {
+        check_yuv2yuv1();
+        report("yuv2yuv1");
+    }
     check_yuv2yuvX();
     report("yuv2yuvX");
 }