diff mbox series

[FFmpeg-devel,1/2] checkasm: sw_rgb: Add a test for interleaveBytes

Message ID 20200515091038.16743-1-martin@martin.st
State Superseded
Headers show
Series [FFmpeg-devel,1/2] checkasm: sw_rgb: Add a test for interleaveBytes | expand

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate fail Make fate failed

Commit Message

Martin Storsjö May 15, 2020, 9:10 a.m. UTC
---
This depends on "checkasm: Add functions for printing pixel buffers".

The existing x86 implementations of interleaveBytes seem to slow
down significantly for unaligned copies (GCC 7.5, Sandy Bridge):

interleave_bytes_c:      36251.6
interleave_bytes_mmx:    10038.8
interleave_bytes_mmxext: 58450.3
interleave_bytes_sse2:   57746.3

For the properly aligned case, it behaves better:

interleave_bytes_aligned_c:     36109.8
interleave_bytes_aligned_mmx:    6033.8
interleave_bytes_aligned_mmxext: 6473.1
interleave_bytes_aligned_sse2:   6163.1

But Clang (in Xcode 11.3, run on Kaby Lake) seems to beat all the asm
implementations, in its (autovectorized?) C version:

interleave_bytes_c:       9893.0
interleave_bytes_mmx:    23153.5
interleave_bytes_mmxext: 43693.8
interleave_bytes_sse2:   55894.8

interleave_bytes_aligned_c:      3456.0
interleave_bytes_aligned_mmx:    5780.0
interleave_bytes_aligned_mmxext: 4913.8
interleave_bytes_aligned_sse2:   4154.3
---
 tests/checkasm/sw_rgb.c | 53 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
diff mbox series

Patch

diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index 000420d8f7..41c486a2d7 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -111,6 +111,56 @@  static void check_uyvy_to_422p(void)
     }
 }
 
+static void check_interleave_bytes(void)
+{
+    LOCAL_ALIGNED_16(uint8_t, src0_buf, [MAX_STRIDE*MAX_HEIGHT+1]);
+    LOCAL_ALIGNED_16(uint8_t, src1_buf, [MAX_STRIDE*MAX_HEIGHT+1]);
+    LOCAL_ALIGNED_16(uint8_t, dst0_buf, [2*MAX_STRIDE*MAX_HEIGHT+2]);
+    LOCAL_ALIGNED_16(uint8_t, dst1_buf, [2*MAX_STRIDE*MAX_HEIGHT+2]);
+    // Intentionally using unaligned buffers, as this function doesn't have
+    // any alignment requirements.
+    uint8_t *src0 = src0_buf + 1;
+    uint8_t *src1 = src1_buf + 1;
+    uint8_t *dst0 = dst0_buf + 2;
+    uint8_t *dst1 = dst1_buf + 2;
+
+    declare_func_emms(AV_CPU_FLAG_MMX, void, const uint8_t *, const uint8_t *,
+                                       uint8_t *, int, int, int, int, int);
+
+    randomize_buffers(src0, MAX_STRIDE * MAX_HEIGHT);
+    randomize_buffers(src1, MAX_STRIDE * MAX_HEIGHT);
+
+    if (check_func(interleaveBytes, "interleave_bytes")) {
+        for (int i = 0; i <= 16; i++) {
+            // Try all widths [1,16], and try one random width.
+
+            int w = i > 0 ? i : (1 + (rnd() % (MAX_STRIDE-2)));
+            int h = 1 + (rnd() % (MAX_HEIGHT-2));
+
+            memset(dst0, 0, 2 * MAX_STRIDE * MAX_HEIGHT);
+            memset(dst1, 0, 2 * MAX_STRIDE * MAX_HEIGHT);
+
+            call_ref(src0, src1, dst0, w, h,
+                     MAX_STRIDE, MAX_STRIDE, 2*MAX_STRIDE);
+            call_new(src0, src1, dst1, w, h,
+                     MAX_STRIDE, MAX_STRIDE, 2*MAX_STRIDE);
+            // Check a one pixel-pair edge around the destination area,
+            // to catch overwrites past the end.
+            checkasm_check(uint8_t, dst0, 2*MAX_STRIDE, dst1, 2*MAX_STRIDE,
+                           2 * w + 2, h + 1, "dst");
+        }
+
+        bench_new(src0, src1, dst1, 127, MAX_HEIGHT,
+                  MAX_STRIDE, MAX_STRIDE, 2*MAX_STRIDE);
+    }
+    if (check_func(interleaveBytes, "interleave_bytes_aligned")) {
+        // Bench the function in a more typical case, with aligned
+        // buffers and widths.
+        bench_new(src0_buf, src1_buf, dst1_buf, 128, MAX_HEIGHT,
+                  MAX_STRIDE, MAX_STRIDE, 2*MAX_STRIDE);
+    }
+}
+
 void checkasm_check_sw_rgb(void)
 {
     ff_sws_rgb2rgb_init();
@@ -132,4 +182,7 @@  void checkasm_check_sw_rgb(void)
 
     check_uyvy_to_422p();
     report("uyvytoyuv422");
+
+    check_interleave_bytes();
+    report("interleave_bytes");
 }