diff mbox series

[FFmpeg-devel,v2] checkasm/hevc_deblock: add luma and chroma full

Message ID 20240124120634.84237-1-jdek@itanimul.li
State New
Headers show
Series [FFmpeg-devel,v2] checkasm/hevc_deblock: add luma and chroma full | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

J. Dekker Jan. 24, 2024, 12:06 p.m. UTC
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 tests/checkasm/hevc_deblock.c | 225 +++++++++++++++++++++++++++++-----
 1 file changed, 195 insertions(+), 30 deletions(-)

- added luma 10/12 bit
- supporting full (*_c) luma & chroma functions
- dynamically generating all test data

Appears to work for me. Testing on x86, hits the filtering decisions correctly.
x86 doesn't have the full asm functions though, need to check a platform which
has them (though the difference is minor, not sure why it wouldn't work).

Comments

Martin Storsjö Jan. 24, 2024, 1:39 p.m. UTC | #1
On Wed, 24 Jan 2024, J. Dekker wrote:

> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> tests/checkasm/hevc_deblock.c | 225 +++++++++++++++++++++++++++++-----
> 1 file changed, 195 insertions(+), 30 deletions(-)
>
> - added luma 10/12 bit
> - supporting full (*_c) luma & chroma functions
> - dynamically generating all test data
>
> Appears to work for me. Testing on x86, hits the filtering decisions correctly.
> x86 doesn't have the full asm functions though, need to check a platform which
> has them (though the difference is minor, not sure why it wouldn't work).

Looks mostly good, although I didn't test it myself.

A couple of cosmetic comments below:

> +#define RNDDIFF(val, diff) av_clip(((SIZEOF_PIXEL == 1) ? \
> +    *(uint8_t*)(&val) : *(uint16_t*)(&val)) - (diff), 0, \
> +    (1 << (bit_depth)) - 1) + rnd() % FFMAX(2 * (diff), 1)

This macro is quite hard to read - can you indent it semantically based on 
nesting level or something like that?

> +#define TC25(x) ((tc[x] * 5 + 1) >> 1);
> +
> +static void randomize_luma_buffers(int type, int *beta, int32_t tc[2], uint8_t *buf, ptrdiff_t xstride, ptrdiff_t ystride, int bit_depth)
> +{

This one line is quite significantly longer than all the surrounding ones 
- can you wrap it? (Elsewhere, there are also a couple rather long lines, 
but they fit in better, but wrapping them is welcome too.)

// Martin
Michael Niedermayer Jan. 25, 2024, 12:04 a.m. UTC | #2
On Wed, Jan 24, 2024 at 01:06:34PM +0100, J. Dekker wrote:
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>  tests/checkasm/hevc_deblock.c | 225 +++++++++++++++++++++++++++++-----
>  1 file changed, 195 insertions(+), 30 deletions(-)
> 
> - added luma 10/12 bit
> - supporting full (*_c) luma & chroma functions
> - dynamically generating all test data
> 
> Appears to work for me. Testing on x86, hits the filtering decisions correctly.
> x86 doesn't have the full asm functions though, need to check a platform which
> has them (though the difference is minor, not sure why it wouldn't work).

I see intermittend failures on x86-64 linux

make V=2 fate-checkasm-hevc_deblock

TEST    checkasm-hevc_deblock
./tests/fate-run.sh fate-checkasm-hevc_deblock "fate-suite/" "" "" 'run tests/checkasm/checkasm --test=hevc_deblock' 'null' '' '' '1' '' '' '' '' '' '' '' '' '' ''
 /tests/checkasm/checkasm --test=hevc_deblock

 $ make V=2 fate-checkasm-hevc_deblock
...
TEST    checkasm-hevc_deblock
./tests/fate-run.sh fate-checkasm-hevc_deblock "fate-suite/" "" "" 'run tests/checkasm/checkasm --test=hevc_deblock' 'null' '' '' '1' '' '' '' '' '' '' '' '' '' ''
 /tests/checkasm/checkasm --test=hevc_deblock
Test checkasm-hevc_deblock failed. Look at tests/data/fate/checkasm-hevc_deblock.err for details.
checkasm: using random seed 241122653
SSE2:
 - hevc_deblock.chroma [OK]
   hevc_h_loop_filter_luma12_weak_sse2 (hevc_deblock.c:230)
 - hevc_deblock.luma   [FAILED]
SSSE3:
 - hevc_deblock.luma   [OK]
AVX:
 - hevc_deblock.chroma [OK]
 - hevc_deblock.luma   [OK]
checkasm: 1 of 66 tests have failed
threads=1
tests/Makefile:318: recipe for target 'fate-checkasm-hevc_deblock' failed
make: *** [fate-checkasm-hevc_deblock] Error 1



[...]
diff mbox series

Patch

diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c
index 66fc8d5646..dfe7fc8e97 100644
--- a/tests/checkasm/hevc_deblock.c
+++ b/tests/checkasm/hevc_deblock.c
@@ -19,6 +19,7 @@ 
 #include <string.h>
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/macros.h"
 #include "libavutil/mem_internal.h"
 
 #include "libavcodec/avcodec.h"
@@ -29,10 +30,11 @@ 
 static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
 
 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
-#define BUF_STRIDE (8 * 2)
-#define BUF_LINES (8)
-#define BUF_OFFSET (BUF_STRIDE * BUF_LINES)
-#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
+#define BUF_STRIDE (16 * 2)
+#define BUF_LINES (16)
+// large buffer sizes based on high bit depth
+#define BUF_OFFSET (2 * BUF_STRIDE * BUF_LINES)
+#define BUF_SIZE (2 * BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
 
 #define randomize_buffers(buf0, buf1, size)                 \
     do {                                                    \
@@ -45,57 +47,220 @@  static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
         }                                                   \
     } while (0)
 
-static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth)
+static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth, int c)
 {
-    int32_t tc[2] = { 0, 0 };
+    // see tctable[] in hevc_filter.c, we check full range
+    int32_t tc[2] = { rnd() % 25, rnd() % 25 };
     // no_p, no_q can only be { 0,0 } for the simpler assembly (non *_c
     // variant) functions, see deblocking_filter_CTB() in hevc_filter.c
-    uint8_t no_p[2] = { 0, 0 };
-    uint8_t no_q[2] = { 0, 0 };
+    uint8_t no_p[2] = { rnd() & c, rnd() & c };
+    uint8_t no_q[2] = { rnd() & c, rnd() & c };
     LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
     LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
 
     declare_func(void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
 
-    if (check_func(h->hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma%d", bit_depth)) {
-        for (int i = 0; i < 4; i++) {
-            randomize_buffers(buf0, buf1, BUF_SIZE);
-            // see betatable[] in hevc_filter.c
-            tc[0] = (rnd() & 63) + (rnd() & 1);
-            tc[1] = (rnd() & 63) + (rnd() & 1);
+    if (check_func(c ? h->hevc_h_loop_filter_chroma_c :
+                       h->hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma%d%s", bit_depth, c ? "_full" : "")) {
+        randomize_buffers(buf0, buf1, BUF_SIZE);
 
-            call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
-            call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        if (memcmp(buf0, buf1, BUF_SIZE))
+            fail();
+        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+    }
+
+    if (check_func(c ? h->hevc_v_loop_filter_chroma_c :
+                       h->hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma%d%s", bit_depth, c ? "_full" : "")) {
+        randomize_buffers(buf0, buf1, BUF_SIZE);
+
+        call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+        if (memcmp(buf0, buf1, BUF_SIZE))
+            fail();
+        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+    }
+}
+
+#define P3 buf[-4 * xstride]
+#define P2 buf[-3 * xstride]
+#define P1 buf[-2 * xstride]
+#define P0 buf[-1 * xstride]
+#define Q0 buf[0 * xstride]
+#define Q1 buf[1 * xstride]
+#define Q2 buf[2 * xstride]
+#define Q3 buf[3 * xstride]
+
+#define EQU(x, y) do { \
+    uint16_t z = (uint16_t)(y & ((1 << (bit_depth)) - 1)); \
+    if (SIZEOF_PIXEL == 1) { \
+        *(uint8_t*)(&x) = (uint8_t)z; \
+    } else if (SIZEOF_PIXEL == 2) { \
+        *(uint16_t*)(&x) = z; \
+    } \
+} while(0)
+
+#define RNDDIFF(val, diff) av_clip(((SIZEOF_PIXEL == 1) ? \
+    *(uint8_t*)(&val) : *(uint16_t*)(&val)) - (diff), 0, \
+    (1 << (bit_depth)) - 1) + rnd() % FFMAX(2 * (diff), 1)
+
+#define TC25(x) ((tc[x] * 5 + 1) >> 1);
+
+static void randomize_luma_buffers(int type, int *beta, int32_t tc[2], uint8_t *buf, ptrdiff_t xstride, ptrdiff_t ystride, int bit_depth)
+{
+    int i, j, b3, tc25, tc25diff, b3diff;
+    // minimum useful value is 1, full range 0-24
+    tc[0] = ((rnd() % 25) + 1) << (bit_depth - 8);
+    tc[1] = ((rnd() % 25) + 1) << (bit_depth - 8);
+    // minimum useful value for 8bit is 8 since >> 8
+    *beta = ((rnd() % 57) + 8) << (bit_depth - 8);
+
+    switch (type) {
+    case 0: // strong
+        for (j = 0; j < 2; j++) {
+            tc25 = TC25(j);
+            tc25diff = FFMAX(tc25 - 1, 0);
+            // 4 lines per tc
+            for (i = 0; i < 4; i++) {
+                b3 = *beta >> 3;
+
+                EQU(P0, rnd() % (1 << bit_depth));
+                EQU(Q0, RNDDIFF(P0, tc25diff));
+
+                // p3 - p0 up to beta3 budget
+                b3diff = rnd() % b3;
+                EQU(P3, RNDDIFF(P0, b3diff));
+                // q3 - q0, reduced budget
+                b3diff = rnd() % FFMAX(b3 - b3diff, 1);
+                EQU(Q3, RNDDIFF(Q0, b3diff));
+
+                // same concept, budget across 4 pixels
+                b3 -= b3diff = rnd() % FFMAX(b3, 1);
+                EQU(P2, RNDDIFF(P0, b3diff));
+                b3 -= b3diff = rnd() % FFMAX(b3, 1);
+                EQU(Q2, RNDDIFF(Q0, b3diff));
+
+                // extra reduced budget for weighted pixels
+                b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
+                EQU(P1, RNDDIFF(P0, b3diff));
+                b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
+                EQU(Q1, RNDDIFF(Q0, b3diff));
+
+                buf += ystride;
+            }
+        }
+        break;
+    case 1: // weak
+        for (j = 0; j < 2; j++) {
+            tc25 = TC25(j);
+            tc25diff = FFMAX(tc25 - 1, 0);
+            // 4 lines per tc
+            for (i = 0; i < 4; i++) {
+                // Weak filtering is signficantly simpler to activate as
+                // we only need to satisfy d0 + d3 < beta, which
+                // can be simplified to d0 + d0 < beta. Using the above
+                // derivations but substiuting b3 for b1 and ensuring
+                // that P0/Q0 are at least 1/2 tc25diff apart (tending
+                // towards 1/2 range).
+                b3 = *beta >> 1;
+
+                EQU(P0, rnd() % (1 << bit_depth));
+                EQU(Q0, RNDDIFF(P0, tc25diff >> 1) +
+                    (tc25diff >> 1) * (P0 < (1 << (bit_depth - 1))) ? 1 : -1);
+
+                // p3 - p0 up to beta3 budget
+                b3diff = rnd() % b3;
+                EQU(P3, RNDDIFF(P0, b3diff));
+                // q3 - q0, reduced budget
+                b3diff = rnd() % FFMAX(b3 - b3diff, 1);
+                EQU(Q3, RNDDIFF(Q0, b3diff));
+
+                // same concept, budget across 4 pixels
+                b3 -= b3diff = rnd() % FFMAX(b3, 1);
+                EQU(P2, RNDDIFF(P0, b3diff));
+                b3 -= b3diff = rnd() % FFMAX(b3, 1);
+                EQU(Q2, RNDDIFF(Q0, b3diff));
+
+                // extra reduced budget for weighted pixels
+                b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
+                EQU(P1, RNDDIFF(P0, b3diff));
+                b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
+                EQU(Q1, RNDDIFF(Q0, b3diff));
+
+                buf += ystride;
+            }
+        }
+        break;
+    case 2: // none
+        *beta = 0; // ensure skip
+        for (i = 0; i < 8; i++) {
+            // we can just fill with completely random data, nothing should be touched.
+            EQU(P3, rnd()); EQU(P2, rnd()); EQU(P1, rnd()); EQU(P0, rnd());
+            EQU(Q0, rnd()); EQU(Q1, rnd()); EQU(Q2, rnd()); EQU(Q3, rnd());
+            buf += ystride;
+        }
+        break;
+    }
+}
+
+static void check_deblock_luma(HEVCDSPContext *h, int bit_depth, int c)
+{
+    const char *type;
+    const char *types[3] = { "strong", "weak", "skip" };
+    int beta;
+    int32_t tc[2] = {0};
+    uint8_t no_p[2] = { rnd() & c, rnd() & c };
+    uint8_t no_q[2] = { rnd() & c, rnd() & c };
+    LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
+
+    declare_func(void, uint8_t *pix, ptrdiff_t stride, int beta, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
+
+    for (int j = 0; j < 3; j++) {
+        type = types[j];
+        if (check_func(c ? h->hevc_h_loop_filter_luma_c :
+                           h->hevc_h_loop_filter_luma, "hevc_h_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : "")) {
+            randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, 16 * SIZEOF_PIXEL, SIZEOF_PIXEL, bit_depth);
+            memcpy(buf1, buf0, BUF_SIZE);
+
+            call_ref(buf0 + BUF_OFFSET, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
+            call_new(buf1 + BUF_OFFSET, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
             if (memcmp(buf0, buf1, BUF_SIZE))
                 fail();
+            bench_new(buf1 + BUF_OFFSET, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
         }
-        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
-    }
 
-    if (check_func(h->hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma%d", bit_depth)) {
-        for (int i = 0; i < 4; i++) {
-            randomize_buffers(buf0, buf1, BUF_SIZE);
-            // see betatable[] in hevc_filter.c
-            tc[0] = (rnd() & 63) + (rnd() & 1);
-            tc[1] = (rnd() & 63) + (rnd() & 1);
+        if (check_func(c ? h->hevc_v_loop_filter_luma_c :
+                           h->hevc_v_loop_filter_luma, "hevc_v_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : "")) {
+            randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, SIZEOF_PIXEL, 16 * SIZEOF_PIXEL, bit_depth);
+            memcpy(buf1, buf0, BUF_SIZE);
 
-            call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
-            call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
+            call_ref(buf0 + BUF_OFFSET, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
+            call_new(buf1 + BUF_OFFSET, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
             if (memcmp(buf0, buf1, BUF_SIZE))
                 fail();
+            bench_new(buf1 + BUF_OFFSET, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
         }
-        bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
     }
 }
 
 void checkasm_check_hevc_deblock(void)
 {
+    HEVCDSPContext h;
     int bit_depth;
-
     for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
-        HEVCDSPContext h;
         ff_hevc_dsp_init(&h, bit_depth);
-        check_deblock_chroma(&h, bit_depth);
+        check_deblock_chroma(&h, bit_depth, 0);
+        // check _c variants (non-simplified asm which allows skipping p/q)
+        check_deblock_chroma(&h, bit_depth, 1);
     }
     report("chroma");
+    for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+        ff_hevc_dsp_init(&h, bit_depth);
+        check_deblock_luma(&h, bit_depth, 0);
+        // as above
+        check_deblock_luma(&h, bit_depth, 1);
+    }
+    report("luma");
 }