diff mbox series

[FFmpeg-devel,01/10] checkasm: Add vc1dsp in-loop deblocking filter tests

Message ID 20220325185257.513933-2-bavison@riscosopen.org
State New
Headers show
Series avcodec/vc1: Arm optimisations | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Ben Avison March 25, 2022, 6:52 p.m. UTC
Note that the benchmarking results for these functions are highly dependent
upon the input data. Therefore, each function is benchmarked twice,
corresponding to the best and worst case complexity of the reference C
implementation. The performance of a real stream decode will fall somewhere
between these two extremes.

Signed-off-by: Ben Avison <bavison@riscosopen.org>
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/vc1dsp.c   | 94 +++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 100 insertions(+)
 create mode 100644 tests/checkasm/vc1dsp.c

Comments

Martin Storsjö March 25, 2022, 10:53 p.m. UTC | #1
On Fri, 25 Mar 2022, Ben Avison wrote:

> Note that the benchmarking results for these functions are highly dependent
> upon the input data. Therefore, each function is benchmarked twice,
> corresponding to the best and worst case complexity of the reference C
> implementation. The performance of a real stream decode will fall somewhere
> between these two extremes.
>
> Signed-off-by: Ben Avison <bavison@riscosopen.org>
> ---
> tests/checkasm/Makefile   |  1 +
> tests/checkasm/checkasm.c |  3 ++
> tests/checkasm/checkasm.h |  1 +
> tests/checkasm/vc1dsp.c   | 94 +++++++++++++++++++++++++++++++++++++++
> tests/fate/checkasm.mak   |  1 +
> 5 files changed, 100 insertions(+)
> create mode 100644 tests/checkasm/vc1dsp.c
>
> +#define CHECK_LOOP_FILTER(func)                                             \
> +    do {                                                                    \
> +        if (check_func(h.func, "vc1dsp." #func)) {                          \
> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
> +            for (int count = 1000; count > 0; --count) {                    \
> +                int pq = rnd() % 31 + 1;                                    \
> +                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 24);        \
> +                call_ref(filter_buf0 + 4 * 24 + 4, 24, pq);                 \
> +                call_new(filter_buf1 + 4 * 24 + 4, 24, pq);                 \
> +                if (memcmp(filter_buf0, filter_buf1, 24 * 24))              \
> +                    fail();                                                 \
> +            }                                                               \
> +        }                                                                   \
> +        for (int j = 0; j < 24; ++j)                                        \
> +            for (int i = 0; i < 24; ++i)                                    \
> +                filter_buf1[24*j + i] = 0x60 + 0x40 * (i >= 4 && j >= 4);   \
> +        if (check_func(h.func, "vc1dsp." #func "_bestcase")) {              \
> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
> +            bench_new(filter_buf1 + 4 * 24 + 4, 24, 1);                     \
> +            (void) checked_call;                                            \
> +        }                                                                   \
> +        if (check_func(h.func, "vc1dsp." #func "_worstcase")) {             \
> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
> +            bench_new(filter_buf1 + 4 * 24 + 4, 24, 31);                    \
> +            (void) checked_call;                                            \
> +        }                                                                   \

(not a full review, just something that cropped up in initial build 
testing)

Why do you have the "(void) checked_call;" here? The checked_call isn't 
something that is universally defined; its availability depends on the 
OS/arch combinations, on other combinations, call_new/call_ref just call 
the function straight away without a wrapper. In particular, on macOS on 
arm64, we don't use checked_call, due to differences in how parameters are 
packed on the stack in the darwin ABI compared to AAPCS.

// Martin
Ben Avison March 28, 2022, 6:28 p.m. UTC | #2
On 25/03/2022 22:53, Martin Storsjö wrote:
> On Fri, 25 Mar 2022, Ben Avison wrote:
> 
>> +#define 
>> CHECK_LOOP_FILTER(func)                                             \
>> +    do 
>> {                                                                    \
>> +        if (check_func(h.func, "vc1dsp." #func)) 
>> {                          \
>> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, 
>> int);  \
>> +            for (int count = 1000; count > 0; --count) 
>> {                    \
>> +                int pq = rnd() % 31 + 
>> 1;                                    \
>> +                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 
>> 24);        \
>> +                call_ref(filter_buf0 + 4 * 24 + 4, 24, 
>> pq);                 \
>> +                call_new(filter_buf1 + 4 * 24 + 4, 24, 
>> pq);                 \
>> +                if (memcmp(filter_buf0, filter_buf1, 24 * 
>> 24))              \
>> +                    
>> fail();                                                 \
>> +            
>> }                                                               \
>> +        
>> }                                                                   \
>> +        for (int j = 0; j < 24; 
>> ++j)                                        \
>> +            for (int i = 0; i < 24; 
>> ++i)                                    \
>> +                filter_buf1[24*j + i] = 0x60 + 0x40 * (i >= 4 && j >= 
>> 4);   \
>> +        if (check_func(h.func, "vc1dsp." #func "_bestcase")) 
>> {              \
>> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, 
>> int);  \
>> +            bench_new(filter_buf1 + 4 * 24 + 4, 24, 
>> 1);                     \
>> +            (void) 
>> checked_call;                                            \
>> +        
>> }                                                                   \
>> +        if (check_func(h.func, "vc1dsp." #func "_worstcase")) 
>> {             \
>> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, 
>> int);  \
>> +            bench_new(filter_buf1 + 4 * 24 + 4, 24, 
>> 31);                    \
>> +            (void) 
>> checked_call;                                            \
>> +        
>> }                                                                   \
> 
> (not a full review, just something that cropped up in initial build 
> testing)
> 
> Why do you have the "(void) checked_call;" here? The checked_call isn't 
> something that is universally defined; its availability depends on the 
> OS/arch combinations, on other combinations, call_new/call_ref just call 
> the function straight away without a wrapper.

OK, I missed that subtlety. My aim was to avoid the "unused variable" 
compiler warnings generated as a result of there being twice as many 
benchmark tests as correctness tests. I believe we need separate calls 
of check_func() to initialise the cycle counts for each benchmark, and 
copying the sequence of macros from checkasm/blockdsp.c, I was placing 
the declare_func_emms() invocations inside the if block that used 
check_func(). That meant that checked_call was initialised, but since 
the correctness test (call_ref / call_new) was in a different block 
scope, this checked_call declaration was never used.

Upon further investigation, I think it's valid to move the 
declare_func_emms() invocation up to the next largest block scope. That 
means it would only appear once rather than 3 times, and it wouldn't 
need the cast-to-void any more. Please do correct me if I'm wrong.

Ben
Martin Storsjö March 29, 2022, 11:47 a.m. UTC | #3
On Mon, 28 Mar 2022, Ben Avison wrote:

> On 25/03/2022 22:53, Martin Storsjö wrote:
>> On Fri, 25 Mar 2022, Ben Avison wrote:
>> 
>>> +#define 
>>> CHECK_LOOP_FILTER(func)                                             \
>>> +    do 
>>> {                                                                    \
>>> +        if (check_func(h.func, "vc1dsp." #func)) 
>>> {                          \
>>> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, 
>>> int);  \
>>> +            for (int count = 1000; count > 0; --count) 
>>> {                    \
>>> +                int pq = rnd() % 31 + 
>>> 1;                                    \
>>> +                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 
>>> 24);        \
>>> +                call_ref(filter_buf0 + 4 * 24 + 4, 24, 
>>> pq);                 \
>>> +                call_new(filter_buf1 + 4 * 24 + 4, 24, 
>>> pq);                 \
>>> +                if (memcmp(filter_buf0, filter_buf1, 24 * 
>>> 24))              \
>>> + 
>>> fail();                                                 \
>>> + 
>>> }                                                               \
>>> + 
>>> }                                                                   \
>>> +        for (int j = 0; j < 24; 
>>> ++j)                                        \
>>> +            for (int i = 0; i < 24; 
>>> ++i)                                    \
>>> +                filter_buf1[24*j + i] = 0x60 + 0x40 * (i >= 4 && j >= 
>>> 4);   \
>>> +        if (check_func(h.func, "vc1dsp." #func "_bestcase")) 
>>> {              \
>>> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, 
>>> int);  \
>>> +            bench_new(filter_buf1 + 4 * 24 + 4, 24, 
>>> 1);                     \
>>> +            (void) 
>>> checked_call;                                            \
>>> + 
>>> }                                                                   \
>>> +        if (check_func(h.func, "vc1dsp." #func "_worstcase")) 
>>> {             \
>>> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, 
>>> int);  \
>>> +            bench_new(filter_buf1 + 4 * 24 + 4, 24, 
>>> 31);                    \
>>> +            (void) 
>>> checked_call;                                            \
>>> + 
>>> }                                                                   \
>> 
>> (not a full review, just something that cropped up in initial build 
>> testing)
>> 
>> Why do you have the "(void) checked_call;" here? The checked_call isn't 
>> something that is universally defined; its availability depends on the 
>> OS/arch combinations, on other combinations, call_new/call_ref just call 
>> the function straight away without a wrapper.
>
> OK, I missed that subtlety. My aim was to avoid the "unused variable" 
> compiler warnings generated as a result of there being twice as many 
> benchmark tests as correctness tests.

Oh, I see. I just ran into it when trying to compile on macOS, then edited 
it out and saw that it built fine there, but didn't try building for other 
platforms with the same modification.

> I believe we need separate calls of check_func() to initialise the cycle 
> counts for each benchmark, and copying the sequence of macros from 
> checkasm/blockdsp.c,

FWIW I think blockdsp.c might have been a bad example in that regard, as 
it expands the whole testcase with macros. (I chose it mainly as it was 
one of the shortest testcases.)

I think e.g. vp8dsp would have been a better example - with the toplevel 
checkasm_check_*() function just calling individual functions for the 
tests for various function groups. As check_func() can take a format 
string, you don't usually need the macro expansion for filling that in.

> I was placing the declare_func_emms() invocations inside the if block 
> that used check_func(). That meant that checked_call was initialised, 
> but since the correctness test (call_ref / call_new) was in a different 
> block scope, this checked_call declaration was never used.
>
> Upon further investigation, I think it's valid to move the 
> declare_func_emms() invocation up to the next largest block scope. That 
> means it would only appear once rather than 3 times, and it wouldn't 
> need the cast-to-void any more. Please do correct me if I'm wrong.

Yes, that seems correct to do. And looking at other examples, e.g. vp8dsp, 
that also uses such a structure, with declare_func_*() outside of 
check_func() - in a function like check_loopfilter_simple().

// Martin
Martin Storsjö March 29, 2022, 12:24 p.m. UTC | #4
On Fri, 25 Mar 2022, Ben Avison wrote:

> Note that the benchmarking results for these functions are highly dependent
> upon the input data. Therefore, each function is benchmarked twice,
> corresponding to the best and worst case complexity of the reference C
> implementation. The performance of a real stream decode will fall somewhere
> between these two extremes.

Great idea to do separate benchmarking of the best/worst cases like this - 
that is usually a recurring issue in benchmarking loop filters.

(Another issue with benchmarking of loop filters, is that the same 
function is run repeatedly without resetting the input data inbetween - so 
depending on the exact setup, it's possible that the decision about 
whether to filter or not is taken differently in the first and last runs. 
But this implementation seems very good in that aspect!)

> +++ b/tests/checkasm/vc1dsp.c
> @@ -0,0 +1,94 @@
> +/*
> + * Copyright (c) 2022 Ben Avison
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +
> +#include "checkasm.h"
> +
> +#include "libavcodec/vc1dsp.h"
> +
> +#include "libavutil/common.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem_internal.h"
> +
> +#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
> +    do {                                            \
> +        uint8_t *p##0 = name##0, *p##1 = name##1;   \
> +        int i = (size);                             \
> +        while (i-- > 0) {                           \
> +            int x = 0x80 | (rnd() & 0x7F);          \
> +            x >>= rnd() % 9;                        \
> +            if (rnd() & 1)                          \
> +                x = -x;                             \
> +            *p##1++ = *p##0++ = 0x80 + x;           \
> +        }                                           \
> +    } while (0)
> +
> +#define CHECK_LOOP_FILTER(func)                                             \
> +    do {                                                                    \
> +        if (check_func(h.func, "vc1dsp." #func)) {                          \
> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
> +            for (int count = 1000; count > 0; --count) {                    \
> +                int pq = rnd() % 31 + 1;                                    \
> +                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 24);        \
> +                call_ref(filter_buf0 + 4 * 24 + 4, 24, pq);                 \
> +                call_new(filter_buf1 + 4 * 24 + 4, 24, pq);                 \
> +                if (memcmp(filter_buf0, filter_buf1, 24 * 24))              \
> +                    fail();                                                 \
> +            }                                                               \
> +        }                                                                   \
> +        for (int j = 0; j < 24; ++j)                                        \
> +            for (int i = 0; i < 24; ++i)                                    \
> +                filter_buf1[24*j + i] = 0x60 + 0x40 * (i >= 4 && j >= 4);   \
> +        if (check_func(h.func, "vc1dsp." #func "_bestcase")) {              \
> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
> +            bench_new(filter_buf1 + 4 * 24 + 4, 24, 1);                     \
> +            (void) checked_call;                                            \
> +        }                                                                   \
> +        if (check_func(h.func, "vc1dsp." #func "_worstcase")) {             \
> +            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
> +            bench_new(filter_buf1 + 4 * 24 + 4, 24, 31);                    \
> +            (void) checked_call;                                            \
> +        }                                                                   \
> +    } while (0)
> +
> +void checkasm_check_vc1dsp(void)
> +{
> +    /* Deblocking filter buffers are big enough to hold a 16x16 block,
> +     * plus 4 rows/columns above/left to hold filter inputs (depending on
> +     * whether v or h neighbouring block edge) plus 4 rows/columns
> +     * right/below to catch write overflows */
> +    LOCAL_ALIGNED_4(uint8_t, filter_buf0, [24 * 24]);
> +    LOCAL_ALIGNED_4(uint8_t, filter_buf1, [24 * 24]);
> +
> +    VC1DSPContext h;
> +
> +    ff_vc1dsp_init(&h);
> +
> +    CHECK_LOOP_FILTER(vc1_v_loop_filter4);
> +    CHECK_LOOP_FILTER(vc1_h_loop_filter4);
> +    CHECK_LOOP_FILTER(vc1_v_loop_filter8);
> +    CHECK_LOOP_FILTER(vc1_h_loop_filter8);
> +    CHECK_LOOP_FILTER(vc1_v_loop_filter16);
> +    CHECK_LOOP_FILTER(vc1_h_loop_filter16);
> +
> +    report("loop_filter");
> +}

This looks great to me overall. I think it'd be nice to unmacro 
CHECK_LOOP_FILTER though and make a separate check_loopfilter() function 
like in vp8dsp.c instead, and move the declare_func_emms outside of 
check_func() as you concluded.

// Martin
Martin Storsjö March 29, 2022, 12:43 p.m. UTC | #5
On Fri, 25 Mar 2022, Ben Avison wrote:

> Note that the benchmarking results for these functions are highly dependent
> upon the input data. Therefore, each function is benchmarked twice,
> corresponding to the best and worst case complexity of the reference C
> implementation. The performance of a real stream decode will fall somewhere
> between these two extremes.
>
> Signed-off-by: Ben Avison <bavison@riscosopen.org>
> ---
> tests/checkasm/Makefile   |  1 +
> tests/checkasm/checkasm.c |  3 ++
> tests/checkasm/checkasm.h |  1 +
> tests/checkasm/vc1dsp.c   | 94 +++++++++++++++++++++++++++++++++++++++
> tests/fate/checkasm.mak   |  1 +
> 5 files changed, 100 insertions(+)
> create mode 100644 tests/checkasm/vc1dsp.c

Actually, this test already paid off - thanks! It caught a real issue with 
the existing x86 loopfilter assembly. The stride parameter is 'int', but 
the assembly uses it as a full register without clearing/sign extending 
the upper half.

Instead of complicating the assembly, the usual remedy is to change the 
parameter to ptrdiff_t, to avoid the issue altogether - I'll send a patch 
for that.

// Martin
diff mbox series

Patch

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index f768b1144e..7133a6ee66 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -11,6 +11,7 @@  AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
 AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
 AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
+AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
 AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
 AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 748d6a9f3a..c2efd81b6d 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -147,6 +147,9 @@  static const struct {
     #if CONFIG_V210_ENCODER
         { "v210enc", checkasm_check_v210enc },
     #endif
+    #if CONFIG_VC1DSP
+        { "vc1dsp", checkasm_check_vc1dsp },
+    #endif
     #if CONFIG_VP8DSP
         { "vp8dsp", checkasm_check_vp8dsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index c3192d8c23..52ab18a5b1 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -78,6 +78,7 @@  void checkasm_check_sw_scale(void);
 void checkasm_check_utvideodsp(void);
 void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
+void checkasm_check_vc1dsp(void);
 void checkasm_check_vf_eq(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
new file mode 100644
index 0000000000..db916d08f9
--- /dev/null
+++ b/tests/checkasm/vc1dsp.c
@@ -0,0 +1,94 @@ 
+/*
+ * Copyright (c) 2022 Ben Avison
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "checkasm.h"
+
+#include "libavcodec/vc1dsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
+    do {                                            \
+        uint8_t *p##0 = name##0, *p##1 = name##1;   \
+        int i = (size);                             \
+        while (i-- > 0) {                           \
+            int x = 0x80 | (rnd() & 0x7F);          \
+            x >>= rnd() % 9;                        \
+            if (rnd() & 1)                          \
+                x = -x;                             \
+            *p##1++ = *p##0++ = 0x80 + x;           \
+        }                                           \
+    } while (0)
+
+#define CHECK_LOOP_FILTER(func)                                             \
+    do {                                                                    \
+        if (check_func(h.func, "vc1dsp." #func)) {                          \
+            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
+            for (int count = 1000; count > 0; --count) {                    \
+                int pq = rnd() % 31 + 1;                                    \
+                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 24);        \
+                call_ref(filter_buf0 + 4 * 24 + 4, 24, pq);                 \
+                call_new(filter_buf1 + 4 * 24 + 4, 24, pq);                 \
+                if (memcmp(filter_buf0, filter_buf1, 24 * 24))              \
+                    fail();                                                 \
+            }                                                               \
+        }                                                                   \
+        for (int j = 0; j < 24; ++j)                                        \
+            for (int i = 0; i < 24; ++i)                                    \
+                filter_buf1[24*j + i] = 0x60 + 0x40 * (i >= 4 && j >= 4);   \
+        if (check_func(h.func, "vc1dsp." #func "_bestcase")) {              \
+            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
+            bench_new(filter_buf1 + 4 * 24 + 4, 24, 1);                     \
+            (void) checked_call;                                            \
+        }                                                                   \
+        if (check_func(h.func, "vc1dsp." #func "_worstcase")) {             \
+            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, int, int);  \
+            bench_new(filter_buf1 + 4 * 24 + 4, 24, 31);                    \
+            (void) checked_call;                                            \
+        }                                                                   \
+    } while (0)
+
+void checkasm_check_vc1dsp(void)
+{
+    /* Deblocking filter buffers are big enough to hold a 16x16 block,
+     * plus 4 rows/columns above/left to hold filter inputs (depending on
+     * whether v or h neighbouring block edge) plus 4 rows/columns
+     * right/below to catch write overflows */
+    LOCAL_ALIGNED_4(uint8_t, filter_buf0, [24 * 24]);
+    LOCAL_ALIGNED_4(uint8_t, filter_buf1, [24 * 24]);
+
+    VC1DSPContext h;
+
+    ff_vc1dsp_init(&h);
+
+    CHECK_LOOP_FILTER(vc1_v_loop_filter4);
+    CHECK_LOOP_FILTER(vc1_h_loop_filter4);
+    CHECK_LOOP_FILTER(vc1_v_loop_filter8);
+    CHECK_LOOP_FILTER(vc1_h_loop_filter8);
+    CHECK_LOOP_FILTER(vc1_v_loop_filter16);
+    CHECK_LOOP_FILTER(vc1_h_loop_filter16);
+
+    report("loop_filter");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 6db8f09d12..99e6bb13c4 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -32,6 +32,7 @@  FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-utvideodsp                                \
                 fate-checkasm-v210dec                                   \
                 fate-checkasm-v210enc                                   \
+                fate-checkasm-vc1dsp                                    \
                 fate-checkasm-vf_blend                                  \
                 fate-checkasm-vf_colorspace                             \
                 fate-checkasm-vf_eq                                     \