diff mbox series

[FFmpeg-devel,04/10] avcodec/vc1: Introduce fast path for unescaping bitstream buffer

Message ID 20220325185257.513933-5-bavison@riscosopen.org
State New
Headers show
Series avcodec/vc1: Arm optimisations | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Ben Avison March 25, 2022, 6:52 p.m. UTC
Includes a checkasm test.

Signed-off-by: Ben Avison <bavison@riscosopen.org>
---
 libavcodec/vc1dec.c     | 20 +++++++-------
 libavcodec/vc1dsp.c     |  2 ++
 libavcodec/vc1dsp.h     |  3 +++
 tests/checkasm/vc1dsp.c | 59 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 74 insertions(+), 10 deletions(-)

Comments

Martin Storsjö March 29, 2022, 8:37 p.m. UTC | #1
On Fri, 25 Mar 2022, Ben Avison wrote:

> void ff_vc1dsp_init(VC1DSPContext* c);
> diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
> index 0823ccad31..0ab5892403 100644
> --- a/tests/checkasm/vc1dsp.c
> +++ b/tests/checkasm/vc1dsp.c
> @@ -286,6 +286,20 @@ static matrix *generate_inverse_quantized_transform_coefficients(size_t width, s
>         }                                                                   \
>     } while (0)
>
> +#define TEST_UNESCAPE                                                                                   \
> +    do {                                                                                            \
> +        for (int count = 100; count > 0; --count) {                                                 \
> +            escaped_offset = rnd() & 7;                                                             \
> +            unescaped_offset = rnd() & 7;                                                           \
> +            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
> +            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \

The output buffer will be overwritten in the end, but I guess this 
initialization is useful for making sure that the test doesn't 
accidentally rely on the output from the previous iteration, right?

> +            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
> +            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
> +            if (len0 != len1 || memcmp(unescaped0, unescaped1, len0))                               \

Don't you need to include unescaped_offset here too? Otherwise you're just 
checking areas of the buffer that wasn't necessarily written.


> +                fail();                                                                             \
> +        }                                                                                           \
> +    } while (0)
> +

As with the rest of the checkasm tests - please unmacro most things where 
possible (except for the RANDOMIZE_* macros, those are ok to keep macroed 
if you want to). And sorry for leading you down a path with a bad example 
in that respect.

> void checkasm_check_vc1dsp(void)
> {
>     /* Inverse transform input coefficients are stored in a 16-bit buffer
> @@ -309,6 +323,14 @@ void checkasm_check_vc1dsp(void)
>     LOCAL_ALIGNED_4(uint8_t, filter_buf0, [24 * 24]);
>     LOCAL_ALIGNED_4(uint8_t, filter_buf1, [24 * 24]);
>
> +    /* This appears to be a typical length of buffer in use */
> +#define LOG2_UNESCAPE_BUF_SIZE 17
> +#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
> +    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
> +    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
> +    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
> +    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
> +
>     VC1DSPContext h;
>
>     ff_vc1dsp_init(&h);
> @@ -349,4 +371,41 @@ void checkasm_check_vc1dsp(void)
>     CHECK_LOOP_FILTER(vc1_h_loop_filter16);
>
>     report("loop_filter");
> +
> +    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
> +        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
> +        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
> +
> +        /* Test data which consists of escapes sequences packed as tightly as possible */
> +        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
> +            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
> +        TEST_UNESCAPE;
> +
> +        /* Test random data */
> +        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
> +        TEST_UNESCAPE;
> +
> +        /* Test data with escape sequences at random intervals */
> +        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
> +            int gap, gap_msb;
> +            escaped1[x+0] = escaped0[x+0] = 0;
> +            escaped1[x+1] = escaped0[x+1] = 0;
> +            escaped1[x+2] = escaped0[x+2] = 3;
> +            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
> +            gap_msb = 2u << (rnd() % 8);
> +            gap = (rnd() &~ -gap_msb) | gap_msb;
> +            x += gap;
> +        }
> +        TEST_UNESCAPE;
> +
> +        /* Test data which is known to contain no escape sequences */
> +        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
> +        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
> +        TEST_UNESCAPE;
> +
> +        /* Benchmark the no-escape-sequences case */
> +        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
> +    }
> +
> +    report("unescape_buffer");
> }

The test looks great otherwise! But please split the code for it into a 
standalonef unction, e.g. check_unescape(), so the main 
checkasm_check_vc1dsp() just is a list of calls to check_loopfilter(), 
check_idct(), check_unescape() etc.

// Martin
Ben Avison March 31, 2022, 1:58 p.m. UTC | #2
On 29/03/2022 21:37, Martin Storsjö wrote:
> On Fri, 25 Mar 2022, Ben Avison wrote:
>> +#define 
>> TEST_UNESCAPE                                                                                   
>> \
>> +    do 
>> {                                                                                            
>> \
>> +        for (int count = 100; count > 0; --count) 
>> {                                                 \
>> +            escaped_offset = rnd() & 
>> 7;                                                             \
>> +            unescaped_offset = rnd() & 
>> 7;                                                           \
>> +            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 
>> 7);                                    \
>> +            RANDOMIZE_BUFFER8(unescaped, 
>> UNESCAPE_BUF_SIZE);                                        \
> 
> The output buffer will be overwritten in the end, but I guess this 
> initialization is useful for making sure that the test doesn't 
> accidentally rely on the output from the previous iteration, right?

The main idea was to catch examples of writing to the buffer beyond the 
length reported (and less likely, writes before the start of the 
buffer). I suppose it's possible that someone might want to deliberately 
overwrite in specific conditions, but the test could always be loosened 
up at that point once those conditions become clearer.

>> +            len0 = call_ref(escaped0 + escaped_offset, escaped_len, 
>> unescaped0 + unescaped_offset); \
>> +            len1 = call_new(escaped1 + escaped_offset, escaped_len, 
>> unescaped1 + unescaped_offset); \
>> +            if (len0 != len1 || memcmp(unescaped0, unescaped1, 
>> len0))                               \
> 
> Don't you need to include unescaped_offset here too? Otherwise you're 
> just checking areas of the buffer that wasn't necessarily written.

I realise I should have made the memcmp length UNESCAPE_BUF_SIZE here to 
achieve what I intended. Testing len0 bytes from the start of the buffer 
neither checks all the written bytes nor checks the byte after those 
written :-$

> As with the rest of the checkasm tests - please unmacro most things 
> where possible (except for the RANDOMIZE_* macros, those are ok to keep 
> macroed if you want to).

In the case of TEST_UNESCAPE, I think it has to remain as a macro, 
otherwise the next function up ends up with a declare_func_emms() and a 
bench_new() but no call_ref() or call_new(), which means some builds end 
up with an unused function warning.

I can, however, split all the unescape tests out of 
checkasm_check_vc1dsp into a separate function (and separate functions 
for inverse-transform and deblocking tests).

Ben
Martin Storsjö March 31, 2022, 2:07 p.m. UTC | #3
On Thu, 31 Mar 2022, Ben Avison wrote:

> On 29/03/2022 21:37, Martin Storsjö wrote:
>> On Fri, 25 Mar 2022, Ben Avison wrote:
>> As with the rest of the checkasm tests - please unmacro most things where 
>> possible (except for the RANDOMIZE_* macros, those are ok to keep macroed 
>> if you want to).
>
> In the case of TEST_UNESCAPE, I think it has to remain as a macro, otherwise 
> the next function up ends up with a declare_func_emms() and a bench_new() but 
> no call_ref() or call_new(), which means some builds end up with an unused 
> function warning.

Oh, right - yes, call_ref and call_new need to be in the same scope as 
declare_func, yes.

> I can, however, split all the unescape tests out of checkasm_check_vc1dsp 
> into a separate function (and separate functions for inverse-transform and 
> deblocking tests).

Awesome, thanks!

// Martin
diff mbox series

Patch

diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 1c92b9d401..6a30b5b664 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -490,7 +490,7 @@  static av_cold int vc1_decode_init(AVCodecContext *avctx)
             size = next - start - 4;
             if (size <= 0)
                 continue;
-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
+            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
             init_get_bits(&gb, buf2, buf2_size * 8);
             switch (AV_RB32(start)) {
             case VC1_CODE_SEQHDR:
@@ -680,7 +680,7 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 case VC1_CODE_FRAME:
                     if (avctx->hwaccel)
                         buf_start = start;
-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
                     break;
                 case VC1_CODE_FIELD: {
                     int buf_size3;
@@ -697,8 +697,8 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                         ret = AVERROR(ENOMEM);
                         goto err;
                     }
-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
-                                                    slices[n_slices].buf);
+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+                                                              slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
@@ -709,7 +709,7 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     break;
                 }
                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
                     break;
@@ -726,8 +726,8 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                         ret = AVERROR(ENOMEM);
                         goto err;
                     }
-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
-                                                    slices[n_slices].buf);
+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+                                                              slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
@@ -761,7 +761,7 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     ret = AVERROR(ENOMEM);
                     goto err;
                 }
-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                               buf_size3 << 3);
                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
@@ -770,9 +770,9 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 n_slices1 = n_slices - 1;
                 n_slices++;
             }
-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
         } else {
-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
         }
         init_get_bits(&s->gb, buf2, buf_size2*8);
     } else{
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index a29b91bf3d..11d493f002 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -34,6 +34,7 @@ 
 #include "rnd_avg.h"
 #include "vc1dsp.h"
 #include "startcode.h"
+#include "vc1_common.h"
 
 /* Apply overlap transform to horizontal edge */
 static void vc1_v_overlap_c(uint8_t *src, int stride)
@@ -1030,6 +1031,7 @@  av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
 
     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
+    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
 
     if (ARCH_AARCH64)
         ff_vc1dsp_init_aarch64(dsp);
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index c6443acb20..8be1198071 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -80,6 +80,9 @@  typedef struct VC1DSPContext {
      * one or more further zero bytes and a one byte.
      */
     int (*startcode_find_candidate)(const uint8_t *buf, int size);
+
+    /* Copy a buffer, removing startcode emulation escape bytes as we go */
+    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
 } VC1DSPContext;
 
 void ff_vc1dsp_init(VC1DSPContext* c);
diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
index 0823ccad31..0ab5892403 100644
--- a/tests/checkasm/vc1dsp.c
+++ b/tests/checkasm/vc1dsp.c
@@ -286,6 +286,20 @@  static matrix *generate_inverse_quantized_transform_coefficients(size_t width, s
         }                                                                   \
     } while (0)
 
+#define TEST_UNESCAPE                                                                                   \
+    do {                                                                                            \
+        for (int count = 100; count > 0; --count) {                                                 \
+            escaped_offset = rnd() & 7;                                                             \
+            unescaped_offset = rnd() & 7;                                                           \
+            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
+            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
+            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
+            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
+            if (len0 != len1 || memcmp(unescaped0, unescaped1, len0))                               \
+                fail();                                                                             \
+        }                                                                                           \
+    } while (0)
+
 void checkasm_check_vc1dsp(void)
 {
     /* Inverse transform input coefficients are stored in a 16-bit buffer
@@ -309,6 +323,14 @@  void checkasm_check_vc1dsp(void)
     LOCAL_ALIGNED_4(uint8_t, filter_buf0, [24 * 24]);
     LOCAL_ALIGNED_4(uint8_t, filter_buf1, [24 * 24]);
 
+    /* This appears to be a typical length of buffer in use */
+#define LOG2_UNESCAPE_BUF_SIZE 17
+#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
+    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
+
     VC1DSPContext h;
 
     ff_vc1dsp_init(&h);
@@ -349,4 +371,41 @@  void checkasm_check_vc1dsp(void)
     CHECK_LOOP_FILTER(vc1_h_loop_filter16);
 
     report("loop_filter");
+
+    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
+        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
+        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
+
+        /* Test data which consists of escapes sequences packed as tightly as possible */
+        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
+            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
+        TEST_UNESCAPE;
+
+        /* Test random data */
+        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
+        TEST_UNESCAPE;
+
+        /* Test data with escape sequences at random intervals */
+        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
+            int gap, gap_msb;
+            escaped1[x+0] = escaped0[x+0] = 0;
+            escaped1[x+1] = escaped0[x+1] = 0;
+            escaped1[x+2] = escaped0[x+2] = 3;
+            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
+            gap_msb = 2u << (rnd() % 8);
+            gap = (rnd() &~ -gap_msb) | gap_msb;
+            x += gap;
+        }
+        TEST_UNESCAPE;
+
+        /* Test data which is known to contain no escape sequences */
+        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
+        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
+        TEST_UNESCAPE;
+
+        /* Benchmark the no-escape-sequences case */
+        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
+    }
+
+    report("unescape_buffer");
 }