diff mbox series

[FFmpeg-devel,6/6] avcodec/vc1: Introduce fast path for unescaping bitstream buffer

Message ID 20220317185819.466470-7-bavison@riscosopen.org
State New
Headers show
Series avcodec/vc1: Arm optimisations | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_aarch64_jetson success Make finished
andriy/make_fate_aarch64_jetson success Make fate finished
andriy/make_armv7_RPi4 success Make finished
andriy/make_fate_armv7_RPi4 success Make fate finished

Commit Message

Ben Avison March 17, 2022, 6:58 p.m. UTC
Populate with implementations suitable for 32-bit and 64-bit Arm.

Signed-off-by: Ben Avison <bavison@riscosopen.org>
---
 libavcodec/aarch64/vc1dsp_init_aarch64.c |  60 ++++++++
 libavcodec/aarch64/vc1dsp_neon.S         | 176 +++++++++++++++++++++++
 libavcodec/arm/vc1dsp_init_neon.c        |  60 ++++++++
 libavcodec/arm/vc1dsp_neon.S             | 118 +++++++++++++++
 libavcodec/vc1dec.c                      |  20 +--
 libavcodec/vc1dsp.c                      |   2 +
 libavcodec/vc1dsp.h                      |   3 +
 7 files changed, 429 insertions(+), 10 deletions(-)

Comments

Andreas Rheinhardt March 18, 2022, 7:10 p.m. UTC | #1
Ben Avison:
> Populate with implementations suitable for 32-bit and 64-bit Arm.
> 
> Signed-off-by: Ben Avison <bavison@riscosopen.org>
> ---
>  libavcodec/aarch64/vc1dsp_init_aarch64.c |  60 ++++++++
>  libavcodec/aarch64/vc1dsp_neon.S         | 176 +++++++++++++++++++++++
>  libavcodec/arm/vc1dsp_init_neon.c        |  60 ++++++++
>  libavcodec/arm/vc1dsp_neon.S             | 118 +++++++++++++++
>  libavcodec/vc1dec.c                      |  20 +--
>  libavcodec/vc1dsp.c                      |   2 +
>  libavcodec/vc1dsp.h                      |   3 +
>  7 files changed, 429 insertions(+), 10 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
> index b672b2aa99..2fc2d5d1d3 100644
> --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
> +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
> @@ -51,6 +51,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
>  void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
>                                  int h, int x, int y);
>  
> +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
> +
> +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
> +{
> +    /* Dealing with starting and stopping, and removing escape bytes, are
> +     * comparatively less time-sensitive, so are more clearly expressed using
> +     * a C wrapper around the assembly inner loop. Note that we assume a
> +     * little-endian machine that supports unaligned loads. */
> +    int dsize = 0;
> +    while (size >= 4)
> +    {
> +        int found = 0;
> +        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
> +        {
> +            found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
> +            if (!found)
> +            {
> +                *dst++ = *src++;
> +                --size;
> +                ++dsize;
> +            }
> +        }
> +        if (!found)
> +        {
> +            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
> +            dst += skip;
> +            src += skip;
> +            size -= skip;
> +            dsize += skip;
> +            while (!found && size >= 4)
> +            {
> +                found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
> +                if (!found)
> +                {
> +                    *dst++ = *src++;
> +                    --size;
> +                    ++dsize;
> +                }
> +            }
> +        }
> +        if (found)
> +        {
> +            *dst++ = *src++;
> +            *dst++ = *src++;
> +            ++src;
> +            size -= 3;
> +            dsize += 2;
> +        }
> +    }
> +    while (size > 0)
> +    {
> +        *dst++ = *src++;
> +        --size;
> +        ++dsize;
> +    }
> +    return dsize;
> +}
> +
>  av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
>  {
>      int cpu_flags = av_get_cpu_flags();
> @@ -76,5 +134,7 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
>          dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
>          dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
>          dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
> +
> +        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
>      }
>  }
> diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
> index c3ca3eae1e..8bdeffab44 100644
> --- a/libavcodec/aarch64/vc1dsp_neon.S
> +++ b/libavcodec/aarch64/vc1dsp_neon.S
> @@ -1374,3 +1374,179 @@ function ff_vc1_h_loop_filter16_neon, export=1
>          st2     {v2.b, v3.b}[7], [x6]
>  4:      ret
>  endfunc
> +
> +// Copy at most the specified number of bytes from source to destination buffer,
> +// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
> +// On entry:
> +//   x0 -> source buffer
> +//   w1 = max number of bytes to copy
> +//   x2 -> destination buffer, optimally 8-byte aligned
> +// On exit:
> +//   w0 = number of bytes not copied
> +function ff_vc1_unescape_buffer_helper_neon, export=1
> +        // Offset by 80 to screen out cases that are too short for us to handle,
> +        // and also make it easy to test for loop termination, or to determine
> +        // whether we need an odd number of half-iterations of the loop.
> +        subs    w1, w1, #80
> +        b.mi    90f
> +
> +        // Set up useful constants
> +        movi    v20.4s, #3, lsl #24
> +        movi    v21.4s, #3, lsl #16
> +
> +        tst     w1, #32
> +        b.ne    1f
> +
> +          ld1     {v0.16b, v1.16b, v2.16b}, [x0], #48
> +          ext     v25.16b, v0.16b, v1.16b, #1
> +          ext     v26.16b, v0.16b, v1.16b, #2
> +          ext     v27.16b, v0.16b, v1.16b, #3
> +          ext     v29.16b, v1.16b, v2.16b, #1
> +          ext     v30.16b, v1.16b, v2.16b, #2
> +          ext     v31.16b, v1.16b, v2.16b, #3
> +          bic     v24.16b, v0.16b, v20.16b
> +          bic     v25.16b, v25.16b, v20.16b
> +          bic     v26.16b, v26.16b, v20.16b
> +          bic     v27.16b, v27.16b, v20.16b
> +          bic     v28.16b, v1.16b, v20.16b
> +          bic     v29.16b, v29.16b, v20.16b
> +          bic     v30.16b, v30.16b, v20.16b
> +          bic     v31.16b, v31.16b, v20.16b
> +          eor     v24.16b, v24.16b, v21.16b
> +          eor     v25.16b, v25.16b, v21.16b
> +          eor     v26.16b, v26.16b, v21.16b
> +          eor     v27.16b, v27.16b, v21.16b
> +          eor     v28.16b, v28.16b, v21.16b
> +          eor     v29.16b, v29.16b, v21.16b
> +          eor     v30.16b, v30.16b, v21.16b
> +          eor     v31.16b, v31.16b, v21.16b
> +          cmeq    v24.4s, v24.4s, #0
> +          cmeq    v25.4s, v25.4s, #0
> +          cmeq    v26.4s, v26.4s, #0
> +          cmeq    v27.4s, v27.4s, #0
> +          add     w1, w1, #32
> +          b       3f
> +
> +1:      ld1     {v3.16b, v4.16b, v5.16b}, [x0], #48
> +        ext     v25.16b, v3.16b, v4.16b, #1
> +        ext     v26.16b, v3.16b, v4.16b, #2
> +        ext     v27.16b, v3.16b, v4.16b, #3
> +        ext     v29.16b, v4.16b, v5.16b, #1
> +        ext     v30.16b, v4.16b, v5.16b, #2
> +        ext     v31.16b, v4.16b, v5.16b, #3
> +        bic     v24.16b, v3.16b, v20.16b
> +        bic     v25.16b, v25.16b, v20.16b
> +        bic     v26.16b, v26.16b, v20.16b
> +        bic     v27.16b, v27.16b, v20.16b
> +        bic     v28.16b, v4.16b, v20.16b
> +        bic     v29.16b, v29.16b, v20.16b
> +        bic     v30.16b, v30.16b, v20.16b
> +        bic     v31.16b, v31.16b, v20.16b
> +        eor     v24.16b, v24.16b, v21.16b
> +        eor     v25.16b, v25.16b, v21.16b
> +        eor     v26.16b, v26.16b, v21.16b
> +        eor     v27.16b, v27.16b, v21.16b
> +        eor     v28.16b, v28.16b, v21.16b
> +        eor     v29.16b, v29.16b, v21.16b
> +        eor     v30.16b, v30.16b, v21.16b
> +        eor     v31.16b, v31.16b, v21.16b
> +        cmeq    v24.4s, v24.4s, #0
> +        cmeq    v25.4s, v25.4s, #0
> +        cmeq    v26.4s, v26.4s, #0
> +        cmeq    v27.4s, v27.4s, #0
> +        // Drop through...
> +2:        mov     v0.16b, v5.16b
> +          ld1     {v1.16b, v2.16b}, [x0], #32
> +        cmeq    v28.4s, v28.4s, #0
> +        cmeq    v29.4s, v29.4s, #0
> +        cmeq    v30.4s, v30.4s, #0
> +        cmeq    v31.4s, v31.4s, #0
> +        orr     v24.16b, v24.16b, v25.16b
> +        orr     v26.16b, v26.16b, v27.16b
> +        orr     v28.16b, v28.16b, v29.16b
> +        orr     v30.16b, v30.16b, v31.16b
> +          ext     v25.16b, v0.16b, v1.16b, #1
> +        orr     v22.16b, v24.16b, v26.16b
> +          ext     v26.16b, v0.16b, v1.16b, #2
> +          ext     v27.16b, v0.16b, v1.16b, #3
> +          ext     v29.16b, v1.16b, v2.16b, #1
> +        orr     v23.16b, v28.16b, v30.16b
> +          ext     v30.16b, v1.16b, v2.16b, #2
> +          ext     v31.16b, v1.16b, v2.16b, #3
> +          bic     v24.16b, v0.16b, v20.16b
> +          bic     v25.16b, v25.16b, v20.16b
> +          bic     v26.16b, v26.16b, v20.16b
> +        orr     v22.16b, v22.16b, v23.16b
> +          bic     v27.16b, v27.16b, v20.16b
> +          bic     v28.16b, v1.16b, v20.16b
> +          bic     v29.16b, v29.16b, v20.16b
> +          bic     v30.16b, v30.16b, v20.16b
> +          bic     v31.16b, v31.16b, v20.16b
> +        addv    s22, v22.4s
> +          eor     v24.16b, v24.16b, v21.16b
> +          eor     v25.16b, v25.16b, v21.16b
> +          eor     v26.16b, v26.16b, v21.16b
> +          eor     v27.16b, v27.16b, v21.16b
> +          eor     v28.16b, v28.16b, v21.16b
> +        mov     w3, v22.s[0]
> +          eor     v29.16b, v29.16b, v21.16b
> +          eor     v30.16b, v30.16b, v21.16b
> +          eor     v31.16b, v31.16b, v21.16b
> +          cmeq    v24.4s, v24.4s, #0
> +          cmeq    v25.4s, v25.4s, #0
> +          cmeq    v26.4s, v26.4s, #0
> +          cmeq    v27.4s, v27.4s, #0
> +        cbnz    w3, 90f
> +        st1     {v3.16b, v4.16b}, [x2], #32
> +3:          mov     v3.16b, v2.16b
> +            ld1     {v4.16b, v5.16b}, [x0], #32
> +          cmeq    v28.4s, v28.4s, #0
> +          cmeq    v29.4s, v29.4s, #0
> +          cmeq    v30.4s, v30.4s, #0
> +          cmeq    v31.4s, v31.4s, #0
> +          orr     v24.16b, v24.16b, v25.16b
> +          orr     v26.16b, v26.16b, v27.16b
> +          orr     v28.16b, v28.16b, v29.16b
> +          orr     v30.16b, v30.16b, v31.16b
> +            ext     v25.16b, v3.16b, v4.16b, #1
> +          orr     v22.16b, v24.16b, v26.16b
> +            ext     v26.16b, v3.16b, v4.16b, #2
> +            ext     v27.16b, v3.16b, v4.16b, #3
> +            ext     v29.16b, v4.16b, v5.16b, #1
> +          orr     v23.16b, v28.16b, v30.16b
> +            ext     v30.16b, v4.16b, v5.16b, #2
> +            ext     v31.16b, v4.16b, v5.16b, #3
> +            bic     v24.16b, v3.16b, v20.16b
> +            bic     v25.16b, v25.16b, v20.16b
> +            bic     v26.16b, v26.16b, v20.16b
> +          orr     v22.16b, v22.16b, v23.16b
> +            bic     v27.16b, v27.16b, v20.16b
> +            bic     v28.16b, v4.16b, v20.16b
> +            bic     v29.16b, v29.16b, v20.16b
> +            bic     v30.16b, v30.16b, v20.16b
> +            bic     v31.16b, v31.16b, v20.16b
> +          addv    s22, v22.4s
> +            eor     v24.16b, v24.16b, v21.16b
> +            eor     v25.16b, v25.16b, v21.16b
> +            eor     v26.16b, v26.16b, v21.16b
> +            eor     v27.16b, v27.16b, v21.16b
> +            eor     v28.16b, v28.16b, v21.16b
> +          mov     w3, v22.s[0]
> +            eor     v29.16b, v29.16b, v21.16b
> +            eor     v30.16b, v30.16b, v21.16b
> +            eor     v31.16b, v31.16b, v21.16b
> +            cmeq    v24.4s, v24.4s, #0
> +            cmeq    v25.4s, v25.4s, #0
> +            cmeq    v26.4s, v26.4s, #0
> +            cmeq    v27.4s, v27.4s, #0
> +          cbnz    w3, 91f
> +          st1     {v0.16b, v1.16b}, [x2], #32
> +        subs    w1, w1, #64
> +        b.pl    2b
> +
> +90:     add     w0, w1, #80
> +        ret
> +
> +91:     sub     w1, w1, #32
> +        b       90b
> +endfunc
> diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
> index f5f5c702d7..3aefbcaf6d 100644
> --- a/libavcodec/arm/vc1dsp_init_neon.c
> +++ b/libavcodec/arm/vc1dsp_init_neon.c
> @@ -84,6 +84,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
>  void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
>                                  int h, int x, int y);
>  
> +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
> +
> +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
> +{
> +    /* Dealing with starting and stopping, and removing escape bytes, are
> +     * comparatively less time-sensitive, so are more clearly expressed using
> +     * a C wrapper around the assembly inner loop. Note that we assume a
> +     * little-endian machine that supports unaligned loads. */

You should nevertheless use AV_RL32 for your unaligned LE loads

> +    int dsize = 0;
> +    while (size >= 4)
> +    {
> +        int found = 0;
> +        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
> +        {
> +            found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
> +            if (!found)
> +            {
> +                *dst++ = *src++;
> +                --size;
> +                ++dsize;
> +            }
> +        }
> +        if (!found)
> +        {
> +            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
> +            dst += skip;
> +            src += skip;
> +            size -= skip;
> +            dsize += skip;
> +            while (!found && size >= 4)
> +            {
> +                found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
> +                if (!found)
> +                {
> +                    *dst++ = *src++;
> +                    --size;
> +                    ++dsize;
> +                }
> +            }
> +        }
> +        if (found)
> +        {
> +            *dst++ = *src++;
> +            *dst++ = *src++;
> +            ++src;
> +            size -= 3;
> +            dsize += 2;
> +        }
> +    }
> +    while (size > 0)
> +    {
> +        *dst++ = *src++;
> +        --size;
> +        ++dsize;
> +    }
> +    return dsize;
> +}
> +
>  #define FN_ASSIGN(X, Y) \
>      dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
>      dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
> @@ -130,4 +188,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
>      dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
>      dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
>      dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
> +
> +    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
>  }
> diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
> index 4ef083102b..9d7333cf12 100644
> --- a/libavcodec/arm/vc1dsp_neon.S
> +++ b/libavcodec/arm/vc1dsp_neon.S
> @@ -1804,3 +1804,121 @@ function ff_vc1_h_loop_filter16_neon, export=1
>  4:      vpop            {d8-d15}
>          pop             {r4-r6,pc}
>  endfunc
> +
> +@ Copy at most the specified number of bytes from source to destination buffer,
> +@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
> +@ On entry:
> +@   r0 -> source buffer
> +@   r1 = max number of bytes to copy
> +@   r2 -> destination buffer, optimally 8-byte aligned
> +@ On exit:
> +@   r0 = number of bytes not copied
> +function ff_vc1_unescape_buffer_helper_neon, export=1
> +        @ Offset by 48 to screen out cases that are too short for us to handle,
> +        @ and also make it easy to test for loop termination, or to determine
> +        @ whether we need an odd number of half-iterations of the loop.
> +        subs    r1, r1, #48
> +        bmi     90f
> +
> +        @ Set up useful constants
> +        vmov.i32        q0, #0x3000000
> +        vmov.i32        q1, #0x30000
> +
> +        tst             r1, #16
> +        bne             1f
> +
> +          vld1.8          {q8, q9}, [r0]!
> +          vbic            q12, q8, q0
> +          vext.8          q13, q8, q9, #1
> +          vext.8          q14, q8, q9, #2
> +          vext.8          q15, q8, q9, #3
> +          veor            q12, q12, q1
> +          vbic            q13, q13, q0
> +          vbic            q14, q14, q0
> +          vbic            q15, q15, q0
> +          vceq.i32        q12, q12, #0
> +          veor            q13, q13, q1
> +          veor            q14, q14, q1
> +          veor            q15, q15, q1
> +          vceq.i32        q13, q13, #0
> +          vceq.i32        q14, q14, #0
> +          vceq.i32        q15, q15, #0
> +          add             r1, r1, #16
> +          b               3f
> +
> +1:      vld1.8          {q10, q11}, [r0]!
> +        vbic            q12, q10, q0
> +        vext.8          q13, q10, q11, #1
> +        vext.8          q14, q10, q11, #2
> +        vext.8          q15, q10, q11, #3
> +        veor            q12, q12, q1
> +        vbic            q13, q13, q0
> +        vbic            q14, q14, q0
> +        vbic            q15, q15, q0
> +        vceq.i32        q12, q12, #0
> +        veor            q13, q13, q1
> +        veor            q14, q14, q1
> +        veor            q15, q15, q1
> +        vceq.i32        q13, q13, #0
> +        vceq.i32        q14, q14, #0
> +        vceq.i32        q15, q15, #0
> +        @ Drop through...
> +2:        vmov            q8, q11
> +          vld1.8          {q9}, [r0]!
> +        vorr            q13, q12, q13
> +        vorr            q15, q14, q15
> +          vbic            q12, q8, q0
> +        vorr            q3, q13, q15
> +          vext.8          q13, q8, q9, #1
> +          vext.8          q14, q8, q9, #2
> +          vext.8          q15, q8, q9, #3
> +          veor            q12, q12, q1
> +        vorr            d6, d6, d7
> +          vbic            q13, q13, q0
> +          vbic            q14, q14, q0
> +          vbic            q15, q15, q0
> +          vceq.i32        q12, q12, #0
> +        vmov            r3, r12, d6
> +          veor            q13, q13, q1
> +          veor            q14, q14, q1
> +          veor            q15, q15, q1
> +          vceq.i32        q13, q13, #0
> +          vceq.i32        q14, q14, #0
> +          vceq.i32        q15, q15, #0
> +        orrs            r3, r3, r12
> +        bne             90f
> +        vst1.64         {q10}, [r2]!
> +3:          vmov            q10, q9
> +            vld1.8          {q11}, [r0]!
> +          vorr            q13, q12, q13
> +          vorr            q15, q14, q15
> +            vbic            q12, q10, q0
> +          vorr            q3, q13, q15
> +            vext.8          q13, q10, q11, #1
> +            vext.8          q14, q10, q11, #2
> +            vext.8          q15, q10, q11, #3
> +            veor            q12, q12, q1
> +          vorr            d6, d6, d7
> +            vbic            q13, q13, q0
> +            vbic            q14, q14, q0
> +            vbic            q15, q15, q0
> +            vceq.i32        q12, q12, #0
> +          vmov            r3, r12, d6
> +            veor            q13, q13, q1
> +            veor            q14, q14, q1
> +            veor            q15, q15, q1
> +            vceq.i32        q13, q13, #0
> +            vceq.i32        q14, q14, #0
> +            vceq.i32        q15, q15, #0
> +          orrs            r3, r3, r12
> +          bne             91f
> +          vst1.64         {q8}, [r2]!
> +        subs            r1, r1, #32
> +        bpl             2b
> +
> +90:     add             r0, r1, #48
> +        bx              lr
> +
> +91:     sub             r1, r1, #16
> +        b               90b
> +endfunc
> diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
> index 1c92b9d401..6a30b5b664 100644
> --- a/libavcodec/vc1dec.c
> +++ b/libavcodec/vc1dec.c
> @@ -490,7 +490,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
>              size = next - start - 4;
>              if (size <= 0)
>                  continue;
> -            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
> +            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
>              init_get_bits(&gb, buf2, buf2_size * 8);
>              switch (AV_RB32(start)) {
>              case VC1_CODE_SEQHDR:
> @@ -680,7 +680,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
>                  case VC1_CODE_FRAME:
>                      if (avctx->hwaccel)
>                          buf_start = start;
> -                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
> +                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
>                      break;
>                  case VC1_CODE_FIELD: {
>                      int buf_size3;
> @@ -697,8 +697,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
>                          ret = AVERROR(ENOMEM);
>                          goto err;
>                      }
> -                    buf_size3 = vc1_unescape_buffer(start + 4, size,
> -                                                    slices[n_slices].buf);
> +                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
> +                                                              slices[n_slices].buf);
>                      init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
>                                    buf_size3 << 3);
>                      slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
> @@ -709,7 +709,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
>                      break;
>                  }
>                  case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
> -                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
> +                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
>                      init_get_bits(&s->gb, buf2, buf_size2 * 8);
>                      ff_vc1_decode_entry_point(avctx, v, &s->gb);
>                      break;
> @@ -726,8 +726,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
>                          ret = AVERROR(ENOMEM);
>                          goto err;
>                      }
> -                    buf_size3 = vc1_unescape_buffer(start + 4, size,
> -                                                    slices[n_slices].buf);
> +                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
> +                                                              slices[n_slices].buf);
>                      init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
>                                    buf_size3 << 3);
>                      slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
> @@ -761,7 +761,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
>                      ret = AVERROR(ENOMEM);
>                      goto err;
>                  }
> -                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
> +                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
>                  init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
>                                buf_size3 << 3);
>                  slices[n_slices].mby_start = s->mb_height + 1 >> 1;
> @@ -770,9 +770,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
>                  n_slices1 = n_slices - 1;
>                  n_slices++;
>              }
> -            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
> +            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
>          } else {
> -            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
> +            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
>          }
>          init_get_bits(&s->gb, buf2, buf_size2*8);
>      } else{
> diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
> index a29b91bf3d..11d493f002 100644
> --- a/libavcodec/vc1dsp.c
> +++ b/libavcodec/vc1dsp.c
> @@ -34,6 +34,7 @@
>  #include "rnd_avg.h"
>  #include "vc1dsp.h"
>  #include "startcode.h"
> +#include "vc1_common.h"
>  
>  /* Apply overlap transform to horizontal edge */
>  static void vc1_v_overlap_c(uint8_t *src, int stride)
> @@ -1030,6 +1031,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
>  #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
>  
>      dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
> +    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
>  
>      if (ARCH_AARCH64)
>          ff_vc1dsp_init_aarch64(dsp);
> diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
> index c6443acb20..8be1198071 100644
> --- a/libavcodec/vc1dsp.h
> +++ b/libavcodec/vc1dsp.h
> @@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
>       * one or more further zero bytes and a one byte.
>       */
>      int (*startcode_find_candidate)(const uint8_t *buf, int size);
> +
> +    /* Copy a buffer, removing startcode emulation escape bytes as we go */
> +    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
>  } VC1DSPContext;
>  
>  void ff_vc1dsp_init(VC1DSPContext* c);

1. You should add some benchmarks to the commit message.
2. The unescaping process for VC1 is basically the same as for H.264 and
HEVC* and for those we already have better optimized code in
libavcodec/h2645_parse.c. Can you check the performance of this code
here against (re)using the code from h2645_parse.c?
(3. Btw: The code in h2645_parse.c could even be optimized further along
the lines of
https://ffmpeg.org/pipermail/ffmpeg-devel/2019-June/245203.html (The
H.264 and VC1 parsers use a quite suboptimal startcode search; this
patch is part of a patchset I submitted ages ago to improve it.).)

- Andreas

*: Except for the fact that VC-1 seems to allow 0x00 0x00 0x03 0xXY with
0xXY > 3 (where the 0x03 is not escaped) to occur inside a EBDU; it also
allows 0x00 0x00 0x02 (while the informative process for encoders is the
same as for H.2645; it does not produce the byte sequences disallows by
H.264).
Ben Avison March 21, 2022, 3:51 p.m. UTC | #2
On 18/03/2022 19:10, Andreas Rheinhardt wrote:
> Ben Avison:
>> +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
>> +{
>> +    /* Dealing with starting and stopping, and removing escape bytes, are
>> +     * comparatively less time-sensitive, so are more clearly expressed using
>> +     * a C wrapper around the assembly inner loop. Note that we assume a
>> +     * little-endian machine that supports unaligned loads. */
> 
> You should nevertheless use AV_RL32 for your unaligned LE loads

Thanks - I wasn't aware of that. I'll add it in.

> 1. You should add some benchmarks to the commit message.

Do you mean for each commit, or this one in particular? Are there any 
particular standard files you'd expect to see benchmarked, or will the 
ones I used in the cover-letter do? (Those were just snippets from 
problematic BluRay rips, but that does mean I don't have the rights to 
redistribute them.) I believe there should be conformance bitstreams for 
VC-1 somewhere, but I wasn't able to locate them.

During development, I wrote a simple benchmarker for this particular 
patch, which measures the throughput of processing random data (which 
doesn't contain the escape sequence at any point). I've just pushed it 
here if anyone's interested:

https://github.com/bavison/test-unescape

The compile-time define VERSION there takes a few different values:
1: the original C implementation of vc1_unescape_buffer()
2: an early prototype version I wrote that uses unaligned 32-bit loads, 
again in pure C
3: the NEON assembly versions

The sort of speeds this measures are:
             AArch32    AArch64
version 1   210 MB/s   292 MB/s
version 2   461 MB/s   435 MB/s
version 3  1294 MB/s  1554 MB/s

> 2. The unescaping process for VC1 is basically the same as for H.264 and
> HEVC* and for those we already have better optimized code in
> libavcodec/h2645_parse.c. Can you check the performance of this code
> here against (re)using the code from h2645_parse.c?

I've hacked that around a bit to match the calling conditions of 
vc1_unescape_buffer(), though not adapted it for the slightly different 
rules you noted for VC-1 as opposed to H.264/265. Hopefully it should 
still give some indication of the approximate performance that could be 
expected, but I didn't take time to fully understand everything it was 
doing, so do please say if I've messed something up.

This can be selected by #defining VERSION 4:

             AArch32    AArch64
version 4   737 MB/s  1286 MB/s

This suggests it's much better than the original C, but my NEON versions 
still have the edge, especially on AArch32. The NEON code is very much a 
brute force check, but it's effectively able to do the testing in 
parallel with the memcpy - each byte only gets loaded once.

Ben
Martin Storsjö March 21, 2022, 8:44 p.m. UTC | #3
On Mon, 21 Mar 2022, Ben Avison wrote:

> On 18/03/2022 19:10, Andreas Rheinhardt wrote:
>> Ben Avison:
>>> +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t 
>>> *dst)
>>> +{
>>> +    /* Dealing with starting and stopping, and removing escape bytes, are
>>> +     * comparatively less time-sensitive, so are more clearly expressed 
>>> using
>>> +     * a C wrapper around the assembly inner loop. Note that we assume a
>>> +     * little-endian machine that supports unaligned loads. */
>> 
>> You should nevertheless use AV_RL32 for your unaligned LE loads
>
> Thanks - I wasn't aware of that. I'll add it in.
>
>> 1. You should add some benchmarks to the commit message.
>
> Do you mean for each commit, or this one in particular? Are there any 
> particular standard files you'd expect to see benchmarked, or will the ones I 
> used in the cover-letter do?

With checkasm tests available, it'd be nice to have per-function 
benchmarks in each of the patches that adds/tweaks a new function - so 
you can see e.g. that the NEON version of a function is e.g. 8x faster 
than the corresponding C function. That usually verifies that this 
particular assembly function is beneficial (there have been cases where 
people have contributed code which turned out to be slower than what the C 
compiler produces).

Then overall, it can probably be nice to have a high level benchmark in 
e.g. the cover letter, like "speeds up decoding <random clip> from xx fps 
to yy fps on hardware zz".

(I'll make a longer reply to the other mail.)

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index b672b2aa99..2fc2d5d1d3 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -51,6 +51,64 @@  void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);
 
+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
+
+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
+{
+    /* Dealing with starting and stopping, and removing escape bytes, are
+     * comparatively less time-sensitive, so are more clearly expressed using
+     * a C wrapper around the assembly inner loop. Note that we assume a
+     * little-endian machine that supports unaligned loads. */
+    int dsize = 0;
+    while (size >= 4)
+    {
+        int found = 0;
+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
+        {
+            found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
+            if (!found)
+            {
+                *dst++ = *src++;
+                --size;
+                ++dsize;
+            }
+        }
+        if (!found)
+        {
+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
+            dst += skip;
+            src += skip;
+            size -= skip;
+            dsize += skip;
+            while (!found && size >= 4)
+            {
+                found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
+                if (!found)
+                {
+                    *dst++ = *src++;
+                    --size;
+                    ++dsize;
+                }
+            }
+        }
+        if (found)
+        {
+            *dst++ = *src++;
+            *dst++ = *src++;
+            ++src;
+            size -= 3;
+            dsize += 2;
+        }
+    }
+    while (size > 0)
+    {
+        *dst++ = *src++;
+        --size;
+        ++dsize;
+    }
+    return dsize;
+}
+
 av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -76,5 +134,7 @@  av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+
+        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
     }
 }
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
index c3ca3eae1e..8bdeffab44 100644
--- a/libavcodec/aarch64/vc1dsp_neon.S
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -1374,3 +1374,179 @@  function ff_vc1_h_loop_filter16_neon, export=1
         st2     {v2.b, v3.b}[7], [x6]
 4:      ret
 endfunc
+
+// Copy at most the specified number of bytes from source to destination buffer,
+// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
+// On entry:
+//   x0 -> source buffer
+//   w1 = max number of bytes to copy
+//   x2 -> destination buffer, optimally 8-byte aligned
+// On exit:
+//   w0 = number of bytes not copied
+function ff_vc1_unescape_buffer_helper_neon, export=1
+        // Offset by 80 to screen out cases that are too short for us to handle,
+        // and also make it easy to test for loop termination, or to determine
+        // whether we need an odd number of half-iterations of the loop.
+        subs    w1, w1, #80
+        b.mi    90f
+
+        // Set up useful constants
+        movi    v20.4s, #3, lsl #24
+        movi    v21.4s, #3, lsl #16
+
+        tst     w1, #32
+        b.ne    1f
+
+          ld1     {v0.16b, v1.16b, v2.16b}, [x0], #48
+          ext     v25.16b, v0.16b, v1.16b, #1
+          ext     v26.16b, v0.16b, v1.16b, #2
+          ext     v27.16b, v0.16b, v1.16b, #3
+          ext     v29.16b, v1.16b, v2.16b, #1
+          ext     v30.16b, v1.16b, v2.16b, #2
+          ext     v31.16b, v1.16b, v2.16b, #3
+          bic     v24.16b, v0.16b, v20.16b
+          bic     v25.16b, v25.16b, v20.16b
+          bic     v26.16b, v26.16b, v20.16b
+          bic     v27.16b, v27.16b, v20.16b
+          bic     v28.16b, v1.16b, v20.16b
+          bic     v29.16b, v29.16b, v20.16b
+          bic     v30.16b, v30.16b, v20.16b
+          bic     v31.16b, v31.16b, v20.16b
+          eor     v24.16b, v24.16b, v21.16b
+          eor     v25.16b, v25.16b, v21.16b
+          eor     v26.16b, v26.16b, v21.16b
+          eor     v27.16b, v27.16b, v21.16b
+          eor     v28.16b, v28.16b, v21.16b
+          eor     v29.16b, v29.16b, v21.16b
+          eor     v30.16b, v30.16b, v21.16b
+          eor     v31.16b, v31.16b, v21.16b
+          cmeq    v24.4s, v24.4s, #0
+          cmeq    v25.4s, v25.4s, #0
+          cmeq    v26.4s, v26.4s, #0
+          cmeq    v27.4s, v27.4s, #0
+          add     w1, w1, #32
+          b       3f
+
+1:      ld1     {v3.16b, v4.16b, v5.16b}, [x0], #48
+        ext     v25.16b, v3.16b, v4.16b, #1
+        ext     v26.16b, v3.16b, v4.16b, #2
+        ext     v27.16b, v3.16b, v4.16b, #3
+        ext     v29.16b, v4.16b, v5.16b, #1
+        ext     v30.16b, v4.16b, v5.16b, #2
+        ext     v31.16b, v4.16b, v5.16b, #3
+        bic     v24.16b, v3.16b, v20.16b
+        bic     v25.16b, v25.16b, v20.16b
+        bic     v26.16b, v26.16b, v20.16b
+        bic     v27.16b, v27.16b, v20.16b
+        bic     v28.16b, v4.16b, v20.16b
+        bic     v29.16b, v29.16b, v20.16b
+        bic     v30.16b, v30.16b, v20.16b
+        bic     v31.16b, v31.16b, v20.16b
+        eor     v24.16b, v24.16b, v21.16b
+        eor     v25.16b, v25.16b, v21.16b
+        eor     v26.16b, v26.16b, v21.16b
+        eor     v27.16b, v27.16b, v21.16b
+        eor     v28.16b, v28.16b, v21.16b
+        eor     v29.16b, v29.16b, v21.16b
+        eor     v30.16b, v30.16b, v21.16b
+        eor     v31.16b, v31.16b, v21.16b
+        cmeq    v24.4s, v24.4s, #0
+        cmeq    v25.4s, v25.4s, #0
+        cmeq    v26.4s, v26.4s, #0
+        cmeq    v27.4s, v27.4s, #0
+        // Drop through...
+2:        mov     v0.16b, v5.16b
+          ld1     {v1.16b, v2.16b}, [x0], #32
+        cmeq    v28.4s, v28.4s, #0
+        cmeq    v29.4s, v29.4s, #0
+        cmeq    v30.4s, v30.4s, #0
+        cmeq    v31.4s, v31.4s, #0
+        orr     v24.16b, v24.16b, v25.16b
+        orr     v26.16b, v26.16b, v27.16b
+        orr     v28.16b, v28.16b, v29.16b
+        orr     v30.16b, v30.16b, v31.16b
+          ext     v25.16b, v0.16b, v1.16b, #1
+        orr     v22.16b, v24.16b, v26.16b
+          ext     v26.16b, v0.16b, v1.16b, #2
+          ext     v27.16b, v0.16b, v1.16b, #3
+          ext     v29.16b, v1.16b, v2.16b, #1
+        orr     v23.16b, v28.16b, v30.16b
+          ext     v30.16b, v1.16b, v2.16b, #2
+          ext     v31.16b, v1.16b, v2.16b, #3
+          bic     v24.16b, v0.16b, v20.16b
+          bic     v25.16b, v25.16b, v20.16b
+          bic     v26.16b, v26.16b, v20.16b
+        orr     v22.16b, v22.16b, v23.16b
+          bic     v27.16b, v27.16b, v20.16b
+          bic     v28.16b, v1.16b, v20.16b
+          bic     v29.16b, v29.16b, v20.16b
+          bic     v30.16b, v30.16b, v20.16b
+          bic     v31.16b, v31.16b, v20.16b
+        addv    s22, v22.4s
+          eor     v24.16b, v24.16b, v21.16b
+          eor     v25.16b, v25.16b, v21.16b
+          eor     v26.16b, v26.16b, v21.16b
+          eor     v27.16b, v27.16b, v21.16b
+          eor     v28.16b, v28.16b, v21.16b
+        mov     w3, v22.s[0]
+          eor     v29.16b, v29.16b, v21.16b
+          eor     v30.16b, v30.16b, v21.16b
+          eor     v31.16b, v31.16b, v21.16b
+          cmeq    v24.4s, v24.4s, #0
+          cmeq    v25.4s, v25.4s, #0
+          cmeq    v26.4s, v26.4s, #0
+          cmeq    v27.4s, v27.4s, #0
+        cbnz    w3, 90f
+        st1     {v3.16b, v4.16b}, [x2], #32
+3:          mov     v3.16b, v2.16b
+            ld1     {v4.16b, v5.16b}, [x0], #32
+          cmeq    v28.4s, v28.4s, #0
+          cmeq    v29.4s, v29.4s, #0
+          cmeq    v30.4s, v30.4s, #0
+          cmeq    v31.4s, v31.4s, #0
+          orr     v24.16b, v24.16b, v25.16b
+          orr     v26.16b, v26.16b, v27.16b
+          orr     v28.16b, v28.16b, v29.16b
+          orr     v30.16b, v30.16b, v31.16b
+            ext     v25.16b, v3.16b, v4.16b, #1
+          orr     v22.16b, v24.16b, v26.16b
+            ext     v26.16b, v3.16b, v4.16b, #2
+            ext     v27.16b, v3.16b, v4.16b, #3
+            ext     v29.16b, v4.16b, v5.16b, #1
+          orr     v23.16b, v28.16b, v30.16b
+            ext     v30.16b, v4.16b, v5.16b, #2
+            ext     v31.16b, v4.16b, v5.16b, #3
+            bic     v24.16b, v3.16b, v20.16b
+            bic     v25.16b, v25.16b, v20.16b
+            bic     v26.16b, v26.16b, v20.16b
+          orr     v22.16b, v22.16b, v23.16b
+            bic     v27.16b, v27.16b, v20.16b
+            bic     v28.16b, v4.16b, v20.16b
+            bic     v29.16b, v29.16b, v20.16b
+            bic     v30.16b, v30.16b, v20.16b
+            bic     v31.16b, v31.16b, v20.16b
+          addv    s22, v22.4s
+            eor     v24.16b, v24.16b, v21.16b
+            eor     v25.16b, v25.16b, v21.16b
+            eor     v26.16b, v26.16b, v21.16b
+            eor     v27.16b, v27.16b, v21.16b
+            eor     v28.16b, v28.16b, v21.16b
+          mov     w3, v22.s[0]
+            eor     v29.16b, v29.16b, v21.16b
+            eor     v30.16b, v30.16b, v21.16b
+            eor     v31.16b, v31.16b, v21.16b
+            cmeq    v24.4s, v24.4s, #0
+            cmeq    v25.4s, v25.4s, #0
+            cmeq    v26.4s, v26.4s, #0
+            cmeq    v27.4s, v27.4s, #0
+          cbnz    w3, 91f
+          st1     {v0.16b, v1.16b}, [x2], #32
+        subs    w1, w1, #64
+        b.pl    2b
+
+90:     add     w0, w1, #80
+        ret
+
+91:     sub     w1, w1, #32
+        b       90b
+endfunc
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index f5f5c702d7..3aefbcaf6d 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -84,6 +84,64 @@  void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);
 
+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
+
+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
+{
+    /* Dealing with starting and stopping, and removing escape bytes, are
+     * comparatively less time-sensitive, so are more clearly expressed using
+     * a C wrapper around the assembly inner loop. Note that we assume a
+     * little-endian machine that supports unaligned loads. */
+    int dsize = 0;
+    while (size >= 4)
+    {
+        int found = 0;
+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
+        {
+            found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
+            if (!found)
+            {
+                *dst++ = *src++;
+                --size;
+                ++dsize;
+            }
+        }
+        if (!found)
+        {
+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
+            dst += skip;
+            src += skip;
+            size -= skip;
+            dsize += skip;
+            while (!found && size >= 4)
+            {
+                found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
+                if (!found)
+                {
+                    *dst++ = *src++;
+                    --size;
+                    ++dsize;
+                }
+            }
+        }
+        if (found)
+        {
+            *dst++ = *src++;
+            *dst++ = *src++;
+            ++src;
+            size -= 3;
+            dsize += 2;
+        }
+    }
+    while (size > 0)
+    {
+        *dst++ = *src++;
+        --size;
+        ++dsize;
+    }
+    return dsize;
+}
+
 #define FN_ASSIGN(X, Y) \
     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
@@ -130,4 +188,6 @@  av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+
+    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
 }
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index 4ef083102b..9d7333cf12 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -1804,3 +1804,121 @@  function ff_vc1_h_loop_filter16_neon, export=1
 4:      vpop            {d8-d15}
         pop             {r4-r6,pc}
 endfunc
+
+@ Copy at most the specified number of bytes from source to destination buffer,
+@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
+@ On entry:
+@   r0 -> source buffer
+@   r1 = max number of bytes to copy
+@   r2 -> destination buffer, optimally 8-byte aligned
+@ On exit:
+@   r0 = number of bytes not copied
+function ff_vc1_unescape_buffer_helper_neon, export=1
+        @ Offset by 48 to screen out cases that are too short for us to handle,
+        @ and also make it easy to test for loop termination, or to determine
+        @ whether we need an odd number of half-iterations of the loop.
+        subs    r1, r1, #48
+        bmi     90f
+
+        @ Set up useful constants
+        vmov.i32        q0, #0x3000000
+        vmov.i32        q1, #0x30000
+
+        tst             r1, #16
+        bne             1f
+
+          vld1.8          {q8, q9}, [r0]!
+          vbic            q12, q8, q0
+          vext.8          q13, q8, q9, #1
+          vext.8          q14, q8, q9, #2
+          vext.8          q15, q8, q9, #3
+          veor            q12, q12, q1
+          vbic            q13, q13, q0
+          vbic            q14, q14, q0
+          vbic            q15, q15, q0
+          vceq.i32        q12, q12, #0
+          veor            q13, q13, q1
+          veor            q14, q14, q1
+          veor            q15, q15, q1
+          vceq.i32        q13, q13, #0
+          vceq.i32        q14, q14, #0
+          vceq.i32        q15, q15, #0
+          add             r1, r1, #16
+          b               3f
+
+1:      vld1.8          {q10, q11}, [r0]!
+        vbic            q12, q10, q0
+        vext.8          q13, q10, q11, #1
+        vext.8          q14, q10, q11, #2
+        vext.8          q15, q10, q11, #3
+        veor            q12, q12, q1
+        vbic            q13, q13, q0
+        vbic            q14, q14, q0
+        vbic            q15, q15, q0
+        vceq.i32        q12, q12, #0
+        veor            q13, q13, q1
+        veor            q14, q14, q1
+        veor            q15, q15, q1
+        vceq.i32        q13, q13, #0
+        vceq.i32        q14, q14, #0
+        vceq.i32        q15, q15, #0
+        @ Drop through...
+2:        vmov            q8, q11
+          vld1.8          {q9}, [r0]!
+        vorr            q13, q12, q13
+        vorr            q15, q14, q15
+          vbic            q12, q8, q0
+        vorr            q3, q13, q15
+          vext.8          q13, q8, q9, #1
+          vext.8          q14, q8, q9, #2
+          vext.8          q15, q8, q9, #3
+          veor            q12, q12, q1
+        vorr            d6, d6, d7
+          vbic            q13, q13, q0
+          vbic            q14, q14, q0
+          vbic            q15, q15, q0
+          vceq.i32        q12, q12, #0
+        vmov            r3, r12, d6
+          veor            q13, q13, q1
+          veor            q14, q14, q1
+          veor            q15, q15, q1
+          vceq.i32        q13, q13, #0
+          vceq.i32        q14, q14, #0
+          vceq.i32        q15, q15, #0
+        orrs            r3, r3, r12
+        bne             90f
+        vst1.64         {q10}, [r2]!
+3:          vmov            q10, q9
+            vld1.8          {q11}, [r0]!
+          vorr            q13, q12, q13
+          vorr            q15, q14, q15
+            vbic            q12, q10, q0
+          vorr            q3, q13, q15
+            vext.8          q13, q10, q11, #1
+            vext.8          q14, q10, q11, #2
+            vext.8          q15, q10, q11, #3
+            veor            q12, q12, q1
+          vorr            d6, d6, d7
+            vbic            q13, q13, q0
+            vbic            q14, q14, q0
+            vbic            q15, q15, q0
+            vceq.i32        q12, q12, #0
+          vmov            r3, r12, d6
+            veor            q13, q13, q1
+            veor            q14, q14, q1
+            veor            q15, q15, q1
+            vceq.i32        q13, q13, #0
+            vceq.i32        q14, q14, #0
+            vceq.i32        q15, q15, #0
+          orrs            r3, r3, r12
+          bne             91f
+          vst1.64         {q8}, [r2]!
+        subs            r1, r1, #32
+        bpl             2b
+
+90:     add             r0, r1, #48
+        bx              lr
+
+91:     sub             r1, r1, #16
+        b               90b
+endfunc
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 1c92b9d401..6a30b5b664 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -490,7 +490,7 @@  static av_cold int vc1_decode_init(AVCodecContext *avctx)
             size = next - start - 4;
             if (size <= 0)
                 continue;
-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
+            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
             init_get_bits(&gb, buf2, buf2_size * 8);
             switch (AV_RB32(start)) {
             case VC1_CODE_SEQHDR:
@@ -680,7 +680,7 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 case VC1_CODE_FRAME:
                     if (avctx->hwaccel)
                         buf_start = start;
-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
                     break;
                 case VC1_CODE_FIELD: {
                     int buf_size3;
@@ -697,8 +697,8 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                         ret = AVERROR(ENOMEM);
                         goto err;
                     }
-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
-                                                    slices[n_slices].buf);
+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+                                                              slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
@@ -709,7 +709,7 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     break;
                 }
                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
                     break;
@@ -726,8 +726,8 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                         ret = AVERROR(ENOMEM);
                         goto err;
                     }
-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
-                                                    slices[n_slices].buf);
+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+                                                              slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
@@ -761,7 +761,7 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     ret = AVERROR(ENOMEM);
                     goto err;
                 }
-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                               buf_size3 << 3);
                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
@@ -770,9 +770,9 @@  static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 n_slices1 = n_slices - 1;
                 n_slices++;
             }
-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
         } else {
-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
         }
         init_get_bits(&s->gb, buf2, buf_size2*8);
     } else{
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index a29b91bf3d..11d493f002 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -34,6 +34,7 @@ 
 #include "rnd_avg.h"
 #include "vc1dsp.h"
 #include "startcode.h"
+#include "vc1_common.h"
 
 /* Apply overlap transform to horizontal edge */
 static void vc1_v_overlap_c(uint8_t *src, int stride)
@@ -1030,6 +1031,7 @@  av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
 
     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
+    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
 
     if (ARCH_AARCH64)
         ff_vc1dsp_init_aarch64(dsp);
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index c6443acb20..8be1198071 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -80,6 +80,9 @@  typedef struct VC1DSPContext {
      * one or more further zero bytes and a one byte.
      */
     int (*startcode_find_candidate)(const uint8_t *buf, int size);
+
+    /* Copy a buffer, removing startcode emulation escape bytes as we go */
+    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
 } VC1DSPContext;
 
 void ff_vc1dsp_init(VC1DSPContext* c);