Message ID | 20220317185819.466470-7-bavison@riscosopen.org |
---|---|
State | New |
Headers | show |
Series | avcodec/vc1: Arm optimisations | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_aarch64_jetson | success | Make finished |
andriy/make_fate_aarch64_jetson | success | Make fate finished |
andriy/make_armv7_RPi4 | success | Make finished |
andriy/make_fate_armv7_RPi4 | success | Make fate finished |
Ben Avison: > Populate with implementations suitable for 32-bit and 64-bit Arm. > > Signed-off-by: Ben Avison <bavison@riscosopen.org> > --- > libavcodec/aarch64/vc1dsp_init_aarch64.c | 60 ++++++++ > libavcodec/aarch64/vc1dsp_neon.S | 176 +++++++++++++++++++++++ > libavcodec/arm/vc1dsp_init_neon.c | 60 ++++++++ > libavcodec/arm/vc1dsp_neon.S | 118 +++++++++++++++ > libavcodec/vc1dec.c | 20 +-- > libavcodec/vc1dsp.c | 2 + > libavcodec/vc1dsp.h | 3 + > 7 files changed, 429 insertions(+), 10 deletions(-) > > diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c > index b672b2aa99..2fc2d5d1d3 100644 > --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c > +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c > @@ -51,6 +51,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, > void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, > int h, int x, int y); > > +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); > + > +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) > +{ > + /* Dealing with starting and stopping, and removing escape bytes, are > + * comparatively less time-sensitive, so are more clearly expressed using > + * a C wrapper around the assembly inner loop. Note that we assume a > + * little-endian machine that supports unaligned loads. */ > + int dsize = 0; > + while (size >= 4) > + { > + int found = 0; > + while (!found && (((uintptr_t) dst) & 7) && size >= 4) > + { > + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000; > + if (!found) > + { > + *dst++ = *src++; > + --size; > + ++dsize; > + } > + } > + if (!found) > + { > + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); > + dst += skip; > + src += skip; > + size -= skip; > + dsize += skip; > + while (!found && size >= 4) > + { > + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000; > + if (!found) > + { > + *dst++ = *src++; > + --size; > + ++dsize; > + } > + } > + } > + if (found) > + { > + *dst++ = *src++; > + *dst++ = *src++; > + ++src; > + size -= 3; > + dsize += 2; > + } > + } > + while (size > 0) > + { > + *dst++ = *src++; > + --size; > + ++dsize; > + } > + return dsize; > +} > + > av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) > { > int cpu_flags = av_get_cpu_flags(); > @@ -76,5 +134,7 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) > dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; > dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; > dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; > + > + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; > } > } > diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S > index c3ca3eae1e..8bdeffab44 100644 > --- a/libavcodec/aarch64/vc1dsp_neon.S > +++ b/libavcodec/aarch64/vc1dsp_neon.S > @@ -1374,3 +1374,179 @@ function ff_vc1_h_loop_filter16_neon, export=1 > st2 {v2.b, v3.b}[7], [x6] > 4: ret > endfunc > + > +// Copy at most the specified number of bytes from source to destination buffer, > +// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence > +// On entry: > +// x0 -> source buffer > +// w1 = max number of bytes to copy > +// x2 -> destination buffer, optimally 8-byte aligned > +// On exit: > +// w0 = number of bytes not copied > +function ff_vc1_unescape_buffer_helper_neon, export=1 > + // Offset by 80 to screen out cases that are too short for us to handle, > + // and also make it easy to test for loop termination, or to determine > + // whether we need an odd number of half-iterations of the loop. > + subs w1, w1, #80 > + b.mi 90f > + > + // Set up useful constants > + movi v20.4s, #3, lsl #24 > + movi v21.4s, #3, lsl #16 > + > + tst w1, #32 > + b.ne 1f > + > + ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 > + ext v25.16b, v0.16b, v1.16b, #1 > + ext v26.16b, v0.16b, v1.16b, #2 > + ext v27.16b, v0.16b, v1.16b, #3 > + ext v29.16b, v1.16b, v2.16b, #1 > + ext v30.16b, v1.16b, v2.16b, #2 > + ext v31.16b, v1.16b, v2.16b, #3 > + bic v24.16b, v0.16b, v20.16b > + bic v25.16b, v25.16b, v20.16b > + bic v26.16b, v26.16b, v20.16b > + bic v27.16b, v27.16b, v20.16b > + bic v28.16b, v1.16b, v20.16b > + bic v29.16b, v29.16b, v20.16b > + bic v30.16b, v30.16b, v20.16b > + bic v31.16b, v31.16b, v20.16b > + eor v24.16b, v24.16b, v21.16b > + eor v25.16b, v25.16b, v21.16b > + eor v26.16b, v26.16b, v21.16b > + eor v27.16b, v27.16b, v21.16b > + eor v28.16b, v28.16b, v21.16b > + eor v29.16b, v29.16b, v21.16b > + eor v30.16b, v30.16b, v21.16b > + eor v31.16b, v31.16b, v21.16b > + cmeq v24.4s, v24.4s, #0 > + cmeq v25.4s, v25.4s, #0 > + cmeq v26.4s, v26.4s, #0 > + cmeq v27.4s, v27.4s, #0 > + add w1, w1, #32 > + b 3f > + > +1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 > + ext v25.16b, v3.16b, v4.16b, #1 > + ext v26.16b, v3.16b, v4.16b, #2 > + ext v27.16b, v3.16b, v4.16b, #3 > + ext v29.16b, v4.16b, v5.16b, #1 > + ext v30.16b, v4.16b, v5.16b, #2 > + ext v31.16b, v4.16b, v5.16b, #3 > + bic v24.16b, v3.16b, v20.16b > + bic v25.16b, v25.16b, v20.16b > + bic v26.16b, v26.16b, v20.16b > + bic v27.16b, v27.16b, v20.16b > + bic v28.16b, v4.16b, v20.16b > + bic v29.16b, v29.16b, v20.16b > + bic v30.16b, v30.16b, v20.16b > + bic v31.16b, v31.16b, v20.16b > + eor v24.16b, v24.16b, v21.16b > + eor v25.16b, v25.16b, v21.16b > + eor v26.16b, v26.16b, v21.16b > + eor v27.16b, v27.16b, v21.16b > + eor v28.16b, v28.16b, v21.16b > + eor v29.16b, v29.16b, v21.16b > + eor v30.16b, v30.16b, v21.16b > + eor v31.16b, v31.16b, v21.16b > + cmeq v24.4s, v24.4s, #0 > + cmeq v25.4s, v25.4s, #0 > + cmeq v26.4s, v26.4s, #0 > + cmeq v27.4s, v27.4s, #0 > + // Drop through... > +2: mov v0.16b, v5.16b > + ld1 {v1.16b, v2.16b}, [x0], #32 > + cmeq v28.4s, v28.4s, #0 > + cmeq v29.4s, v29.4s, #0 > + cmeq v30.4s, v30.4s, #0 > + cmeq v31.4s, v31.4s, #0 > + orr v24.16b, v24.16b, v25.16b > + orr v26.16b, v26.16b, v27.16b > + orr v28.16b, v28.16b, v29.16b > + orr v30.16b, v30.16b, v31.16b > + ext v25.16b, v0.16b, v1.16b, #1 > + orr v22.16b, v24.16b, v26.16b > + ext v26.16b, v0.16b, v1.16b, #2 > + ext v27.16b, v0.16b, v1.16b, #3 > + ext v29.16b, v1.16b, v2.16b, #1 > + orr v23.16b, v28.16b, v30.16b > + ext v30.16b, v1.16b, v2.16b, #2 > + ext v31.16b, v1.16b, v2.16b, #3 > + bic v24.16b, v0.16b, v20.16b > + bic v25.16b, v25.16b, v20.16b > + bic v26.16b, v26.16b, v20.16b > + orr v22.16b, v22.16b, v23.16b > + bic v27.16b, v27.16b, v20.16b > + bic v28.16b, v1.16b, v20.16b > + bic v29.16b, v29.16b, v20.16b > + bic v30.16b, v30.16b, v20.16b > + bic v31.16b, v31.16b, v20.16b > + addv s22, v22.4s > + eor v24.16b, v24.16b, v21.16b > + eor v25.16b, v25.16b, v21.16b > + eor v26.16b, v26.16b, v21.16b > + eor v27.16b, v27.16b, v21.16b > + eor v28.16b, v28.16b, v21.16b > + mov w3, v22.s[0] > + eor v29.16b, v29.16b, v21.16b > + eor v30.16b, v30.16b, v21.16b > + eor v31.16b, v31.16b, v21.16b > + cmeq v24.4s, v24.4s, #0 > + cmeq v25.4s, v25.4s, #0 > + cmeq v26.4s, v26.4s, #0 > + cmeq v27.4s, v27.4s, #0 > + cbnz w3, 90f > + st1 {v3.16b, v4.16b}, [x2], #32 > +3: mov v3.16b, v2.16b > + ld1 {v4.16b, v5.16b}, [x0], #32 > + cmeq v28.4s, v28.4s, #0 > + cmeq v29.4s, v29.4s, #0 > + cmeq v30.4s, v30.4s, #0 > + cmeq v31.4s, v31.4s, #0 > + orr v24.16b, v24.16b, v25.16b > + orr v26.16b, v26.16b, v27.16b > + orr v28.16b, v28.16b, v29.16b > + orr v30.16b, v30.16b, v31.16b > + ext v25.16b, v3.16b, v4.16b, #1 > + orr v22.16b, v24.16b, v26.16b > + ext v26.16b, v3.16b, v4.16b, #2 > + ext v27.16b, v3.16b, v4.16b, #3 > + ext v29.16b, v4.16b, v5.16b, #1 > + orr v23.16b, v28.16b, v30.16b > + ext v30.16b, v4.16b, v5.16b, #2 > + ext v31.16b, v4.16b, v5.16b, #3 > + bic v24.16b, v3.16b, v20.16b > + bic v25.16b, v25.16b, v20.16b > + bic v26.16b, v26.16b, v20.16b > + orr v22.16b, v22.16b, v23.16b > + bic v27.16b, v27.16b, v20.16b > + bic v28.16b, v4.16b, v20.16b > + bic v29.16b, v29.16b, v20.16b > + bic v30.16b, v30.16b, v20.16b > + bic v31.16b, v31.16b, v20.16b > + addv s22, v22.4s > + eor v24.16b, v24.16b, v21.16b > + eor v25.16b, v25.16b, v21.16b > + eor v26.16b, v26.16b, v21.16b > + eor v27.16b, v27.16b, v21.16b > + eor v28.16b, v28.16b, v21.16b > + mov w3, v22.s[0] > + eor v29.16b, v29.16b, v21.16b > + eor v30.16b, v30.16b, v21.16b > + eor v31.16b, v31.16b, v21.16b > + cmeq v24.4s, v24.4s, #0 > + cmeq v25.4s, v25.4s, #0 > + cmeq v26.4s, v26.4s, #0 > + cmeq v27.4s, v27.4s, #0 > + cbnz w3, 91f > + st1 {v0.16b, v1.16b}, [x2], #32 > + subs w1, w1, #64 > + b.pl 2b > + > +90: add w0, w1, #80 > + ret > + > +91: sub w1, w1, #32 > + b 90b > +endfunc > diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c > index f5f5c702d7..3aefbcaf6d 100644 > --- a/libavcodec/arm/vc1dsp_init_neon.c > +++ b/libavcodec/arm/vc1dsp_init_neon.c > @@ -84,6 +84,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, > void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, > int h, int x, int y); > > +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); > + > +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) > +{ > + /* Dealing with starting and stopping, and removing escape bytes, are > + * comparatively less time-sensitive, so are more clearly expressed using > + * a C wrapper around the assembly inner loop. Note that we assume a > + * little-endian machine that supports unaligned loads. */ You should nevertheless use AV_RL32 for your unaligned LE loads > + int dsize = 0; > + while (size >= 4) > + { > + int found = 0; > + while (!found && (((uintptr_t) dst) & 7) && size >= 4) > + { > + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000; > + if (!found) > + { > + *dst++ = *src++; > + --size; > + ++dsize; > + } > + } > + if (!found) > + { > + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); > + dst += skip; > + src += skip; > + size -= skip; > + dsize += skip; > + while (!found && size >= 4) > + { > + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000; > + if (!found) > + { > + *dst++ = *src++; > + --size; > + ++dsize; > + } > + } > + } > + if (found) > + { > + *dst++ = *src++; > + *dst++ = *src++; > + ++src; > + size -= 3; > + dsize += 2; > + } > + } > + while (size > 0) > + { > + *dst++ = *src++; > + --size; > + ++dsize; > + } > + return dsize; > +} > + > #define FN_ASSIGN(X, Y) \ > dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ > dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon > @@ -130,4 +188,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) > dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; > dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; > dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; > + > + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; > } > diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S > index 4ef083102b..9d7333cf12 100644 > --- a/libavcodec/arm/vc1dsp_neon.S > +++ b/libavcodec/arm/vc1dsp_neon.S > @@ -1804,3 +1804,121 @@ function ff_vc1_h_loop_filter16_neon, export=1 > 4: vpop {d8-d15} > pop {r4-r6,pc} > endfunc > + > +@ Copy at most the specified number of bytes from source to destination buffer, > +@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence > +@ On entry: > +@ r0 -> source buffer > +@ r1 = max number of bytes to copy > +@ r2 -> destination buffer, optimally 8-byte aligned > +@ On exit: > +@ r0 = number of bytes not copied > +function ff_vc1_unescape_buffer_helper_neon, export=1 > + @ Offset by 48 to screen out cases that are too short for us to handle, > + @ and also make it easy to test for loop termination, or to determine > + @ whether we need an odd number of half-iterations of the loop. > + subs r1, r1, #48 > + bmi 90f > + > + @ Set up useful constants > + vmov.i32 q0, #0x3000000 > + vmov.i32 q1, #0x30000 > + > + tst r1, #16 > + bne 1f > + > + vld1.8 {q8, q9}, [r0]! > + vbic q12, q8, q0 > + vext.8 q13, q8, q9, #1 > + vext.8 q14, q8, q9, #2 > + vext.8 q15, q8, q9, #3 > + veor q12, q12, q1 > + vbic q13, q13, q0 > + vbic q14, q14, q0 > + vbic q15, q15, q0 > + vceq.i32 q12, q12, #0 > + veor q13, q13, q1 > + veor q14, q14, q1 > + veor q15, q15, q1 > + vceq.i32 q13, q13, #0 > + vceq.i32 q14, q14, #0 > + vceq.i32 q15, q15, #0 > + add r1, r1, #16 > + b 3f > + > +1: vld1.8 {q10, q11}, [r0]! > + vbic q12, q10, q0 > + vext.8 q13, q10, q11, #1 > + vext.8 q14, q10, q11, #2 > + vext.8 q15, q10, q11, #3 > + veor q12, q12, q1 > + vbic q13, q13, q0 > + vbic q14, q14, q0 > + vbic q15, q15, q0 > + vceq.i32 q12, q12, #0 > + veor q13, q13, q1 > + veor q14, q14, q1 > + veor q15, q15, q1 > + vceq.i32 q13, q13, #0 > + vceq.i32 q14, q14, #0 > + vceq.i32 q15, q15, #0 > + @ Drop through... > +2: vmov q8, q11 > + vld1.8 {q9}, [r0]! > + vorr q13, q12, q13 > + vorr q15, q14, q15 > + vbic q12, q8, q0 > + vorr q3, q13, q15 > + vext.8 q13, q8, q9, #1 > + vext.8 q14, q8, q9, #2 > + vext.8 q15, q8, q9, #3 > + veor q12, q12, q1 > + vorr d6, d6, d7 > + vbic q13, q13, q0 > + vbic q14, q14, q0 > + vbic q15, q15, q0 > + vceq.i32 q12, q12, #0 > + vmov r3, r12, d6 > + veor q13, q13, q1 > + veor q14, q14, q1 > + veor q15, q15, q1 > + vceq.i32 q13, q13, #0 > + vceq.i32 q14, q14, #0 > + vceq.i32 q15, q15, #0 > + orrs r3, r3, r12 > + bne 90f > + vst1.64 {q10}, [r2]! > +3: vmov q10, q9 > + vld1.8 {q11}, [r0]! > + vorr q13, q12, q13 > + vorr q15, q14, q15 > + vbic q12, q10, q0 > + vorr q3, q13, q15 > + vext.8 q13, q10, q11, #1 > + vext.8 q14, q10, q11, #2 > + vext.8 q15, q10, q11, #3 > + veor q12, q12, q1 > + vorr d6, d6, d7 > + vbic q13, q13, q0 > + vbic q14, q14, q0 > + vbic q15, q15, q0 > + vceq.i32 q12, q12, #0 > + vmov r3, r12, d6 > + veor q13, q13, q1 > + veor q14, q14, q1 > + veor q15, q15, q1 > + vceq.i32 q13, q13, #0 > + vceq.i32 q14, q14, #0 > + vceq.i32 q15, q15, #0 > + orrs r3, r3, r12 > + bne 91f > + vst1.64 {q8}, [r2]! > + subs r1, r1, #32 > + bpl 2b > + > +90: add r0, r1, #48 > + bx lr > + > +91: sub r1, r1, #16 > + b 90b > +endfunc > diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c > index 1c92b9d401..6a30b5b664 100644 > --- a/libavcodec/vc1dec.c > +++ b/libavcodec/vc1dec.c > @@ -490,7 +490,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) > size = next - start - 4; > if (size <= 0) > continue; > - buf2_size = vc1_unescape_buffer(start + 4, size, buf2); > + buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); > init_get_bits(&gb, buf2, buf2_size * 8); > switch (AV_RB32(start)) { > case VC1_CODE_SEQHDR: > @@ -680,7 +680,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, > case VC1_CODE_FRAME: > if (avctx->hwaccel) > buf_start = start; > - buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); > + buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); > break; > case VC1_CODE_FIELD: { > int buf_size3; > @@ -697,8 +697,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, > ret = AVERROR(ENOMEM); > goto err; > } > - buf_size3 = vc1_unescape_buffer(start + 4, size, > - slices[n_slices].buf); > + buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, > + slices[n_slices].buf); > init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, > buf_size3 << 3); > slices[n_slices].mby_start = avctx->coded_height + 31 >> 5; > @@ -709,7 +709,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, > break; > } > case VC1_CODE_ENTRYPOINT: /* it should be before frame data */ > - buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); > + buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); > init_get_bits(&s->gb, buf2, buf_size2 * 8); > ff_vc1_decode_entry_point(avctx, v, &s->gb); > break; > @@ -726,8 +726,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, > ret = AVERROR(ENOMEM); > goto err; > } > - buf_size3 = vc1_unescape_buffer(start + 4, size, > - slices[n_slices].buf); > + buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, > + slices[n_slices].buf); > init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, > buf_size3 << 3); > slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9); > @@ -761,7 +761,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, > ret = AVERROR(ENOMEM); > goto err; > } > - buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); > + buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); > init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, > buf_size3 << 3); > slices[n_slices].mby_start = s->mb_height + 1 >> 1; > @@ -770,9 +770,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, > n_slices1 = n_slices - 1; > n_slices++; > } > - buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2); > + buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2); > } else { > - buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2); > + buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2); > } > init_get_bits(&s->gb, buf2, buf_size2*8); > } else{ > diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c > index a29b91bf3d..11d493f002 100644 > --- a/libavcodec/vc1dsp.c > +++ b/libavcodec/vc1dsp.c > @@ -34,6 +34,7 @@ > #include "rnd_avg.h" > #include "vc1dsp.h" > #include "startcode.h" > +#include "vc1_common.h" > > /* Apply overlap transform to horizontal edge */ > static void vc1_v_overlap_c(uint8_t *src, int stride) > @@ -1030,6 +1031,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) > #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ > > dsp->startcode_find_candidate = ff_startcode_find_candidate_c; > + dsp->vc1_unescape_buffer = vc1_unescape_buffer; > > if (ARCH_AARCH64) > ff_vc1dsp_init_aarch64(dsp); > diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h > index c6443acb20..8be1198071 100644 > --- a/libavcodec/vc1dsp.h > +++ b/libavcodec/vc1dsp.h > @@ -80,6 +80,9 @@ typedef struct VC1DSPContext { > * one or more further zero bytes and a one byte. > */ > int (*startcode_find_candidate)(const uint8_t *buf, int size); > + > + /* Copy a buffer, removing startcode emulation escape bytes as we go */ > + int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst); > } VC1DSPContext; > > void ff_vc1dsp_init(VC1DSPContext* c); 1. You should add some benchmarks to the commit message. 2. The unescaping process for VC1 is basically the same as for H.264 and HEVC* and for those we already have better optimized code in libavcodec/h2645_parse.c. Can you check the performance of this code here against (re)using the code from h2645_parse.c? (3. Btw: The code in h2645_parse.c could even be optimized further along the lines of https://ffmpeg.org/pipermail/ffmpeg-devel/2019-June/245203.html (The H.264 and VC1 parsers use a quite suboptimal startcode search; this patch is part of a patchset I submitted ages ago to improve it.).) - Andreas *: Except for the fact that VC-1 seems to allow 0x00 0x00 0x03 0xXY with 0xXY > 3 (where the 0x03 is not escaped) to occur inside a EBDU; it also allows 0x00 0x00 0x02 (while the informative process for encoders is the same as for H.2645; it does not produce the byte sequences disallows by H.264).
On 18/03/2022 19:10, Andreas Rheinhardt wrote: > Ben Avison: >> +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) >> +{ >> + /* Dealing with starting and stopping, and removing escape bytes, are >> + * comparatively less time-sensitive, so are more clearly expressed using >> + * a C wrapper around the assembly inner loop. Note that we assume a >> + * little-endian machine that supports unaligned loads. */ > > You should nevertheless use AV_RL32 for your unaligned LE loads Thanks - I wasn't aware of that. I'll add it in. > 1. You should add some benchmarks to the commit message. Do you mean for each commit, or this one in particular? Are there any particular standard files you'd expect to see benchmarked, or will the ones I used in the cover-letter do? (Those were just snippets from problematic BluRay rips, but that does mean I don't have the rights to redistribute them.) I believe there should be conformance bitstreams for VC-1 somewhere, but I wasn't able to locate them. During development, I wrote a simple benchmarker for this particular patch, which measures the throughput of processing random data (which doesn't contain the escape sequence at any point). I've just pushed it here if anyone's interested: https://github.com/bavison/test-unescape The compile-time define VERSION there takes a few different values: 1: the original C implementation of vc1_unescape_buffer() 2: an early prototype version I wrote that uses unaligned 32-bit loads, again in pure C 3: the NEON assembly versions The sort of speeds this measures are: AArch32 AArch64 version 1 210 MB/s 292 MB/s version 2 461 MB/s 435 MB/s version 3 1294 MB/s 1554 MB/s > 2. The unescaping process for VC1 is basically the same as for H.264 and > HEVC* and for those we already have better optimized code in > libavcodec/h2645_parse.c. Can you check the performance of this code > here against (re)using the code from h2645_parse.c? I've hacked that around a bit to match the calling conditions of vc1_unescape_buffer(), though not adapted it for the slightly different rules you noted for VC-1 as opposed to H.264/265. Hopefully it should still give some indication of the approximate performance that could be expected, but I didn't take time to fully understand everything it was doing, so do please say if I've messed something up. This can be selected by #defining VERSION 4: AArch32 AArch64 version 4 737 MB/s 1286 MB/s This suggests it's much better than the original C, but my NEON versions still have the edge, especially on AArch32. The NEON code is very much a brute force check, but it's effectively able to do the testing in parallel with the memcpy - each byte only gets loaded once. Ben
On Mon, 21 Mar 2022, Ben Avison wrote: > On 18/03/2022 19:10, Andreas Rheinhardt wrote: >> Ben Avison: >>> +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t >>> *dst) >>> +{ >>> + /* Dealing with starting and stopping, and removing escape bytes, are >>> + * comparatively less time-sensitive, so are more clearly expressed >>> using >>> + * a C wrapper around the assembly inner loop. Note that we assume a >>> + * little-endian machine that supports unaligned loads. */ >> >> You should nevertheless use AV_RL32 for your unaligned LE loads > > Thanks - I wasn't aware of that. I'll add it in. > >> 1. You should add some benchmarks to the commit message. > > Do you mean for each commit, or this one in particular? Are there any > particular standard files you'd expect to see benchmarked, or will the ones I > used in the cover-letter do? With checkasm tests available, it'd be nice to have per-function benchmarks in each of the patches that adds/tweaks a new function - so you can see e.g. that the NEON version of a function is e.g. 8x faster than the corresponding C function. That usually verifies that this particular assembly function is beneficial (there have been cases where people have contributed code which turned out to be slower than what the C compiler produces). Then overall, it can probably be nice to have a high level benchmark in e.g. the cover letter, like "speeds up decoding <random clip> from xx fps to yy fps on hardware zz". (I'll make a longer reply to the other mail.) // Martin
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c index b672b2aa99..2fc2d5d1d3 100644 --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c @@ -51,6 +51,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); + +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) +{ + /* Dealing with starting and stopping, and removing escape bytes, are + * comparatively less time-sensitive, so are more clearly expressed using + * a C wrapper around the assembly inner loop. Note that we assume a + * little-endian machine that supports unaligned loads. */ + int dsize = 0; + while (size >= 4) + { + int found = 0; + while (!found && (((uintptr_t) dst) & 7) && size >= 4) + { + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + if (!found) + { + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); + dst += skip; + src += skip; + size -= skip; + dsize += skip; + while (!found && size >= 4) + { + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + } + if (found) + { + *dst++ = *src++; + *dst++ = *src++; + ++src; + size -= 3; + dsize += 2; + } + } + while (size > 0) + { + *dst++ = *src++; + --size; + ++dsize; + } + return dsize; +} + av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) { int cpu_flags = av_get_cpu_flags(); @@ -76,5 +134,7 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; + + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; } } diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S index c3ca3eae1e..8bdeffab44 100644 --- a/libavcodec/aarch64/vc1dsp_neon.S +++ b/libavcodec/aarch64/vc1dsp_neon.S @@ -1374,3 +1374,179 @@ function ff_vc1_h_loop_filter16_neon, export=1 st2 {v2.b, v3.b}[7], [x6] 4: ret endfunc + +// Copy at most the specified number of bytes from source to destination buffer, +// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence +// On entry: +// x0 -> source buffer +// w1 = max number of bytes to copy +// x2 -> destination buffer, optimally 8-byte aligned +// On exit: +// w0 = number of bytes not copied +function ff_vc1_unescape_buffer_helper_neon, export=1 + // Offset by 80 to screen out cases that are too short for us to handle, + // and also make it easy to test for loop termination, or to determine + // whether we need an odd number of half-iterations of the loop. + subs w1, w1, #80 + b.mi 90f + + // Set up useful constants + movi v20.4s, #3, lsl #24 + movi v21.4s, #3, lsl #16 + + tst w1, #32 + b.ne 1f + + ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 + ext v25.16b, v0.16b, v1.16b, #1 + ext v26.16b, v0.16b, v1.16b, #2 + ext v27.16b, v0.16b, v1.16b, #3 + ext v29.16b, v1.16b, v2.16b, #1 + ext v30.16b, v1.16b, v2.16b, #2 + ext v31.16b, v1.16b, v2.16b, #3 + bic v24.16b, v0.16b, v20.16b + bic v25.16b, v25.16b, v20.16b + bic v26.16b, v26.16b, v20.16b + bic v27.16b, v27.16b, v20.16b + bic v28.16b, v1.16b, v20.16b + bic v29.16b, v29.16b, v20.16b + bic v30.16b, v30.16b, v20.16b + bic v31.16b, v31.16b, v20.16b + eor v24.16b, v24.16b, v21.16b + eor v25.16b, v25.16b, v21.16b + eor v26.16b, v26.16b, v21.16b + eor v27.16b, v27.16b, v21.16b + eor v28.16b, v28.16b, v21.16b + eor v29.16b, v29.16b, v21.16b + eor v30.16b, v30.16b, v21.16b + eor v31.16b, v31.16b, v21.16b + cmeq v24.4s, v24.4s, #0 + cmeq v25.4s, v25.4s, #0 + cmeq v26.4s, v26.4s, #0 + cmeq v27.4s, v27.4s, #0 + add w1, w1, #32 + b 3f + +1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 + ext v25.16b, v3.16b, v4.16b, #1 + ext v26.16b, v3.16b, v4.16b, #2 + ext v27.16b, v3.16b, v4.16b, #3 + ext v29.16b, v4.16b, v5.16b, #1 + ext v30.16b, v4.16b, v5.16b, #2 + ext v31.16b, v4.16b, v5.16b, #3 + bic v24.16b, v3.16b, v20.16b + bic v25.16b, v25.16b, v20.16b + bic v26.16b, v26.16b, v20.16b + bic v27.16b, v27.16b, v20.16b + bic v28.16b, v4.16b, v20.16b + bic v29.16b, v29.16b, v20.16b + bic v30.16b, v30.16b, v20.16b + bic v31.16b, v31.16b, v20.16b + eor v24.16b, v24.16b, v21.16b + eor v25.16b, v25.16b, v21.16b + eor v26.16b, v26.16b, v21.16b + eor v27.16b, v27.16b, v21.16b + eor v28.16b, v28.16b, v21.16b + eor v29.16b, v29.16b, v21.16b + eor v30.16b, v30.16b, v21.16b + eor v31.16b, v31.16b, v21.16b + cmeq v24.4s, v24.4s, #0 + cmeq v25.4s, v25.4s, #0 + cmeq v26.4s, v26.4s, #0 + cmeq v27.4s, v27.4s, #0 + // Drop through... +2: mov v0.16b, v5.16b + ld1 {v1.16b, v2.16b}, [x0], #32 + cmeq v28.4s, v28.4s, #0 + cmeq v29.4s, v29.4s, #0 + cmeq v30.4s, v30.4s, #0 + cmeq v31.4s, v31.4s, #0 + orr v24.16b, v24.16b, v25.16b + orr v26.16b, v26.16b, v27.16b + orr v28.16b, v28.16b, v29.16b + orr v30.16b, v30.16b, v31.16b + ext v25.16b, v0.16b, v1.16b, #1 + orr v22.16b, v24.16b, v26.16b + ext v26.16b, v0.16b, v1.16b, #2 + ext v27.16b, v0.16b, v1.16b, #3 + ext v29.16b, v1.16b, v2.16b, #1 + orr v23.16b, v28.16b, v30.16b + ext v30.16b, v1.16b, v2.16b, #2 + ext v31.16b, v1.16b, v2.16b, #3 + bic v24.16b, v0.16b, v20.16b + bic v25.16b, v25.16b, v20.16b + bic v26.16b, v26.16b, v20.16b + orr v22.16b, v22.16b, v23.16b + bic v27.16b, v27.16b, v20.16b + bic v28.16b, v1.16b, v20.16b + bic v29.16b, v29.16b, v20.16b + bic v30.16b, v30.16b, v20.16b + bic v31.16b, v31.16b, v20.16b + addv s22, v22.4s + eor v24.16b, v24.16b, v21.16b + eor v25.16b, v25.16b, v21.16b + eor v26.16b, v26.16b, v21.16b + eor v27.16b, v27.16b, v21.16b + eor v28.16b, v28.16b, v21.16b + mov w3, v22.s[0] + eor v29.16b, v29.16b, v21.16b + eor v30.16b, v30.16b, v21.16b + eor v31.16b, v31.16b, v21.16b + cmeq v24.4s, v24.4s, #0 + cmeq v25.4s, v25.4s, #0 + cmeq v26.4s, v26.4s, #0 + cmeq v27.4s, v27.4s, #0 + cbnz w3, 90f + st1 {v3.16b, v4.16b}, [x2], #32 +3: mov v3.16b, v2.16b + ld1 {v4.16b, v5.16b}, [x0], #32 + cmeq v28.4s, v28.4s, #0 + cmeq v29.4s, v29.4s, #0 + cmeq v30.4s, v30.4s, #0 + cmeq v31.4s, v31.4s, #0 + orr v24.16b, v24.16b, v25.16b + orr v26.16b, v26.16b, v27.16b + orr v28.16b, v28.16b, v29.16b + orr v30.16b, v30.16b, v31.16b + ext v25.16b, v3.16b, v4.16b, #1 + orr v22.16b, v24.16b, v26.16b + ext v26.16b, v3.16b, v4.16b, #2 + ext v27.16b, v3.16b, v4.16b, #3 + ext v29.16b, v4.16b, v5.16b, #1 + orr v23.16b, v28.16b, v30.16b + ext v30.16b, v4.16b, v5.16b, #2 + ext v31.16b, v4.16b, v5.16b, #3 + bic v24.16b, v3.16b, v20.16b + bic v25.16b, v25.16b, v20.16b + bic v26.16b, v26.16b, v20.16b + orr v22.16b, v22.16b, v23.16b + bic v27.16b, v27.16b, v20.16b + bic v28.16b, v4.16b, v20.16b + bic v29.16b, v29.16b, v20.16b + bic v30.16b, v30.16b, v20.16b + bic v31.16b, v31.16b, v20.16b + addv s22, v22.4s + eor v24.16b, v24.16b, v21.16b + eor v25.16b, v25.16b, v21.16b + eor v26.16b, v26.16b, v21.16b + eor v27.16b, v27.16b, v21.16b + eor v28.16b, v28.16b, v21.16b + mov w3, v22.s[0] + eor v29.16b, v29.16b, v21.16b + eor v30.16b, v30.16b, v21.16b + eor v31.16b, v31.16b, v21.16b + cmeq v24.4s, v24.4s, #0 + cmeq v25.4s, v25.4s, #0 + cmeq v26.4s, v26.4s, #0 + cmeq v27.4s, v27.4s, #0 + cbnz w3, 91f + st1 {v0.16b, v1.16b}, [x2], #32 + subs w1, w1, #64 + b.pl 2b + +90: add w0, w1, #80 + ret + +91: sub w1, w1, #32 + b 90b +endfunc diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c index f5f5c702d7..3aefbcaf6d 100644 --- a/libavcodec/arm/vc1dsp_init_neon.c +++ b/libavcodec/arm/vc1dsp_init_neon.c @@ -84,6 +84,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); + +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) +{ + /* Dealing with starting and stopping, and removing escape bytes, are + * comparatively less time-sensitive, so are more clearly expressed using + * a C wrapper around the assembly inner loop. Note that we assume a + * little-endian machine that supports unaligned loads. */ + int dsize = 0; + while (size >= 4) + { + int found = 0; + while (!found && (((uintptr_t) dst) & 7) && size >= 4) + { + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + if (!found) + { + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); + dst += skip; + src += skip; + size -= skip; + dsize += skip; + while (!found && size >= 4) + { + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + } + if (found) + { + *dst++ = *src++; + *dst++ = *src++; + ++src; + size -= 3; + dsize += 2; + } + } + while (size > 0) + { + *dst++ = *src++; + --size; + ++dsize; + } + return dsize; +} + #define FN_ASSIGN(X, Y) \ dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon @@ -130,4 +188,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; + + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; } diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S index 4ef083102b..9d7333cf12 100644 --- a/libavcodec/arm/vc1dsp_neon.S +++ b/libavcodec/arm/vc1dsp_neon.S @@ -1804,3 +1804,121 @@ function ff_vc1_h_loop_filter16_neon, export=1 4: vpop {d8-d15} pop {r4-r6,pc} endfunc + +@ Copy at most the specified number of bytes from source to destination buffer, +@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence +@ On entry: +@ r0 -> source buffer +@ r1 = max number of bytes to copy +@ r2 -> destination buffer, optimally 8-byte aligned +@ On exit: +@ r0 = number of bytes not copied +function ff_vc1_unescape_buffer_helper_neon, export=1 + @ Offset by 48 to screen out cases that are too short for us to handle, + @ and also make it easy to test for loop termination, or to determine + @ whether we need an odd number of half-iterations of the loop. + subs r1, r1, #48 + bmi 90f + + @ Set up useful constants + vmov.i32 q0, #0x3000000 + vmov.i32 q1, #0x30000 + + tst r1, #16 + bne 1f + + vld1.8 {q8, q9}, [r0]! + vbic q12, q8, q0 + vext.8 q13, q8, q9, #1 + vext.8 q14, q8, q9, #2 + vext.8 q15, q8, q9, #3 + veor q12, q12, q1 + vbic q13, q13, q0 + vbic q14, q14, q0 + vbic q15, q15, q0 + vceq.i32 q12, q12, #0 + veor q13, q13, q1 + veor q14, q14, q1 + veor q15, q15, q1 + vceq.i32 q13, q13, #0 + vceq.i32 q14, q14, #0 + vceq.i32 q15, q15, #0 + add r1, r1, #16 + b 3f + +1: vld1.8 {q10, q11}, [r0]! + vbic q12, q10, q0 + vext.8 q13, q10, q11, #1 + vext.8 q14, q10, q11, #2 + vext.8 q15, q10, q11, #3 + veor q12, q12, q1 + vbic q13, q13, q0 + vbic q14, q14, q0 + vbic q15, q15, q0 + vceq.i32 q12, q12, #0 + veor q13, q13, q1 + veor q14, q14, q1 + veor q15, q15, q1 + vceq.i32 q13, q13, #0 + vceq.i32 q14, q14, #0 + vceq.i32 q15, q15, #0 + @ Drop through... +2: vmov q8, q11 + vld1.8 {q9}, [r0]! + vorr q13, q12, q13 + vorr q15, q14, q15 + vbic q12, q8, q0 + vorr q3, q13, q15 + vext.8 q13, q8, q9, #1 + vext.8 q14, q8, q9, #2 + vext.8 q15, q8, q9, #3 + veor q12, q12, q1 + vorr d6, d6, d7 + vbic q13, q13, q0 + vbic q14, q14, q0 + vbic q15, q15, q0 + vceq.i32 q12, q12, #0 + vmov r3, r12, d6 + veor q13, q13, q1 + veor q14, q14, q1 + veor q15, q15, q1 + vceq.i32 q13, q13, #0 + vceq.i32 q14, q14, #0 + vceq.i32 q15, q15, #0 + orrs r3, r3, r12 + bne 90f + vst1.64 {q10}, [r2]! +3: vmov q10, q9 + vld1.8 {q11}, [r0]! + vorr q13, q12, q13 + vorr q15, q14, q15 + vbic q12, q10, q0 + vorr q3, q13, q15 + vext.8 q13, q10, q11, #1 + vext.8 q14, q10, q11, #2 + vext.8 q15, q10, q11, #3 + veor q12, q12, q1 + vorr d6, d6, d7 + vbic q13, q13, q0 + vbic q14, q14, q0 + vbic q15, q15, q0 + vceq.i32 q12, q12, #0 + vmov r3, r12, d6 + veor q13, q13, q1 + veor q14, q14, q1 + veor q15, q15, q1 + vceq.i32 q13, q13, #0 + vceq.i32 q14, q14, #0 + vceq.i32 q15, q15, #0 + orrs r3, r3, r12 + bne 91f + vst1.64 {q8}, [r2]! + subs r1, r1, #32 + bpl 2b + +90: add r0, r1, #48 + bx lr + +91: sub r1, r1, #16 + b 90b +endfunc diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index 1c92b9d401..6a30b5b664 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -490,7 +490,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) size = next - start - 4; if (size <= 0) continue; - buf2_size = vc1_unescape_buffer(start + 4, size, buf2); + buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); init_get_bits(&gb, buf2, buf2_size * 8); switch (AV_RB32(start)) { case VC1_CODE_SEQHDR: @@ -680,7 +680,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, case VC1_CODE_FRAME: if (avctx->hwaccel) buf_start = start; - buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); + buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); break; case VC1_CODE_FIELD: { int buf_size3; @@ -697,8 +697,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } - buf_size3 = vc1_unescape_buffer(start + 4, size, - slices[n_slices].buf); + buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, + slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = avctx->coded_height + 31 >> 5; @@ -709,7 +709,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, break; } case VC1_CODE_ENTRYPOINT: /* it should be before frame data */ - buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); + buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); init_get_bits(&s->gb, buf2, buf_size2 * 8); ff_vc1_decode_entry_point(avctx, v, &s->gb); break; @@ -726,8 +726,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } - buf_size3 = vc1_unescape_buffer(start + 4, size, - slices[n_slices].buf); + buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, + slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9); @@ -761,7 +761,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } - buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); + buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = s->mb_height + 1 >> 1; @@ -770,9 +770,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, n_slices1 = n_slices - 1; n_slices++; } - buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2); + buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2); } else { - buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2); + buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2); } init_get_bits(&s->gb, buf2, buf_size2*8); } else{ diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c index a29b91bf3d..11d493f002 100644 --- a/libavcodec/vc1dsp.c +++ b/libavcodec/vc1dsp.c @@ -34,6 +34,7 @@ #include "rnd_avg.h" #include "vc1dsp.h" #include "startcode.h" +#include "vc1_common.h" /* Apply overlap transform to horizontal edge */ static void vc1_v_overlap_c(uint8_t *src, int stride) @@ -1030,6 +1031,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ dsp->startcode_find_candidate = ff_startcode_find_candidate_c; + dsp->vc1_unescape_buffer = vc1_unescape_buffer; if (ARCH_AARCH64) ff_vc1dsp_init_aarch64(dsp); diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h index c6443acb20..8be1198071 100644 --- a/libavcodec/vc1dsp.h +++ b/libavcodec/vc1dsp.h @@ -80,6 +80,9 @@ typedef struct VC1DSPContext { * one or more further zero bytes and a one byte. */ int (*startcode_find_candidate)(const uint8_t *buf, int size); + + /* Copy a buffer, removing startcode emulation escape bytes as we go */ + int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst); } VC1DSPContext; void ff_vc1dsp_init(VC1DSPContext* c);
Populate with implementations suitable for 32-bit and 64-bit Arm. Signed-off-by: Ben Avison <bavison@riscosopen.org> --- libavcodec/aarch64/vc1dsp_init_aarch64.c | 60 ++++++++ libavcodec/aarch64/vc1dsp_neon.S | 176 +++++++++++++++++++++++ libavcodec/arm/vc1dsp_init_neon.c | 60 ++++++++ libavcodec/arm/vc1dsp_neon.S | 118 +++++++++++++++ libavcodec/vc1dec.c | 20 +-- libavcodec/vc1dsp.c | 2 + libavcodec/vc1dsp.h | 3 + 7 files changed, 429 insertions(+), 10 deletions(-)