diff mbox

[FFmpeg-devel,1/3] cbs_mpeg2: Improve performance of writing slices

Message ID 20181104044842.3092-2-andreas.rheinhardt@googlemail.com
State Accepted
Headers show

Commit Message

Andreas Rheinhardt Nov. 4, 2018, 4:48 a.m. UTC
Instead of using a combination of bitreader and -writer for copying data,
one can byte-align the (obsolete and removed) bitreader to improve performance.
One can even use memcpy in the normal case.
This improved the time needed for writing the slicedata from 33618 to
2370 decicycles when tested on a video originating from a DVD (4194394
runs).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
---
 libavcodec/cbs_mpeg2.c | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

Comments

Mark Thompson Nov. 11, 2018, 7:32 p.m. UTC | #1
On 04/11/18 04:48, Andreas Rheinhardt wrote:
> Instead of using a combination of bitreader and -writer for copying data,
> one can byte-align the (obsolete and removed) bitreader to improve performance.
> One can even use memcpy in the normal case.
> This improved the time needed for writing the slicedata from 33618 to
> 2370 decicycles when tested on a video originating from a DVD (4194394
> runs).
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
> ---
>  libavcodec/cbs_mpeg2.c | 39 +++++++++++++++++++++++++++------------
>  1 file changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/libavcodec/cbs_mpeg2.c b/libavcodec/cbs_mpeg2.c
> index 0df4234b12..7161f1ee80 100644
> --- a/libavcodec/cbs_mpeg2.c
> +++ b/libavcodec/cbs_mpeg2.c
> @@ -264,8 +264,6 @@ static int cbs_mpeg2_write_slice(CodedBitstreamContext *ctx,
>                                   PutBitContext *pbc)
>  {
>      MPEG2RawSlice *slice = unit->content;
> -    GetBitContext gbc;
> -    size_t bits_left;
>      int err;
>  
>      err = cbs_mpeg2_write_slice_header(ctx, pbc, &slice->header);
> @@ -273,21 +271,38 @@ static int cbs_mpeg2_write_slice(CodedBitstreamContext *ctx,
>          return err;
>  
>      if (slice->data) {
> +        size_t rest = slice->data_size - (slice->data_bit_start + 7) / 8;
> +        uint8_t *pos = slice->data + slice->data_bit_start / 8;
> +
> +        av_assert0(slice->data_bit_start >= 0 &&
> +                   8* slice->data_size > slice->data_bit_start);
> +
>          if (slice->data_size * 8 + 8 > put_bits_left(pbc))
>              return AVERROR(ENOSPC);
>  
> -        init_get_bits(&gbc, slice->data, slice->data_size * 8);
> -        skip_bits_long(&gbc, slice->data_bit_start);
> -
> -        while (get_bits_left(&gbc) > 15)
> -            put_bits(pbc, 16, get_bits(&gbc, 16));
> +        // First copy the remaining bits of the first byte
> +        if (slice->data_bit_start % 8)
> +            put_bits(pbc, 8 - slice->data_bit_start % 8,
> +                    *pos++ & MAX_UINT_BITS(8 - slice->data_bit_start % 8));
> +
> +        if (put_bits_count(pbc) % 8 == 0) {
> +            // If the writer is aligned at this point,
> +            // memcpy can be used to improve performance.
> +            // This is the normal case.
> +            flush_put_bits(pbc);
> +            memcpy(put_bits_ptr(pbc), pos, rest);
> +            skip_put_bytes(pbc, rest);
> +        } else {
> +            // If not, we have to copy manually:
> +            for (; rest > 3; rest -= 4, pos += 4)
> +                put_bits32(pbc, AV_RB32(pos));
>  
> -        bits_left = get_bits_left(&gbc);
> -        put_bits(pbc, bits_left, get_bits(&gbc, bits_left));
> +            for (; rest; rest--, pos++)
> +                put_bits(pbc, 8, *pos);
>  
> -        // Align with zeroes.
> -        while (put_bits_count(pbc) % 8 != 0)
> -            put_bits(pbc, 1, 0);
> +            // Align with zeros
> +            put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
> +        }
>      }
>  
>      return 0;
> 

LGTM, tested and applied.

(I'm not sure that there is much value in handling the case of slice->data_bit_start >= 8 since that currently never happens, but I guess it worked for the old code so I don't mind the extra complexity.)

Thanks!

- Mark
diff mbox

Patch

diff --git a/libavcodec/cbs_mpeg2.c b/libavcodec/cbs_mpeg2.c
index 0df4234b12..7161f1ee80 100644
--- a/libavcodec/cbs_mpeg2.c
+++ b/libavcodec/cbs_mpeg2.c
@@ -264,8 +264,6 @@  static int cbs_mpeg2_write_slice(CodedBitstreamContext *ctx,
                                  PutBitContext *pbc)
 {
     MPEG2RawSlice *slice = unit->content;
-    GetBitContext gbc;
-    size_t bits_left;
     int err;
 
     err = cbs_mpeg2_write_slice_header(ctx, pbc, &slice->header);
@@ -273,21 +271,38 @@  static int cbs_mpeg2_write_slice(CodedBitstreamContext *ctx,
         return err;
 
     if (slice->data) {
+        size_t rest = slice->data_size - (slice->data_bit_start + 7) / 8;
+        uint8_t *pos = slice->data + slice->data_bit_start / 8;
+
+        av_assert0(slice->data_bit_start >= 0 &&
+                   8* slice->data_size > slice->data_bit_start);
+
         if (slice->data_size * 8 + 8 > put_bits_left(pbc))
             return AVERROR(ENOSPC);
 
-        init_get_bits(&gbc, slice->data, slice->data_size * 8);
-        skip_bits_long(&gbc, slice->data_bit_start);
-
-        while (get_bits_left(&gbc) > 15)
-            put_bits(pbc, 16, get_bits(&gbc, 16));
+        // First copy the remaining bits of the first byte
+        if (slice->data_bit_start % 8)
+            put_bits(pbc, 8 - slice->data_bit_start % 8,
+                    *pos++ & MAX_UINT_BITS(8 - slice->data_bit_start % 8));
+
+        if (put_bits_count(pbc) % 8 == 0) {
+            // If the writer is aligned at this point,
+            // memcpy can be used to improve performance.
+            // This is the normal case.
+            flush_put_bits(pbc);
+            memcpy(put_bits_ptr(pbc), pos, rest);
+            skip_put_bytes(pbc, rest);
+        } else {
+            // If not, we have to copy manually:
+            for (; rest > 3; rest -= 4, pos += 4)
+                put_bits32(pbc, AV_RB32(pos));
 
-        bits_left = get_bits_left(&gbc);
-        put_bits(pbc, bits_left, get_bits(&gbc, bits_left));
+            for (; rest; rest--, pos++)
+                put_bits(pbc, 8, *pos);
 
-        // Align with zeroes.
-        while (put_bits_count(pbc) % 8 != 0)
-            put_bits(pbc, 1, 0);
+            // Align with zeros
+            put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
+        }
     }
 
     return 0;