diff mbox

[FFmpeg-devel] cbs_h2645: Improve performance of writing slices

Message ID 20181111224305.4480-1-andreas.rheinhardt@googlemail.com
State Accepted
Headers show

Commit Message

Andreas Rheinhardt Nov. 11, 2018, 10:43 p.m. UTC
Instead of using a combination of bitreader and -writer for copying data,
one can byte-align the (obsolete and removed) bitreader to improve performance.
With the right alignment one can even use memcpy. The right alignment
normally exists for CABAC and hence for H.265 in general.
For aligned data this reduced the time to copy the slicedata from
776520 decicycles to 33889 with 262144 runs and a 6.5mb/s H.264 video.
For unaligned data the number went down from 279196 to 97739 decicycles.
---
 libavcodec/cbs_h2645.c | 119 ++++++++++++++++++++++++-----------------
 1 file changed, 69 insertions(+), 50 deletions(-)

Comments

Mark Thompson Nov. 12, 2018, 12:19 a.m. UTC | #1
On 11/11/18 22:43, Andreas Rheinhardt wrote:
> Instead of using a combination of bitreader and -writer for copying data,
> one can byte-align the (obsolete and removed) bitreader to improve performance.
> With the right alignment one can even use memcpy. The right alignment
> normally exists for CABAC and hence for H.265 in general.
> For aligned data this reduced the time to copy the slicedata from
> 776520 decicycles to 33889 with 262144 runs and a 6.5mb/s H.264 video.
> For unaligned data the number went down from 279196 to 97739 decicycles.
> ---
>  libavcodec/cbs_h2645.c | 119 ++++++++++++++++++++++++-----------------
>  1 file changed, 69 insertions(+), 50 deletions(-)
> 
> diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
> index e55bd00183..416d3fd32a 100644
> --- a/libavcodec/cbs_h2645.c
> +++ b/libavcodec/cbs_h2645.c
> @@ -1050,6 +1050,64 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
>      return 0;
>  }
>  
> +static int cbs_h2645_write_slice_data(CodedBitstreamContext *ctx,
> +                                      PutBitContext *pbc, const uint8_t *data,
> +                                      size_t data_size, int data_bit_start)
> +{
> +    size_t rest  = data_size - (data_bit_start + 7) / 8;
> +    const uint8_t *pos = data + data_bit_start / 8;
> +
> +    av_assert0(data_bit_start >= 0 &&
> +               8 * data_size > data_bit_start);
> +
> +    if (data_size * 8 + 8 > put_bits_left(pbc))
> +        return AVERROR(ENOSPC);
> +
> +    if (!rest)
> +        goto rbsp_stop_one_bit;
> +
> +    // First copy the remaining bits of the first byte
> +    // The above check ensures that we do not accidentally
> +    // copy beyond the rbsp_stop_one_bit.
> +    if (data_bit_start % 8)
> +        put_bits(pbc, 8 - data_bit_start % 8,
> +                *pos++ & MAX_UINT_BITS(8 - data_bit_start % 8));
> +
> +    if (put_bits_count(pbc) % 8 == 0) {
> +        // If the writer is aligned at this point,
> +        // memcpy can be used to improve performance.
> +        // This happens normally for CABAC.
> +        flush_put_bits(pbc);
> +        memcpy(put_bits_ptr(pbc), pos, rest);
> +        skip_put_bytes(pbc, rest);
> +    } else {
> +        // If not, we have to copy manually.
> +        // rbsp_stop_one_bit forces us to special-case
> +        // the last byte.
> +        uint8_t temp;
> +        int i;
> +
> +        for (; rest > 4; rest -= 4, pos += 4)
> +            put_bits32(pbc, AV_RB32(pos));
> +
> +        for (; rest > 1; rest--, pos++)
> +            put_bits(pbc, 8, *pos);
> +
> +    rbsp_stop_one_bit:
> +        temp = rest ? *pos : *pos & MAX_UINT_BITS(8 - data_bit_start % 8);
> +
> +        av_assert0(temp);
> +        i = ff_ctz(*pos);
> +        temp = temp >> i;
> +        i = rest ? (8 - i) : (8 - i - data_bit_start % 8);
> +        put_bits(pbc, i, temp);
> +        if (put_bits_count(pbc) % 8)
> +            put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
> +    }
> +
> +    return 0;
> +}
> +
>  static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
>                                     CodedBitstreamUnit *unit,
>                                     PutBitContext *pbc)
> @@ -1100,37 +1158,17 @@ static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
>      case H264_NAL_AUXILIARY_SLICE:
>          {
>              H264RawSlice *slice = unit->content;
> -            GetBitContext gbc;
> -            int bits_left, end, zeroes;
>  
>              err = cbs_h264_write_slice_header(ctx, pbc, &slice->header);
>              if (err < 0)
>                  return err;
>  
>              if (slice->data) {
> -                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
> -                    return AVERROR(ENOSPC);
> -
> -                init_get_bits(&gbc, slice->data, slice->data_size * 8);
> -                skip_bits_long(&gbc, slice->data_bit_start);
> -
> -                // Copy in two-byte blocks, but stop before copying the
> -                // rbsp_stop_one_bit in the final byte.
> -                while (get_bits_left(&gbc) > 23)
> -                    put_bits(pbc, 16, get_bits(&gbc, 16));
> -
> -                bits_left = get_bits_left(&gbc);
> -                end = get_bits(&gbc, bits_left);
> -
> -                // rbsp_stop_one_bit must be present here.
> -                av_assert0(end);
> -                zeroes = ff_ctz(end);
> -                if (bits_left > zeroes + 1)
> -                    put_bits(pbc, bits_left - zeroes - 1,
> -                             end >> (zeroes + 1));
> -                put_bits(pbc, 1, 1);
> -                while (put_bits_count(pbc) % 8 != 0)
> -                    put_bits(pbc, 1, 0);
> +                err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
> +                                                 slice->data_size,
> +                                                 slice->data_bit_start);
> +                if (err < 0)
> +                    return err;
>              } else {
>                  // No slice data - that was just the header.
>                  // (Bitstream may be unaligned!)
> @@ -1254,39 +1292,20 @@ static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
>      case HEVC_NAL_CRA_NUT:
>          {
>              H265RawSlice *slice = unit->content;
> -            GetBitContext gbc;
> -            int bits_left, end, zeroes;
>  
>              err = cbs_h265_write_slice_segment_header(ctx, pbc, &slice->header);
>              if (err < 0)
>                  return err;
>  
>              if (slice->data) {
> -                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
> -                    return AVERROR(ENOSPC);
> -
> -                init_get_bits(&gbc, slice->data, slice->data_size * 8);
> -                skip_bits_long(&gbc, slice->data_bit_start);
> -
> -                // Copy in two-byte blocks, but stop before copying the
> -                // rbsp_stop_one_bit in the final byte.
> -                while (get_bits_left(&gbc) > 23)
> -                    put_bits(pbc, 16, get_bits(&gbc, 16));
> -
> -                bits_left = get_bits_left(&gbc);
> -                end = get_bits(&gbc, bits_left);
> -
> -                // rbsp_stop_one_bit must be present here.
> -                av_assert0(end);
> -                zeroes = ff_ctz(end);
> -                if (bits_left > zeroes + 1)
> -                    put_bits(pbc, bits_left - zeroes - 1,
> -                             end >> (zeroes + 1));
> -                put_bits(pbc, 1, 1);
> -                while (put_bits_count(pbc) % 8 != 0)
> -                    put_bits(pbc, 1, 0);
> +                err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
> +                                                 slice->data_size,
> +                                                 slice->data_bit_start);
> +                if (err < 0)
> +                    return err;
>              } else {
>                  // No slice data - that was just the header.
> +                // (Bitstream may be unaligned!)

This comment change isn't accurate - the bitstream will always be aligned for H.265 in this case because byte alignment is included at the end of the slice segment header.  (I've just removed it.)

>              }
>          }
>          break;
> 

LGTM, tested, applied.

Thanks!

- Mark


On 11/11/18 22:32, Andreas Rheinhardt wrote:
> ...  Btw: What was the normal speedup you got when copying in the aligned mode?

Macro-level tests come out very well here.

Test files:
A  900 frames of single-slice 4K H.265, 1.5GB (900 writes of slice data, averaging 1.6MB each).
B  38074 frames of 32-slice 1080p H.264, 1.5GB (1.2m writes of slice data, averaging 1.2kB each).

In each case, get input file into memory, "./ffmpeg -i input-file -c:v copy -bsf:v hxxx_metadata -f null -", run five times and average the result.

Intel 8700 (Coffee Lake):
A    ~83fps ->  ~274fps
B  ~5150fps -> ~8160fps

Rockchip 3288 (Cortex A15):
A    ~31fps ->   ~48fps
B  ~1210fps -> ~1700fps

So, around 50% increase in hxxx_metadata throughput for these cases.  (And I guess Intel is very good at large memcpy for the first one.)
diff mbox

Patch

diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
index e55bd00183..416d3fd32a 100644
--- a/libavcodec/cbs_h2645.c
+++ b/libavcodec/cbs_h2645.c
@@ -1050,6 +1050,64 @@  static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
     return 0;
 }
 
+static int cbs_h2645_write_slice_data(CodedBitstreamContext *ctx,
+                                      PutBitContext *pbc, const uint8_t *data,
+                                      size_t data_size, int data_bit_start)
+{
+    size_t rest  = data_size - (data_bit_start + 7) / 8;
+    const uint8_t *pos = data + data_bit_start / 8;
+
+    av_assert0(data_bit_start >= 0 &&
+               8 * data_size > data_bit_start);
+
+    if (data_size * 8 + 8 > put_bits_left(pbc))
+        return AVERROR(ENOSPC);
+
+    if (!rest)
+        goto rbsp_stop_one_bit;
+
+    // First copy the remaining bits of the first byte
+    // The above check ensures that we do not accidentally
+    // copy beyond the rbsp_stop_one_bit.
+    if (data_bit_start % 8)
+        put_bits(pbc, 8 - data_bit_start % 8,
+                *pos++ & MAX_UINT_BITS(8 - data_bit_start % 8));
+
+    if (put_bits_count(pbc) % 8 == 0) {
+        // If the writer is aligned at this point,
+        // memcpy can be used to improve performance.
+        // This happens normally for CABAC.
+        flush_put_bits(pbc);
+        memcpy(put_bits_ptr(pbc), pos, rest);
+        skip_put_bytes(pbc, rest);
+    } else {
+        // If not, we have to copy manually.
+        // rbsp_stop_one_bit forces us to special-case
+        // the last byte.
+        uint8_t temp;
+        int i;
+
+        for (; rest > 4; rest -= 4, pos += 4)
+            put_bits32(pbc, AV_RB32(pos));
+
+        for (; rest > 1; rest--, pos++)
+            put_bits(pbc, 8, *pos);
+
+    rbsp_stop_one_bit:
+        temp = rest ? *pos : *pos & MAX_UINT_BITS(8 - data_bit_start % 8);
+
+        av_assert0(temp);
+        i = ff_ctz(*pos);
+        temp = temp >> i;
+        i = rest ? (8 - i) : (8 - i - data_bit_start % 8);
+        put_bits(pbc, i, temp);
+        if (put_bits_count(pbc) % 8)
+            put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
+    }
+
+    return 0;
+}
+
 static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
                                    CodedBitstreamUnit *unit,
                                    PutBitContext *pbc)
@@ -1100,37 +1158,17 @@  static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
     case H264_NAL_AUXILIARY_SLICE:
         {
             H264RawSlice *slice = unit->content;
-            GetBitContext gbc;
-            int bits_left, end, zeroes;
 
             err = cbs_h264_write_slice_header(ctx, pbc, &slice->header);
             if (err < 0)
                 return err;
 
             if (slice->data) {
-                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
-                    return AVERROR(ENOSPC);
-
-                init_get_bits(&gbc, slice->data, slice->data_size * 8);
-                skip_bits_long(&gbc, slice->data_bit_start);
-
-                // Copy in two-byte blocks, but stop before copying the
-                // rbsp_stop_one_bit in the final byte.
-                while (get_bits_left(&gbc) > 23)
-                    put_bits(pbc, 16, get_bits(&gbc, 16));
-
-                bits_left = get_bits_left(&gbc);
-                end = get_bits(&gbc, bits_left);
-
-                // rbsp_stop_one_bit must be present here.
-                av_assert0(end);
-                zeroes = ff_ctz(end);
-                if (bits_left > zeroes + 1)
-                    put_bits(pbc, bits_left - zeroes - 1,
-                             end >> (zeroes + 1));
-                put_bits(pbc, 1, 1);
-                while (put_bits_count(pbc) % 8 != 0)
-                    put_bits(pbc, 1, 0);
+                err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
+                                                 slice->data_size,
+                                                 slice->data_bit_start);
+                if (err < 0)
+                    return err;
             } else {
                 // No slice data - that was just the header.
                 // (Bitstream may be unaligned!)
@@ -1254,39 +1292,20 @@  static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
     case HEVC_NAL_CRA_NUT:
         {
             H265RawSlice *slice = unit->content;
-            GetBitContext gbc;
-            int bits_left, end, zeroes;
 
             err = cbs_h265_write_slice_segment_header(ctx, pbc, &slice->header);
             if (err < 0)
                 return err;
 
             if (slice->data) {
-                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
-                    return AVERROR(ENOSPC);
-
-                init_get_bits(&gbc, slice->data, slice->data_size * 8);
-                skip_bits_long(&gbc, slice->data_bit_start);
-
-                // Copy in two-byte blocks, but stop before copying the
-                // rbsp_stop_one_bit in the final byte.
-                while (get_bits_left(&gbc) > 23)
-                    put_bits(pbc, 16, get_bits(&gbc, 16));
-
-                bits_left = get_bits_left(&gbc);
-                end = get_bits(&gbc, bits_left);
-
-                // rbsp_stop_one_bit must be present here.
-                av_assert0(end);
-                zeroes = ff_ctz(end);
-                if (bits_left > zeroes + 1)
-                    put_bits(pbc, bits_left - zeroes - 1,
-                             end >> (zeroes + 1));
-                put_bits(pbc, 1, 1);
-                while (put_bits_count(pbc) % 8 != 0)
-                    put_bits(pbc, 1, 0);
+                err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
+                                                 slice->data_size,
+                                                 slice->data_bit_start);
+                if (err < 0)
+                    return err;
             } else {
                 // No slice data - that was just the header.
+                // (Bitstream may be unaligned!)
             }
         }
         break;