[FFmpeg-devel,2/3] cbs_h264: Improve performance of writing slices

Submitted by Andreas Rheinhardt on Nov. 4, 2018, 4:48 a.m.

Details

Message ID 20181104044842.3092-3-andreas.rheinhardt@googlemail.com
State New
Headers show

Commit Message

Andreas Rheinhardt Nov. 4, 2018, 4:48 a.m.
Instead of using a combination of bitreader and -writer for copying data,
one can byte-align the (obsolete and removed) bitreader to improve performance.
With the right alignment one can even use memcpy. The right alignment
normally exists for CABAC.
For aligned data this reduced the time to copy the slicedata from
776520 decicycles to 33889 with 262144 runs and a 6.5mb/s H.264 video.
For unaligned data the number went down from 279196 to 97739 decicycles.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
---
 libavcodec/cbs_h2645.c | 67 +++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 21 deletions(-)

Comments

Mark Thompson Nov. 11, 2018, 7:49 p.m.
On 04/11/18 04:48, Andreas Rheinhardt wrote:
> Instead of using a combination of bitreader and -writer for copying data,
> one can byte-align the (obsolete and removed) bitreader to improve performance.
> With the right alignment one can even use memcpy. The right alignment
> normally exists for CABAC.
> For aligned data this reduced the time to copy the slicedata from
> 776520 decicycles to 33889 with 262144 runs and a 6.5mb/s H.264 video.
> For unaligned data the number went down from 279196 to 97739 decicycles.
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
> ---
>  libavcodec/cbs_h2645.c | 67 +++++++++++++++++++++++++++++-------------
>  1 file changed, 46 insertions(+), 21 deletions(-)
> 
> diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
> index e55bd00183..d3a41fbdf0 100644
> --- a/libavcodec/cbs_h2645.c
> +++ b/libavcodec/cbs_h2645.c
> @@ -1100,37 +1100,62 @@ static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
>      case H264_NAL_AUXILIARY_SLICE:
>          {
>              H264RawSlice *slice = unit->content;
> -            GetBitContext gbc;
> -            int bits_left, end, zeroes;
>  
>              err = cbs_h264_write_slice_header(ctx, pbc, &slice->header);
>              if (err < 0)
>                  return err;
>  
>              if (slice->data) {
> -                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
> -                    return AVERROR(ENOSPC);
> +                size_t rest = slice->data_size - (slice->data_bit_start + 7) / 8;
> +                uint8_t *pos = slice->data + slice->data_bit_start / 8;
>  
> -                init_get_bits(&gbc, slice->data, slice->data_size * 8);
> -                skip_bits_long(&gbc, slice->data_bit_start);
> +                av_assert0(slice->data_bit_start >= 0 &&
> +                           8 * slice->data_size > slice->data_bit_start);
>  
> -                // Copy in two-byte blocks, but stop before copying the
> -                // rbsp_stop_one_bit in the final byte.
> -                while (get_bits_left(&gbc) > 23)
> -                    put_bits(pbc, 16, get_bits(&gbc, 16));
> +                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
> +                    return AVERROR(ENOSPC);
>  
> -                bits_left = get_bits_left(&gbc);
> -                end = get_bits(&gbc, bits_left);
> +                if (!rest)
> +                    goto rbsp_stop_one_bit;
> +
> +                // First copy the remaining bits of the first byte
> +                // The above check ensures that we do not accidentally
> +                // copy beyond the rbsp_stop_one_bit.
> +                if (slice->data_bit_start % 8)
> +                    put_bits(pbc, 8 - slice->data_bit_start % 8,
> +                            *pos++ & MAX_UINT_BITS(8 - slice->data_bit_start % 8));
> +
> +                if (put_bits_count(pbc) % 8 == 0) {
> +                    // If the writer is aligned at this point,
> +                    // memcpy can be used to improve performance.
> +                    // This happens normally for CABAC.
> +                    flush_put_bits(pbc);
> +                    memcpy(put_bits_ptr(pbc), pos, rest);
> +                    skip_put_bytes(pbc, rest);
> +                    break;

From reading the code I find this break slightly surprising, because it jumps out of the switch which is a layer above the code being changed.  I think a little rearrangement would avoid it and make the code simpler to understand?  (Alternatively, the change suggested below would have the same effect.)

> +                } else {
> +                    // If not, we have to copy manually.
> +                    // rbsp_stop_one_bit forces us to special-case
> +                    // the last byte.
> +                    for (; rest > 4; rest -= 4, pos += 4)
> +                        put_bits32(pbc, AV_RB32(pos));
> +
> +                    for (; rest > 1; rest--, pos++)
> +                        put_bits(pbc, 8, *pos);
> +                }
>  
> -                // rbsp_stop_one_bit must be present here.
> -                av_assert0(end);
> -                zeroes = ff_ctz(end);
> -                if (bits_left > zeroes + 1)
> -                    put_bits(pbc, bits_left - zeroes - 1,
> -                             end >> (zeroes + 1));
> -                put_bits(pbc, 1, 1);
> -                while (put_bits_count(pbc) % 8 != 0)
> -                    put_bits(pbc, 1, 0);
> +                rbsp_stop_one_bit: {
> +                    int i;
> +                    uint8_t temp = rest ? *pos : *pos & MAX_UINT_BITS(8 -
> +                                                 slice->data_bit_start % 8);
> +                    av_assert0(temp);
> +                    i = ff_ctz(*pos);
> +                    temp = temp >> i;
> +                    i = rest ? (8 - i) : (8 - i - slice->data_bit_start % 8);
> +                    put_bits(pbc, i, temp);
> +                    if (put_bits_count(pbc) % 8)
> +                        put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
> +                }
>              } else {
>                  // No slice data - that was just the header.
>                  // (Bitstream may be unaligned!)
> 

Looks good (and tested), but this code is now somewhat larger and the duplication between H.264 and H.265 feels worse.  Would it be sensible to extract it into a separate function something like

int cbs_h2645_write_slice_data(CodedBitstreamContext *ctx, PutBitContext *pbc, const uint8_t *data, size_t data_size, int data_bit_start)

to call in in both cases if slice->data is true?

Thanks,

- Mark
Andreas Rheinhardt Nov. 11, 2018, 10:32 p.m.
I have created a new version that addresses all your concerns. Will send it in a moment.
I actually wanted to make a separate function, but decided against it because the code already contained the duplication, so I thought that is how you liked it.
Thanks for the review. Btw: What was the normal speedup you got when copying in the aligned mode?

Patch hide | download patch | download mbox

diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
index e55bd00183..d3a41fbdf0 100644
--- a/libavcodec/cbs_h2645.c
+++ b/libavcodec/cbs_h2645.c
@@ -1100,37 +1100,62 @@  static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
     case H264_NAL_AUXILIARY_SLICE:
         {
             H264RawSlice *slice = unit->content;
-            GetBitContext gbc;
-            int bits_left, end, zeroes;
 
             err = cbs_h264_write_slice_header(ctx, pbc, &slice->header);
             if (err < 0)
                 return err;
 
             if (slice->data) {
-                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
-                    return AVERROR(ENOSPC);
+                size_t rest = slice->data_size - (slice->data_bit_start + 7) / 8;
+                uint8_t *pos = slice->data + slice->data_bit_start / 8;
 
-                init_get_bits(&gbc, slice->data, slice->data_size * 8);
-                skip_bits_long(&gbc, slice->data_bit_start);
+                av_assert0(slice->data_bit_start >= 0 &&
+                           8 * slice->data_size > slice->data_bit_start);
 
-                // Copy in two-byte blocks, but stop before copying the
-                // rbsp_stop_one_bit in the final byte.
-                while (get_bits_left(&gbc) > 23)
-                    put_bits(pbc, 16, get_bits(&gbc, 16));
+                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
+                    return AVERROR(ENOSPC);
 
-                bits_left = get_bits_left(&gbc);
-                end = get_bits(&gbc, bits_left);
+                if (!rest)
+                    goto rbsp_stop_one_bit;
+
+                // First copy the remaining bits of the first byte
+                // The above check ensures that we do not accidentally
+                // copy beyond the rbsp_stop_one_bit.
+                if (slice->data_bit_start % 8)
+                    put_bits(pbc, 8 - slice->data_bit_start % 8,
+                            *pos++ & MAX_UINT_BITS(8 - slice->data_bit_start % 8));
+
+                if (put_bits_count(pbc) % 8 == 0) {
+                    // If the writer is aligned at this point,
+                    // memcpy can be used to improve performance.
+                    // This happens normally for CABAC.
+                    flush_put_bits(pbc);
+                    memcpy(put_bits_ptr(pbc), pos, rest);
+                    skip_put_bytes(pbc, rest);
+                    break;
+                } else {
+                    // If not, we have to copy manually.
+                    // rbsp_stop_one_bit forces us to special-case
+                    // the last byte.
+                    for (; rest > 4; rest -= 4, pos += 4)
+                        put_bits32(pbc, AV_RB32(pos));
+
+                    for (; rest > 1; rest--, pos++)
+                        put_bits(pbc, 8, *pos);
+                }
 
-                // rbsp_stop_one_bit must be present here.
-                av_assert0(end);
-                zeroes = ff_ctz(end);
-                if (bits_left > zeroes + 1)
-                    put_bits(pbc, bits_left - zeroes - 1,
-                             end >> (zeroes + 1));
-                put_bits(pbc, 1, 1);
-                while (put_bits_count(pbc) % 8 != 0)
-                    put_bits(pbc, 1, 0);
+                rbsp_stop_one_bit: {
+                    int i;
+                    uint8_t temp = rest ? *pos : *pos & MAX_UINT_BITS(8 -
+                                                 slice->data_bit_start % 8);
+                    av_assert0(temp);
+                    i = ff_ctz(*pos);
+                    temp = temp >> i;
+                    i = rest ? (8 - i) : (8 - i - slice->data_bit_start % 8);
+                    put_bits(pbc, i, temp);
+                    if (put_bits_count(pbc) % 8)
+                        put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
+                }
             } else {
                 // No slice data - that was just the header.
                 // (Bitstream may be unaligned!)