[FFmpeg-devel,3/3] cbs_h265: Improve performance of writing slices

Submitted by Andreas Rheinhardt on Nov. 4, 2018, 4:48 a.m.

Details

Message ID 20181104044842.3092-4-andreas.rheinhardt@googlemail.com
State New
Headers show

Commit Message

Andreas Rheinhardt Nov. 4, 2018, 4:48 a.m.
Instead of using a combination of bitreader and -writer for copying data,
one can byte-align the (obsolete and removed) bitreader to improve performance.
Given that the H265 slice segment header always has a byte length,
one can normally use memcpy.
With this patch the number of decicycles used to copy the slicedata
went down from 181395 to 8672 for a 830kb/s sample with 16384 runs.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
---
 libavcodec/cbs_h2645.c | 70 +++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 22 deletions(-)

Patch hide | download patch | download mbox

diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
index d3a41fbdf0..d9ea498faa 100644
--- a/libavcodec/cbs_h2645.c
+++ b/libavcodec/cbs_h2645.c
@@ -1279,39 +1279,65 @@  static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
     case HEVC_NAL_CRA_NUT:
         {
             H265RawSlice *slice = unit->content;
-            GetBitContext gbc;
-            int bits_left, end, zeroes;
 
             err = cbs_h265_write_slice_segment_header(ctx, pbc, &slice->header);
             if (err < 0)
                 return err;
 
             if (slice->data) {
+                size_t rest = slice->data_size - (slice->data_bit_start + 7) / 8;
+                uint8_t *pos = slice->data + slice->data_bit_start / 8;
+
+                av_assert0(slice->data_bit_start >= 0 &&
+                           8 * slice->data_size > slice->data_bit_start);
+
                 if (slice->data_size * 8 + 8 > put_bits_left(pbc))
                     return AVERROR(ENOSPC);
 
-                init_get_bits(&gbc, slice->data, slice->data_size * 8);
-                skip_bits_long(&gbc, slice->data_bit_start);
-
-                // Copy in two-byte blocks, but stop before copying the
-                // rbsp_stop_one_bit in the final byte.
-                while (get_bits_left(&gbc) > 23)
-                    put_bits(pbc, 16, get_bits(&gbc, 16));
-
-                bits_left = get_bits_left(&gbc);
-                end = get_bits(&gbc, bits_left);
-
-                // rbsp_stop_one_bit must be present here.
-                av_assert0(end);
-                zeroes = ff_ctz(end);
-                if (bits_left > zeroes + 1)
-                    put_bits(pbc, bits_left - zeroes - 1,
-                             end >> (zeroes + 1));
-                put_bits(pbc, 1, 1);
-                while (put_bits_count(pbc) % 8 != 0)
-                    put_bits(pbc, 1, 0);
+                if (!rest)
+                    goto rbsp_stop_one_bit;
+
+                // First copy the remaining bits of the first byte
+                // The above check ensures that we do not accidentally
+                // copy beyond the rbsp_stop_one_bit.
+                if (slice->data_bit_start % 8)
+                    put_bits(pbc, 8 - slice->data_bit_start % 8,
+                            *pos++ & MAX_UINT_BITS(8 - slice->data_bit_start % 8));
+
+                if (put_bits_count(pbc) % 8 == 0) {
+                    // If the writer is aligned at this point,
+                    // memcpy can be used to improve performance.
+                    // This is the normal case.
+                    flush_put_bits(pbc);
+                    memcpy(put_bits_ptr(pbc), pos, rest);
+                    skip_put_bytes(pbc, rest);
+                    break;
+                } else {
+                    // If not, we have to copy manually.
+                    // rbsp_stop_one_bit forces us to special-case
+                    // the last byte.
+                    for (; rest > 4; rest -= 4, pos += 4)
+                        put_bits32(pbc, AV_RB32(pos));
+
+                    for (; rest > 1; rest--, pos++)
+                        put_bits(pbc, 8, *pos);
+                }
+
+                rbsp_stop_one_bit: {
+                    int i;
+                    uint8_t temp = rest ? *pos : *pos & MAX_UINT_BITS(8 -
+                                                 slice->data_bit_start % 8);
+                    av_assert0(temp);
+                    i = ff_ctz(*pos);
+                    temp = temp >> i;
+                    i = rest ? (8 - i) : (8 - i - slice->data_bit_start % 8);
+                    put_bits(pbc, i, temp);
+                    if (put_bits_count(pbc) % 8)
+                        put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
+                }
             } else {
                 // No slice data - that was just the header.
+                // (Bitstream may be unaligned!)
             }
         }
         break;