diff mbox series

[FFmpeg-devel,v3,2/2] avcodec/put_bits: Make bit buffers 64-bit

Message ID 20200718145303.17059-2-steinar+ffmpeg@gunderson.no
State Accepted
Commit 88d80cb97528d52dac3178cf5393d6095eca6200
Headers show
Series [FFmpeg-devel,v3,1/2] avcodec/put_bits: Parametrize bit buffer type | expand

Checks

Context Check Description
andriy/default pending
andriy/make_warn warning New warnings during build
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Steinar H. Gunderson July 18, 2020, 2:53 p.m. UTC
Change BitBuf into uint64_t on 64-bit x86. This means we need to flush the
buffer less often, which is a significant speed win. All other platforms,
including all 32-bit ones, are unchanged. Output bitstream is the same.

All API constraints are kept in place, e.g., you still cannot put_bits()
more than 31 bits at a time. This is so that codecs cannot accidentally
become 64-bit-only or similar.

Benchmarking on transcoding to various formats shows consistently
positive results:

  dnxhd                 25.60 fps ->  26.26 fps ( +2.6%)
  dvvideo               24.88 fps ->  25.17 fps ( +1.2%)
  ffv1                  14.32 fps ->  14.58 fps ( +1.8%)
  huffyuv               58.75 fps ->  63.27 fps ( +7.7%)
  jpegls                 6.22 fps ->   6.34 fps ( +1.8%)
  magicyuv              57.10 fps ->  63.29 fps (+10.8%)
  mjpeg                 48.65 fps ->  49.01 fps ( +0.7%)
  mpeg1video            76.41 fps ->  77.01 fps ( +0.8%)
  mpeg2video            75.99 fps ->  77.43 fps ( +1.9%)
  mpeg4                 80.66 fps ->  81.37 fps ( +0.9%)
  prores                12.35 fps ->  12.88 fps ( +4.3%)
  prores_ks             16.20 fps ->  16.80 fps ( +3.7%)
  rv20                  62.80 fps ->  62.99 fps ( +0.3%)
  utvideo               68.41 fps ->  76.32 fps (+11.6%)

Note that this includes video decoding and all other encoding work,
such as DCTs. If you isolate the actual bit-writing routines, it is
likely to be much more.

Benchmark details: Transcoding the first 30 seconds of Big Buck Bunny
in 1080p, Haswell 2.1 GHz, GCC 8.3, generally quantizer locked to
5.0. (Exceptions: DNxHD needs fixed bitrate, and JPEG-LS is so slow
that I only took the first 10 seconds, not 30.) All runs were done
ten times and single-threaded, top and bottom two results discarded to
get rid of outliers, arithmetic mean between the remaining six.
---
 libavcodec/asvenc.c   |  1 +
 libavcodec/put_bits.h | 31 ++++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index c2c940f365..28f7a94071 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -295,6 +295,7 @@  static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     while (put_bits_count(&a->pb) & 31)
         put_bits(&a->pb, 8, 0);
 
+    flush_put_bits(&a->pb);
     size = put_bits_count(&a->pb) / 32;
 
     if (avctx->codec_id == AV_CODEC_ID_ASV1) {
diff --git a/libavcodec/put_bits.h b/libavcodec/put_bits.h
index c6a8f3ac14..ddd97906b2 100644
--- a/libavcodec/put_bits.h
+++ b/libavcodec/put_bits.h
@@ -29,12 +29,20 @@ 
 #include <stdint.h>
 #include <stddef.h>
 
+#include "config.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/avassert.h"
 
+#if ARCH_X86_64
+// TODO: Benchmark and optionally enable on other 64-bit architectures.
+typedef uint64_t BitBuf;
+#define AV_WBBUF AV_WB64
+#define AV_WLBUF AV_WL64
+#else
 typedef uint32_t BitBuf;
 #define AV_WBBUF AV_WB32
 #define AV_WLBUF AV_WL32
+#endif
 
 static const int BUF_BITS = 8 * sizeof(BitBuf);
 
@@ -163,17 +171,11 @@  void avpriv_put_string(PutBitContext *pb, const char *string,
 void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length);
 #endif
 
-/**
- * Write up to 31 bits into a bitstream.
- * Use put_bits32 to write 32 bits.
- */
-static inline void put_bits(PutBitContext *s, int n, BitBuf value)
+static inline void put_bits_no_assert(PutBitContext *s, int n, BitBuf value)
 {
     BitBuf bit_buf;
     int bit_left;
 
-    av_assert2(n <= 31 && value < (1UL << n));
-
     bit_buf  = s->bit_buf;
     bit_left = s->bit_left;
 
@@ -215,6 +217,16 @@  static inline void put_bits(PutBitContext *s, int n, BitBuf value)
     s->bit_left = bit_left;
 }
 
+/**
+ * Write up to 31 bits into a bitstream.
+ * Use put_bits32 to write 32 bits.
+ */
+static inline void put_bits(PutBitContext *s, int n, BitBuf value)
+{
+    av_assert2(n <= 31 && value < (1UL << n));
+    put_bits_no_assert(s, n, value);
+}
+
 static inline void put_bits_le(PutBitContext *s, int n, BitBuf value)
 {
     BitBuf bit_buf;
@@ -258,6 +270,11 @@  static void av_unused put_bits32(PutBitContext *s, uint32_t value)
     BitBuf bit_buf;
     int bit_left;
 
+    if (BUF_BITS > 32) {
+        put_bits_no_assert(s, 32, value);
+        return;
+    }
+
     bit_buf  = s->bit_buf;
     bit_left = s->bit_left;