diff mbox

[FFmpeg-devel,2/5] startcode: Switch to aligned reads

Message ID 20190609110053.4012-3-andreas.rheinhardt@gmail.com
State New
Headers show

Commit Message

Andreas Rheinhardt June 9, 2019, 11 a.m. UTC
ff_startcode_find_candidate_c already checked multiple bytes for zeros at
once if HAVE_FAST_UNALIGNED is true; up until now the other case checked
all bytes one by one. This has been modified: A few bytes are checked
until alignment is reached from which point on several bytes can be
checked at once via aligned reads.
This might cause a slight performance degradation if HAVE_FAST_UNALIGNED
is true, but this is only temporarily as this patch is preparatory for
further patches where benchmarks have shown aligned accesses to be faster.
On an x64 Haswell this led to a performance degradation of ca. 3% (from
411578 decicycles to 424503 decicycles based upon 10 iteration with 8192
runs each) when reading a 30.2 Mb/s H.264 stream from a transport stream;
for another file it were 4.9% (from 55476 to 58326 decicycles based on
10 iterations with 131072 runs each).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
---
 libavcodec/startcode.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)
diff mbox

Patch

diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
index a55a8fafa6..373572365b 100644
--- a/libavcodec/startcode.c
+++ b/libavcodec/startcode.c
@@ -33,8 +33,13 @@  int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
 {
     const uint8_t *start = buf, *end = buf + size;
 
-#if HAVE_FAST_UNALIGNED
-#define READ(bitness) AV_RN ## bitness
+#define INITIALIZATION(mod) do {                                           \
+    for (; buf < end && (uintptr_t)buf % mod; buf++)                       \
+        if (!*buf)                                                         \
+            return buf - start;                                            \
+    } while (0)
+
+#define READ(bitness) AV_RN ## bitness ## A
 #define MAIN_LOOP(bitness, mask1, mask2) do {                              \
         /* we check p < end instead of p + 3 / 7 because it is
          * simpler and there must be AV_INPUT_BUFFER_PADDING_SIZE
@@ -46,10 +51,11 @@  int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
     } while (0)
 
 #if HAVE_FAST_64BIT
+    INITIALIZATION(8);
     MAIN_LOOP(64, 0x0101010101010101ULL, 0x8080808080808080ULL);
 #else
+    INITIALIZATION(4);
     MAIN_LOOP(32, 0x01010101U, 0x80808080U);
-#endif
 #endif
     for (; buf < end; buf++)
         if (!*buf)