diff mbox

[FFmpeg-devel,21/37] avformat/matroskadec: Introduce a "last known good" position

Message ID 20190516223018.30827-22-andreas.rheinhardt@gmail.com
State Accepted
Commit a3db9f62a42a8e5365de67722ecfac7065a70699
Headers show

Commit Message

Andreas Rheinhardt May 16, 2019, 10:30 p.m. UTC
Currently, resyncing during reading packets works as follows:
The current position is recorded, then a call to matroska_parse_cluster
is made and if said call fails, the demuxer tries to resync from the
earlier position. If the call doesn't fail, but also doesn't deliver a
packet, then this is looped.

There are two problems with this approach:
1. The Matroska file format aims to be forward-compatible; to achieve
this, a demuxer should simply ignore and skip elements it doesn't
know about. But it is not possible to reliably distinguish unknown
elements from junk. If matroska_parse_cluster encounters an unknown
element, it can therefore not simply error out; instead it returns zero
and the loop is iterated which includes an update of the position that
is intended to be used in case of errors, i.e. the element that is
skipped is not searched for level 1 element ids to resync to at all if
later calls to matroska_parse_cluster return an error.
Notice that in case that sync has been lost there can be a chain of
several unknown/possibly junk elements before an error is detected.

2. Even if a call to matroska_parse_cluster delivers a packet, this does
not mean that everything is fine. E.g. it might be that some of the
block's data is missing and that the data that was presumed to be from
the block just read actually contains the beginning of the next element.
This will only be apparent at the next call of matroska_read_packet,
which uses the (false) end of the earlier block as resync position so
that in the (not unlikely) case that the call to matroska_parse_cluster
fails, the data believed to be part of the earlier block is not searched
for a level 1 element to resync to.

To counter this, a "last known good" position is introduced. When an
element id that is known to be allowed at this position in the hierarchy
(according to the syntax currently in use for parsing) is read and some
further checks (regarding the length of the element and its containing
master element) are passed, then the beginning of the current element is
treated as a "good" position and recorded as such in the
MatroskaDemuxContext. Because of 2., only the start of the element is
treated as a "good" position, not the whole element. If an error occurs
later during parsing of clusters, the resync process starts at the last
known good position.

Given that when the header is damaged the subsequent resync never skips over
data and is therefore unaffected by both issues, the "last known good"
concept is not used there.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
 libavformat/matroskadec.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)
diff mbox


diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
index deb01578d5..949169d708 100644
--- a/libavformat/matroskadec.c
+++ b/libavformat/matroskadec.c
@@ -335,6 +335,7 @@  typedef struct MatroskaDemuxContext {
     int num_levels;
     MatroskaLevel levels[EBML_MAX_DEPTH];
     uint32_t current_id;
+    int64_t  resync_pos;
     uint64_t time_scale;
     double   duration;
@@ -754,6 +755,9 @@  static int matroska_reset_status(MatroskaDemuxContext *matroska,
     matroska->current_id = id;
     matroska->num_levels = 1;
     matroska->current_cluster.pos = 0;
+    matroska->resync_pos = avio_tell(matroska->ctx->pb);
+    if (id)
+        matroska->resync_pos -= (av_log2(id) + 7) / 8;
     return 0;
@@ -1164,7 +1168,8 @@  static int ebml_parse(MatroskaDemuxContext *matroska,
     AVIOContext *pb = matroska->ctx->pb;
     uint32_t id;
     uint64_t length;
-    int res;
+    int64_t pos = avio_tell(pb);
+    int res, update_pos = 1;
     void *newelem;
     MatroskaLevel1Element *level1_elem;
@@ -1177,7 +1182,8 @@  static int ebml_parse(MatroskaDemuxContext *matroska,
                     res == AVERROR_EOF) ? 1 : res;
         matroska->current_id = id | 1 << 7 * res;
-    }
+    } else
+        pos -= (av_log2(matroska->current_id) + 7) / 8;
     id = matroska->current_id;
@@ -1188,6 +1194,7 @@  static int ebml_parse(MatroskaDemuxContext *matroska,
         return 0;  // we reached the end of an unknown size cluster
     if (!syntax->id && id != EBML_ID_VOID && id != EBML_ID_CRC32) {
         av_log(matroska->ctx, AV_LOG_DEBUG, "Unknown entry 0x%"PRIX32"\n", id);
+        update_pos = 0;
     data = (char *) data + syntax->data_offset;
@@ -1242,6 +1249,13 @@  static int ebml_parse(MatroskaDemuxContext *matroska,
                 return AVERROR_INVALIDDATA;
+        if (update_pos) {
+            // We have found an element that is allowed at this place
+            // in the hierarchy and it passed all checks, so treat the beginning
+            // of the element as the "last known good" position.
+            matroska->resync_pos = pos;
+        }
     switch (syntax->type) {
@@ -3562,12 +3576,16 @@  static int matroska_read_packet(AVFormatContext *s, AVPacket *pkt)
     MatroskaDemuxContext *matroska = s->priv_data;
     int ret = 0;
+    if (matroska->resync_pos == -1) {
+        // This can only happen if generic seeking has been used.
+        matroska->resync_pos = avio_tell(s->pb);
+    }
     while (matroska_deliver_packet(matroska, pkt)) {
-        int64_t pos = avio_tell(matroska->ctx->pb);
         if (matroska->done)
             return (ret < 0) ? ret : AVERROR_EOF;
         if (matroska_parse_cluster(matroska) < 0)
-            ret = matroska_resync(matroska, pos);
+            ret = matroska_resync(matroska, matroska->resync_pos);
     return 0;
@@ -3629,6 +3647,7 @@  err:
     // slightly hackish but allows proper fallback to
     // the generic seeking code.
     matroska_reset_status(matroska, 0, -1);
+    matroska->resync_pos = -1;
     st->skip_to_keyframe =
     matroska->skip_to_keyframe = 0;