diff mbox series

[FFmpeg-devel,3/5] lavf/matroskadec: support standard (non-WebM) WebVTT formatting

Message ID 20200618004743.95896-3-rcombs@rcombs.me
State New
Headers show
Series [FFmpeg-devel,1/5] lavf: matroska subtitle muxer
Related show

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Ridley Combs June 18, 2020, 12:47 a.m. UTC
Fixes ticket #5641
---
 libavformat/matroska.c    |  11 ++--
 libavformat/matroskadec.c | 111 ++++++++++++++++++++++++--------------
 2 files changed, 77 insertions(+), 45 deletions(-)

Comments

Andreas Rheinhardt June 20, 2020, 9:14 a.m. UTC | #1
rcombs:
> Fixes ticket #5641
> ---
>  libavformat/matroska.c    |  11 ++--
>  libavformat/matroskadec.c | 111 ++++++++++++++++++++++++--------------
>  2 files changed, 77 insertions(+), 45 deletions(-)
> 
> diff --git a/libavformat/matroska.c b/libavformat/matroska.c
> index 7c56aba403..962fa496f4 100644
> --- a/libavformat/matroska.c
> +++ b/libavformat/matroska.c
> @@ -60,16 +60,12 @@ const CodecTags ff_mkv_codec_tags[]={
>      {"A_VORBIS"         , AV_CODEC_ID_VORBIS},
>      {"A_WAVPACK4"       , AV_CODEC_ID_WAVPACK},
>  
> -    {"D_WEBVTT/SUBTITLES"   , AV_CODEC_ID_WEBVTT},
> -    {"D_WEBVTT/CAPTIONS"    , AV_CODEC_ID_WEBVTT},
> -    {"D_WEBVTT/DESCRIPTIONS", AV_CODEC_ID_WEBVTT},
> -    {"D_WEBVTT/METADATA"    , AV_CODEC_ID_WEBVTT},
> -
>      {"S_TEXT/UTF8"      , AV_CODEC_ID_SUBRIP},
>      {"S_TEXT/UTF8"      , AV_CODEC_ID_TEXT},
>      {"S_TEXT/ASCII"     , AV_CODEC_ID_TEXT},
>      {"S_TEXT/ASS"       , AV_CODEC_ID_ASS},
>      {"S_TEXT/SSA"       , AV_CODEC_ID_ASS},
> +    {"S_TEXT/WEBVTT"    , AV_CODEC_ID_WEBVTT},
>      {"S_ASS"            , AV_CODEC_ID_ASS},
>      {"S_SSA"            , AV_CODEC_ID_ASS},
>      {"S_VOBSUB"         , AV_CODEC_ID_DVD_SUBTITLE},
> @@ -77,6 +73,11 @@ const CodecTags ff_mkv_codec_tags[]={
>      {"S_HDMV/PGS"       , AV_CODEC_ID_HDMV_PGS_SUBTITLE},
>      {"S_HDMV/TEXTST"    , AV_CODEC_ID_HDMV_TEXT_SUBTITLE},
>  
> +    {"D_WEBVTT/SUBTITLES"   , AV_CODEC_ID_WEBVTT},
> +    {"D_WEBVTT/CAPTIONS"    , AV_CODEC_ID_WEBVTT},
> +    {"D_WEBVTT/DESCRIPTIONS", AV_CODEC_ID_WEBVTT},
> +    {"D_WEBVTT/METADATA"    , AV_CODEC_ID_WEBVTT},
> +
>      {"V_AV1"            , AV_CODEC_ID_AV1},
>      {"V_DIRAC"          , AV_CODEC_ID_DIRAC},
>      {"V_FFV1"           , AV_CODEC_ID_FFV1},
> diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
> index cff7f0cb54..1e4947e209 100644
> --- a/libavformat/matroskadec.c
> +++ b/libavformat/matroskadec.c
> @@ -3305,62 +3305,81 @@ static int matroska_parse_webvtt(MatroskaDemuxContext *matroska,
>                                   uint8_t *data, int data_len,
>                                   uint64_t timecode,
>                                   uint64_t duration,
> -                                 int64_t pos)
> +                                 int64_t pos,
> +                                 uint8_t *additional, uint64_t additional_id, int additional_size)
>  {
>      AVPacket pktl, *pkt = &pktl;
> -    uint8_t *id, *settings, *text, *buf;
> -    int id_len, settings_len, text_len;
> +    uint8_t *id, *settings, *comment, *text, *buf;
> +    int id_len = 0, settings_len = 0, comment_len = 0, text_len;
>      uint8_t *p, *q;
>      int err;
> +    int webm_style = !strncmp(track->codec_id, "D_WEBVTT/", 9);
>  
>      if (data_len <= 0)
>          return AVERROR_INVALIDDATA;
>  
> -    p = data;
> -    q = data + data_len;
> -
> -    id = p;
> -    id_len = -1;
> -    while (p < q) {
> -        if (*p == '\r' || *p == '\n') {
> -            id_len = p - id;
> -            if (*p == '\r')
> -                p++;
> -            break;
> +    p = webm_style ? data : additional;
> +    q = webm_style ? (data + data_len) : (additional + additional_size);
> +

Unfortunately all pointer arithmetic involving NULL is undefined
behaviour in C, even NULL + 0. So the above is ub when not using
webm-style and when there is no BlockAddition.

> +    if (p) {
> +        id = p;
> +        id_len = -1;

This is unnecessary. Initializing to zero (as is already done above)
works just fine.

> +        while (p < q) {
> +            if (*p == '\r' || *p == '\n') {
> +                id_len = p - id;
> +                if (*p == '\r')
> +                    p++;
> +                break;
> +            }
> +            p++;
>          }
> +
> +        if (p >= q || *p != '\n')
> +            return AVERROR_INVALIDDATA;
>          p++;
> -    }
>  
> -    if (p >= q || *p != '\n')
> -        return AVERROR_INVALIDDATA;
> -    p++;
> -
> -    settings = p;
> -    settings_len = -1;
> -    while (p < q) {
> -        if (*p == '\r' || *p == '\n') {
> -            settings_len = p - settings;
> -            if (*p == '\r')
> -                p++;
> -            break;
> +        settings = p;
> +        settings_len = -1;
> +        while (p < q) {
> +            if (*p == '\r' || *p == '\n') {
> +                settings_len = p - settings;
> +                if (*p == '\r')
> +                    p++;
> +                break;
> +            }
> +            p++;
>          }
> +
> +        if (p >= q || *p != '\n')
> +            return AVERROR_INVALIDDATA;

You're repeating yourself.

>          p++;
> +
> +        if (!webm_style && p < q) {
> +            if (q[-1] != '\r' && q[-1] != '\n')

The current Matroska codec mapping indeed requires the terminating
WebVTT line terminator to be retained, yet mkvmerge doesn't do so, so we
should not be so picky with this.

> +                return AVERROR_INVALIDDATA;
> +
> +            comment = p;
> +            comment_len = q - p;
> +        }
>      }
>  
> -    if (p >= q || *p != '\n')
> -        return AVERROR_INVALIDDATA;
> -    p++;
> -
> -    text = p;
> -    text_len = q - p;
> -    while (text_len > 0) {
> -        const int len = text_len - 1;
> -        const uint8_t c = p[len];
> -        if (c != '\r' && c != '\n')
> -            break;
> -        text_len = len;
> +    if (webm_style) {
> +        text = p;
> +        text_len = q - p;
> +
> +        while (text_len > 0) {
> +            const int len = text_len - 1;
> +            const uint8_t c = p[len];
> +            if (c != '\r' && c != '\n')
> +                break;
> +            text_len = len;
> +        }
> +    } else {
> +        text = data;
> +        text_len = data_len;
>      }
>  
> +
>      if (text_len <= 0)
>          return AVERROR_INVALIDDATA;
>  
> @@ -3393,6 +3412,17 @@ static int matroska_parse_webvtt(MatroskaDemuxContext *matroska,
>          memcpy(buf, settings, settings_len);
>      }
>  
> +    if (comment_len > 0) {
> +        buf = av_packet_new_side_data(pkt,
> +                                      AV_PKT_DATA_WEBVTT_COMMENT,
> +                                      comment_len);
> +        if (!buf) {
> +            av_packet_unref(pkt);
> +            return AVERROR(ENOMEM);
> +        }
> +        memcpy(buf, comment, comment_len);
> +    }
> +
>      // Do we need this for subtitles?
>      // pkt->flags = AV_PKT_FLAG_KEY;
>  
> @@ -3656,7 +3686,8 @@ static int matroska_parse_block(MatroskaDemuxContext *matroska, AVBufferRef *buf
>              res = matroska_parse_webvtt(matroska, track, st,
>                                          out_data, out_size,
>                                          timecode, lace_duration,
> -                                        pos);
> +                                        pos,
> +                                        additional, additional_id, additional_size);
>              if (!buf)
>                  av_free(out_data);
>              if (res)
>
Nicolas George June 20, 2020, 11:11 a.m. UTC | #2
Andreas Rheinhardt (12020-06-20):
> Unfortunately all pointer arithmetic involving NULL is undefined
> behaviour in C, even NULL + 0.

I am pretty sure the UB is not there, and that NULL+0 is valid. Do you
have the reference?

Anyway, even if it is valid, it is not good style, I agree on that.

Regards,
Andreas Rheinhardt June 20, 2020, 1:04 p.m. UTC | #3
Nicolas George:
> Andreas Rheinhardt (12020-06-20):
>> Unfortunately all pointer arithmetic involving NULL is undefined
>> behaviour in C, even NULL + 0.
> 
> I am pretty sure the UB is not there, and that NULL+0 is valid. Do you
> have the reference?
> 
C99, 6.5.6 (the part about additive operators) says that pointer
arithmetic is only defined in two cases: First, if it involves a pointer
to an array (an object that is not an element of an array is treated as
an array with exactly one element for the purposes of pointer
arithmetic) and an offset so small so that it contains an element offset
places away from the pointer. And secondly, if the pointer points to the
last element of an array and the offset is 1.
(For C90, this stuff is contained in 6.3.6.)

A nullpointer does not point to any object, hence it is undefined.

Notice that UBSan in Clang 10 started warning about this (see ticket
8592 or see
https://github.com/llvm/llvm-project/commit/536b0ee40ab97f2878dc124a321cf9108ee3d233).

> Anyway, even if it is valid, it is not good style, I agree on that.

While it would be very bad style to write NULL + 0 or something like
that when it is known at compile-time that this is the only case that
will happen, I don't consider an expression that might lead to NULL + 0
to be bad style. I actually think that this is one of the insanities of
the C standard (C++ btw explicitly allows nullptr + 0).

- Andreas
diff mbox series

Patch

diff --git a/libavformat/matroska.c b/libavformat/matroska.c
index 7c56aba403..962fa496f4 100644
--- a/libavformat/matroska.c
+++ b/libavformat/matroska.c
@@ -60,16 +60,12 @@  const CodecTags ff_mkv_codec_tags[]={
     {"A_VORBIS"         , AV_CODEC_ID_VORBIS},
     {"A_WAVPACK4"       , AV_CODEC_ID_WAVPACK},
 
-    {"D_WEBVTT/SUBTITLES"   , AV_CODEC_ID_WEBVTT},
-    {"D_WEBVTT/CAPTIONS"    , AV_CODEC_ID_WEBVTT},
-    {"D_WEBVTT/DESCRIPTIONS", AV_CODEC_ID_WEBVTT},
-    {"D_WEBVTT/METADATA"    , AV_CODEC_ID_WEBVTT},
-
     {"S_TEXT/UTF8"      , AV_CODEC_ID_SUBRIP},
     {"S_TEXT/UTF8"      , AV_CODEC_ID_TEXT},
     {"S_TEXT/ASCII"     , AV_CODEC_ID_TEXT},
     {"S_TEXT/ASS"       , AV_CODEC_ID_ASS},
     {"S_TEXT/SSA"       , AV_CODEC_ID_ASS},
+    {"S_TEXT/WEBVTT"    , AV_CODEC_ID_WEBVTT},
     {"S_ASS"            , AV_CODEC_ID_ASS},
     {"S_SSA"            , AV_CODEC_ID_ASS},
     {"S_VOBSUB"         , AV_CODEC_ID_DVD_SUBTITLE},
@@ -77,6 +73,11 @@  const CodecTags ff_mkv_codec_tags[]={
     {"S_HDMV/PGS"       , AV_CODEC_ID_HDMV_PGS_SUBTITLE},
     {"S_HDMV/TEXTST"    , AV_CODEC_ID_HDMV_TEXT_SUBTITLE},
 
+    {"D_WEBVTT/SUBTITLES"   , AV_CODEC_ID_WEBVTT},
+    {"D_WEBVTT/CAPTIONS"    , AV_CODEC_ID_WEBVTT},
+    {"D_WEBVTT/DESCRIPTIONS", AV_CODEC_ID_WEBVTT},
+    {"D_WEBVTT/METADATA"    , AV_CODEC_ID_WEBVTT},
+
     {"V_AV1"            , AV_CODEC_ID_AV1},
     {"V_DIRAC"          , AV_CODEC_ID_DIRAC},
     {"V_FFV1"           , AV_CODEC_ID_FFV1},
diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
index cff7f0cb54..1e4947e209 100644
--- a/libavformat/matroskadec.c
+++ b/libavformat/matroskadec.c
@@ -3305,62 +3305,81 @@  static int matroska_parse_webvtt(MatroskaDemuxContext *matroska,
                                  uint8_t *data, int data_len,
                                  uint64_t timecode,
                                  uint64_t duration,
-                                 int64_t pos)
+                                 int64_t pos,
+                                 uint8_t *additional, uint64_t additional_id, int additional_size)
 {
     AVPacket pktl, *pkt = &pktl;
-    uint8_t *id, *settings, *text, *buf;
-    int id_len, settings_len, text_len;
+    uint8_t *id, *settings, *comment, *text, *buf;
+    int id_len = 0, settings_len = 0, comment_len = 0, text_len;
     uint8_t *p, *q;
     int err;
+    int webm_style = !strncmp(track->codec_id, "D_WEBVTT/", 9);
 
     if (data_len <= 0)
         return AVERROR_INVALIDDATA;
 
-    p = data;
-    q = data + data_len;
-
-    id = p;
-    id_len = -1;
-    while (p < q) {
-        if (*p == '\r' || *p == '\n') {
-            id_len = p - id;
-            if (*p == '\r')
-                p++;
-            break;
+    p = webm_style ? data : additional;
+    q = webm_style ? (data + data_len) : (additional + additional_size);
+
+    if (p) {
+        id = p;
+        id_len = -1;
+        while (p < q) {
+            if (*p == '\r' || *p == '\n') {
+                id_len = p - id;
+                if (*p == '\r')
+                    p++;
+                break;
+            }
+            p++;
         }
+
+        if (p >= q || *p != '\n')
+            return AVERROR_INVALIDDATA;
         p++;
-    }
 
-    if (p >= q || *p != '\n')
-        return AVERROR_INVALIDDATA;
-    p++;
-
-    settings = p;
-    settings_len = -1;
-    while (p < q) {
-        if (*p == '\r' || *p == '\n') {
-            settings_len = p - settings;
-            if (*p == '\r')
-                p++;
-            break;
+        settings = p;
+        settings_len = -1;
+        while (p < q) {
+            if (*p == '\r' || *p == '\n') {
+                settings_len = p - settings;
+                if (*p == '\r')
+                    p++;
+                break;
+            }
+            p++;
         }
+
+        if (p >= q || *p != '\n')
+            return AVERROR_INVALIDDATA;
         p++;
+
+        if (!webm_style && p < q) {
+            if (q[-1] != '\r' && q[-1] != '\n')
+                return AVERROR_INVALIDDATA;
+
+            comment = p;
+            comment_len = q - p;
+        }
     }
 
-    if (p >= q || *p != '\n')
-        return AVERROR_INVALIDDATA;
-    p++;
-
-    text = p;
-    text_len = q - p;
-    while (text_len > 0) {
-        const int len = text_len - 1;
-        const uint8_t c = p[len];
-        if (c != '\r' && c != '\n')
-            break;
-        text_len = len;
+    if (webm_style) {
+        text = p;
+        text_len = q - p;
+
+        while (text_len > 0) {
+            const int len = text_len - 1;
+            const uint8_t c = p[len];
+            if (c != '\r' && c != '\n')
+                break;
+            text_len = len;
+        }
+    } else {
+        text = data;
+        text_len = data_len;
     }
 
+
     if (text_len <= 0)
         return AVERROR_INVALIDDATA;
 
@@ -3393,6 +3412,17 @@  static int matroska_parse_webvtt(MatroskaDemuxContext *matroska,
         memcpy(buf, settings, settings_len);
     }
 
+    if (comment_len > 0) {
+        buf = av_packet_new_side_data(pkt,
+                                      AV_PKT_DATA_WEBVTT_COMMENT,
+                                      comment_len);
+        if (!buf) {
+            av_packet_unref(pkt);
+            return AVERROR(ENOMEM);
+        }
+        memcpy(buf, comment, comment_len);
+    }
+
     // Do we need this for subtitles?
     // pkt->flags = AV_PKT_FLAG_KEY;
 
@@ -3656,7 +3686,8 @@  static int matroska_parse_block(MatroskaDemuxContext *matroska, AVBufferRef *buf
             res = matroska_parse_webvtt(matroska, track, st,
                                         out_data, out_size,
                                         timecode, lace_duration,
-                                        pos);
+                                        pos,
+                                        additional, additional_id, additional_size);
             if (!buf)
                 av_free(out_data);
             if (res)