diff mbox

[FFmpeg-devel,08/21] avformat/matroskadec: Improve error messages

Message ID 20190327111852.3784-9-andreas.rheinhardt@googlemail.com
State New
Headers show

Commit Message

Diego Felix de Souza via ffmpeg-devel March 27, 2019, 11:18 a.m. UTC
ebml_read_num had a number of flaws:

1. The check for read errors/EOF was totally wrong. E.g. an EBML number
beginning with the invalid 0x00 would be considered a read error,
although it is just invalid data.
2. The check for read errors/EOF was done just once, after reading the
first byte of the EBML number. But errors/EOF can happen inbetween, of
course, and this wasn't checked.
3. There was no way to distinguish when EOF should be an error (because
the data has to be there) for which an error message should be emitted
and when it is not necessarily an error (namely during parsing of EBML
IDs). Such a possibility has been added and used.

Some useless initializations were also fixed.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
---
 libavformat/matroskadec.c | 61 ++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 27 deletions(-)

Comments

Steve Lhomme April 7, 2019, 6:53 a.m. UTC | #1
On 3/27/2019 12:18 PM, Andreas Rheinhardt via ffmpeg-devel wrote:
> ebml_read_num had a number of flaws:
>
> 1. The check for read errors/EOF was totally wrong. E.g. an EBML number
> beginning with the invalid 0x00 would be considered a read error,
> although it is just invalid data.
> 2. The check for read errors/EOF was done just once, after reading the
> first byte of the EBML number. But errors/EOF can happen inbetween, of
> course, and this wasn't checked.
> 3. There was no way to distinguish when EOF should be an error (because
> the data has to be there) for which an error message should be emitted
> and when it is not necessarily an error (namely during parsing of EBML
> IDs). Such a possibility has been added and used.

Maybe the title of the patch should rather mention that it's fixing the 
EOF handling when reading EBML ID/Length. The changed error messages is 
a less important consequence.


>
> Some useless initializations were also fixed.
>
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
> ---
>   libavformat/matroskadec.c | 61 ++++++++++++++++++++++-----------------
>   1 file changed, 34 insertions(+), 27 deletions(-)
>
> diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
> index a6617a607b..aa2266384a 100644
> --- a/libavformat/matroskadec.c
> +++ b/libavformat/matroskadec.c
> @@ -820,29 +820,19 @@ static int ebml_level_end(MatroskaDemuxContext *matroska)
>    * Returns: number of bytes read, < 0 on error
>    */
>   static int ebml_read_num(MatroskaDemuxContext *matroska, AVIOContext *pb,
> -                         int max_size, uint64_t *number)
> +                         int max_size, uint64_t *number, int eof_forbidden)
>   {
> -    int read = 1, n = 1;
> -    uint64_t total = 0;
> +    int read, n = 1;
> +    uint64_t total;
> +    int64_t pos;
>   
> -    /* The first byte tells us the length in bytes - avio_r8() can normally
> -     * return 0, but since that's not a valid first ebmlID byte, we can
> -     * use it safely here to catch EOS. */
> -    if (!(total = avio_r8(pb))) {
> -        /* we might encounter EOS here */
> -        if (!avio_feof(pb)) {
> -            int64_t pos = avio_tell(pb);
> -            av_log(matroska->ctx, AV_LOG_ERROR,
> -                   "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
> -                   pos, pos);
> -            return pb->error ? pb->error : AVERROR(EIO);
> -        }
> -        return AVERROR_EOF;
> -    }
> +    /* The first byte tells us the length in bytes - except when it is zero. */
> +    total = avio_r8(pb);
> +    if (avio_feof(pb))
> +        goto err;
>   
>       /* get the length of the EBML number */
> -    read = 8 - ff_log2_tab[total];
> -    if (read > max_size) {
> +    if (!total || (read = 8 - ff_log2_tab[total]) > max_size) {
>           int64_t pos = avio_tell(pb) - 1;
>           av_log(matroska->ctx, AV_LOG_ERROR,
>                  "Invalid EBML number size tag 0x%02x at pos %"PRIu64" (0x%"PRIx64")\n",
> @@ -855,9 +845,27 @@ static int ebml_read_num(MatroskaDemuxContext *matroska, AVIOContext *pb,
>       while (n++ < read)
>           total = (total << 8) | avio_r8(pb);
>   
> +    if (avio_feof(pb)) {
> +        eof_forbidden = 1;
> +        goto err;
> +    }

You're forcing an error if the data ends after reading a number ? Ending 
a Matroska file with a number should be fine. It could also be an 
element with a size of 0. It doesn't contain any data but it's still 
valid (depending on the semantic of the element).

So this forced error seem wrong. Let the next read catch the EOF if it 
finds one.

> +
>       *number = total;
>   
>       return read;
> +
> +err:
> +    pos = avio_tell(pb);
> +    if (pb->error) {
> +        av_log(matroska->ctx, AV_LOG_ERROR,
> +               "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
> +               pos, pos);
> +        return pb->error;
> +    }
> +    if (eof_forbidden)
> +        av_log(matroska->ctx, AV_LOG_ERROR, "File ended prematurely "
> +               "at pos. %"PRIu64" (0x%"PRIx64")\n", pos, pos);

If eof_forbidden is false (EOF allowed) you return an EOF error anyway ?

> +    return AVERROR_EOF;
>   }
>   
>   /**
> @@ -868,7 +876,7 @@ static int ebml_read_num(MatroskaDemuxContext *matroska, AVIOContext *pb,
>   static int ebml_read_length(MatroskaDemuxContext *matroska, AVIOContext *pb,
>                               uint64_t *number)
>   {
> -    int res = ebml_read_num(matroska, pb, 8, number);
> +    int res = ebml_read_num(matroska, pb, 8, number, 1);
>       if (res > 0 && *number + 1 == 1ULL << (7 * res))
>           *number = EBML_UNKNOWN_LENGTH;
>       return res;
> @@ -1010,7 +1018,7 @@ static int matroska_ebmlnum_uint(MatroskaDemuxContext *matroska,
>   {
>       AVIOContext pb;
>       ffio_init_context(&pb, data, size, 0, NULL, NULL, NULL, NULL);
> -    return ebml_read_num(matroska, &pb, FFMIN(size, 8), num);
> +    return ebml_read_num(matroska, &pb, FFMIN(size, 8), num, 1);
>   }
>   
>   /*
> @@ -1057,7 +1065,7 @@ static int ebml_parse(MatroskaDemuxContext *matroska, EbmlSyntax *syntax,
>   {
>       if (!matroska->current_id) {
>           uint64_t id;
> -        int res = ebml_read_num(matroska, matroska->ctx->pb, 4, &id);
> +        int res = ebml_read_num(matroska, matroska->ctx->pb, 4, &id, 0);
>           if (res < 0) {
>               // in live mode, finish parsing if EOF is reached.
>               return (matroska->is_live && matroska->ctx->pb->eof_reached &&
> @@ -3335,10 +3343,9 @@ static int matroska_parse_block(MatroskaDemuxContext *matroska, AVBufferRef *buf
>       uint64_t num;
>       int trust_default_duration = 1;
>   
> -    if ((n = matroska_ebmlnum_uint(matroska, data, size, &num)) < 0) {
> -        av_log(matroska->ctx, AV_LOG_ERROR, "EBML block data error\n");
> +    if ((n = matroska_ebmlnum_uint(matroska, data, size, &num)) < 0)
>           return n;
> -    }
> +
>       data += n;
>       size -= n;
>   
> @@ -3699,7 +3706,7 @@ static int webm_clusters_start_with_keyframe(AVFormatContext *s)
>           AVPacket *pkt;
>           avio_seek(s->pb, cluster_pos, SEEK_SET);
>           // read cluster id and length
> -        read = ebml_read_num(matroska, matroska->ctx->pb, 4, &cluster_id);
> +        read = ebml_read_num(matroska, matroska->ctx->pb, 4, &cluster_id, 1);
>           if (read < 0 || cluster_id != 0xF43B675) // done with all clusters
>               break;
>           read = ebml_read_length(matroska, matroska->ctx->pb, &cluster_length);
> @@ -3916,7 +3923,7 @@ static int webm_dash_manifest_cues(AVFormatContext *s, int64_t init_range)
>           // Cues element ID + EBML length of the Cues element. cues_end is
>           // inclusive and the above sum is reduced by 1.
>           uint64_t cues_length = 0, cues_id = 0, bytes_read = 0;
> -        bytes_read += ebml_read_num(matroska, matroska->ctx->pb, 4, &cues_id);
> +        bytes_read += ebml_read_num(matroska, matroska->ctx->pb, 4, &cues_id, 1);

+= on a call that may return an error doesn't seem correct. The error 
should be checked before using the value.

>           bytes_read += ebml_read_length(matroska, matroska->ctx->pb, &cues_length);
>           cues_end = cues_start + cues_length + bytes_read - 1;
>       }
> -- 
> 2.19.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Diego Felix de Souza via ffmpeg-devel April 7, 2019, 9:24 a.m. UTC | #2
Steve Lhomme:
> On 3/27/2019 12:18 PM, Andreas Rheinhardt via ffmpeg-devel wrote:
>> ebml_read_num had a number of flaws:
>>
>> 1. The check for read errors/EOF was totally wrong. E.g. an EBML number
>> beginning with the invalid 0x00 would be considered a read error,
>> although it is just invalid data.
>> 2. The check for read errors/EOF was done just once, after reading the
>> first byte of the EBML number. But errors/EOF can happen inbetween, of
>> course, and this wasn't checked.
>> 3. There was no way to distinguish when EOF should be an error (because
>> the data has to be there) for which an error message should be emitted
>> and when it is not necessarily an error (namely during parsing of EBML
>> IDs). Such a possibility has been added and used.
> 
> Maybe the title of the patch should rather mention that it's fixing
> the EOF handling when reading EBML ID/Length. The changed error
> messages is a less important consequence.
> 
How about "avformat/matroskadec: Improve (in particular) EOF checks"?
> 
>>
>> Some useless initializations were also fixed.
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
>> ---
>>   libavformat/matroskadec.c | 61
>> ++++++++++++++++++++++-----------------
>>   1 file changed, 34 insertions(+), 27 deletions(-)
>>
>> diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
>> index a6617a607b..aa2266384a 100644
>> --- a/libavformat/matroskadec.c
>> +++ b/libavformat/matroskadec.c
>> @@ -820,29 +820,19 @@ static int ebml_level_end(MatroskaDemuxContext
>> *matroska)
>>    * Returns: number of bytes read, < 0 on error
>>    */
>>   static int ebml_read_num(MatroskaDemuxContext *matroska,
>> AVIOContext *pb,
>> -                         int max_size, uint64_t *number)
>> +                         int max_size, uint64_t *number, int
>> eof_forbidden)
>>   {
>> -    int read = 1, n = 1;
>> -    uint64_t total = 0;
>> +    int read, n = 1;
>> +    uint64_t total;
>> +    int64_t pos;
>>   -    /* The first byte tells us the length in bytes - avio_r8()
>> can normally
>> -     * return 0, but since that's not a valid first ebmlID byte, we
>> can
>> -     * use it safely here to catch EOS. */
>> -    if (!(total = avio_r8(pb))) {
>> -        /* we might encounter EOS here */
>> -        if (!avio_feof(pb)) {
>> -            int64_t pos = avio_tell(pb);
>> -            av_log(matroska->ctx, AV_LOG_ERROR,
>> -                   "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
>> -                   pos, pos);
>> -            return pb->error ? pb->error : AVERROR(EIO);
>> -        }
>> -        return AVERROR_EOF;
>> -    }
>> +    /* The first byte tells us the length in bytes - except when it
>> is zero. */
>> +    total = avio_r8(pb);
>> +    if (avio_feof(pb))
>> +        goto err;
>>         /* get the length of the EBML number */
>> -    read = 8 - ff_log2_tab[total];
>> -    if (read > max_size) {
>> +    if (!total || (read = 8 - ff_log2_tab[total]) > max_size) {
>>           int64_t pos = avio_tell(pb) - 1;
>>           av_log(matroska->ctx, AV_LOG_ERROR,
>>                  "Invalid EBML number size tag 0x%02x at pos
>> %"PRIu64" (0x%"PRIx64")\n",
>> @@ -855,9 +845,27 @@ static int ebml_read_num(MatroskaDemuxContext
>> *matroska, AVIOContext *pb,
>>       while (n++ < read)
>>           total = (total << 8) | avio_r8(pb);
>>   +    if (avio_feof(pb)) {
>> +        eof_forbidden = 1;
>> +        goto err;
>> +    }
> 
> You're forcing an error if the data ends after reading a number ?
> Ending a Matroska file with a number should be fine. It could also be
> an element with a size of 0. It doesn't contain any data but it's
> still valid (depending on the semantic of the element).
> 
> So this forced error seem wrong. Let the next read catch the EOF if it
> finds one.
> 
avio_feof doesn't indicate an error if we reach EOF, only if we
attempt to read past EOF (or if another error occurred). If the EBML
number we intend to parse is completely read, then no error is
indicated here at all. If the input ends immediately after this EBML
number, then the next read will trigger EOF and will have to catch it.

If avio_feof is set at this point, it means that the first byte of the
EBML number says that the EBML number is total bytes long, but an
error or EOF occured during reading of these bytes. I think this
warrants an error. eof_forbidden is set at this point so that an
incomplete EBML number will always trigger an error.
>> +
>>       *number = total;
>>         return read;
>> +
>> +err:
>> +    pos = avio_tell(pb);
>> +    if (pb->error) {
>> +        av_log(matroska->ctx, AV_LOG_ERROR,
>> +               "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
>> +               pos, pos);
>> +        return pb->error;
>> +    }
>> +    if (eof_forbidden)
>> +        av_log(matroska->ctx, AV_LOG_ERROR, "File ended prematurely "
>> +               "at pos. %"PRIu64" (0x%"PRIx64")\n", pos, pos);
> 
> If eof_forbidden is false (EOF allowed) you return an EOF error anyway ?
> 
Yes, so that the caller knows that the file has ended and can act
accordingly. Currently only the parsing of EBML IDs in ebml_parse
allow EOF and if we hit EOF while parsing an EBML ID, ebml_parse does
a more thorough check for whether this is a real error or something
legal (of course, based upon whether we the current level is
unknown-sized or not).
>> +    return AVERROR_EOF;
>>   }
>>     /**
>> @@ -868,7 +876,7 @@ static int ebml_read_num(MatroskaDemuxContext
>> *matroska, AVIOContext *pb,
>>   static int ebml_read_length(MatroskaDemuxContext *matroska,
>> AVIOContext *pb,
>>                               uint64_t *number)
>>   {
>> -    int res = ebml_read_num(matroska, pb, 8, number);
>> +    int res = ebml_read_num(matroska, pb, 8, number, 1);
>>       if (res > 0 && *number + 1 == 1ULL << (7 * res))
>>           *number = EBML_UNKNOWN_LENGTH;
>>       return res;
>> @@ -1010,7 +1018,7 @@ static int
>> matroska_ebmlnum_uint(MatroskaDemuxContext *matroska,
>>   {
>>       AVIOContext pb;
>>       ffio_init_context(&pb, data, size, 0, NULL, NULL, NULL, NULL);
>> -    return ebml_read_num(matroska, &pb, FFMIN(size, 8), num);
>> +    return ebml_read_num(matroska, &pb, FFMIN(size, 8), num, 1);
>>   }
>>     /*
>> @@ -1057,7 +1065,7 @@ static int ebml_parse(MatroskaDemuxContext
>> *matroska, EbmlSyntax *syntax,
>>   {
>>       if (!matroska->current_id) {
>>           uint64_t id;
>> -        int res = ebml_read_num(matroska, matroska->ctx->pb, 4, &id);
>> +        int res = ebml_read_num(matroska, matroska->ctx->pb, 4,
>> &id, 0);
>>           if (res < 0) {
>>               // in live mode, finish parsing if EOF is reached.
>>               return (matroska->is_live &&
>> matroska->ctx->pb->eof_reached &&
>> @@ -3335,10 +3343,9 @@ static int
>> matroska_parse_block(MatroskaDemuxContext *matroska, AVBufferRef *buf
>>       uint64_t num;
>>       int trust_default_duration = 1;
>>   -    if ((n = matroska_ebmlnum_uint(matroska, data, size, &num)) <
>> 0) {
>> -        av_log(matroska->ctx, AV_LOG_ERROR, "EBML block data
>> error\n");
>> +    if ((n = matroska_ebmlnum_uint(matroska, data, size, &num)) < 0)
>>           return n;
>> -    }
>> +
>>       data += n;
>>       size -= n;
>>   @@ -3699,7 +3706,7 @@ static int
>> webm_clusters_start_with_keyframe(AVFormatContext *s)
>>           AVPacket *pkt;
>>           avio_seek(s->pb, cluster_pos, SEEK_SET);
>>           // read cluster id and length
>> -        read = ebml_read_num(matroska, matroska->ctx->pb, 4,
>> &cluster_id);
>> +        read = ebml_read_num(matroska, matroska->ctx->pb, 4,
>> &cluster_id, 1);
>>           if (read < 0 || cluster_id != 0xF43B675) // done with all
>> clusters
>>               break;
>>           read = ebml_read_length(matroska, matroska->ctx->pb,
>> &cluster_length);
>> @@ -3916,7 +3923,7 @@ static int
>> webm_dash_manifest_cues(AVFormatContext *s, int64_t init_range)
>>           // Cues element ID + EBML length of the Cues element.
>> cues_end is
>>           // inclusive and the above sum is reduced by 1.
>>           uint64_t cues_length = 0, cues_id = 0, bytes_read = 0;
>> -        bytes_read += ebml_read_num(matroska, matroska->ctx->pb, 4,
>> &cues_id);
>> +        bytes_read += ebml_read_num(matroska, matroska->ctx->pb, 4,
>> &cues_id, 1);
> 
> += on a call that may return an error doesn't seem correct. The error
> should be checked before using the value.
> 
Ok, I'll fix this (pre-existing) error.
>>           bytes_read += ebml_read_length(matroska,
>> matroska->ctx->pb, &cues_length);
>>           cues_end = cues_start + cues_length + bytes_read - 1;
>>       }
>> -- 
>> 2.19.2
Steve Lhomme April 8, 2019, 6:44 a.m. UTC | #3
> On April 7, 2019 at 11:24 AM Andreas Rheinhardt via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> wrote:
> 
> 
> Steve Lhomme:
> > On 3/27/2019 12:18 PM, Andreas Rheinhardt via ffmpeg-devel wrote:
> >> ebml_read_num had a number of flaws:
> >>
> >> 1. The check for read errors/EOF was totally wrong. E.g. an EBML number
> >> beginning with the invalid 0x00 would be considered a read error,
> >> although it is just invalid data.
> >> 2. The check for read errors/EOF was done just once, after reading the
> >> first byte of the EBML number. But errors/EOF can happen inbetween, of
> >> course, and this wasn't checked.
> >> 3. There was no way to distinguish when EOF should be an error (because
> >> the data has to be there) for which an error message should be emitted
> >> and when it is not necessarily an error (namely during parsing of EBML
> >> IDs). Such a possibility has been added and used.
> > 
> > Maybe the title of the patch should rather mention that it's fixing
> > the EOF handling when reading EBML ID/Length. The changed error
> > messages is a less important consequence.
> > 
> How about "avformat/matroskadec: Improve (in particular) EOF checks"?
> > 
> >>
> >> Some useless initializations were also fixed.
> >>
> >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
> >> ---
> >>   libavformat/matroskadec.c | 61
> >> ++++++++++++++++++++++-----------------
> >>   1 file changed, 34 insertions(+), 27 deletions(-)
> >>
> >> diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
> >> index a6617a607b..aa2266384a 100644
> >> --- a/libavformat/matroskadec.c
> >> +++ b/libavformat/matroskadec.c
> >> @@ -820,29 +820,19 @@ static int ebml_level_end(MatroskaDemuxContext
> >> *matroska)
> >>    * Returns: number of bytes read, < 0 on error
> >>    */
> >>   static int ebml_read_num(MatroskaDemuxContext *matroska,
> >> AVIOContext *pb,
> >> -                         int max_size, uint64_t *number)
> >> +                         int max_size, uint64_t *number, int
> >> eof_forbidden)
> >>   {
> >> -    int read = 1, n = 1;
> >> -    uint64_t total = 0;
> >> +    int read, n = 1;
> >> +    uint64_t total;
> >> +    int64_t pos;
> >>   -    /* The first byte tells us the length in bytes - avio_r8()
> >> can normally
> >> -     * return 0, but since that's not a valid first ebmlID byte, we
> >> can
> >> -     * use it safely here to catch EOS. */
> >> -    if (!(total = avio_r8(pb))) {
> >> -        /* we might encounter EOS here */
> >> -        if (!avio_feof(pb)) {
> >> -            int64_t pos = avio_tell(pb);
> >> -            av_log(matroska->ctx, AV_LOG_ERROR,
> >> -                   "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
> >> -                   pos, pos);
> >> -            return pb->error ? pb->error : AVERROR(EIO);
> >> -        }
> >> -        return AVERROR_EOF;
> >> -    }
> >> +    /* The first byte tells us the length in bytes - except when it
> >> is zero. */
> >> +    total = avio_r8(pb);
> >> +    if (avio_feof(pb))
> >> +        goto err;
> >>         /* get the length of the EBML number */
> >> -    read = 8 - ff_log2_tab[total];
> >> -    if (read > max_size) {
> >> +    if (!total || (read = 8 - ff_log2_tab[total]) > max_size) {
> >>           int64_t pos = avio_tell(pb) - 1;
> >>           av_log(matroska->ctx, AV_LOG_ERROR,
> >>                  "Invalid EBML number size tag 0x%02x at pos
> >> %"PRIu64" (0x%"PRIx64")\n",
> >> @@ -855,9 +845,27 @@ static int ebml_read_num(MatroskaDemuxContext
> >> *matroska, AVIOContext *pb,
> >>       while (n++ < read)
> >>           total = (total << 8) | avio_r8(pb);
> >>   +    if (avio_feof(pb)) {
> >> +        eof_forbidden = 1;
> >> +        goto err;
> >> +    }
> > 
> > You're forcing an error if the data ends after reading a number ?
> > Ending a Matroska file with a number should be fine. It could also be
> > an element with a size of 0. It doesn't contain any data but it's
> > still valid (depending on the semantic of the element).
> > 
> > So this forced error seem wrong. Let the next read catch the EOF if it
> > finds one.
> > 
> avio_feof doesn't indicate an error if we reach EOF, only if we
> attempt to read past EOF (or if another error occurred). If the EBML
> number we intend to parse is completely read, then no error is
> indicated here at all. If the input ends immediately after this EBML
> number, then the next read will trigger EOF and will have to catch it.

OK, it makes sense.

> If avio_feof is set at this point, it means that the first byte of the
> EBML number says that the EBML number is total bytes long, but an
> error or EOF occured during reading of these bytes. I think this
> warrants an error. eof_forbidden is set at this point so that an
> incomplete EBML number will always trigger an error.

OK

> >> +
> >>       *number = total;
> >>         return read;
> >> +
> >> +err:
> >> +    pos = avio_tell(pb);
> >> +    if (pb->error) {
> >> +        av_log(matroska->ctx, AV_LOG_ERROR,
> >> +               "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
> >> +               pos, pos);
> >> +        return pb->error;
> >> +    }
> >> +    if (eof_forbidden)
> >> +        av_log(matroska->ctx, AV_LOG_ERROR, "File ended prematurely "
> >> +               "at pos. %"PRIu64" (0x%"PRIx64")\n", pos, pos);
> > 
> > If eof_forbidden is false (EOF allowed) you return an EOF error anyway ?
> > 
> Yes, so that the caller knows that the file has ended and can act
> accordingly. Currently only the parsing of EBML IDs in ebml_parse
> allow EOF and if we hit EOF while parsing an EBML ID, ebml_parse does
> a more thorough check for whether this is a real error or something
> legal (of course, based upon whether we the current level is
> unknown-sized or not).

Then I don't understand what eof_forbidden is for. It seems that EOF should be treated no matter what.

> >> +    return AVERROR_EOF;
> >>   }
> >>     /**
> >> @@ -868,7 +876,7 @@ static int ebml_read_num(MatroskaDemuxContext
> >> *matroska, AVIOContext *pb,
> >>   static int ebml_read_length(MatroskaDemuxContext *matroska,
> >> AVIOContext *pb,
> >>                               uint64_t *number)
> >>   {
> >> -    int res = ebml_read_num(matroska, pb, 8, number);
> >> +    int res = ebml_read_num(matroska, pb, 8, number, 1);
> >>       if (res > 0 && *number + 1 == 1ULL << (7 * res))
> >>           *number = EBML_UNKNOWN_LENGTH;
> >>       return res;
> >> @@ -1010,7 +1018,7 @@ static int
> >> matroska_ebmlnum_uint(MatroskaDemuxContext *matroska,
> >>   {
> >>       AVIOContext pb;
> >>       ffio_init_context(&pb, data, size, 0, NULL, NULL, NULL, NULL);
> >> -    return ebml_read_num(matroska, &pb, FFMIN(size, 8), num);
> >> +    return ebml_read_num(matroska, &pb, FFMIN(size, 8), num, 1);
> >>   }
> >>     /*
> >> @@ -1057,7 +1065,7 @@ static int ebml_parse(MatroskaDemuxContext
> >> *matroska, EbmlSyntax *syntax,
> >>   {
> >>       if (!matroska->current_id) {
> >>           uint64_t id;
> >> -        int res = ebml_read_num(matroska, matroska->ctx->pb, 4, &id);
> >> +        int res = ebml_read_num(matroska, matroska->ctx->pb, 4,
> >> &id, 0);
> >>           if (res < 0) {
> >>               // in live mode, finish parsing if EOF is reached.
> >>               return (matroska->is_live &&
> >> matroska->ctx->pb->eof_reached &&
> >> @@ -3335,10 +3343,9 @@ static int
> >> matroska_parse_block(MatroskaDemuxContext *matroska, AVBufferRef *buf
> >>       uint64_t num;
> >>       int trust_default_duration = 1;
> >>   -    if ((n = matroska_ebmlnum_uint(matroska, data, size, &num)) <
> >> 0) {
> >> -        av_log(matroska->ctx, AV_LOG_ERROR, "EBML block data
> >> error\n");
> >> +    if ((n = matroska_ebmlnum_uint(matroska, data, size, &num)) < 0)
> >>           return n;
> >> -    }
> >> +
> >>       data += n;
> >>       size -= n;
> >>   @@ -3699,7 +3706,7 @@ static int
> >> webm_clusters_start_with_keyframe(AVFormatContext *s)
> >>           AVPacket *pkt;
> >>           avio_seek(s->pb, cluster_pos, SEEK_SET);
> >>           // read cluster id and length
> >> -        read = ebml_read_num(matroska, matroska->ctx->pb, 4,
> >> &cluster_id);
> >> +        read = ebml_read_num(matroska, matroska->ctx->pb, 4,
> >> &cluster_id, 1);
> >>           if (read < 0 || cluster_id != 0xF43B675) // done with all
> >> clusters
> >>               break;
> >>           read = ebml_read_length(matroska, matroska->ctx->pb,
> >> &cluster_length);
> >> @@ -3916,7 +3923,7 @@ static int
> >> webm_dash_manifest_cues(AVFormatContext *s, int64_t init_range)
> >>           // Cues element ID + EBML length of the Cues element.
> >> cues_end is
> >>           // inclusive and the above sum is reduced by 1.
> >>           uint64_t cues_length = 0, cues_id = 0, bytes_read = 0;
> >> -        bytes_read += ebml_read_num(matroska, matroska->ctx->pb, 4,
> >> &cues_id);
> >> +        bytes_read += ebml_read_num(matroska, matroska->ctx->pb, 4,
> >> &cues_id, 1);
> > 
> > += on a call that may return an error doesn't seem correct. The error
> > should be checked before using the value.
> > 
> Ok, I'll fix this (pre-existing) error.
> >>           bytes_read += ebml_read_length(matroska,
> >> matroska->ctx->pb, &cues_length);
> >>           cues_end = cues_start + cues_length + bytes_read - 1;
> >>       }
> >> -- 
> >> 2.19.2
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Diego Felix de Souza via ffmpeg-devel April 8, 2019, 9:53 a.m. UTC | #4
Steve Lhomme:
>> On April 7, 2019 at 11:24 AM Andreas Rheinhardt via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> wrote:
>>
>>
>> Steve Lhomme:
>>> On 3/27/2019 12:18 PM, Andreas Rheinhardt via ffmpeg-devel wrote:
>>>> ebml_read_num had a number of flaws:
>>>>
>>>> 1. The check for read errors/EOF was totally wrong. E.g. an EBML number
>>>> beginning with the invalid 0x00 would be considered a read error,
>>>> although it is just invalid data.
>>>> 2. The check for read errors/EOF was done just once, after reading the
>>>> first byte of the EBML number. But errors/EOF can happen inbetween, of
>>>> course, and this wasn't checked.
>>>> 3. There was no way to distinguish when EOF should be an error (because
>>>> the data has to be there) for which an error message should be emitted
>>>> and when it is not necessarily an error (namely during parsing of EBML
>>>> IDs). Such a possibility has been added and used.
>>>
>>> Maybe the title of the patch should rather mention that it's fixing
>>> the EOF handling when reading EBML ID/Length. The changed error
>>> messages is a less important consequence.
>>>
>> How about "avformat/matroskadec: Improve (in particular) EOF checks"?
>>>
>>>>
>>>> Some useless initializations were also fixed.
>>>>
>>>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@googlemail.com>
>>>> ---
>>>>   libavformat/matroskadec.c | 61
>>>> ++++++++++++++++++++++-----------------
>>>>   1 file changed, 34 insertions(+), 27 deletions(-)
>>>>
>>>> diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
>>>> index a6617a607b..aa2266384a 100644
>>>> --- a/libavformat/matroskadec.c
>>>> +++ b/libavformat/matroskadec.c
>>>> @@ -820,29 +820,19 @@ static int ebml_level_end(MatroskaDemuxContext
>>>> *matroska)
>>>>    * Returns: number of bytes read, < 0 on error
>>>>    */
>>>>   static int ebml_read_num(MatroskaDemuxContext *matroska,
>>>> AVIOContext *pb,
>>>> -                         int max_size, uint64_t *number)
>>>> +                         int max_size, uint64_t *number, int
>>>> eof_forbidden)
>>>>   {
>>>> -    int read = 1, n = 1;
>>>> -    uint64_t total = 0;
>>>> +    int read, n = 1;
>>>> +    uint64_t total;
>>>> +    int64_t pos;
>>>>   -    /* The first byte tells us the length in bytes - avio_r8()
>>>> can normally
>>>> -     * return 0, but since that's not a valid first ebmlID byte, we
>>>> can
>>>> -     * use it safely here to catch EOS. */
>>>> -    if (!(total = avio_r8(pb))) {
>>>> -        /* we might encounter EOS here */
>>>> -        if (!avio_feof(pb)) {
>>>> -            int64_t pos = avio_tell(pb);
>>>> -            av_log(matroska->ctx, AV_LOG_ERROR,
>>>> -                   "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
>>>> -                   pos, pos);
>>>> -            return pb->error ? pb->error : AVERROR(EIO);
>>>> -        }
>>>> -        return AVERROR_EOF;
>>>> -    }
>>>> +    /* The first byte tells us the length in bytes - except when it
>>>> is zero. */
>>>> +    total = avio_r8(pb);
>>>> +    if (avio_feof(pb))
>>>> +        goto err;
>>>>         /* get the length of the EBML number */
>>>> -    read = 8 - ff_log2_tab[total];
>>>> -    if (read > max_size) {
>>>> +    if (!total || (read = 8 - ff_log2_tab[total]) > max_size) {
>>>>           int64_t pos = avio_tell(pb) - 1;
>>>>           av_log(matroska->ctx, AV_LOG_ERROR,
>>>>                  "Invalid EBML number size tag 0x%02x at pos
>>>> %"PRIu64" (0x%"PRIx64")\n",
>>>> @@ -855,9 +845,27 @@ static int ebml_read_num(MatroskaDemuxContext
>>>> *matroska, AVIOContext *pb,
>>>>       while (n++ < read)
>>>>           total = (total << 8) | avio_r8(pb);
>>>>   +    if (avio_feof(pb)) {
>>>> +        eof_forbidden = 1;
>>>> +        goto err;
>>>> +    }
>>>
>>> You're forcing an error if the data ends after reading a number ?
>>> Ending a Matroska file with a number should be fine. It could also be
>>> an element with a size of 0. It doesn't contain any data but it's
>>> still valid (depending on the semantic of the element).
>>>
>>> So this forced error seem wrong. Let the next read catch the EOF if it
>>> finds one.
>>>
>> avio_feof doesn't indicate an error if we reach EOF, only if we
>> attempt to read past EOF (or if another error occurred). If the EBML
>> number we intend to parse is completely read, then no error is
>> indicated here at all. If the input ends immediately after this EBML
>> number, then the next read will trigger EOF and will have to catch it.
> 
> OK, it makes sense.
> 
>> If avio_feof is set at this point, it means that the first byte of the
>> EBML number says that the EBML number is total bytes long, but an
>> error or EOF occured during reading of these bytes. I think this
>> warrants an error. eof_forbidden is set at this point so that an
>> incomplete EBML number will always trigger an error.
> 
> OK
> 
>>>> +
>>>>       *number = total;
>>>>         return read;
>>>> +
>>>> +err:
>>>> +    pos = avio_tell(pb);
>>>> +    if (pb->error) {
>>>> +        av_log(matroska->ctx, AV_LOG_ERROR,
>>>> +               "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
>>>> +               pos, pos);
>>>> +        return pb->error;
>>>> +    }
>>>> +    if (eof_forbidden)
>>>> +        av_log(matroska->ctx, AV_LOG_ERROR, "File ended prematurely "
>>>> +               "at pos. %"PRIu64" (0x%"PRIx64")\n", pos, pos);
>>>
>>> If eof_forbidden is false (EOF allowed) you return an EOF error anyway ?
>>>
>> Yes, so that the caller knows that the file has ended and can act
>> accordingly. Currently only the parsing of EBML IDs in ebml_parse
>> allow EOF and if we hit EOF while parsing an EBML ID, ebml_parse does
>> a more thorough check for whether this is a real error or something
>> legal (of course, based upon whether we the current level is
>> unknown-sized or not).
> 
> Then I don't understand what eof_forbidden is for. It seems that EOF should be treated no matter what.
> 
But it should not always be treated as an error. There are cases where
it is always an error if EOF happens: In the middle of reading an EBML
number, directly after reading an EBML ID (the EBML size is supposed
to be there) and also during parsing of the sizes of laces. But there
is one instance where EOF might be allowed, namely when parsing an
EBML ID. Without eof_forbidden or similar, ebml_read_num would either
always when hitting EOF at the beginning or never. In the first case,
a compliant unknown-sized file would always get an unwarranted error.
In the second case, I would have to write custom error messages for
every place where EOF is forbidden.
In my proposal, I only have to write custom checks and error messages
for reading an EBML ID.

- Andreas
diff mbox

Patch

diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
index a6617a607b..aa2266384a 100644
--- a/libavformat/matroskadec.c
+++ b/libavformat/matroskadec.c
@@ -820,29 +820,19 @@  static int ebml_level_end(MatroskaDemuxContext *matroska)
  * Returns: number of bytes read, < 0 on error
  */
 static int ebml_read_num(MatroskaDemuxContext *matroska, AVIOContext *pb,
-                         int max_size, uint64_t *number)
+                         int max_size, uint64_t *number, int eof_forbidden)
 {
-    int read = 1, n = 1;
-    uint64_t total = 0;
+    int read, n = 1;
+    uint64_t total;
+    int64_t pos;
 
-    /* The first byte tells us the length in bytes - avio_r8() can normally
-     * return 0, but since that's not a valid first ebmlID byte, we can
-     * use it safely here to catch EOS. */
-    if (!(total = avio_r8(pb))) {
-        /* we might encounter EOS here */
-        if (!avio_feof(pb)) {
-            int64_t pos = avio_tell(pb);
-            av_log(matroska->ctx, AV_LOG_ERROR,
-                   "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
-                   pos, pos);
-            return pb->error ? pb->error : AVERROR(EIO);
-        }
-        return AVERROR_EOF;
-    }
+    /* The first byte tells us the length in bytes - except when it is zero. */
+    total = avio_r8(pb);
+    if (avio_feof(pb))
+        goto err;
 
     /* get the length of the EBML number */
-    read = 8 - ff_log2_tab[total];
-    if (read > max_size) {
+    if (!total || (read = 8 - ff_log2_tab[total]) > max_size) {
         int64_t pos = avio_tell(pb) - 1;
         av_log(matroska->ctx, AV_LOG_ERROR,
                "Invalid EBML number size tag 0x%02x at pos %"PRIu64" (0x%"PRIx64")\n",
@@ -855,9 +845,27 @@  static int ebml_read_num(MatroskaDemuxContext *matroska, AVIOContext *pb,
     while (n++ < read)
         total = (total << 8) | avio_r8(pb);
 
+    if (avio_feof(pb)) {
+        eof_forbidden = 1;
+        goto err;
+    }
+
     *number = total;
 
     return read;
+
+err:
+    pos = avio_tell(pb);
+    if (pb->error) {
+        av_log(matroska->ctx, AV_LOG_ERROR,
+               "Read error at pos. %"PRIu64" (0x%"PRIx64")\n",
+               pos, pos);
+        return pb->error;
+    }
+    if (eof_forbidden)
+        av_log(matroska->ctx, AV_LOG_ERROR, "File ended prematurely "
+               "at pos. %"PRIu64" (0x%"PRIx64")\n", pos, pos);
+    return AVERROR_EOF;
 }
 
 /**
@@ -868,7 +876,7 @@  static int ebml_read_num(MatroskaDemuxContext *matroska, AVIOContext *pb,
 static int ebml_read_length(MatroskaDemuxContext *matroska, AVIOContext *pb,
                             uint64_t *number)
 {
-    int res = ebml_read_num(matroska, pb, 8, number);
+    int res = ebml_read_num(matroska, pb, 8, number, 1);
     if (res > 0 && *number + 1 == 1ULL << (7 * res))
         *number = EBML_UNKNOWN_LENGTH;
     return res;
@@ -1010,7 +1018,7 @@  static int matroska_ebmlnum_uint(MatroskaDemuxContext *matroska,
 {
     AVIOContext pb;
     ffio_init_context(&pb, data, size, 0, NULL, NULL, NULL, NULL);
-    return ebml_read_num(matroska, &pb, FFMIN(size, 8), num);
+    return ebml_read_num(matroska, &pb, FFMIN(size, 8), num, 1);
 }
 
 /*
@@ -1057,7 +1065,7 @@  static int ebml_parse(MatroskaDemuxContext *matroska, EbmlSyntax *syntax,
 {
     if (!matroska->current_id) {
         uint64_t id;
-        int res = ebml_read_num(matroska, matroska->ctx->pb, 4, &id);
+        int res = ebml_read_num(matroska, matroska->ctx->pb, 4, &id, 0);
         if (res < 0) {
             // in live mode, finish parsing if EOF is reached.
             return (matroska->is_live && matroska->ctx->pb->eof_reached &&
@@ -3335,10 +3343,9 @@  static int matroska_parse_block(MatroskaDemuxContext *matroska, AVBufferRef *buf
     uint64_t num;
     int trust_default_duration = 1;
 
-    if ((n = matroska_ebmlnum_uint(matroska, data, size, &num)) < 0) {
-        av_log(matroska->ctx, AV_LOG_ERROR, "EBML block data error\n");
+    if ((n = matroska_ebmlnum_uint(matroska, data, size, &num)) < 0)
         return n;
-    }
+
     data += n;
     size -= n;
 
@@ -3699,7 +3706,7 @@  static int webm_clusters_start_with_keyframe(AVFormatContext *s)
         AVPacket *pkt;
         avio_seek(s->pb, cluster_pos, SEEK_SET);
         // read cluster id and length
-        read = ebml_read_num(matroska, matroska->ctx->pb, 4, &cluster_id);
+        read = ebml_read_num(matroska, matroska->ctx->pb, 4, &cluster_id, 1);
         if (read < 0 || cluster_id != 0xF43B675) // done with all clusters
             break;
         read = ebml_read_length(matroska, matroska->ctx->pb, &cluster_length);
@@ -3916,7 +3923,7 @@  static int webm_dash_manifest_cues(AVFormatContext *s, int64_t init_range)
         // Cues element ID + EBML length of the Cues element. cues_end is
         // inclusive and the above sum is reduced by 1.
         uint64_t cues_length = 0, cues_id = 0, bytes_read = 0;
-        bytes_read += ebml_read_num(matroska, matroska->ctx->pb, 4, &cues_id);
+        bytes_read += ebml_read_num(matroska, matroska->ctx->pb, 4, &cues_id, 1);
         bytes_read += ebml_read_length(matroska, matroska->ctx->pb, &cues_length);
         cues_end = cues_start + cues_length + bytes_read - 1;
     }