diff mbox

[FFmpeg-devel,03/23] avformat/matroskaenc: Use random TrackUID

Message ID 20191106024922.19228-3-andreas.rheinhardt@gmail.com
State Superseded
Headers show

Commit Message

Andreas Rheinhardt Nov. 6, 2019, 2:49 a.m. UTC
Up until now, the TrackUID of a Matroska track which is supposed to be
random was not random at all: It always coincided with the TrackNumber
which is usually the 1-based index of the corresponding stream in the
array of AVStreams. This has been changed: It is now set via an AVLFG
if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is set
now (the only change happens if an explicit track number has been
choosen via dash_track_number, because the system used in the normal
situation is now used, too). In particular, no FATE tests need to be
updated.

This also fixes a bug in case the dash_track_number option was used:
In this case the TrackUID was set to the track number, but the tags were
written with a TagTrackUID simply based upon the index, so that the tags
didn't apply to the track they ought to apply to.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
---
mkv_get_uid() might be overkill, but I simply wanted to be sure that
there are no collisions.

 libavformat/matroskaenc.c | 65 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 8 deletions(-)

Comments

Michael Niedermayer Nov. 24, 2019, 8:48 a.m. UTC | #1
On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt wrote:
> Up until now, the TrackUID of a Matroska track which is supposed to be
> random was not random at all: It always coincided with the TrackNumber
> which is usually the 1-based index of the corresponding stream in the
> array of AVStreams. This has been changed: It is now set via an AVLFG
> if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is set
> now (the only change happens if an explicit track number has been
> choosen via dash_track_number, because the system used in the normal
> situation is now used, too). In particular, no FATE tests need to be
> updated.
> 
> This also fixes a bug in case the dash_track_number option was used:
> In this case the TrackUID was set to the track number, but the tags were
> written with a TagTrackUID simply based upon the index, so that the tags
> didn't apply to the track they ought to apply to.
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
> ---
> mkv_get_uid() might be overkill, but I simply wanted to be sure that
> there are no collisions.
> 
>  libavformat/matroskaenc.c | 65 ++++++++++++++++++++++++++++++++++-----
>  1 file changed, 57 insertions(+), 8 deletions(-)
> 
> diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
> index de57e474be..b87d15b7ff 100644
> --- a/libavformat/matroskaenc.c
> +++ b/libavformat/matroskaenc.c
> @@ -94,6 +94,7 @@ typedef struct mkv_cues {
>  typedef struct mkv_track {
>      int             write_dts;
>      int             has_cue;
> +    uint32_t        uid;
>      int             sample_rate;
>      int64_t         sample_rate_offset;
>      int64_t         last_timestamp;
> @@ -1199,8 +1200,7 @@ static int mkv_write_track(AVFormatContext *s, MatroskaMuxContext *mkv,
>      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
>      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
>                     mkv->is_dash ? mkv->dash_track_number : i + 1);
> -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> -                   mkv->is_dash ? mkv->dash_track_number : i + 1);
> +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
>      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no lacing (yet)
>  
>      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
> @@ -1650,7 +1650,8 @@ static int mkv_write_tags(AVFormatContext *s)
>          if (!mkv_check_tag(st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID))
>              continue;
>  
> -        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
> +        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID,
> +                            mkv->tracks[i].uid);
>          if (ret < 0) return ret;
>      }
>  
> @@ -1658,13 +1659,15 @@ static int mkv_write_tags(AVFormatContext *s)
>          for (i = 0; i < s->nb_streams; i++) {
>              AVIOContext *pb;
>              AVStream *st = s->streams[i];
> +            mkv_track *track = &mkv->tracks[i];
>              ebml_master tag_target;
>              ebml_master tag;
>  
>              if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT)
>                  continue;
>  
> -            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
> +            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID,
> +                                  track->uid, &tag_target);
>              pb = mkv->tags_bc;
>  
>              tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG, 0);
> @@ -1863,10 +1866,6 @@ static int mkv_write_header(AVFormatContext *s)
>              version = 4;
>      }
>  
> -    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
> -    if (!mkv->tracks) {
> -        return AVERROR(ENOMEM);
> -    }
>      ebml_header = start_ebml_master(pb, EBML_ID_HEADER, MAX_EBML_HEADER_SIZE);
>      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
>      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
> @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum AVCodecID codec_id, int std_compliance)
>      return 0;
>  }
>  
> +static uint32_t mkv_get_uid(const mkv_track *tracks, int i, AVLFG *c)
> +{
> +    uint32_t uid;
> +
> +    for (int j = 0, k; j < 5; j++) {
> +        uid = av_lfg_get(c);
> +        if (!uid)
> +            continue;
> +        for (k = 0; k < i; k++) {
> +            if (tracks[k].uid == uid) {
> +                /* Was something wrong with our random seed? */
> +                av_lfg_init(c, av_get_random_seed());
> +                break;
> +            }
> +        }
> +        if (k == i)
> +            return uid;
> +    }
> +
> +    /* Test the numbers from 1 to i. */
> +    for (int j = 1, k; j < i + 1; j++) {
> +        for (k = 0; k < i; k++) {
> +            if (tracks[k].uid == j)
> +                break;
> +        }
> +        if (k == i)
> +            return j;
> +    }
> +    /* Return i + 1. It can't coincide with another uid. */
> +    return i + 1;
> +}
> +
>  static int mkv_init(struct AVFormatContext *s)
>  {
> +    MatroskaMuxContext *mkv = s->priv_data;
> +    AVLFG c;
>      int i;
>  
>      if (s->nb_streams > MAX_TRACKS) {
> @@ -2697,7 +2730,23 @@ static int mkv_init(struct AVFormatContext *s)
>          s->internal->avoid_negative_ts_use_pts = 1;
>      }
>  
> +    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
> +    if (!mkv->tracks) {
> +        return AVERROR(ENOMEM);
> +    }
> +

> +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
> +        av_lfg_init(&c, av_get_random_seed());

would there be a disadvantage if the configuration of a stream / its content
is used a seed ?
That way it would be more deterministic


[...]
Hendrik Leppkes Nov. 24, 2019, 8:58 a.m. UTC | #2
On Sun, Nov 24, 2019 at 9:49 AM Michael Niedermayer
<michael@niedermayer.cc> wrote:
>
> On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt wrote:
> > Up until now, the TrackUID of a Matroska track which is supposed to be
> > random was not random at all: It always coincided with the TrackNumber
> > which is usually the 1-based index of the corresponding stream in the
> > array of AVStreams. This has been changed: It is now set via an AVLFG
> > if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is set
> > now (the only change happens if an explicit track number has been
> > choosen via dash_track_number, because the system used in the normal
> > situation is now used, too). In particular, no FATE tests need to be
> > updated.
> >
> > This also fixes a bug in case the dash_track_number option was used:
> > In this case the TrackUID was set to the track number, but the tags were
> > written with a TagTrackUID simply based upon the index, so that the tags
> > didn't apply to the track they ought to apply to.
> >
> > Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
> > ---
> > mkv_get_uid() might be overkill, but I simply wanted to be sure that
> > there are no collisions.
> >
> >  libavformat/matroskaenc.c | 65 ++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 57 insertions(+), 8 deletions(-)
> >
> > diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
> > index de57e474be..b87d15b7ff 100644
> > --- a/libavformat/matroskaenc.c
> > +++ b/libavformat/matroskaenc.c
> > @@ -94,6 +94,7 @@ typedef struct mkv_cues {
> >  typedef struct mkv_track {
> >      int             write_dts;
> >      int             has_cue;
> > +    uint32_t        uid;
> >      int             sample_rate;
> >      int64_t         sample_rate_offset;
> >      int64_t         last_timestamp;
> > @@ -1199,8 +1200,7 @@ static int mkv_write_track(AVFormatContext *s, MatroskaMuxContext *mkv,
> >      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
> >      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
> >                     mkv->is_dash ? mkv->dash_track_number : i + 1);
> > -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> > -                   mkv->is_dash ? mkv->dash_track_number : i + 1);
> > +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
> >      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no lacing (yet)
> >
> >      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
> > @@ -1650,7 +1650,8 @@ static int mkv_write_tags(AVFormatContext *s)
> >          if (!mkv_check_tag(st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID))
> >              continue;
> >
> > -        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
> > +        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID,
> > +                            mkv->tracks[i].uid);
> >          if (ret < 0) return ret;
> >      }
> >
> > @@ -1658,13 +1659,15 @@ static int mkv_write_tags(AVFormatContext *s)
> >          for (i = 0; i < s->nb_streams; i++) {
> >              AVIOContext *pb;
> >              AVStream *st = s->streams[i];
> > +            mkv_track *track = &mkv->tracks[i];
> >              ebml_master tag_target;
> >              ebml_master tag;
> >
> >              if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT)
> >                  continue;
> >
> > -            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
> > +            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID,
> > +                                  track->uid, &tag_target);
> >              pb = mkv->tags_bc;
> >
> >              tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG, 0);
> > @@ -1863,10 +1866,6 @@ static int mkv_write_header(AVFormatContext *s)
> >              version = 4;
> >      }
> >
> > -    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
> > -    if (!mkv->tracks) {
> > -        return AVERROR(ENOMEM);
> > -    }
> >      ebml_header = start_ebml_master(pb, EBML_ID_HEADER, MAX_EBML_HEADER_SIZE);
> >      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
> >      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
> > @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum AVCodecID codec_id, int std_compliance)
> >      return 0;
> >  }
> >
> > +static uint32_t mkv_get_uid(const mkv_track *tracks, int i, AVLFG *c)
> > +{
> > +    uint32_t uid;
> > +
> > +    for (int j = 0, k; j < 5; j++) {
> > +        uid = av_lfg_get(c);
> > +        if (!uid)
> > +            continue;
> > +        for (k = 0; k < i; k++) {
> > +            if (tracks[k].uid == uid) {
> > +                /* Was something wrong with our random seed? */
> > +                av_lfg_init(c, av_get_random_seed());
> > +                break;
> > +            }
> > +        }
> > +        if (k == i)
> > +            return uid;
> > +    }
> > +
> > +    /* Test the numbers from 1 to i. */
> > +    for (int j = 1, k; j < i + 1; j++) {
> > +        for (k = 0; k < i; k++) {
> > +            if (tracks[k].uid == j)
> > +                break;
> > +        }
> > +        if (k == i)
> > +            return j;
> > +    }
> > +    /* Return i + 1. It can't coincide with another uid. */
> > +    return i + 1;
> > +}
> > +
> >  static int mkv_init(struct AVFormatContext *s)
> >  {
> > +    MatroskaMuxContext *mkv = s->priv_data;
> > +    AVLFG c;
> >      int i;
> >
> >      if (s->nb_streams > MAX_TRACKS) {
> > @@ -2697,7 +2730,23 @@ static int mkv_init(struct AVFormatContext *s)
> >          s->internal->avoid_negative_ts_use_pts = 1;
> >      }
> >
> > +    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
> > +    if (!mkv->tracks) {
> > +        return AVERROR(ENOMEM);
> > +    }
> > +
>
> > +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
> > +        av_lfg_init(&c, av_get_random_seed());
>
> would there be a disadvantage if the configuration of a stream / its content
> is used a seed ?
> That way it would be more deterministic
>

Configuration is not enough data as many files can share that. How can
you take the content into it realistically?
MKV has various UUID identifiers, those should just be generated, imho.

- Hendrik
Andreas Rheinhardt Nov. 24, 2019, 9:08 a.m. UTC | #3
Michael Niedermayer:
> On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt wrote:
>> Up until now, the TrackUID of a Matroska track which is supposed to be
>> random was not random at all: It always coincided with the TrackNumber
>> which is usually the 1-based index of the corresponding stream in the
>> array of AVStreams. This has been changed: It is now set via an AVLFG
>> if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is set
>> now (the only change happens if an explicit track number has been
>> choosen via dash_track_number, because the system used in the normal
>> situation is now used, too). In particular, no FATE tests need to be
>> updated.
>>
>> This also fixes a bug in case the dash_track_number option was used:
>> In this case the TrackUID was set to the track number, but the tags were
>> written with a TagTrackUID simply based upon the index, so that the tags
>> didn't apply to the track they ought to apply to.
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
>> ---
>> mkv_get_uid() might be overkill, but I simply wanted to be sure that
>> there are no collisions.
>>
>>  libavformat/matroskaenc.c | 65 ++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 57 insertions(+), 8 deletions(-)
>>
>> diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
>> index de57e474be..b87d15b7ff 100644
>> --- a/libavformat/matroskaenc.c
>> +++ b/libavformat/matroskaenc.c
>> @@ -94,6 +94,7 @@ typedef struct mkv_cues {
>>  typedef struct mkv_track {
>>      int             write_dts;
>>      int             has_cue;
>> +    uint32_t        uid;
>>      int             sample_rate;
>>      int64_t         sample_rate_offset;
>>      int64_t         last_timestamp;
>> @@ -1199,8 +1200,7 @@ static int mkv_write_track(AVFormatContext *s, MatroskaMuxContext *mkv,
>>      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
>>      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
>>                     mkv->is_dash ? mkv->dash_track_number : i + 1);
>> -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
>> -                   mkv->is_dash ? mkv->dash_track_number : i + 1);
>> +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
>>      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no lacing (yet)
>>  
>>      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
>> @@ -1650,7 +1650,8 @@ static int mkv_write_tags(AVFormatContext *s)
>>          if (!mkv_check_tag(st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID))
>>              continue;
>>  
>> -        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
>> +        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID,
>> +                            mkv->tracks[i].uid);
>>          if (ret < 0) return ret;
>>      }
>>  
>> @@ -1658,13 +1659,15 @@ static int mkv_write_tags(AVFormatContext *s)
>>          for (i = 0; i < s->nb_streams; i++) {
>>              AVIOContext *pb;
>>              AVStream *st = s->streams[i];
>> +            mkv_track *track = &mkv->tracks[i];
>>              ebml_master tag_target;
>>              ebml_master tag;
>>  
>>              if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT)
>>                  continue;
>>  
>> -            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
>> +            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID,
>> +                                  track->uid, &tag_target);
>>              pb = mkv->tags_bc;
>>  
>>              tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG, 0);
>> @@ -1863,10 +1866,6 @@ static int mkv_write_header(AVFormatContext *s)
>>              version = 4;
>>      }
>>  
>> -    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
>> -    if (!mkv->tracks) {
>> -        return AVERROR(ENOMEM);
>> -    }
>>      ebml_header = start_ebml_master(pb, EBML_ID_HEADER, MAX_EBML_HEADER_SIZE);
>>      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
>>      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
>> @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum AVCodecID codec_id, int std_compliance)
>>      return 0;
>>  }
>>  
>> +static uint32_t mkv_get_uid(const mkv_track *tracks, int i, AVLFG *c)
>> +{
>> +    uint32_t uid;
>> +
>> +    for (int j = 0, k; j < 5; j++) {
>> +        uid = av_lfg_get(c);
>> +        if (!uid)
>> +            continue;
>> +        for (k = 0; k < i; k++) {
>> +            if (tracks[k].uid == uid) {
>> +                /* Was something wrong with our random seed? */
>> +                av_lfg_init(c, av_get_random_seed());
>> +                break;
>> +            }
>> +        }
>> +        if (k == i)
>> +            return uid;
>> +    }
>> +
>> +    /* Test the numbers from 1 to i. */
>> +    for (int j = 1, k; j < i + 1; j++) {
>> +        for (k = 0; k < i; k++) {
>> +            if (tracks[k].uid == j)
>> +                break;
>> +        }
>> +        if (k == i)
>> +            return j;
>> +    }
>> +    /* Return i + 1. It can't coincide with another uid. */
>> +    return i + 1;
>> +}
>> +
>>  static int mkv_init(struct AVFormatContext *s)
>>  {
>> +    MatroskaMuxContext *mkv = s->priv_data;
>> +    AVLFG c;
>>      int i;
>>  
>>      if (s->nb_streams > MAX_TRACKS) {
>> @@ -2697,7 +2730,23 @@ static int mkv_init(struct AVFormatContext *s)
>>          s->internal->avoid_negative_ts_use_pts = 1;
>>      }
>>  
>> +    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
>> +    if (!mkv->tracks) {
>> +        return AVERROR(ENOMEM);
>> +    }
>> +
> 
>> +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
>> +        av_lfg_init(&c, av_get_random_seed());
> 
> would there be a disadvantage if the configuration of a stream / its content
> is used a seed ?
> That way it would be more deterministic
> 
I see several disadvantages:
1. If one duplicates a stream (i.e. muxes it twice in the same file),
they would get the same uid (unless one takes additional measures).
2. If the packet content is used, one can determine the uid only after
having received a packet. This would mean that one has to postpone
writing the Tracks element until one has a packet and the same goes
for the tags given that the uid is used to convey what they apply to.
One could solve this problem in the seekable, non-livestreaming case
by reserving size to be overwritten later, but this is nontrivial and
I'd like to avoid that.
3. If the configuration record is used, then you won't get anything
remotely unique: Using the same encoder settings will likely produce
files with identical uids.

Furthermore, this approach means that I can reuse the code for both
attachments as well as tracks. (See the next patch in this series.)

- Andreas
Michael Niedermayer Nov. 24, 2019, 7:28 p.m. UTC | #4
On Sun, Nov 24, 2019 at 09:08:00AM +0000, Andreas Rheinhardt wrote:
> Michael Niedermayer:
> > On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt wrote:
> >> Up until now, the TrackUID of a Matroska track which is supposed to be
> >> random was not random at all: It always coincided with the TrackNumber
> >> which is usually the 1-based index of the corresponding stream in the
> >> array of AVStreams. This has been changed: It is now set via an AVLFG
> >> if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is set
> >> now (the only change happens if an explicit track number has been
> >> choosen via dash_track_number, because the system used in the normal
> >> situation is now used, too). In particular, no FATE tests need to be
> >> updated.
> >>
> >> This also fixes a bug in case the dash_track_number option was used:
> >> In this case the TrackUID was set to the track number, but the tags were
> >> written with a TagTrackUID simply based upon the index, so that the tags
> >> didn't apply to the track they ought to apply to.
> >>
> >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
> >> ---
> >> mkv_get_uid() might be overkill, but I simply wanted to be sure that
> >> there are no collisions.
> >>
> >>  libavformat/matroskaenc.c | 65 ++++++++++++++++++++++++++++++++++-----
> >>  1 file changed, 57 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
> >> index de57e474be..b87d15b7ff 100644
> >> --- a/libavformat/matroskaenc.c
> >> +++ b/libavformat/matroskaenc.c
> >> @@ -94,6 +94,7 @@ typedef struct mkv_cues {
> >>  typedef struct mkv_track {
> >>      int             write_dts;
> >>      int             has_cue;
> >> +    uint32_t        uid;
> >>      int             sample_rate;
> >>      int64_t         sample_rate_offset;
> >>      int64_t         last_timestamp;
> >> @@ -1199,8 +1200,7 @@ static int mkv_write_track(AVFormatContext *s, MatroskaMuxContext *mkv,
> >>      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
> >>      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
> >>                     mkv->is_dash ? mkv->dash_track_number : i + 1);
> >> -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> >> -                   mkv->is_dash ? mkv->dash_track_number : i + 1);
> >> +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
> >>      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no lacing (yet)
> >>  
> >>      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
> >> @@ -1650,7 +1650,8 @@ static int mkv_write_tags(AVFormatContext *s)
> >>          if (!mkv_check_tag(st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID))
> >>              continue;
> >>  
> >> -        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
> >> +        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID,
> >> +                            mkv->tracks[i].uid);
> >>          if (ret < 0) return ret;
> >>      }
> >>  
> >> @@ -1658,13 +1659,15 @@ static int mkv_write_tags(AVFormatContext *s)
> >>          for (i = 0; i < s->nb_streams; i++) {
> >>              AVIOContext *pb;
> >>              AVStream *st = s->streams[i];
> >> +            mkv_track *track = &mkv->tracks[i];
> >>              ebml_master tag_target;
> >>              ebml_master tag;
> >>  
> >>              if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT)
> >>                  continue;
> >>  
> >> -            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
> >> +            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID,
> >> +                                  track->uid, &tag_target);
> >>              pb = mkv->tags_bc;
> >>  
> >>              tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG, 0);
> >> @@ -1863,10 +1866,6 @@ static int mkv_write_header(AVFormatContext *s)
> >>              version = 4;
> >>      }
> >>  
> >> -    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
> >> -    if (!mkv->tracks) {
> >> -        return AVERROR(ENOMEM);
> >> -    }
> >>      ebml_header = start_ebml_master(pb, EBML_ID_HEADER, MAX_EBML_HEADER_SIZE);
> >>      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
> >>      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
> >> @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum AVCodecID codec_id, int std_compliance)
> >>      return 0;
> >>  }
> >>  
> >> +static uint32_t mkv_get_uid(const mkv_track *tracks, int i, AVLFG *c)
> >> +{
> >> +    uint32_t uid;
> >> +
> >> +    for (int j = 0, k; j < 5; j++) {
> >> +        uid = av_lfg_get(c);
> >> +        if (!uid)
> >> +            continue;
> >> +        for (k = 0; k < i; k++) {
> >> +            if (tracks[k].uid == uid) {
> >> +                /* Was something wrong with our random seed? */
> >> +                av_lfg_init(c, av_get_random_seed());
> >> +                break;
> >> +            }
> >> +        }
> >> +        if (k == i)
> >> +            return uid;
> >> +    }
> >> +
> >> +    /* Test the numbers from 1 to i. */
> >> +    for (int j = 1, k; j < i + 1; j++) {
> >> +        for (k = 0; k < i; k++) {
> >> +            if (tracks[k].uid == j)
> >> +                break;
> >> +        }
> >> +        if (k == i)
> >> +            return j;
> >> +    }
> >> +    /* Return i + 1. It can't coincide with another uid. */
> >> +    return i + 1;
> >> +}
> >> +
> >>  static int mkv_init(struct AVFormatContext *s)
> >>  {
> >> +    MatroskaMuxContext *mkv = s->priv_data;
> >> +    AVLFG c;
> >>      int i;
> >>  
> >>      if (s->nb_streams > MAX_TRACKS) {
> >> @@ -2697,7 +2730,23 @@ static int mkv_init(struct AVFormatContext *s)
> >>          s->internal->avoid_negative_ts_use_pts = 1;
> >>      }
> >>  
> >> +    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
> >> +    if (!mkv->tracks) {
> >> +        return AVERROR(ENOMEM);
> >> +    }
> >> +
> > 
> >> +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
> >> +        av_lfg_init(&c, av_get_random_seed());
> > 
> > would there be a disadvantage if the configuration of a stream / its content
> > is used a seed ?
> > That way it would be more deterministic
> > 
> I see several disadvantages:
> 1. If one duplicates a stream (i.e. muxes it twice in the same file),
> they would get the same uid (unless one takes additional measures).

If you took the configuration of all streams as a single seed that would not
happen


> 2. If the packet content is used, one can determine the uid only after
> having received a packet. This would mean that one has to postpone
> writing the Tracks element until one has a packet and the same goes
> for the tags given that the uid is used to convey what they apply to.
> One could solve this problem in the seekable, non-livestreaming case
> by reserving size to be overwritten later, but this is nontrivial and
> I'd like to avoid that.
> 3. If the configuration record is used, then you won't get anything
> remotely unique: Using the same encoder settings will likely produce
> files with identical uids.
> 
> Furthermore, this approach means that I can reuse the code for both
> attachments as well as tracks. (See the next patch in this series.)

fair enough, if its considered that avoiding some complexity is more
important than deterministic output

Either way if you need X bits of entropy in the muxer there should be 
X/32 calls to av_get_random_seed(). as it is there are 4 independant
places calling av_get_random_seed() after the patch each using 32bit 
to initialize an independant LFG.
seed collisions are not at all impossible at 32bits
also av_get_random_seed() can be expensive on some platforms so 
i think there should be a single place calling it as needed and not
throwing the obtained entropy away.

Also, why 32bit ? arent these "uint" UIDs 64bit ?


and, this
+    /* Test the numbers from 1 to i. */
+    for (int j = 1, k; j < i + 1; j++) {
+        for (k = 0; k < i; k++) {
+            if (tracks[k].uid == j)
+                break;
+        }
+        if (k == i)
+            return j;
+    }

just smells "wrong"

if you want a simple non repeating UID, just count 0,1,2,3 for each
and put that in the unused upper 32bit for example

or you can even ditch LFG and use something that findamentgally generates 
non repeating output.
UID[i] = seed + i
comes to mind
or if theres a reason why that is not good then you can also do something
like
UID[i] = (seed + i*(seed2|1)) & 0xFFFFFFFFFFFFFFFF;

but maybe theres a reason behind the more complex UID generation that
iam missing ?

Thanks



[...]
Andreas Rheinhardt Nov. 24, 2019, 10:05 p.m. UTC | #5
On Sun, Nov 24, 2019 at 8:28 PM Michael Niedermayer <michael@niedermayer.cc>
wrote:

> On Sun, Nov 24, 2019 at 09:08:00AM +0000, Andreas Rheinhardt wrote:
> > Michael Niedermayer:
> > > On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt wrote:
> > >> Up until now, the TrackUID of a Matroska track which is supposed to be
> > >> random was not random at all: It always coincided with the TrackNumber
> > >> which is usually the 1-based index of the corresponding stream in the
> > >> array of AVStreams. This has been changed: It is now set via an AVLFG
> > >> if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is set
> > >> now (the only change happens if an explicit track number has been
> > >> choosen via dash_track_number, because the system used in the normal
> > >> situation is now used, too). In particular, no FATE tests need to be
> > >> updated.
> > >>
> > >> This also fixes a bug in case the dash_track_number option was used:
> > >> In this case the TrackUID was set to the track number, but the tags
> were
> > >> written with a TagTrackUID simply based upon the index, so that the
> tags
> > >> didn't apply to the track they ought to apply to.
> > >>
> > >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
> > >> ---
> > >> mkv_get_uid() might be overkill, but I simply wanted to be sure that
> > >> there are no collisions.
> > >>
> > >>  libavformat/matroskaenc.c | 65
> ++++++++++++++++++++++++++++++++++-----
> > >>  1 file changed, 57 insertions(+), 8 deletions(-)
> > >>
> > >> diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
> > >> index de57e474be..b87d15b7ff 100644
> > >> --- a/libavformat/matroskaenc.c
> > >> +++ b/libavformat/matroskaenc.c
> > >> @@ -94,6 +94,7 @@ typedef struct mkv_cues {
> > >>  typedef struct mkv_track {
> > >>      int             write_dts;
> > >>      int             has_cue;
> > >> +    uint32_t        uid;
> > >>      int             sample_rate;
> > >>      int64_t         sample_rate_offset;
> > >>      int64_t         last_timestamp;
> > >> @@ -1199,8 +1200,7 @@ static int mkv_write_track(AVFormatContext *s,
> MatroskaMuxContext *mkv,
> > >>      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
> > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
> > >>                     mkv->is_dash ? mkv->dash_track_number : i + 1);
> > >> -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> > >> -                   mkv->is_dash ? mkv->dash_track_number : i + 1);
> > >> +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
> > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no
> lacing (yet)
> > >>
> > >>      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
> > >> @@ -1650,7 +1650,8 @@ static int mkv_write_tags(AVFormatContext *s)
> > >>          if (!mkv_check_tag(st->metadata,
> MATROSKA_ID_TAGTARGETS_TRACKUID))
> > >>              continue;
> > >>
> > >> -        ret = mkv_write_tag(s, st->metadata,
> MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
> > >> +        ret = mkv_write_tag(s, st->metadata,
> MATROSKA_ID_TAGTARGETS_TRACKUID,
> > >> +                            mkv->tracks[i].uid);
> > >>          if (ret < 0) return ret;
> > >>      }
> > >>
> > >> @@ -1658,13 +1659,15 @@ static int mkv_write_tags(AVFormatContext *s)
> > >>          for (i = 0; i < s->nb_streams; i++) {
> > >>              AVIOContext *pb;
> > >>              AVStream *st = s->streams[i];
> > >> +            mkv_track *track = &mkv->tracks[i];
> > >>              ebml_master tag_target;
> > >>              ebml_master tag;
> > >>
> > >>              if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT)
> > >>                  continue;
> > >>
> > >> -            mkv_write_tag_targets(s,
> MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
> > >> +            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID,
> > >> +                                  track->uid, &tag_target);
> > >>              pb = mkv->tags_bc;
> > >>
> > >>              tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG, 0);
> > >> @@ -1863,10 +1866,6 @@ static int mkv_write_header(AVFormatContext *s)
> > >>              version = 4;
> > >>      }
> > >>
> > >> -    mkv->tracks = av_mallocz_array(s->nb_streams,
> sizeof(*mkv->tracks));
> > >> -    if (!mkv->tracks) {
> > >> -        return AVERROR(ENOMEM);
> > >> -    }
> > >>      ebml_header = start_ebml_master(pb, EBML_ID_HEADER,
> MAX_EBML_HEADER_SIZE);
> > >>      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
> > >>      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
> > >> @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum AVCodecID
> codec_id, int std_compliance)
> > >>      return 0;
> > >>  }
> > >>
> > >> +static uint32_t mkv_get_uid(const mkv_track *tracks, int i, AVLFG *c)
> > >> +{
> > >> +    uint32_t uid;
> > >> +
> > >> +    for (int j = 0, k; j < 5; j++) {
> > >> +        uid = av_lfg_get(c);
> > >> +        if (!uid)
> > >> +            continue;
> > >> +        for (k = 0; k < i; k++) {
> > >> +            if (tracks[k].uid == uid) {
> > >> +                /* Was something wrong with our random seed? */
> > >> +                av_lfg_init(c, av_get_random_seed());
> > >> +                break;
> > >> +            }
> > >> +        }
> > >> +        if (k == i)
> > >> +            return uid;
> > >> +    }
> > >> +
> > >> +    /* Test the numbers from 1 to i. */
> > >> +    for (int j = 1, k; j < i + 1; j++) {
> > >> +        for (k = 0; k < i; k++) {
> > >> +            if (tracks[k].uid == j)
> > >> +                break;
> > >> +        }
> > >> +        if (k == i)
> > >> +            return j;
> > >> +    }
> > >> +    /* Return i + 1. It can't coincide with another uid. */
> > >> +    return i + 1;
> > >> +}
> > >> +
> > >>  static int mkv_init(struct AVFormatContext *s)
> > >>  {
> > >> +    MatroskaMuxContext *mkv = s->priv_data;
> > >> +    AVLFG c;
> > >>      int i;
> > >>
> > >>      if (s->nb_streams > MAX_TRACKS) {
> > >> @@ -2697,7 +2730,23 @@ static int mkv_init(struct AVFormatContext *s)
> > >>          s->internal->avoid_negative_ts_use_pts = 1;
> > >>      }
> > >>
> > >> +    mkv->tracks = av_mallocz_array(s->nb_streams,
> sizeof(*mkv->tracks));
> > >> +    if (!mkv->tracks) {
> > >> +        return AVERROR(ENOMEM);
> > >> +    }
> > >> +
> > >
> > >> +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
> > >> +        av_lfg_init(&c, av_get_random_seed());
> > >
> > > would there be a disadvantage if the configuration of a stream / its
> content
> > > is used a seed ?
> > > That way it would be more deterministic
> > >
> > I see several disadvantages:
> > 1. If one duplicates a stream (i.e. muxes it twice in the same file),
> > they would get the same uid (unless one takes additional measures).
>
> If you took the configuration of all streams as a single seed that would
> not
> happen
>
>
> > 2. If the packet content is used, one can determine the uid only after
> > having received a packet. This would mean that one has to postpone
> > writing the Tracks element until one has a packet and the same goes
> > for the tags given that the uid is used to convey what they apply to.
> > One could solve this problem in the seekable, non-livestreaming case
> > by reserving size to be overwritten later, but this is nontrivial and
> > I'd like to avoid that.
> > 3. If the configuration record is used, then you won't get anything
> > remotely unique: Using the same encoder settings will likely produce
> > files with identical uids.
> >
> > Furthermore, this approach means that I can reuse the code for both
> > attachments as well as tracks. (See the next patch in this series.)
>
> fair enough, if its considered that avoiding some complexity is more
> important than deterministic output
>

Just to be sure: The output will be deterministic if the bitexact flag is
set.
I don't see a reason for deterministic output if it is not set.

>
> Either way if you need X bits of entropy in the muxer there should be
> X/32 calls to av_get_random_seed(). as it is there are 4 independant
> places calling av_get_random_seed() after the patch each using 32bit
> to initialize an independant LFG.
>

After the next patch, there will only be three; and usually only two of them
are used (the one in mkv_get_uid is usually not used).


> seed collisions are not at all impossible at 32bits
> also av_get_random_seed() can be expensive on some platforms so
> i think there should be a single place calling it as needed and not
> throwing the obtained entropy away.
>

I didn't deem this important performance-wise, as this is only done a few
times during init.
But if you want to, I can reuse the AVLFG from mkv_init for the segment UID.


>
> Also, why 32bit ? arent these "uint" UIDs 64bit ?
>
> Most UIDs (except the Segment UIDs) are 64bit. The reason for 32bit is
that this patchset was already pretty long, so I didn't include a patch to
make these 64bit. (There are some issues with the chapter UIDs: The
Matroska demuxer implicitly only uses the lower 32bits without making sure
that these numbers are distinct; and in an edge case the written chapter ID
can be zero despite this being against the spec: Namely if one of the
chapter IDs is INT_MIN and another one is INT_MAX.)


> and, this
> +    /* Test the numbers from 1 to i. */
> +    for (int j = 1, k; j < i + 1; j++) {
> +        for (k = 0; k < i; k++) {
> +            if (tracks[k].uid == j)
> +                break;
> +        }
> +        if (k == i)
> +            return j;
> +    }
>
> just smells "wrong"
>
> This code would only be executed if using AVLFG would repeatedly lead to
collisions;
it is just there to guarantee that the UIDs are different, but actually it
is not intended to be run.


> if you want a simple non repeating UID, just count 0,1,2,3 for each
> and put that in the unused upper 32bit for example
>
> or you can even ditch LFG and use something that findamentgally generates
> non repeating output.
> UID[i] = seed + i
> comes to mind
> or if theres a reason why that is not good then you can also do something
> like
> UID[i] = (seed + i*(seed2|1)) & 0xFFFFFFFFFFFFFFFF;
>
> but maybe theres a reason behind the more complex UID generation that
> iam missing ?
>
>  The reason I've chosen this is that this is what has already been used
for the UIDs of attachments, so I simply wanted to reuse this, while also
fixing its defects (namely that it does not ensure that the resulting UIDs
are actually distinct).

- Andreas
Michael Niedermayer Nov. 25, 2019, 1:57 p.m. UTC | #6
On Sun, Nov 24, 2019 at 11:05:03PM +0100, Andreas Rheinhardt wrote:
> On Sun, Nov 24, 2019 at 8:28 PM Michael Niedermayer <michael@niedermayer.cc>
> wrote:
> 
> > On Sun, Nov 24, 2019 at 09:08:00AM +0000, Andreas Rheinhardt wrote:
> > > Michael Niedermayer:
> > > > On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt wrote:
> > > >> Up until now, the TrackUID of a Matroska track which is supposed to be
> > > >> random was not random at all: It always coincided with the TrackNumber
> > > >> which is usually the 1-based index of the corresponding stream in the
> > > >> array of AVStreams. This has been changed: It is now set via an AVLFG
> > > >> if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is set
> > > >> now (the only change happens if an explicit track number has been
> > > >> choosen via dash_track_number, because the system used in the normal
> > > >> situation is now used, too). In particular, no FATE tests need to be
> > > >> updated.
> > > >>
> > > >> This also fixes a bug in case the dash_track_number option was used:
> > > >> In this case the TrackUID was set to the track number, but the tags
> > were
> > > >> written with a TagTrackUID simply based upon the index, so that the
> > tags
> > > >> didn't apply to the track they ought to apply to.
> > > >>
> > > >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
> > > >> ---
> > > >> mkv_get_uid() might be overkill, but I simply wanted to be sure that
> > > >> there are no collisions.
> > > >>
> > > >>  libavformat/matroskaenc.c | 65
> > ++++++++++++++++++++++++++++++++++-----
> > > >>  1 file changed, 57 insertions(+), 8 deletions(-)
> > > >>
> > > >> diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
> > > >> index de57e474be..b87d15b7ff 100644
> > > >> --- a/libavformat/matroskaenc.c
> > > >> +++ b/libavformat/matroskaenc.c
> > > >> @@ -94,6 +94,7 @@ typedef struct mkv_cues {
> > > >>  typedef struct mkv_track {
> > > >>      int             write_dts;
> > > >>      int             has_cue;
> > > >> +    uint32_t        uid;
> > > >>      int             sample_rate;
> > > >>      int64_t         sample_rate_offset;
> > > >>      int64_t         last_timestamp;
> > > >> @@ -1199,8 +1200,7 @@ static int mkv_write_track(AVFormatContext *s,
> > MatroskaMuxContext *mkv,
> > > >>      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
> > > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
> > > >>                     mkv->is_dash ? mkv->dash_track_number : i + 1);
> > > >> -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> > > >> -                   mkv->is_dash ? mkv->dash_track_number : i + 1);
> > > >> +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
> > > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no
> > lacing (yet)
> > > >>
> > > >>      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
> > > >> @@ -1650,7 +1650,8 @@ static int mkv_write_tags(AVFormatContext *s)
> > > >>          if (!mkv_check_tag(st->metadata,
> > MATROSKA_ID_TAGTARGETS_TRACKUID))
> > > >>              continue;
> > > >>
> > > >> -        ret = mkv_write_tag(s, st->metadata,
> > MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
> > > >> +        ret = mkv_write_tag(s, st->metadata,
> > MATROSKA_ID_TAGTARGETS_TRACKUID,
> > > >> +                            mkv->tracks[i].uid);
> > > >>          if (ret < 0) return ret;
> > > >>      }
> > > >>
> > > >> @@ -1658,13 +1659,15 @@ static int mkv_write_tags(AVFormatContext *s)
> > > >>          for (i = 0; i < s->nb_streams; i++) {
> > > >>              AVIOContext *pb;
> > > >>              AVStream *st = s->streams[i];
> > > >> +            mkv_track *track = &mkv->tracks[i];
> > > >>              ebml_master tag_target;
> > > >>              ebml_master tag;
> > > >>
> > > >>              if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT)
> > > >>                  continue;
> > > >>
> > > >> -            mkv_write_tag_targets(s,
> > MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
> > > >> +            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID,
> > > >> +                                  track->uid, &tag_target);
> > > >>              pb = mkv->tags_bc;
> > > >>
> > > >>              tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG, 0);
> > > >> @@ -1863,10 +1866,6 @@ static int mkv_write_header(AVFormatContext *s)
> > > >>              version = 4;
> > > >>      }
> > > >>
> > > >> -    mkv->tracks = av_mallocz_array(s->nb_streams,
> > sizeof(*mkv->tracks));
> > > >> -    if (!mkv->tracks) {
> > > >> -        return AVERROR(ENOMEM);
> > > >> -    }
> > > >>      ebml_header = start_ebml_master(pb, EBML_ID_HEADER,
> > MAX_EBML_HEADER_SIZE);
> > > >>      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
> > > >>      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
> > > >> @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum AVCodecID
> > codec_id, int std_compliance)
> > > >>      return 0;
> > > >>  }
> > > >>
> > > >> +static uint32_t mkv_get_uid(const mkv_track *tracks, int i, AVLFG *c)
> > > >> +{
> > > >> +    uint32_t uid;
> > > >> +
> > > >> +    for (int j = 0, k; j < 5; j++) {
> > > >> +        uid = av_lfg_get(c);
> > > >> +        if (!uid)
> > > >> +            continue;
> > > >> +        for (k = 0; k < i; k++) {
> > > >> +            if (tracks[k].uid == uid) {
> > > >> +                /* Was something wrong with our random seed? */
> > > >> +                av_lfg_init(c, av_get_random_seed());
> > > >> +                break;
> > > >> +            }
> > > >> +        }
> > > >> +        if (k == i)
> > > >> +            return uid;
> > > >> +    }
> > > >> +
> > > >> +    /* Test the numbers from 1 to i. */
> > > >> +    for (int j = 1, k; j < i + 1; j++) {
> > > >> +        for (k = 0; k < i; k++) {
> > > >> +            if (tracks[k].uid == j)
> > > >> +                break;
> > > >> +        }
> > > >> +        if (k == i)
> > > >> +            return j;
> > > >> +    }
> > > >> +    /* Return i + 1. It can't coincide with another uid. */
> > > >> +    return i + 1;
> > > >> +}
> > > >> +
> > > >>  static int mkv_init(struct AVFormatContext *s)
> > > >>  {
> > > >> +    MatroskaMuxContext *mkv = s->priv_data;
> > > >> +    AVLFG c;
> > > >>      int i;
> > > >>
> > > >>      if (s->nb_streams > MAX_TRACKS) {
> > > >> @@ -2697,7 +2730,23 @@ static int mkv_init(struct AVFormatContext *s)
> > > >>          s->internal->avoid_negative_ts_use_pts = 1;
> > > >>      }
> > > >>
> > > >> +    mkv->tracks = av_mallocz_array(s->nb_streams,
> > sizeof(*mkv->tracks));
> > > >> +    if (!mkv->tracks) {
> > > >> +        return AVERROR(ENOMEM);
> > > >> +    }
> > > >> +
> > > >
> > > >> +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
> > > >> +        av_lfg_init(&c, av_get_random_seed());
> > > >
> > > > would there be a disadvantage if the configuration of a stream / its
> > content
> > > > is used a seed ?
> > > > That way it would be more deterministic
> > > >
> > > I see several disadvantages:
> > > 1. If one duplicates a stream (i.e. muxes it twice in the same file),
> > > they would get the same uid (unless one takes additional measures).
> >
> > If you took the configuration of all streams as a single seed that would
> > not
> > happen
> >
> >
> > > 2. If the packet content is used, one can determine the uid only after
> > > having received a packet. This would mean that one has to postpone
> > > writing the Tracks element until one has a packet and the same goes
> > > for the tags given that the uid is used to convey what they apply to.
> > > One could solve this problem in the seekable, non-livestreaming case
> > > by reserving size to be overwritten later, but this is nontrivial and
> > > I'd like to avoid that.
> > > 3. If the configuration record is used, then you won't get anything
> > > remotely unique: Using the same encoder settings will likely produce
> > > files with identical uids.
> > >
> > > Furthermore, this approach means that I can reuse the code for both
> > > attachments as well as tracks. (See the next patch in this series.)
> >
> > fair enough, if its considered that avoiding some complexity is more
> > important than deterministic output
> >
> 
> Just to be sure: The output will be deterministic if the bitexact flag is
> set.
> I don't see a reason for deterministic output if it is not set.

the bitexact flag is used for multiple things
1. produce the same output between platforms and minor code changes
2. produce the same output when running the exact same binary with the exact
   same input

This has a few problems
code pathes disabled because of 1, cannot be tested easily because
if bitexact is enabled the code to be tested is disabled
if bitexact is disabled random values are put in the file making it not
as easy to see if the code under test produces the same results each time

Another problem is, obscuring problems, because
if with a deterministic binary you get different output on 2 runs then you
know you have a bug, a race condition or maybe a uninitialized variable or
such.
But with random values stored all over this is no longer vissible unless
one uses bitexact which most people dont do by default
so this could cause a decrease in chance detections of some bugs


[...]
> > seed collisions are not at all impossible at 32bits
> > also av_get_random_seed() can be expensive on some platforms so
> > i think there should be a single place calling it as needed and not
> > throwing the obtained entropy away.
> >
> 
> I didn't deem this important performance-wise, as this is only done a few
> times during init.
> But if you want to, I can reuse the AVLFG from mkv_init for the segment UID.

get_generic_seed() is not really a fast function so when its used IIRC
on some platforms its not really fast.
And if in a usecase someone remuxes MANY small matroska files, i would
suspect this could be noticable


[...]
> 
> 
> > and, this
> > +    /* Test the numbers from 1 to i. */
> > +    for (int j = 1, k; j < i + 1; j++) {
> > +        for (k = 0; k < i; k++) {
> > +            if (tracks[k].uid == j)
> > +                break;
> > +        }
> > +        if (k == i)
> > +            return j;
> > +    }
> >
> > just smells "wrong"
> >
> > This code would only be executed if using AVLFG would repeatedly lead to
> collisions;

yes but this doesnt smell right
if your random number generator produces no random numbers i dont think
adding +1 is the solution.
I mean if that really happens, theres some bug either in LFG the seed generation
or in how it is used.


> it is just there to guarantee that the UIDs are different, but actually it
> is not intended to be run.
> 
> 
> > if you want a simple non repeating UID, just count 0,1,2,3 for each
> > and put that in the unused upper 32bit for example
> >
> > or you can even ditch LFG and use something that findamentgally generates
> > non repeating output.
> > UID[i] = seed + i
> > comes to mind
> > or if theres a reason why that is not good then you can also do something
> > like
> > UID[i] = (seed + i*(seed2|1)) & 0xFFFFFFFFFFFFFFFF;
> >
> > but maybe theres a reason behind the more complex UID generation that
> > iam missing ?
> >
> >  The reason I've chosen this is that this is what has already been used
> for the UIDs of attachments, so I simply wanted to reuse this, while also
> fixing its defects (namely that it does not ensure that the resulting UIDs
> are actually distinct).

sure it all should be using the same but i am not sure what is used is a good
choice as it requires these extra duplicate checks

thanks

[...]
Andreas Rheinhardt Nov. 27, 2019, 7:25 a.m. UTC | #7
On Mon, Nov 25, 2019 at 2:57 PM Michael Niedermayer <michael@niedermayer.cc>
wrote:

> On Sun, Nov 24, 2019 at 11:05:03PM +0100, Andreas Rheinhardt wrote:
> > On Sun, Nov 24, 2019 at 8:28 PM Michael Niedermayer
> <michael@niedermayer.cc>
> > wrote:
> >
> > > On Sun, Nov 24, 2019 at 09:08:00AM +0000, Andreas Rheinhardt wrote:
> > > > Michael Niedermayer:
> > > > > On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt wrote:
> > > > >> Up until now, the TrackUID of a Matroska track which is supposed
> to be
> > > > >> random was not random at all: It always coincided with the
> TrackNumber
> > > > >> which is usually the 1-based index of the corresponding stream in
> the
> > > > >> array of AVStreams. This has been changed: It is now set via an
> AVLFG
> > > > >> if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is
> set
> > > > >> now (the only change happens if an explicit track number has been
> > > > >> choosen via dash_track_number, because the system used in the
> normal
> > > > >> situation is now used, too). In particular, no FATE tests need to
> be
> > > > >> updated.
> > > > >>
> > > > >> This also fixes a bug in case the dash_track_number option was
> used:
> > > > >> In this case the TrackUID was set to the track number, but the
> tags
> > > were
> > > > >> written with a TagTrackUID simply based upon the index, so that
> the
> > > tags
> > > > >> didn't apply to the track they ought to apply to.
> > > > >>
> > > > >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
> > > > >> ---
> > > > >> mkv_get_uid() might be overkill, but I simply wanted to be sure
> that
> > > > >> there are no collisions.
> > > > >>
> > > > >>  libavformat/matroskaenc.c | 65
> > > ++++++++++++++++++++++++++++++++++-----
> > > > >>  1 file changed, 57 insertions(+), 8 deletions(-)
> > > > >>
> > > > >> diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
> > > > >> index de57e474be..b87d15b7ff 100644
> > > > >> --- a/libavformat/matroskaenc.c
> > > > >> +++ b/libavformat/matroskaenc.c
> > > > >> @@ -94,6 +94,7 @@ typedef struct mkv_cues {
> > > > >>  typedef struct mkv_track {
> > > > >>      int             write_dts;
> > > > >>      int             has_cue;
> > > > >> +    uint32_t        uid;
> > > > >>      int             sample_rate;
> > > > >>      int64_t         sample_rate_offset;
> > > > >>      int64_t         last_timestamp;
> > > > >> @@ -1199,8 +1200,7 @@ static int mkv_write_track(AVFormatContext
> *s,
> > > MatroskaMuxContext *mkv,
> > > > >>      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
> > > > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
> > > > >>                     mkv->is_dash ? mkv->dash_track_number : i +
> 1);
> > > > >> -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> > > > >> -                   mkv->is_dash ? mkv->dash_track_number : i +
> 1);
> > > > >> +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
> > > > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no
> > > lacing (yet)
> > > > >>
> > > > >>      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
> > > > >> @@ -1650,7 +1650,8 @@ static int mkv_write_tags(AVFormatContext
> *s)
> > > > >>          if (!mkv_check_tag(st->metadata,
> > > MATROSKA_ID_TAGTARGETS_TRACKUID))
> > > > >>              continue;
> > > > >>
> > > > >> -        ret = mkv_write_tag(s, st->metadata,
> > > MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
> > > > >> +        ret = mkv_write_tag(s, st->metadata,
> > > MATROSKA_ID_TAGTARGETS_TRACKUID,
> > > > >> +                            mkv->tracks[i].uid);
> > > > >>          if (ret < 0) return ret;
> > > > >>      }
> > > > >>
> > > > >> @@ -1658,13 +1659,15 @@ static int mkv_write_tags(AVFormatContext
> *s)
> > > > >>          for (i = 0; i < s->nb_streams; i++) {
> > > > >>              AVIOContext *pb;
> > > > >>              AVStream *st = s->streams[i];
> > > > >> +            mkv_track *track = &mkv->tracks[i];
> > > > >>              ebml_master tag_target;
> > > > >>              ebml_master tag;
> > > > >>
> > > > >>              if (st->codecpar->codec_type ==
> AVMEDIA_TYPE_ATTACHMENT)
> > > > >>                  continue;
> > > > >>
> > > > >> -            mkv_write_tag_targets(s,
> > > MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
> > > > >> +            mkv_write_tag_targets(s,
> MATROSKA_ID_TAGTARGETS_TRACKUID,
> > > > >> +                                  track->uid, &tag_target);
> > > > >>              pb = mkv->tags_bc;
> > > > >>
> > > > >>              tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG,
> 0);
> > > > >> @@ -1863,10 +1866,6 @@ static int
> mkv_write_header(AVFormatContext *s)
> > > > >>              version = 4;
> > > > >>      }
> > > > >>
> > > > >> -    mkv->tracks = av_mallocz_array(s->nb_streams,
> > > sizeof(*mkv->tracks));
> > > > >> -    if (!mkv->tracks) {
> > > > >> -        return AVERROR(ENOMEM);
> > > > >> -    }
> > > > >>      ebml_header = start_ebml_master(pb, EBML_ID_HEADER,
> > > MAX_EBML_HEADER_SIZE);
> > > > >>      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
> > > > >>      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
> > > > >> @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum AVCodecID
> > > codec_id, int std_compliance)
> > > > >>      return 0;
> > > > >>  }
> > > > >>
> > > > >> +static uint32_t mkv_get_uid(const mkv_track *tracks, int i,
> AVLFG *c)
> > > > >> +{
> > > > >> +    uint32_t uid;
> > > > >> +
> > > > >> +    for (int j = 0, k; j < 5; j++) {
> > > > >> +        uid = av_lfg_get(c);
> > > > >> +        if (!uid)
> > > > >> +            continue;
> > > > >> +        for (k = 0; k < i; k++) {
> > > > >> +            if (tracks[k].uid == uid) {
> > > > >> +                /* Was something wrong with our random seed? */
> > > > >> +                av_lfg_init(c, av_get_random_seed());
> > > > >> +                break;
> > > > >> +            }
> > > > >> +        }
> > > > >> +        if (k == i)
> > > > >> +            return uid;
> > > > >> +    }
> > > > >> +
> > > > >> +    /* Test the numbers from 1 to i. */
> > > > >> +    for (int j = 1, k; j < i + 1; j++) {
> > > > >> +        for (k = 0; k < i; k++) {
> > > > >> +            if (tracks[k].uid == j)
> > > > >> +                break;
> > > > >> +        }
> > > > >> +        if (k == i)
> > > > >> +            return j;
> > > > >> +    }
> > > > >> +    /* Return i + 1. It can't coincide with another uid. */
> > > > >> +    return i + 1;
> > > > >> +}
> > > > >> +
> > > > >>  static int mkv_init(struct AVFormatContext *s)
> > > > >>  {
> > > > >> +    MatroskaMuxContext *mkv = s->priv_data;
> > > > >> +    AVLFG c;
> > > > >>      int i;
> > > > >>
> > > > >>      if (s->nb_streams > MAX_TRACKS) {
> > > > >> @@ -2697,7 +2730,23 @@ static int mkv_init(struct AVFormatContext
> *s)
> > > > >>          s->internal->avoid_negative_ts_use_pts = 1;
> > > > >>      }
> > > > >>
> > > > >> +    mkv->tracks = av_mallocz_array(s->nb_streams,
> > > sizeof(*mkv->tracks));
> > > > >> +    if (!mkv->tracks) {
> > > > >> +        return AVERROR(ENOMEM);
> > > > >> +    }
> > > > >> +
> > > > >
> > > > >> +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
> > > > >> +        av_lfg_init(&c, av_get_random_seed());
> > > > >
> > > > > would there be a disadvantage if the configuration of a stream /
> its
> > > content
> > > > > is used a seed ?
> > > > > That way it would be more deterministic
> > > > >
> > > > I see several disadvantages:
> > > > 1. If one duplicates a stream (i.e. muxes it twice in the same file),
> > > > they would get the same uid (unless one takes additional measures).
> > >
> > > If you took the configuration of all streams as a single seed that
> would
> > > not
> > > happen
> > >
> > >
> > > > 2. If the packet content is used, one can determine the uid only
> after
> > > > having received a packet. This would mean that one has to postpone
> > > > writing the Tracks element until one has a packet and the same goes
> > > > for the tags given that the uid is used to convey what they apply to.
> > > > One could solve this problem in the seekable, non-livestreaming case
> > > > by reserving size to be overwritten later, but this is nontrivial and
> > > > I'd like to avoid that.
> > > > 3. If the configuration record is used, then you won't get anything
> > > > remotely unique: Using the same encoder settings will likely produce
> > > > files with identical uids.
> > > >
> > > > Furthermore, this approach means that I can reuse the code for both
> > > > attachments as well as tracks. (See the next patch in this series.)
> > >
> > > fair enough, if its considered that avoiding some complexity is more
> > > important than deterministic output
> > >
> >
> > Just to be sure: The output will be deterministic if the bitexact flag is
> > set.
> > I don't see a reason for deterministic output if it is not set.
>
> the bitexact flag is used for multiple things
> 1. produce the same output between platforms and minor code changes
> 2. produce the same output when running the exact same binary with the
> exact
>    same input
>
> This has a few problems
> code pathes disabled because of 1, cannot be tested easily because
> if bitexact is enabled the code to be tested is disabled
> if bitexact is disabled random values are put in the file making it not
> as easy to see if the code under test produces the same results each time
>
> Another problem is, obscuring problems, because
> if with a deterministic binary you get different output on 2 runs then you
> know you have a bug, a race condition or maybe a uninitialized variable or
> such.
> But with random values stored all over this is no longer vissible unless
> one uses bitexact which most people dont do by default
> so this could cause a decrease in chance detections of some bugs
>
> The Matroska muxer only behaves differently when in bitexact mode in two
ways: The UIDs are chosen deterministically (or in case of the SegmentUID:
not written at all) and certain strings that usually contain the
libavformat version no longer do so. All these things are in the file
header. There are no (and won't be) random values stored all over the
place. The code paths are very small and so I don't think that they will
obscure any bug.

I have also added a patch to store the UIDs that can be random on a fixed
size (namely eight, because the UIDs are now eight bytes long). This even
ensures that there is no offset/size difference between the output of two
files both produced with the same settings in non-bitexact mode (this can
currently happen with attachments if the upper byte of the random UID
vanishes). The differences between these files will be confined to the
header (the UIDs as well as the CRC-32 elements of the level 1-master
elements that contain the UIDs).


>
> [...]
> > > seed collisions are not at all impossible at 32bits
> > > also av_get_random_seed() can be expensive on some platforms so
> > > i think there should be a single place calling it as needed and not
> > > throwing the obtained entropy away.
> > >
> >
> > I didn't deem this important performance-wise, as this is only done a few
> > times during init.
> > But if you want to, I can reuse the AVLFG from mkv_init for the segment
> UID.
>
> get_generic_seed() is not really a fast function so when its used IIRC
> on some platforms its not really fast.
> And if in a usecase someone remuxes MANY small matroska files, i would
> suspect this could be noticable
>
> I changed the code so that av_get_random_seed() is only called once. (This
of course clashes with your "Either way if you need X bits of entropy in
the muxer there should be X/32 calls to av_get_random_seed().")


>
> [...]
> >
> >
> > > and, this
> > > +    /* Test the numbers from 1 to i. */
> > > +    for (int j = 1, k; j < i + 1; j++) {
> > > +        for (k = 0; k < i; k++) {
> > > +            if (tracks[k].uid == j)
> > > +                break;
> > > +        }
> > > +        if (k == i)
> > > +            return j;
> > > +    }
> > >
> > > just smells "wrong"
> > >
> > > This code would only be executed if using AVLFG would repeatedly lead
> to
> > collisions;
>
> yes but this doesnt smell right
> if your random number generator produces no random numbers i dont think
> adding +1 is the solution.
> I mean if that really happens, theres some bug either in LFG the seed
> generation
> or in how it is used.
>
> Yes, this code was only intended to be run if the random number generator
is completely buggy. But it seems that this can unfortunately not ruled out
completely (
https://www.phoronix.com/scan.php?page=news_item&px=AMD-RdRand-Disable-15h-16h
).

Anyway, I have now changed the last part a bit so that it does not simply
count.

>
> > it is just there to guarantee that the UIDs are different, but actually
> it
> > is not intended to be run.
> >
> >
> > > if you want a simple non repeating UID, just count 0,1,2,3 for each
> > > and put that in the unused upper 32bit for example
> > >
> > > or you can even ditch LFG and use something that findamentgally
> generates
> > > non repeating output.
> > > UID[i] = seed + i
> > > comes to mind
> > > or if theres a reason why that is not good then you can also do
> something
> > > like
> > > UID[i] = (seed + i*(seed2|1)) & 0xFFFFFFFFFFFFFFFF;
> > >
> > > but maybe theres a reason behind the more complex UID generation that
> > > iam missing ?
> > >
> > >  The reason I've chosen this is that this is what has already been used
> > for the UIDs of attachments, so I simply wanted to reuse this, while also
> > fixing its defects (namely that it does not ensure that the resulting
> UIDs
> > are actually distinct).
>
> sure it all should be using the same but i am not sure what is used is a
> good
> choice as it requires these extra duplicate checks
>
> I don't think that this code is either complicated or extensive.


I hope I have addressed all your concerns.

- Andreas
Michael Niedermayer Nov. 27, 2019, 3:50 p.m. UTC | #8
On Wed, Nov 27, 2019 at 08:25:13AM +0100, Andreas Rheinhardt wrote:
> On Mon, Nov 25, 2019 at 2:57 PM Michael Niedermayer <michael@niedermayer.cc>
> wrote:
> 
> > On Sun, Nov 24, 2019 at 11:05:03PM +0100, Andreas Rheinhardt wrote:
> > > On Sun, Nov 24, 2019 at 8:28 PM Michael Niedermayer
> > <michael@niedermayer.cc>
> > > wrote:
> > >
> > > > On Sun, Nov 24, 2019 at 09:08:00AM +0000, Andreas Rheinhardt wrote:
> > > > > Michael Niedermayer:
> > > > > > On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt wrote:
> > > > > >> Up until now, the TrackUID of a Matroska track which is supposed
> > to be
> > > > > >> random was not random at all: It always coincided with the
> > TrackNumber
> > > > > >> which is usually the 1-based index of the corresponding stream in
> > the
> > > > > >> array of AVStreams. This has been changed: It is now set via an
> > AVLFG
> > > > > >> if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like it is
> > set
> > > > > >> now (the only change happens if an explicit track number has been
> > > > > >> choosen via dash_track_number, because the system used in the
> > normal
> > > > > >> situation is now used, too). In particular, no FATE tests need to
> > be
> > > > > >> updated.
> > > > > >>
> > > > > >> This also fixes a bug in case the dash_track_number option was
> > used:
> > > > > >> In this case the TrackUID was set to the track number, but the
> > tags
> > > > were
> > > > > >> written with a TagTrackUID simply based upon the index, so that
> > the
> > > > tags
> > > > > >> didn't apply to the track they ought to apply to.
> > > > > >>
> > > > > >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@gmail.com>
> > > > > >> ---
> > > > > >> mkv_get_uid() might be overkill, but I simply wanted to be sure
> > that
> > > > > >> there are no collisions.
> > > > > >>
> > > > > >>  libavformat/matroskaenc.c | 65
> > > > ++++++++++++++++++++++++++++++++++-----
> > > > > >>  1 file changed, 57 insertions(+), 8 deletions(-)
> > > > > >>
> > > > > >> diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
> > > > > >> index de57e474be..b87d15b7ff 100644
> > > > > >> --- a/libavformat/matroskaenc.c
> > > > > >> +++ b/libavformat/matroskaenc.c
> > > > > >> @@ -94,6 +94,7 @@ typedef struct mkv_cues {
> > > > > >>  typedef struct mkv_track {
> > > > > >>      int             write_dts;
> > > > > >>      int             has_cue;
> > > > > >> +    uint32_t        uid;
> > > > > >>      int             sample_rate;
> > > > > >>      int64_t         sample_rate_offset;
> > > > > >>      int64_t         last_timestamp;
> > > > > >> @@ -1199,8 +1200,7 @@ static int mkv_write_track(AVFormatContext
> > *s,
> > > > MatroskaMuxContext *mkv,
> > > > > >>      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
> > > > > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
> > > > > >>                     mkv->is_dash ? mkv->dash_track_number : i +
> > 1);
> > > > > >> -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> > > > > >> -                   mkv->is_dash ? mkv->dash_track_number : i +
> > 1);
> > > > > >> +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
> > > > > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no
> > > > lacing (yet)
> > > > > >>
> > > > > >>      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
> > > > > >> @@ -1650,7 +1650,8 @@ static int mkv_write_tags(AVFormatContext
> > *s)
> > > > > >>          if (!mkv_check_tag(st->metadata,
> > > > MATROSKA_ID_TAGTARGETS_TRACKUID))
> > > > > >>              continue;
> > > > > >>
> > > > > >> -        ret = mkv_write_tag(s, st->metadata,
> > > > MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
> > > > > >> +        ret = mkv_write_tag(s, st->metadata,
> > > > MATROSKA_ID_TAGTARGETS_TRACKUID,
> > > > > >> +                            mkv->tracks[i].uid);
> > > > > >>          if (ret < 0) return ret;
> > > > > >>      }
> > > > > >>
> > > > > >> @@ -1658,13 +1659,15 @@ static int mkv_write_tags(AVFormatContext
> > *s)
> > > > > >>          for (i = 0; i < s->nb_streams; i++) {
> > > > > >>              AVIOContext *pb;
> > > > > >>              AVStream *st = s->streams[i];
> > > > > >> +            mkv_track *track = &mkv->tracks[i];
> > > > > >>              ebml_master tag_target;
> > > > > >>              ebml_master tag;
> > > > > >>
> > > > > >>              if (st->codecpar->codec_type ==
> > AVMEDIA_TYPE_ATTACHMENT)
> > > > > >>                  continue;
> > > > > >>
> > > > > >> -            mkv_write_tag_targets(s,
> > > > MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
> > > > > >> +            mkv_write_tag_targets(s,
> > MATROSKA_ID_TAGTARGETS_TRACKUID,
> > > > > >> +                                  track->uid, &tag_target);
> > > > > >>              pb = mkv->tags_bc;
> > > > > >>
> > > > > >>              tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG,
> > 0);
> > > > > >> @@ -1863,10 +1866,6 @@ static int
> > mkv_write_header(AVFormatContext *s)
> > > > > >>              version = 4;
> > > > > >>      }
> > > > > >>
> > > > > >> -    mkv->tracks = av_mallocz_array(s->nb_streams,
> > > > sizeof(*mkv->tracks));
> > > > > >> -    if (!mkv->tracks) {
> > > > > >> -        return AVERROR(ENOMEM);
> > > > > >> -    }
> > > > > >>      ebml_header = start_ebml_master(pb, EBML_ID_HEADER,
> > > > MAX_EBML_HEADER_SIZE);
> > > > > >>      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
> > > > > >>      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
> > > > > >> @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum AVCodecID
> > > > codec_id, int std_compliance)
> > > > > >>      return 0;
> > > > > >>  }
> > > > > >>
> > > > > >> +static uint32_t mkv_get_uid(const mkv_track *tracks, int i,
> > AVLFG *c)
> > > > > >> +{
> > > > > >> +    uint32_t uid;
> > > > > >> +
> > > > > >> +    for (int j = 0, k; j < 5; j++) {
> > > > > >> +        uid = av_lfg_get(c);
> > > > > >> +        if (!uid)
> > > > > >> +            continue;
> > > > > >> +        for (k = 0; k < i; k++) {
> > > > > >> +            if (tracks[k].uid == uid) {
> > > > > >> +                /* Was something wrong with our random seed? */
> > > > > >> +                av_lfg_init(c, av_get_random_seed());
> > > > > >> +                break;
> > > > > >> +            }
> > > > > >> +        }
> > > > > >> +        if (k == i)
> > > > > >> +            return uid;
> > > > > >> +    }
> > > > > >> +
> > > > > >> +    /* Test the numbers from 1 to i. */
> > > > > >> +    for (int j = 1, k; j < i + 1; j++) {
> > > > > >> +        for (k = 0; k < i; k++) {
> > > > > >> +            if (tracks[k].uid == j)
> > > > > >> +                break;
> > > > > >> +        }
> > > > > >> +        if (k == i)
> > > > > >> +            return j;
> > > > > >> +    }
> > > > > >> +    /* Return i + 1. It can't coincide with another uid. */
> > > > > >> +    return i + 1;
> > > > > >> +}
> > > > > >> +
> > > > > >>  static int mkv_init(struct AVFormatContext *s)
> > > > > >>  {
> > > > > >> +    MatroskaMuxContext *mkv = s->priv_data;
> > > > > >> +    AVLFG c;
> > > > > >>      int i;
> > > > > >>
> > > > > >>      if (s->nb_streams > MAX_TRACKS) {
> > > > > >> @@ -2697,7 +2730,23 @@ static int mkv_init(struct AVFormatContext
> > *s)
> > > > > >>          s->internal->avoid_negative_ts_use_pts = 1;
> > > > > >>      }
> > > > > >>
> > > > > >> +    mkv->tracks = av_mallocz_array(s->nb_streams,
> > > > sizeof(*mkv->tracks));
> > > > > >> +    if (!mkv->tracks) {
> > > > > >> +        return AVERROR(ENOMEM);
> > > > > >> +    }
> > > > > >> +
> > > > > >
> > > > > >> +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
> > > > > >> +        av_lfg_init(&c, av_get_random_seed());
> > > > > >
> > > > > > would there be a disadvantage if the configuration of a stream /
> > its
> > > > content
> > > > > > is used a seed ?
> > > > > > That way it would be more deterministic
> > > > > >
> > > > > I see several disadvantages:
> > > > > 1. If one duplicates a stream (i.e. muxes it twice in the same file),
> > > > > they would get the same uid (unless one takes additional measures).
> > > >
> > > > If you took the configuration of all streams as a single seed that
> > would
> > > > not
> > > > happen
> > > >
> > > >
> > > > > 2. If the packet content is used, one can determine the uid only
> > after
> > > > > having received a packet. This would mean that one has to postpone
> > > > > writing the Tracks element until one has a packet and the same goes
> > > > > for the tags given that the uid is used to convey what they apply to.
> > > > > One could solve this problem in the seekable, non-livestreaming case
> > > > > by reserving size to be overwritten later, but this is nontrivial and
> > > > > I'd like to avoid that.
> > > > > 3. If the configuration record is used, then you won't get anything
> > > > > remotely unique: Using the same encoder settings will likely produce
> > > > > files with identical uids.
> > > > >
> > > > > Furthermore, this approach means that I can reuse the code for both
> > > > > attachments as well as tracks. (See the next patch in this series.)
> > > >
> > > > fair enough, if its considered that avoiding some complexity is more
> > > > important than deterministic output
> > > >
> > >
> > > Just to be sure: The output will be deterministic if the bitexact flag is
> > > set.
> > > I don't see a reason for deterministic output if it is not set.
> >
> > the bitexact flag is used for multiple things
> > 1. produce the same output between platforms and minor code changes
> > 2. produce the same output when running the exact same binary with the
> > exact
> >    same input
> >
> > This has a few problems
> > code pathes disabled because of 1, cannot be tested easily because
> > if bitexact is enabled the code to be tested is disabled
> > if bitexact is disabled random values are put in the file making it not
> > as easy to see if the code under test produces the same results each time
> >
> > Another problem is, obscuring problems, because
> > if with a deterministic binary you get different output on 2 runs then you
> > know you have a bug, a race condition or maybe a uninitialized variable or
> > such.
> > But with random values stored all over this is no longer vissible unless
> > one uses bitexact which most people dont do by default
> > so this could cause a decrease in chance detections of some bugs
> >
> > The Matroska muxer only behaves differently when in bitexact mode in two
> ways: The UIDs are chosen deterministically (or in case of the SegmentUID:
> not written at all) and certain strings that usually contain the
> libavformat version no longer do so. All these things are in the file
> header. There are no (and won't be) random values stored all over the
> place. The code paths are very small and so I don't think that they will
> obscure any bug.

If a user encodes the same input twice with the same options and
as its popular uses mkv as output and then notices that the files
are different then with deterministic components she can conclude
there was a bug. With randomly seeded UIDs that conclusion is not
possible anymore.
Retesting with bitexact in this case also doesnt help, the bug
might not occur during 2 bitexact retests

Iam not saying thats enough of a problem to change something but what
i mean is that its a real thing not have absolutely 0 disadvantage.


[...]
> >
> > [...]
> > >
> > >
> > > > and, this
> > > > +    /* Test the numbers from 1 to i. */
> > > > +    for (int j = 1, k; j < i + 1; j++) {
> > > > +        for (k = 0; k < i; k++) {
> > > > +            if (tracks[k].uid == j)
> > > > +                break;
> > > > +        }
> > > > +        if (k == i)
> > > > +            return j;
> > > > +    }
> > > >
> > > > just smells "wrong"
> > > >
> > > > This code would only be executed if using AVLFG would repeatedly lead
> > to
> > > collisions;
> >
> > yes but this doesnt smell right
> > if your random number generator produces no random numbers i dont think
> > adding +1 is the solution.
> > I mean if that really happens, theres some bug either in LFG the seed
> > generation
> > or in how it is used.
> >
> > Yes, this code was only intended to be run if the random number generator
> is completely buggy. But it seems that this can unfortunately not ruled out
> completely (

> https://www.phoronix.com/scan.php?page=news_item&px=AMD-RdRand-Disable-15h-16h
> ).

The fix to this does not belong into a caller of av_get_random_seed()
if thats fixed / worked around in CPU, Bios, OS, or in av_get_random_seed()
i dont know but definitly not the user of av_get_random_seed().
If av_get_random_seed() is bad thats bad and needs to be fixed

Also lets assume av_get_random_seed() is bad and returns -1 always
or another constant.
The code still must not fail like this. the seeded PRNG must still 
function properly and produce a few differentg values.
If its statistics are 100% duplicates thats a failure of the PRNG.

[...]

Thanks
Andreas Rheinhardt Nov. 27, 2019, 4:57 p.m. UTC | #9
On Wed, Nov 27, 2019 at 4:50 PM Michael Niedermayer <michael@niedermayer.cc>
wrote:

> On Wed, Nov 27, 2019 at 08:25:13AM +0100, Andreas Rheinhardt wrote:
> > On Mon, Nov 25, 2019 at 2:57 PM Michael Niedermayer
> <michael@niedermayer.cc>
> > wrote:
> >
> > > On Sun, Nov 24, 2019 at 11:05:03PM +0100, Andreas Rheinhardt wrote:
> > > > On Sun, Nov 24, 2019 at 8:28 PM Michael Niedermayer
> > > <michael@niedermayer.cc>
> > > > wrote:
> > > >
> > > > > On Sun, Nov 24, 2019 at 09:08:00AM +0000, Andreas Rheinhardt wrote:
> > > > > > Michael Niedermayer:
> > > > > > > On Wed, Nov 06, 2019 at 03:49:02AM +0100, Andreas Rheinhardt
> wrote:
> > > > > > >> Up until now, the TrackUID of a Matroska track which is
> supposed
> > > to be
> > > > > > >> random was not random at all: It always coincided with the
> > > TrackNumber
> > > > > > >> which is usually the 1-based index of the corresponding
> stream in
> > > the
> > > > > > >> array of AVStreams. This has been changed: It is now set via
> an
> > > AVLFG
> > > > > > >> if AVFMT_FLAG_BITEXACT is not set. Otherwise it is set like
> it is
> > > set
> > > > > > >> now (the only change happens if an explicit track number has
> been
> > > > > > >> choosen via dash_track_number, because the system used in the
> > > normal
> > > > > > >> situation is now used, too). In particular, no FATE tests
> need to
> > > be
> > > > > > >> updated.
> > > > > > >>
> > > > > > >> This also fixes a bug in case the dash_track_number option was
> > > used:
> > > > > > >> In this case the TrackUID was set to the track number, but the
> > > tags
> > > > > were
> > > > > > >> written with a TagTrackUID simply based upon the index, so
> that
> > > the
> > > > > tags
> > > > > > >> didn't apply to the track they ought to apply to.
> > > > > > >>
> > > > > > >> Signed-off-by: Andreas Rheinhardt <
> andreas.rheinhardt@gmail.com>
> > > > > > >> ---
> > > > > > >> mkv_get_uid() might be overkill, but I simply wanted to be
> sure
> > > that
> > > > > > >> there are no collisions.
> > > > > > >>
> > > > > > >>  libavformat/matroskaenc.c | 65
> > > > > ++++++++++++++++++++++++++++++++++-----
> > > > > > >>  1 file changed, 57 insertions(+), 8 deletions(-)
> > > > > > >>
> > > > > > >> diff --git a/libavformat/matroskaenc.c
> b/libavformat/matroskaenc.c
> > > > > > >> index de57e474be..b87d15b7ff 100644
> > > > > > >> --- a/libavformat/matroskaenc.c
> > > > > > >> +++ b/libavformat/matroskaenc.c
> > > > > > >> @@ -94,6 +94,7 @@ typedef struct mkv_cues {
> > > > > > >>  typedef struct mkv_track {
> > > > > > >>      int             write_dts;
> > > > > > >>      int             has_cue;
> > > > > > >> +    uint32_t        uid;
> > > > > > >>      int             sample_rate;
> > > > > > >>      int64_t         sample_rate_offset;
> > > > > > >>      int64_t         last_timestamp;
> > > > > > >> @@ -1199,8 +1200,7 @@ static int
> mkv_write_track(AVFormatContext
> > > *s,
> > > > > MatroskaMuxContext *mkv,
> > > > > > >>      track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
> > > > > > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
> > > > > > >>                     mkv->is_dash ? mkv->dash_track_number : i
> +
> > > 1);
> > > > > > >> -    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> > > > > > >> -                   mkv->is_dash ? mkv->dash_track_number : i
> +
> > > 1);
> > > > > > >> +    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
> mkv->tracks[i].uid);
> > > > > > >>      put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);
> // no
> > > > > lacing (yet)
> > > > > > >>
> > > > > > >>      if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
> > > > > > >> @@ -1650,7 +1650,8 @@ static int
> mkv_write_tags(AVFormatContext
> > > *s)
> > > > > > >>          if (!mkv_check_tag(st->metadata,
> > > > > MATROSKA_ID_TAGTARGETS_TRACKUID))
> > > > > > >>              continue;
> > > > > > >>
> > > > > > >> -        ret = mkv_write_tag(s, st->metadata,
> > > > > MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
> > > > > > >> +        ret = mkv_write_tag(s, st->metadata,
> > > > > MATROSKA_ID_TAGTARGETS_TRACKUID,
> > > > > > >> +                            mkv->tracks[i].uid);
> > > > > > >>          if (ret < 0) return ret;
> > > > > > >>      }
> > > > > > >>
> > > > > > >> @@ -1658,13 +1659,15 @@ static int
> mkv_write_tags(AVFormatContext
> > > *s)
> > > > > > >>          for (i = 0; i < s->nb_streams; i++) {
> > > > > > >>              AVIOContext *pb;
> > > > > > >>              AVStream *st = s->streams[i];
> > > > > > >> +            mkv_track *track = &mkv->tracks[i];
> > > > > > >>              ebml_master tag_target;
> > > > > > >>              ebml_master tag;
> > > > > > >>
> > > > > > >>              if (st->codecpar->codec_type ==
> > > AVMEDIA_TYPE_ATTACHMENT)
> > > > > > >>                  continue;
> > > > > > >>
> > > > > > >> -            mkv_write_tag_targets(s,
> > > > > MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
> > > > > > >> +            mkv_write_tag_targets(s,
> > > MATROSKA_ID_TAGTARGETS_TRACKUID,
> > > > > > >> +                                  track->uid, &tag_target);
> > > > > > >>              pb = mkv->tags_bc;
> > > > > > >>
> > > > > > >>              tag = start_ebml_master(pb,
> MATROSKA_ID_SIMPLETAG,
> > > 0);
> > > > > > >> @@ -1863,10 +1866,6 @@ static int
> > > mkv_write_header(AVFormatContext *s)
> > > > > > >>              version = 4;
> > > > > > >>      }
> > > > > > >>
> > > > > > >> -    mkv->tracks = av_mallocz_array(s->nb_streams,
> > > > > sizeof(*mkv->tracks));
> > > > > > >> -    if (!mkv->tracks) {
> > > > > > >> -        return AVERROR(ENOMEM);
> > > > > > >> -    }
> > > > > > >>      ebml_header = start_ebml_master(pb, EBML_ID_HEADER,
> > > > > MAX_EBML_HEADER_SIZE);
> > > > > > >>      put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,
>  1);
> > > > > > >>      put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,
>  1);
> > > > > > >> @@ -2667,8 +2666,42 @@ static int webm_query_codec(enum
> AVCodecID
> > > > > codec_id, int std_compliance)
> > > > > > >>      return 0;
> > > > > > >>  }
> > > > > > >>
> > > > > > >> +static uint32_t mkv_get_uid(const mkv_track *tracks, int i,
> > > AVLFG *c)
> > > > > > >> +{
> > > > > > >> +    uint32_t uid;
> > > > > > >> +
> > > > > > >> +    for (int j = 0, k; j < 5; j++) {
> > > > > > >> +        uid = av_lfg_get(c);
> > > > > > >> +        if (!uid)
> > > > > > >> +            continue;
> > > > > > >> +        for (k = 0; k < i; k++) {
> > > > > > >> +            if (tracks[k].uid == uid) {
> > > > > > >> +                /* Was something wrong with our random seed?
> */
> > > > > > >> +                av_lfg_init(c, av_get_random_seed());
> > > > > > >> +                break;
> > > > > > >> +            }
> > > > > > >> +        }
> > > > > > >> +        if (k == i)
> > > > > > >> +            return uid;
> > > > > > >> +    }
> > > > > > >> +
> > > > > > >> +    /* Test the numbers from 1 to i. */
> > > > > > >> +    for (int j = 1, k; j < i + 1; j++) {
> > > > > > >> +        for (k = 0; k < i; k++) {
> > > > > > >> +            if (tracks[k].uid == j)
> > > > > > >> +                break;
> > > > > > >> +        }
> > > > > > >> +        if (k == i)
> > > > > > >> +            return j;
> > > > > > >> +    }
> > > > > > >> +    /* Return i + 1. It can't coincide with another uid. */
> > > > > > >> +    return i + 1;
> > > > > > >> +}
> > > > > > >> +
> > > > > > >>  static int mkv_init(struct AVFormatContext *s)
> > > > > > >>  {
> > > > > > >> +    MatroskaMuxContext *mkv = s->priv_data;
> > > > > > >> +    AVLFG c;
> > > > > > >>      int i;
> > > > > > >>
> > > > > > >>      if (s->nb_streams > MAX_TRACKS) {
> > > > > > >> @@ -2697,7 +2730,23 @@ static int mkv_init(struct
> AVFormatContext
> > > *s)
> > > > > > >>          s->internal->avoid_negative_ts_use_pts = 1;
> > > > > > >>      }
> > > > > > >>
> > > > > > >> +    mkv->tracks = av_mallocz_array(s->nb_streams,
> > > > > sizeof(*mkv->tracks));
> > > > > > >> +    if (!mkv->tracks) {
> > > > > > >> +        return AVERROR(ENOMEM);
> > > > > > >> +    }
> > > > > > >> +
> > > > > > >
> > > > > > >> +    if (!(s->flags & AVFMT_FLAG_BITEXACT))
> > > > > > >> +        av_lfg_init(&c, av_get_random_seed());
> > > > > > >
> > > > > > > would there be a disadvantage if the configuration of a stream
> /
> > > its
> > > > > content
> > > > > > > is used a seed ?
> > > > > > > That way it would be more deterministic
> > > > > > >
> > > > > > I see several disadvantages:
> > > > > > 1. If one duplicates a stream (i.e. muxes it twice in the same
> file),
> > > > > > they would get the same uid (unless one takes additional
> measures).
> > > > >
> > > > > If you took the configuration of all streams as a single seed that
> > > would
> > > > > not
> > > > > happen
> > > > >
> > > > >
> > > > > > 2. If the packet content is used, one can determine the uid only
> > > after
> > > > > > having received a packet. This would mean that one has to
> postpone
> > > > > > writing the Tracks element until one has a packet and the same
> goes
> > > > > > for the tags given that the uid is used to convey what they
> apply to.
> > > > > > One could solve this problem in the seekable, non-livestreaming
> case
> > > > > > by reserving size to be overwritten later, but this is
> nontrivial and
> > > > > > I'd like to avoid that.
> > > > > > 3. If the configuration record is used, then you won't get
> anything
> > > > > > remotely unique: Using the same encoder settings will likely
> produce
> > > > > > files with identical uids.
> > > > > >
> > > > > > Furthermore, this approach means that I can reuse the code for
> both
> > > > > > attachments as well as tracks. (See the next patch in this
> series.)
> > > > >
> > > > > fair enough, if its considered that avoiding some complexity is
> more
> > > > > important than deterministic output
> > > > >
> > > >
> > > > Just to be sure: The output will be deterministic if the bitexact
> flag is
> > > > set.
> > > > I don't see a reason for deterministic output if it is not set.
> > >
> > > the bitexact flag is used for multiple things
> > > 1. produce the same output between platforms and minor code changes
> > > 2. produce the same output when running the exact same binary with the
> > > exact
> > >    same input
> > >
> > > This has a few problems
> > > code pathes disabled because of 1, cannot be tested easily because
> > > if bitexact is enabled the code to be tested is disabled
> > > if bitexact is disabled random values are put in the file making it not
> > > as easy to see if the code under test produces the same results each
> time
> > >
> > > Another problem is, obscuring problems, because
> > > if with a deterministic binary you get different output on 2 runs then
> you
> > > know you have a bug, a race condition or maybe a uninitialized
> variable or
> > > such.
> > > But with random values stored all over this is no longer vissible
> unless
> > > one uses bitexact which most people dont do by default
> > > so this could cause a decrease in chance detections of some bugs
> > >
> > > The Matroska muxer only behaves differently when in bitexact mode in
> two
> > ways: The UIDs are chosen deterministically (or in case of the
> SegmentUID:
> > not written at all) and certain strings that usually contain the
> > libavformat version no longer do so. All these things are in the file
> > header. There are no (and won't be) random values stored all over the
> > place. The code paths are very small and so I don't think that they will
> > obscure any bug.
>
> If a user encodes the same input twice with the same options and
> as its popular uses mkv as output and then notices that the files
> are different then with deterministic components she can conclude
> there was a bug. With randomly seeded UIDs that conclusion is not
> possible anymore.
> Retesting with bitexact in this case also doesnt help, the bug
> might not occur during 2 bitexact retests
>
> Iam not saying thats enough of a problem to change something but what
> i mean is that its a real thing not have absolutely 0 disadvantage.
>
> I consider this only a slight disadvantage: E.g. the streamhash or the
framehash muxers would provide easy ways to examine whether the actual
content of the files (besides the UIDs) are different.

>
> [...]
> > >
> > > [...]
> > > >
> > > >
> > > > > and, this
> > > > > +    /* Test the numbers from 1 to i. */
> > > > > +    for (int j = 1, k; j < i + 1; j++) {
> > > > > +        for (k = 0; k < i; k++) {
> > > > > +            if (tracks[k].uid == j)
> > > > > +                break;
> > > > > +        }
> > > > > +        if (k == i)
> > > > > +            return j;
> > > > > +    }
> > > > >
> > > > > just smells "wrong"
> > > > >
> > > > > This code would only be executed if using AVLFG would repeatedly
> lead
> > > to
> > > > collisions;
> > >
> > > yes but this doesnt smell right
> > > if your random number generator produces no random numbers i dont think
> > > adding +1 is the solution.
> > > I mean if that really happens, theres some bug either in LFG the seed
> > > generation
> > > or in how it is used.
> > >
> > > Yes, this code was only intended to be run if the random number
> generator
> > is completely buggy. But it seems that this can unfortunately not ruled
> out
> > completely (
>
> >
> https://www.phoronix.com/scan.php?page=news_item&px=AMD-RdRand-Disable-15h-16h
> > ).
>
> The fix to this does not belong into a caller of av_get_random_seed()
> if thats fixed / worked around in CPU, Bios, OS, or in av_get_random_seed()
> i dont know but definitly not the user of av_get_random_seed().
> If av_get_random_seed() is bad thats bad and needs to be fixed
>
> Also lets assume av_get_random_seed() is bad and returns -1 always
> or another constant.
> The code still must not fail like this. the seeded PRNG must still
> function properly and produce a few differentg values.
> If its statistics are 100% duplicates thats a failure of the PRNG.
>
>  First, are you saying that a caller can rely on the values being
different so that I can simply remove the last-resort code and assert that
everything is fine? (Or should I let the initial loop using AVLFG run until
it has found a good value in the hope that it will terminate eventually?)

Second, given that only one call to av_get_random_seed() is ever performed,
the question of whether it always returns -1 is irrelevant here: If some
seed exists that leads to these collisions, then it is possible for these
collisions to happen even when the random seed is truly random (in such a
case, it would be unlikely, of course). Is there such a seed? I don't know.
I only know that if the state of the AVLFG is completely zero, then
av_lfg_get() will always return zero. This is the only state with the
property that the state is not altered by any av_lfg_get(), but this does
not preclude the output to be e.g. periodic.

- Andreas
Carl Eugen Hoyos Nov. 27, 2019, 5:11 p.m. UTC | #10
Am Mi., 27. Nov. 2019 um 17:58 Uhr schrieb Andreas Rheinhardt
<andreas.rheinhardt@gmail.com>:

> I consider this only a slight disadvantage: E.g. the streamhash or the
> framehash muxers would provide easy ways to examine whether the actual
> content of the files (besides the UIDs) are different.

But that wouldn't show changes in the output of the muxer.

Carl Eugen
Michael Niedermayer Nov. 27, 2019, 10:27 p.m. UTC | #11
On Wed, Nov 27, 2019 at 05:57:56PM +0100, Andreas Rheinhardt wrote:
> On Wed, Nov 27, 2019 at 4:50 PM Michael Niedermayer <michael@niedermayer.cc>
> wrote:
> 
> > On Wed, Nov 27, 2019 at 08:25:13AM +0100, Andreas Rheinhardt wrote:
> > > On Mon, Nov 25, 2019 at 2:57 PM Michael Niedermayer
> > <michael@niedermayer.cc>
> > > wrote:
[...]
> 
> >
> > [...]
> > > >
> > > > [...]
> > > > >
> > > > >
> > > > > > and, this
> > > > > > +    /* Test the numbers from 1 to i. */
> > > > > > +    for (int j = 1, k; j < i + 1; j++) {
> > > > > > +        for (k = 0; k < i; k++) {
> > > > > > +            if (tracks[k].uid == j)
> > > > > > +                break;
> > > > > > +        }
> > > > > > +        if (k == i)
> > > > > > +            return j;
> > > > > > +    }
> > > > > >
> > > > > > just smells "wrong"
> > > > > >
> > > > > > This code would only be executed if using AVLFG would repeatedly
> > lead
> > > > to
> > > > > collisions;
> > > >
> > > > yes but this doesnt smell right
> > > > if your random number generator produces no random numbers i dont think
> > > > adding +1 is the solution.
> > > > I mean if that really happens, theres some bug either in LFG the seed
> > > > generation
> > > > or in how it is used.
> > > >
> > > > Yes, this code was only intended to be run if the random number
> > generator
> > > is completely buggy. But it seems that this can unfortunately not ruled
> > out
> > > completely (
> >
> > >
> > https://www.phoronix.com/scan.php?page=news_item&px=AMD-RdRand-Disable-15h-16h
> > > ).
> >
> > The fix to this does not belong into a caller of av_get_random_seed()
> > if thats fixed / worked around in CPU, Bios, OS, or in av_get_random_seed()
> > i dont know but definitly not the user of av_get_random_seed().
> > If av_get_random_seed() is bad thats bad and needs to be fixed
> >
> > Also lets assume av_get_random_seed() is bad and returns -1 always
> > or another constant.
> > The code still must not fail like this. the seeded PRNG must still
> > function properly and produce a few differentg values.
> > If its statistics are 100% duplicates thats a failure of the PRNG.
> >
> >  First, are you saying that a caller can rely on the values being
> different so that I can simply remove the last-resort code and assert that
> everything is fine? (Or should I let the initial loop using AVLFG run until
> it has found a good value in the hope that it will terminate eventually?)

I expect the LFG to produce billions of distinct values for every seed.
If you find a seed for which it does not do this i would consider this a
bug and iam insterrested in such a case.
Distinct values may of course be non consecutive as one would expect
from random numbers


> 
> Second, given that only one call to av_get_random_seed() is ever performed,
> the question of whether it always returns -1 is irrelevant here: If some
> seed exists that leads to these collisions, then it is possible for these
> collisions to happen even when the random seed is truly random (in such a
> case, it would be unlikely, of course). Is there such a seed? I don't know.
> I only know that if the state of the AVLFG is completely zero, then
> av_lfg_get() will always return zero. This is the only state with the
> property that the state is not altered by any av_lfg_get(), but this does
> not preclude the output to be e.g. periodic.

its not needed to be all zero. a pathological case already occurs if
all state values are even. I suspect that does not happen for any seed though.
its statistically unlikely.

Thanks

[...]
Andreas Rheinhardt Nov. 28, 2019, 9:19 p.m. UTC | #12
Carl Eugen Hoyos:
> Am Mi., 27. Nov. 2019 um 17:58 Uhr schrieb Andreas Rheinhardt
> <andreas.rheinhardt@gmail.com>:
> 
>> I consider this only a slight disadvantage: E.g. the streamhash or the
>> framehash muxers would provide easy ways to examine whether the actual
>> content of the files (besides the UIDs) are different.
> 
> But that wouldn't show changes in the output of the muxer.
> 
True. But given that this muxer has already written some random data
when not in bitexact mode (namely the SegmentUID as well as the
FileUID of attachments) for more than nine years I wonder how much
this has hindered finding bugs.

- Andreas
diff mbox

Patch

diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
index de57e474be..b87d15b7ff 100644
--- a/libavformat/matroskaenc.c
+++ b/libavformat/matroskaenc.c
@@ -94,6 +94,7 @@  typedef struct mkv_cues {
 typedef struct mkv_track {
     int             write_dts;
     int             has_cue;
+    uint32_t        uid;
     int             sample_rate;
     int64_t         sample_rate_offset;
     int64_t         last_timestamp;
@@ -1199,8 +1200,7 @@  static int mkv_write_track(AVFormatContext *s, MatroskaMuxContext *mkv,
     track = start_ebml_master(pb, MATROSKA_ID_TRACKENTRY, 0);
     put_ebml_uint (pb, MATROSKA_ID_TRACKNUMBER,
                    mkv->is_dash ? mkv->dash_track_number : i + 1);
-    put_ebml_uint (pb, MATROSKA_ID_TRACKUID,
-                   mkv->is_dash ? mkv->dash_track_number : i + 1);
+    put_ebml_uint (pb, MATROSKA_ID_TRACKUID, mkv->tracks[i].uid);
     put_ebml_uint (pb, MATROSKA_ID_TRACKFLAGLACING , 0);    // no lacing (yet)
 
     if ((tag = av_dict_get(st->metadata, "title", NULL, 0)))
@@ -1650,7 +1650,8 @@  static int mkv_write_tags(AVFormatContext *s)
         if (!mkv_check_tag(st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID))
             continue;
 
-        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1);
+        ret = mkv_write_tag(s, st->metadata, MATROSKA_ID_TAGTARGETS_TRACKUID,
+                            mkv->tracks[i].uid);
         if (ret < 0) return ret;
     }
 
@@ -1658,13 +1659,15 @@  static int mkv_write_tags(AVFormatContext *s)
         for (i = 0; i < s->nb_streams; i++) {
             AVIOContext *pb;
             AVStream *st = s->streams[i];
+            mkv_track *track = &mkv->tracks[i];
             ebml_master tag_target;
             ebml_master tag;
 
             if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT)
                 continue;
 
-            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID, i + 1, &tag_target);
+            mkv_write_tag_targets(s, MATROSKA_ID_TAGTARGETS_TRACKUID,
+                                  track->uid, &tag_target);
             pb = mkv->tags_bc;
 
             tag = start_ebml_master(pb, MATROSKA_ID_SIMPLETAG, 0);
@@ -1863,10 +1866,6 @@  static int mkv_write_header(AVFormatContext *s)
             version = 4;
     }
 
-    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
-    if (!mkv->tracks) {
-        return AVERROR(ENOMEM);
-    }
     ebml_header = start_ebml_master(pb, EBML_ID_HEADER, MAX_EBML_HEADER_SIZE);
     put_ebml_uint  (pb, EBML_ID_EBMLVERSION       ,           1);
     put_ebml_uint  (pb, EBML_ID_EBMLREADVERSION   ,           1);
@@ -2667,8 +2666,42 @@  static int webm_query_codec(enum AVCodecID codec_id, int std_compliance)
     return 0;
 }
 
+static uint32_t mkv_get_uid(const mkv_track *tracks, int i, AVLFG *c)
+{
+    uint32_t uid;
+
+    for (int j = 0, k; j < 5; j++) {
+        uid = av_lfg_get(c);
+        if (!uid)
+            continue;
+        for (k = 0; k < i; k++) {
+            if (tracks[k].uid == uid) {
+                /* Was something wrong with our random seed? */
+                av_lfg_init(c, av_get_random_seed());
+                break;
+            }
+        }
+        if (k == i)
+            return uid;
+    }
+
+    /* Test the numbers from 1 to i. */
+    for (int j = 1, k; j < i + 1; j++) {
+        for (k = 0; k < i; k++) {
+            if (tracks[k].uid == j)
+                break;
+        }
+        if (k == i)
+            return j;
+    }
+    /* Return i + 1. It can't coincide with another uid. */
+    return i + 1;
+}
+
 static int mkv_init(struct AVFormatContext *s)
 {
+    MatroskaMuxContext *mkv = s->priv_data;
+    AVLFG c;
     int i;
 
     if (s->nb_streams > MAX_TRACKS) {
@@ -2697,7 +2730,23 @@  static int mkv_init(struct AVFormatContext *s)
         s->internal->avoid_negative_ts_use_pts = 1;
     }
 
+    mkv->tracks = av_mallocz_array(s->nb_streams, sizeof(*mkv->tracks));
+    if (!mkv->tracks) {
+        return AVERROR(ENOMEM);
+    }
+
+    if (!(s->flags & AVFMT_FLAG_BITEXACT))
+        av_lfg_init(&c, av_get_random_seed());
+
     for (i = 0; i < s->nb_streams; i++) {
+        mkv_track *track = &mkv->tracks[i];
+
+        if (s->flags & AVFMT_FLAG_BITEXACT) {
+            track->uid = i + 1;
+        } else {
+            track->uid = mkv_get_uid(mkv->tracks, i, &c);
+        }
+
         // ms precision is the de-facto standard timescale for mkv files
         avpriv_set_pts_info(s->streams[i], 64, 1, 1000);
     }