diff mbox series

[FFmpeg-devel,2/2] avformat/movenc: parse h264 packets to build Sync Sample and Recovery Point tables

Message ID 20210727130820.708-2-jamrial@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/2] avformat/tests/movenc: fill the packets with some valid data | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

James Almer July 27, 2021, 1:08 p.m. UTC
Since we can't blindly trust the keyframe flag in packets and assume its
contents are a valid Sync Sample, do some basic bitstream parsing to build the
Sync Sample table in addition to a Random Access Recovery Point table.

Suggested-by: ffmpeg@fb.com
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
 libavformat/movenc.h         |   1 +
 tests/ref/lavf-fate/h264.mp4 |   6 +-
 3 files changed, 123 insertions(+), 9 deletions(-)

Comments

Andreas Rheinhardt July 28, 2021, 10:43 p.m. UTC | #1
James Almer:
> Since we can't blindly trust the keyframe flag in packets and assume its
> contents are a valid Sync Sample, do some basic bitstream parsing to build the
> Sync Sample table in addition to a Random Access Recovery Point table.
> 
> Suggested-by: ffmpeg@fb.com
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>  libavformat/movenc.h         |   1 +
>  tests/ref/lavf-fate/h264.mp4 |   6 +-
>  3 files changed, 123 insertions(+), 9 deletions(-)
> 
> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
> index 57062f45c5..159e0261b7 100644
> --- a/libavformat/movenc.c
> +++ b/libavformat/movenc.c
> @@ -34,13 +34,17 @@
>  #include "avc.h"
>  #include "libavcodec/ac3_parser_internal.h"
>  #include "libavcodec/dnxhddata.h"
> +#include "libavcodec/h264.h"
> +#include "libavcodec/h2645_parse.h"
>  #include "libavcodec/flac.h"
>  #include "libavcodec/get_bits.h"
> +#include "libavcodec/golomb.h"
>  
>  #include "libavcodec/internal.h"
>  #include "libavcodec/put_bits.h"
>  #include "libavcodec/vc1_common.h"
>  #include "libavcodec/raw.h"
> +#include "libavcodec/sei.h"
>  #include "internal.h"
>  #include "libavutil/avstring.h"
>  #include "libavutil/channel_layout.h"
> @@ -2537,7 +2541,9 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>      if (!sgpd_entries)
>          return AVERROR(ENOMEM);
>  
> -    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC);
> +    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
> +               track->par->codec_id == AV_CODEC_ID_AAC  ||
> +               track->par->codec_id == AV_CODEC_ID_H264);
>  
>      if (track->par->codec_id == AV_CODEC_ID_OPUS) {
>          for (i = 0; i < track->entry; i++) {
> @@ -2545,7 +2551,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>              int distance = 0;
>              for (j = i - 1; j >= 0; j--) {
>                  roll_samples_remaining -= get_cluster_duration(track, j);
> -                distance++;
> +                distance--;
>                  if (roll_samples_remaining <= 0)
>                      break;
>              }
> @@ -2555,7 +2561,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>              if (roll_samples_remaining > 0)
>                  distance = 0;
>              /* Verify distance is a maximum of 32 (2.5ms) packets. */
> -            if (distance > 32)
> +            if (distance < 32)
>                  return AVERROR_INVALIDDATA;
>              if (i && distance == sgpd_entries[entries].roll_distance) {
>                  sgpd_entries[entries].count++;
> @@ -2566,10 +2572,22 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>                  sgpd_entries[entries].group_description_index = distance ? ++group : 0;
>              }
>          }
> +    } else if (track->par->codec_id == AV_CODEC_ID_H264) {
> +        for (i = 0; i < track->entry; i++) {
> +            int distance = track->cluster[i].roll_distance;
> +            if (i && distance == sgpd_entries[entries].roll_distance) {
> +                sgpd_entries[entries].count++;
> +            } else {
> +                entries++;
> +                sgpd_entries[entries].count = 1;
> +                sgpd_entries[entries].roll_distance = distance;
> +                sgpd_entries[entries].group_description_index = distance ? ++group : 0;
> +            }
> +        }
>      } else {
>          entries++;
>          sgpd_entries[entries].count = track->sample_count;
> -        sgpd_entries[entries].roll_distance = 1;
> +        sgpd_entries[entries].roll_distance = -1;

You seem to be negate roll_distance in this patch; shouldn't this be a
separate patch?

>          sgpd_entries[entries].group_description_index = ++group;
>      }
>      entries++;
> @@ -2588,7 +2606,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>      avio_wb32(pb, group); /* entry_count */
>      for (i = 0; i < entries; i++) {
>          if (sgpd_entries[i].group_description_index) {
> -            avio_wb16(pb, -sgpd_entries[i].roll_distance); /* roll_distance */
> +            avio_wb16(pb, sgpd_entries[i].roll_distance); /* roll_distance */
>          }
>      }
>  
> @@ -2639,7 +2657,9 @@ static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
>      if (track->cenc.aes_ctr) {
>          ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
>      }
> -    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC) {
> +    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
> +        track->par->codec_id == AV_CODEC_ID_AAC  ||
> +        track->par->codec_id == AV_CODEC_ID_H264) {
>          mov_preroll_write_stbl_atoms(pb, track);
>      }
>      return update_size(pb, pos);
> @@ -5150,6 +5170,96 @@ static int mov_parse_mpeg2_frame(AVPacket *pkt, uint32_t *flags)
>      return 0;
>  }
>  
> +static int mov_parse_h264_frame(AVPacket *pkt, MOVTrack *trk)
> +{
> +    GetBitContext gb;
> +    const uint8_t *buf = pkt->data;
> +    const uint8_t *buf_end = pkt->data + pkt->size;
> +    uint32_t state = -1;
> +    unsigned roll_distance = 0;
> +    int nal_length_size = 0, nalsize;
> +    int idr = 0;
> +
> +    if (!pkt->size)
> +        return 0;
> +    if (trk->vos_data && trk->vos_data[0] == 1) {
> +        if (trk->vos_len < 5)
> +            return 0;
> +        nal_length_size = (trk->vos_data[4] & 0x03) + 1;
> +    }
> +
> +    while (buf_end - buf >= 4) {
> +        if (nal_length_size) {
> +            int i = 0;
> +            nalsize = get_nalsize(nal_length_size, buf, buf_end - buf, &i, NULL);
> +            if (nalsize < 0)
> +                break;
> +            state = buf[nal_length_size];
> +            buf += nal_length_size + 1;
> +        } else {
> +            buf = avpriv_find_start_code(buf, buf_end, &state);

This checks the whole packet for start codes. But given that we have to
convert annex B input to ISOBMFF format anyway, this separate codepath
seems avoidable by working with already reformatted data here.
(In any case: In case this is not an IDR access unit you really check
the whole packet, even when you have already encountered a VLC NALU
despite SEIs having to be in front of them.)

> +            if (buf >= buf_end)
> +                break;
> +        }
> +
> +        switch (state & 0x1f) {
> +        case H264_NAL_IDR_SLICE:
> +            idr = 1;
> +            goto end;
> +        case H264_NAL_SEI:
> +            init_get_bits8(&gb, buf, buf_end - buf);

Weird that you checked the init_get_bits8() below but not this one.

> +            do {
> +                GetBitContext gb_payload;
> +                int ret, type = 0, size = 0;
> +
> +                do {
> +                    if (get_bits_left(&gb) < 8)
> +                        goto end;
> +                    type += show_bits(&gb, 8);
> +                } while (get_bits(&gb, 8) == 255);
> +                do {
> +                    if (get_bits_left(&gb) < 8)
> +                        goto end;
> +                    size += show_bits(&gb, 8);
> +                } while (get_bits(&gb, 8) == 255);
> +
> +                if (size > get_bits_left(&gb) / 8)
> +                    goto end;
> +
> +                ret = init_get_bits8(&gb_payload, gb.buffer + get_bits_count(&gb) / 8, size);
> +                if (ret < 0)
> +                    goto end;
> +
> +                switch (type) {
> +                case SEI_TYPE_RECOVERY_POINT:
> +                    roll_distance = get_ue_golomb_long(&gb_payload);
> +                    break;
> +                default:
> +                    break;
> +                }
> +                skip_bits_long(&gb, 8 * size);
> +            } while (get_bits_left(&gb) > 0 && show_bits(&gb, 8) != 0x80);

1. You are using the outer bitreader gb only for byte-aligned
reads/skips; better read bytes directly.
2. You are not unescaping the buffer: The SEI size is the size of an
unescaped SEI (after 0x03 has been stripped away). Knowing the size in
advance comes in really handy for this; this is where only working with
mp4-formatted data brings benefits.

> +            break;
> +        default:
> +            break;
> +        }
> +        if (nal_length_size)
> +            buf += nalsize - 1;
> +    }
> +
> +end:
> +    if (roll_distance != (int16_t)roll_distance)

The H.264 spec restricts this field to values in the range
0..MaxFrameNum - 1, where the latter can be UINT16_MAX. So I don't
understand why you are using a signed int16_t here and as type in MOVIentry.

> +        roll_distance = 0;
> +    trk->cluster[trk->entry].roll_distance = roll_distance;
> +
> +    if (idr) {
> +        trk->cluster[trk->entry].flags |= MOV_SYNC_SAMPLE;
> +        trk->has_keyframes++;
> +    }
> +
> +    return 0;
> +}
> +
>  static void mov_parse_vc1_frame(AVPacket *pkt, MOVTrack *trk)
>  {
>      const uint8_t *start, *next, *end = pkt->data + pkt->size;
> @@ -5740,6 +5850,7 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>      trk->cluster[trk->entry].entries          = samples_in_chunk;
>      trk->cluster[trk->entry].dts              = pkt->dts;
>      trk->cluster[trk->entry].pts              = pkt->pts;
> +    trk->cluster[trk->entry].roll_distance    = 0;
>      if (!trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
>          if (!trk->frag_discont) {
>              /* First packet of a new fragment. We already wrote the duration
> @@ -5821,6 +5932,8 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>  
>      if (par->codec_id == AV_CODEC_ID_VC1) {
>          mov_parse_vc1_frame(pkt, trk);
> +    } else if (par->codec_id == AV_CODEC_ID_H264) {
> +        mov_parse_h264_frame(pkt, trk);
>      } else if (par->codec_id == AV_CODEC_ID_TRUEHD) {
>          mov_parse_truehd_frame(pkt, trk);
>      } else if (pkt->flags & AV_PKT_FLAG_KEY) {
> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
> index af1ea0bce6..73bf73ce8f 100644
> --- a/libavformat/movenc.h
> +++ b/libavformat/movenc.h
> @@ -56,6 +56,7 @@ typedef struct MOVIentry {
>  #define MOV_PARTIAL_SYNC_SAMPLE 0x0002
>  #define MOV_DISPOSABLE_SAMPLE   0x0004
>      uint32_t     flags;
> +    int16_t      roll_distance;
>      AVProducerReferenceTime prft;
>  } MOVIentry;
>  
> diff --git a/tests/ref/lavf-fate/h264.mp4 b/tests/ref/lavf-fate/h264.mp4
> index a9c3823c2c..c08ee4c7ae 100644
> --- a/tests/ref/lavf-fate/h264.mp4
> +++ b/tests/ref/lavf-fate/h264.mp4
> @@ -1,3 +1,3 @@
> -fe299ea5205b71a48281f917b1256a5d *tests/data/lavf-fate/lavf.h264.mp4
> -547928 tests/data/lavf-fate/lavf.h264.mp4
> -tests/data/lavf-fate/lavf.h264.mp4 CRC=0x9da2c999
> +2812f617314d23474fcb23898b8a56ab *tests/data/lavf-fate/lavf.h264.mp4
> +548084 tests/data/lavf-fate/lavf.h264.mp4
> +tests/data/lavf-fate/lavf.h264.mp4 CRC=0x396f0829
> 
I guess this filesize increase is due to the SyncSample table? What
exactly has been written before this patch, what gets written with it?

- Andreas
Derek Buitenhuis July 28, 2021, 10:51 p.m. UTC | #2
On 7/27/2021 2:08 PM, James Almer wrote:
> Since we can't blindly trust the keyframe flag in packets and assume its
> contents are a valid Sync Sample, do some basic bitstream parsing to build the
> Sync Sample table in addition to a Random Access Recovery Point table.
> 
> Suggested-by: ffmpeg@fb.com
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>  libavformat/movenc.h         |   1 +
>  tests/ref/lavf-fate/h264.mp4 |   6 +-
>  3 files changed, 123 insertions(+), 9 deletions(-)

This problem (due to insufficient API) exists for a lot more codecs
than H.264 - are we going to fill movenc.c with parsers for HEVC,
AV1, MPEG-4 ASP, etc., or just make only on codec behave this way?

- Derek
James Almer July 28, 2021, 11:55 p.m. UTC | #3
On 7/28/2021 7:43 PM, Andreas Rheinhardt wrote:
> James Almer:
>> Since we can't blindly trust the keyframe flag in packets and assume its
>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>> Sync Sample table in addition to a Random Access Recovery Point table.
>>
>> Suggested-by: ffmpeg@fb.com
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>   libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>   libavformat/movenc.h         |   1 +
>>   tests/ref/lavf-fate/h264.mp4 |   6 +-
>>   3 files changed, 123 insertions(+), 9 deletions(-)
>>
>> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
>> index 57062f45c5..159e0261b7 100644
>> --- a/libavformat/movenc.c
>> +++ b/libavformat/movenc.c
>> @@ -34,13 +34,17 @@
>>   #include "avc.h"
>>   #include "libavcodec/ac3_parser_internal.h"
>>   #include "libavcodec/dnxhddata.h"
>> +#include "libavcodec/h264.h"
>> +#include "libavcodec/h2645_parse.h"
>>   #include "libavcodec/flac.h"
>>   #include "libavcodec/get_bits.h"
>> +#include "libavcodec/golomb.h"
>>   
>>   #include "libavcodec/internal.h"
>>   #include "libavcodec/put_bits.h"
>>   #include "libavcodec/vc1_common.h"
>>   #include "libavcodec/raw.h"
>> +#include "libavcodec/sei.h"
>>   #include "internal.h"
>>   #include "libavutil/avstring.h"
>>   #include "libavutil/channel_layout.h"
>> @@ -2537,7 +2541,9 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>       if (!sgpd_entries)
>>           return AVERROR(ENOMEM);
>>   
>> -    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC);
>> +    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
>> +               track->par->codec_id == AV_CODEC_ID_AAC  ||
>> +               track->par->codec_id == AV_CODEC_ID_H264);
>>   
>>       if (track->par->codec_id == AV_CODEC_ID_OPUS) {
>>           for (i = 0; i < track->entry; i++) {
>> @@ -2545,7 +2551,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>               int distance = 0;
>>               for (j = i - 1; j >= 0; j--) {
>>                   roll_samples_remaining -= get_cluster_duration(track, j);
>> -                distance++;
>> +                distance--;
>>                   if (roll_samples_remaining <= 0)
>>                       break;
>>               }
>> @@ -2555,7 +2561,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>               if (roll_samples_remaining > 0)
>>                   distance = 0;
>>               /* Verify distance is a maximum of 32 (2.5ms) packets. */
>> -            if (distance > 32)
>> +            if (distance < 32)
>>                   return AVERROR_INVALIDDATA;
>>               if (i && distance == sgpd_entries[entries].roll_distance) {
>>                   sgpd_entries[entries].count++;
>> @@ -2566,10 +2572,22 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>                   sgpd_entries[entries].group_description_index = distance ? ++group : 0;
>>               }
>>           }
>> +    } else if (track->par->codec_id == AV_CODEC_ID_H264) {
>> +        for (i = 0; i < track->entry; i++) {
>> +            int distance = track->cluster[i].roll_distance;
>> +            if (i && distance == sgpd_entries[entries].roll_distance) {
>> +                sgpd_entries[entries].count++;
>> +            } else {
>> +                entries++;
>> +                sgpd_entries[entries].count = 1;
>> +                sgpd_entries[entries].roll_distance = distance;
>> +                sgpd_entries[entries].group_description_index = distance ? ++group : 0;
>> +            }
>> +        }
>>       } else {
>>           entries++;
>>           sgpd_entries[entries].count = track->sample_count;
>> -        sgpd_entries[entries].roll_distance = 1;
>> +        sgpd_entries[entries].roll_distance = -1;
> 
> You seem to be negate roll_distance in this patch; shouldn't this be a
> separate patch?

I'll split it.

> 
>>           sgpd_entries[entries].group_description_index = ++group;
>>       }
>>       entries++;
>> @@ -2588,7 +2606,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>       avio_wb32(pb, group); /* entry_count */
>>       for (i = 0; i < entries; i++) {
>>           if (sgpd_entries[i].group_description_index) {
>> -            avio_wb16(pb, -sgpd_entries[i].roll_distance); /* roll_distance */
>> +            avio_wb16(pb, sgpd_entries[i].roll_distance); /* roll_distance */
>>           }
>>       }
>>   
>> @@ -2639,7 +2657,9 @@ static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
>>       if (track->cenc.aes_ctr) {
>>           ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
>>       }
>> -    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC) {
>> +    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
>> +        track->par->codec_id == AV_CODEC_ID_AAC  ||
>> +        track->par->codec_id == AV_CODEC_ID_H264) {
>>           mov_preroll_write_stbl_atoms(pb, track);
>>       }
>>       return update_size(pb, pos);
>> @@ -5150,6 +5170,96 @@ static int mov_parse_mpeg2_frame(AVPacket *pkt, uint32_t *flags)
>>       return 0;
>>   }
>>   
>> +static int mov_parse_h264_frame(AVPacket *pkt, MOVTrack *trk)
>> +{
>> +    GetBitContext gb;
>> +    const uint8_t *buf = pkt->data;
>> +    const uint8_t *buf_end = pkt->data + pkt->size;
>> +    uint32_t state = -1;
>> +    unsigned roll_distance = 0;
>> +    int nal_length_size = 0, nalsize;
>> +    int idr = 0;
>> +
>> +    if (!pkt->size)
>> +        return 0;
>> +    if (trk->vos_data && trk->vos_data[0] == 1) {
>> +        if (trk->vos_len < 5)
>> +            return 0;
>> +        nal_length_size = (trk->vos_data[4] & 0x03) + 1;
>> +    }
>> +
>> +    while (buf_end - buf >= 4) {
>> +        if (nal_length_size) {
>> +            int i = 0;
>> +            nalsize = get_nalsize(nal_length_size, buf, buf_end - buf, &i, NULL);
>> +            if (nalsize < 0)
>> +                break;
>> +            state = buf[nal_length_size];
>> +            buf += nal_length_size + 1;
>> +        } else {
>> +            buf = avpriv_find_start_code(buf, buf_end, &state);
> 
> This checks the whole packet for start codes. But given that we have to
> convert annex B input to ISOBMFF format anyway, this separate codepath
> seems avoidable by working with already reformatted data here.
> (In any case: In case this is not an IDR access unit you really check
> the whole packet, even when you have already encountered a VLC NALU
> despite SEIs having to be in front of them.)

I'll look into it.

> 
>> +            if (buf >= buf_end)
>> +                break;
>> +        }
>> +
>> +        switch (state & 0x1f) {
>> +        case H264_NAL_IDR_SLICE:
>> +            idr = 1;
>> +            goto end;
>> +        case H264_NAL_SEI:
>> +            init_get_bits8(&gb, buf, buf_end - buf);
> 
> Weird that you checked the init_get_bits8() below but not this one.

The one below was copy-pasted. Will also check this one.

> 
>> +            do {
>> +                GetBitContext gb_payload;
>> +                int ret, type = 0, size = 0;
>> +
>> +                do {
>> +                    if (get_bits_left(&gb) < 8)
>> +                        goto end;
>> +                    type += show_bits(&gb, 8);
>> +                } while (get_bits(&gb, 8) == 255);
>> +                do {
>> +                    if (get_bits_left(&gb) < 8)
>> +                        goto end;
>> +                    size += show_bits(&gb, 8);
>> +                } while (get_bits(&gb, 8) == 255);
>> +
>> +                if (size > get_bits_left(&gb) / 8)
>> +                    goto end;
>> +
>> +                ret = init_get_bits8(&gb_payload, gb.buffer + get_bits_count(&gb) / 8, size);
>> +                if (ret < 0)
>> +                    goto end;
>> +
>> +                switch (type) {
>> +                case SEI_TYPE_RECOVERY_POINT:
>> +                    roll_distance = get_ue_golomb_long(&gb_payload);
>> +                    break;
>> +                default:
>> +                    break;
>> +                }
>> +                skip_bits_long(&gb, 8 * size);
>> +            } while (get_bits_left(&gb) > 0 && show_bits(&gb, 8) != 0x80);
> 
> 1. You are using the outer bitreader gb only for byte-aligned
> reads/skips; better read bytes directly.

Using GetBitContext is cleaner and clearer IMO, but I could use 
GetByteContext instead i guess. I'd rather not work directly on bytes.

> 2. You are not unescaping the buffer: The SEI size is the size of an
> unescaped SEI (after 0x03 has been stripped away). Knowing the size in
> advance comes in really handy for this; this is where only working with
> mp4-formatted data brings benefits.

A SEI NALu may have more than one SEI message, and the Recovery Point 
may not be the first. If i can't trust the size read in the loop above 
because it does not take into account the emulation prevention bytes, 
how can i feasibly skip each SEI message within a give NALU?

> 
>> +            break;
>> +        default:
>> +            break;
>> +        }
>> +        if (nal_length_size)
>> +            buf += nalsize - 1;
>> +    }
>> +
>> +end:
>> +    if (roll_distance != (int16_t)roll_distance)
> 
> The H.264 spec restricts this field to values in the range
> 0..MaxFrameNum - 1, where the latter can be UINT16_MAX. So I don't
> understand why you are using a signed int16_t here and as type in MOVIentry.

Because ISOBMFF's roll_distance is an int16 field.

> 
>> +        roll_distance = 0;
>> +    trk->cluster[trk->entry].roll_distance = roll_distance;
>> +
>> +    if (idr) {
>> +        trk->cluster[trk->entry].flags |= MOV_SYNC_SAMPLE;
>> +        trk->has_keyframes++;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static void mov_parse_vc1_frame(AVPacket *pkt, MOVTrack *trk)
>>   {
>>       const uint8_t *start, *next, *end = pkt->data + pkt->size;
>> @@ -5740,6 +5850,7 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>>       trk->cluster[trk->entry].entries          = samples_in_chunk;
>>       trk->cluster[trk->entry].dts              = pkt->dts;
>>       trk->cluster[trk->entry].pts              = pkt->pts;
>> +    trk->cluster[trk->entry].roll_distance    = 0;
>>       if (!trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
>>           if (!trk->frag_discont) {
>>               /* First packet of a new fragment. We already wrote the duration
>> @@ -5821,6 +5932,8 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>>   
>>       if (par->codec_id == AV_CODEC_ID_VC1) {
>>           mov_parse_vc1_frame(pkt, trk);
>> +    } else if (par->codec_id == AV_CODEC_ID_H264) {
>> +        mov_parse_h264_frame(pkt, trk);
>>       } else if (par->codec_id == AV_CODEC_ID_TRUEHD) {
>>           mov_parse_truehd_frame(pkt, trk);
>>       } else if (pkt->flags & AV_PKT_FLAG_KEY) {
>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>> index af1ea0bce6..73bf73ce8f 100644
>> --- a/libavformat/movenc.h
>> +++ b/libavformat/movenc.h
>> @@ -56,6 +56,7 @@ typedef struct MOVIentry {
>>   #define MOV_PARTIAL_SYNC_SAMPLE 0x0002
>>   #define MOV_DISPOSABLE_SAMPLE   0x0004
>>       uint32_t     flags;
>> +    int16_t      roll_distance;
>>       AVProducerReferenceTime prft;
>>   } MOVIentry;
>>   
>> diff --git a/tests/ref/lavf-fate/h264.mp4 b/tests/ref/lavf-fate/h264.mp4
>> index a9c3823c2c..c08ee4c7ae 100644
>> --- a/tests/ref/lavf-fate/h264.mp4
>> +++ b/tests/ref/lavf-fate/h264.mp4
>> @@ -1,3 +1,3 @@
>> -fe299ea5205b71a48281f917b1256a5d *tests/data/lavf-fate/lavf.h264.mp4
>> -547928 tests/data/lavf-fate/lavf.h264.mp4
>> -tests/data/lavf-fate/lavf.h264.mp4 CRC=0x9da2c999
>> +2812f617314d23474fcb23898b8a56ab *tests/data/lavf-fate/lavf.h264.mp4
>> +548084 tests/data/lavf-fate/lavf.h264.mp4
>> +tests/data/lavf-fate/lavf.h264.mp4 CRC=0x396f0829
>>
> I guess this filesize increase is due to the SyncSample table? What
> exactly has been written before this patch, what gets written with it?

The size increase is due to the addition of Recovery Point sample 
groups. The Sync Sample table is smaller now that we're not listing the 
recovery point packets in it, but ultimately the file is bigger because 
of the presence of a sgpd and sbgp full boxes.

> 
> - Andreas
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
James Almer July 29, 2021, 12:39 a.m. UTC | #4
On 7/28/2021 7:51 PM, Derek Buitenhuis wrote:
> On 7/27/2021 2:08 PM, James Almer wrote:
>> Since we can't blindly trust the keyframe flag in packets and assume its
>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>> Sync Sample table in addition to a Random Access Recovery Point table.
>>
>> Suggested-by: ffmpeg@fb.com
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>   libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>   libavformat/movenc.h         |   1 +
>>   tests/ref/lavf-fate/h264.mp4 |   6 +-
>>   3 files changed, 123 insertions(+), 9 deletions(-)
> 
> This problem (due to insufficient API) exists for a lot more codecs
> than H.264 - are we going to fill movenc.c with parsers for HEVC,
> AV1, MPEG-4 ASP, etc., or just make only on codec behave this way?

It's not a problem for AV1 (we correctly tag all packets that should be 
sync samples).

I'm not going to bother with HEVC right now, but this all can be 
simplified once we introduce a new parser API that exposes the important 
per-codec bitstream values, CBS-style (and for example based on it), 
that can be used by muxers, instead of relying purely on values present 
in packets as exported by either a demuxer (from container info or 
derived from auto inserted AVCodecParserContext in utils.c) or just made 
up by the library user.

> 
> - Derek
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Mapul Bhola July 29, 2021, 1:31 a.m. UTC | #5
I always meant to ask what is meaning of your profile picture?

July 28, 2021 6:51 PM, "Derek Buitenhuis" <derek.buitenhuis@gmail.com> wrote:

> On 7/27/2021 2:08 PM, James Almer wrote:
> 
>> Since we can't blindly trust the keyframe flag in packets and assume its
>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>> Sync Sample table in addition to a Random Access Recovery Point table.
>> 
>> Suggested-by: ffmpeg@fb.com
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>> libavformat/movenc.c | 125 +++++++++++++++++++++++++++++++++--
>> libavformat/movenc.h | 1 +
>> tests/ref/lavf-fate/h264.mp4 | 6 +-
>> 3 files changed, 123 insertions(+), 9 deletions(-)
> 
> This problem (due to insufficient API) exists for a lot more codecs
> than H.264 - are we going to fill movenc.c with parsers for HEVC,
> AV1, MPEG-4 ASP, etc., or just make only on codec behave this way?
> 
> - Derek
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Andreas Rheinhardt July 29, 2021, 10:23 a.m. UTC | #6
James Almer:
> On 7/28/2021 7:43 PM, Andreas Rheinhardt wrote:
>> James Almer:
>>> Since we can't blindly trust the keyframe flag in packets and assume its
>>> contents are a valid Sync Sample, do some basic bitstream parsing to
>>> build the
>>> Sync Sample table in addition to a Random Access Recovery Point table.
>>>
>>> Suggested-by: ffmpeg@fb.com
>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>> ---
>>>   libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>>   libavformat/movenc.h         |   1 +
>>>   tests/ref/lavf-fate/h264.mp4 |   6 +-
>>>   3 files changed, 123 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
>>> index 57062f45c5..159e0261b7 100644
>>> --- a/libavformat/movenc.c
>>> +++ b/libavformat/movenc.c
>>> @@ -34,13 +34,17 @@
>>>   #include "avc.h"
>>>   #include "libavcodec/ac3_parser_internal.h"
>>>   #include "libavcodec/dnxhddata.h"
>>> +#include "libavcodec/h264.h"
>>> +#include "libavcodec/h2645_parse.h"
>>>   #include "libavcodec/flac.h"
>>>   #include "libavcodec/get_bits.h"
>>> +#include "libavcodec/golomb.h"
>>>     #include "libavcodec/internal.h"
>>>   #include "libavcodec/put_bits.h"
>>>   #include "libavcodec/vc1_common.h"
>>>   #include "libavcodec/raw.h"
>>> +#include "libavcodec/sei.h"
>>>   #include "internal.h"
>>>   #include "libavutil/avstring.h"
>>>   #include "libavutil/channel_layout.h"
>>> @@ -2537,7 +2541,9 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>       if (!sgpd_entries)
>>>           return AVERROR(ENOMEM);
>>>   -    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
>>> track->par->codec_id == AV_CODEC_ID_AAC);
>>> +    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
>>> +               track->par->codec_id == AV_CODEC_ID_AAC  ||
>>> +               track->par->codec_id == AV_CODEC_ID_H264);
>>>         if (track->par->codec_id == AV_CODEC_ID_OPUS) {
>>>           for (i = 0; i < track->entry; i++) {
>>> @@ -2545,7 +2551,7 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>               int distance = 0;
>>>               for (j = i - 1; j >= 0; j--) {
>>>                   roll_samples_remaining -=
>>> get_cluster_duration(track, j);
>>> -                distance++;
>>> +                distance--;
>>>                   if (roll_samples_remaining <= 0)
>>>                       break;
>>>               }
>>> @@ -2555,7 +2561,7 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>               if (roll_samples_remaining > 0)
>>>                   distance = 0;
>>>               /* Verify distance is a maximum of 32 (2.5ms) packets. */
>>> -            if (distance > 32)
>>> +            if (distance < 32)
>>>                   return AVERROR_INVALIDDATA;
>>>               if (i && distance ==
>>> sgpd_entries[entries].roll_distance) {
>>>                   sgpd_entries[entries].count++;
>>> @@ -2566,10 +2572,22 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>                   sgpd_entries[entries].group_description_index =
>>> distance ? ++group : 0;
>>>               }
>>>           }
>>> +    } else if (track->par->codec_id == AV_CODEC_ID_H264) {
>>> +        for (i = 0; i < track->entry; i++) {
>>> +            int distance = track->cluster[i].roll_distance;
>>> +            if (i && distance == sgpd_entries[entries].roll_distance) {
>>> +                sgpd_entries[entries].count++;
>>> +            } else {
>>> +                entries++;
>>> +                sgpd_entries[entries].count = 1;
>>> +                sgpd_entries[entries].roll_distance = distance;
>>> +                sgpd_entries[entries].group_description_index =
>>> distance ? ++group : 0;
>>> +            }
>>> +        }
>>>       } else {
>>>           entries++;
>>>           sgpd_entries[entries].count = track->sample_count;
>>> -        sgpd_entries[entries].roll_distance = 1;
>>> +        sgpd_entries[entries].roll_distance = -1;
>>
>> You seem to be negate roll_distance in this patch; shouldn't this be a
>> separate patch?
> 
> I'll split it.
> 
>>
>>>           sgpd_entries[entries].group_description_index = ++group;
>>>       }
>>>       entries++;
>>> @@ -2588,7 +2606,7 @@ static int
>>> mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>>>       avio_wb32(pb, group); /* entry_count */
>>>       for (i = 0; i < entries; i++) {
>>>           if (sgpd_entries[i].group_description_index) {
>>> -            avio_wb16(pb, -sgpd_entries[i].roll_distance); /*
>>> roll_distance */
>>> +            avio_wb16(pb, sgpd_entries[i].roll_distance); /*
>>> roll_distance */
>>>           }
>>>       }
>>>   @@ -2639,7 +2657,9 @@ static int mov_write_stbl_tag(AVFormatContext
>>> *s, AVIOContext *pb, MOVMuxContext
>>>       if (track->cenc.aes_ctr) {
>>>           ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
>>>       }
>>> -    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
>>> track->par->codec_id == AV_CODEC_ID_AAC) {
>>> +    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
>>> +        track->par->codec_id == AV_CODEC_ID_AAC  ||
>>> +        track->par->codec_id == AV_CODEC_ID_H264) {
>>>           mov_preroll_write_stbl_atoms(pb, track);
>>>       }
>>>       return update_size(pb, pos);
>>> @@ -5150,6 +5170,96 @@ static int mov_parse_mpeg2_frame(AVPacket
>>> *pkt, uint32_t *flags)
>>>       return 0;
>>>   }
>>>   +static int mov_parse_h264_frame(AVPacket *pkt, MOVTrack *trk)
>>> +{
>>> +    GetBitContext gb;
>>> +    const uint8_t *buf = pkt->data;
>>> +    const uint8_t *buf_end = pkt->data + pkt->size;
>>> +    uint32_t state = -1;
>>> +    unsigned roll_distance = 0;
>>> +    int nal_length_size = 0, nalsize;
>>> +    int idr = 0;
>>> +
>>> +    if (!pkt->size)
>>> +        return 0;
>>> +    if (trk->vos_data && trk->vos_data[0] == 1) {
>>> +        if (trk->vos_len < 5)
>>> +            return 0;
>>> +        nal_length_size = (trk->vos_data[4] & 0x03) + 1;
>>> +    }
>>> +
>>> +    while (buf_end - buf >= 4) {
>>> +        if (nal_length_size) {
>>> +            int i = 0;
>>> +            nalsize = get_nalsize(nal_length_size, buf, buf_end -
>>> buf, &i, NULL);
>>> +            if (nalsize < 0)
>>> +                break;
>>> +            state = buf[nal_length_size];
>>> +            buf += nal_length_size + 1;
>>> +        } else {
>>> +            buf = avpriv_find_start_code(buf, buf_end, &state);
>>
>> This checks the whole packet for start codes. But given that we have to
>> convert annex B input to ISOBMFF format anyway, this separate codepath
>> seems avoidable by working with already reformatted data here.
>> (In any case: In case this is not an IDR access unit you really check
>> the whole packet, even when you have already encountered a VLC NALU
>> despite SEIs having to be in front of them.)
> 
> I'll look into it.
> 
>>
>>> +            if (buf >= buf_end)
>>> +                break;
>>> +        }
>>> +
>>> +        switch (state & 0x1f) {
>>> +        case H264_NAL_IDR_SLICE:
>>> +            idr = 1;
>>> +            goto end;
>>> +        case H264_NAL_SEI:
>>> +            init_get_bits8(&gb, buf, buf_end - buf);
>>
>> Weird that you checked the init_get_bits8() below but not this one.
> 
> The one below was copy-pasted. Will also check this one.
> 
>>
>>> +            do {
>>> +                GetBitContext gb_payload;
>>> +                int ret, type = 0, size = 0;
>>> +
>>> +                do {
>>> +                    if (get_bits_left(&gb) < 8)
>>> +                        goto end;
>>> +                    type += show_bits(&gb, 8);
>>> +                } while (get_bits(&gb, 8) == 255);
>>> +                do {
>>> +                    if (get_bits_left(&gb) < 8)
>>> +                        goto end;
>>> +                    size += show_bits(&gb, 8);
>>> +                } while (get_bits(&gb, 8) == 255);
>>> +
>>> +                if (size > get_bits_left(&gb) / 8)
>>> +                    goto end;
>>> +
>>> +                ret = init_get_bits8(&gb_payload, gb.buffer +
>>> get_bits_count(&gb) / 8, size);
>>> +                if (ret < 0)
>>> +                    goto end;
>>> +
>>> +                switch (type) {
>>> +                case SEI_TYPE_RECOVERY_POINT:
>>> +                    roll_distance = get_ue_golomb_long(&gb_payload);
>>> +                    break;
>>> +                default:
>>> +                    break;
>>> +                }
>>> +                skip_bits_long(&gb, 8 * size);
>>> +            } while (get_bits_left(&gb) > 0 && show_bits(&gb, 8) !=
>>> 0x80);
>>
>> 1. You are using the outer bitreader gb only for byte-aligned
>> reads/skips; better read bytes directly.
> 
> Using GetBitContext is cleaner and clearer IMO, but I could use
> GetByteContext instead i guess. I'd rather not work directly on bytes.
> 

I think we disagree on that: I always hate it when I have to use a
bitreader in case the position is known to be byte-aligned.

>> 2. You are not unescaping the buffer: The SEI size is the size of an
>> unescaped SEI (after 0x03 has been stripped away). Knowing the size in
>> advance comes in really handy for this; this is where only working with
>> mp4-formatted data brings benefits.
> 
> A SEI NALu may have more than one SEI message, and the Recovery Point
> may not be the first. If i can't trust the size read in the loop above
> because it does not take into account the emulation prevention bytes,
> how can i feasibly skip each SEI message within a give NALU?
> 

By unescaping the SEI NALU first. (Alternatively, one could write a
bitreader (necessarily cached) that unescapes internally, but that is
more effort; and it would be slower with bigger NALUs, so the fuzzer
will probably report some new timeouts.)

- Andreas
Michael Niedermayer July 29, 2021, 5:58 p.m. UTC | #7
On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
> Since we can't blindly trust the keyframe flag in packets and assume its
> contents are a valid Sync Sample, do some basic bitstream parsing to build the
> Sync Sample table in addition to a Random Access Recovery Point table.
> 
> Suggested-by: ffmpeg@fb.com
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>  libavformat/movenc.h         |   1 +
>  tests/ref/lavf-fate/h264.mp4 |   6 +-
>  3 files changed, 123 insertions(+), 9 deletions(-)

Will this allow a user to mark a subset of keyframes as random
access points ?
A user may want seeking to preferably happen to a scene start and not a
frame before

Will this allow a user to mark a damaged keyframe as such ?
A frame might in fact have been a keyframe and maybe the only in the file
but maybe was damaged so a parser might fail to detect it.

Thanks

[...]
James Almer July 29, 2021, 6:09 p.m. UTC | #8
On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
> On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
>> Since we can't blindly trust the keyframe flag in packets and assume its
>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>> Sync Sample table in addition to a Random Access Recovery Point table.
>>
>> Suggested-by: ffmpeg@fb.com
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>>   libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>   libavformat/movenc.h         |   1 +
>>   tests/ref/lavf-fate/h264.mp4 |   6 +-
>>   3 files changed, 123 insertions(+), 9 deletions(-)
> 
> Will this allow a user to mark a subset of keyframes as random
> access points ?
> A user may want seeking to preferably happen to a scene start and not a
> frame before
> 
> Will this allow a user to mark a damaged keyframe as such ?
> A frame might in fact have been a keyframe and maybe the only in the file
> but maybe was damaged so a parser might fail to detect it.

This code will ensure all packets with an IDR picture will be listed in 
the Sync Sample table, and all packets with a Recovery Point SEI message 
will be listed in the Random Access Recovery Point table.
Whatever is signaled in packets (like the keyframe flag) is ignored to 
prevent creating non-spec compliant output.

> 
> Thanks
> 
> [...]
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Derek Buitenhuis July 29, 2021, 10:04 p.m. UTC | #9
On 7/29/2021 1:39 AM, James Almer wrote:
> It's not a problem for AV1 (we correctly tag all packets that should be 
> sync samples).

I meant for the rest of the boxes ;)

> I'm not going to bother with HEVC right now, but this all can be 
> simplified once we introduce a new parser API that exposes the important 
> per-codec bitstream values, CBS-style (and for example based on it), 
> that can be used by muxers, instead of relying purely on values present 
> in packets as exported by either a demuxer (from container info or 
> derived from auto inserted AVCodecParserContext in utils.c) or just made 
> up by the library user.

That certainly sounds less insane than what currently exists in our APIs.

- Derek
Derek Buitenhuis July 29, 2021, 10:05 p.m. UTC | #10
On 7/29/2021 2:31 AM, ffmpegandmahanstreamer@e.email wrote:
> I always meant to ask what is meaning of your profile picture?

What?

I don't know which picture you're referring to, but it doesn't matter.

This is neither the appropriate place nor thread.

- Derek
Michael Niedermayer July 30, 2021, 11:33 a.m. UTC | #11
On Thu, Jul 29, 2021 at 03:09:03PM -0300, James Almer wrote:
> On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
> > On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
> > > Since we can't blindly trust the keyframe flag in packets and assume its
> > > contents are a valid Sync Sample, do some basic bitstream parsing to build the
> > > Sync Sample table in addition to a Random Access Recovery Point table.
> > > 
> > > Suggested-by: ffmpeg@fb.com
> > > Signed-off-by: James Almer <jamrial@gmail.com>
> > > ---
> > >   libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
> > >   libavformat/movenc.h         |   1 +
> > >   tests/ref/lavf-fate/h264.mp4 |   6 +-
> > >   3 files changed, 123 insertions(+), 9 deletions(-)
> > 

A.
> > Will this allow a user to mark a subset of keyframes as random
> > access points ?
> > A user may want seeking to preferably happen to a scene start and not a
> > frame before
> > 

B.
> > Will this allow a user to mark a damaged keyframe as such ?
> > A frame might in fact have been a keyframe and maybe the only in the file
> > but maybe was damaged so a parser might fail to detect it.
> 
> This code will ensure all packets with an IDR picture will be listed in the
> Sync Sample table, and all packets with a Recovery Point SEI message will be
> listed in the Random Access Recovery Point table.
> Whatever is signaled in packets (like the keyframe flag) is ignored to
> prevent creating non-spec compliant output.

The file in case B will be non compliant, it is damaged data, it cannot
be muxed in a compliant way. If we support muxing damaged data then 
parsing that data as a way to unconditionally override the user is
problematic.
If you try to interpret damaged data the result is undefined, the spec
does not tell you how to interpret it and 2 parsers can disagree
if a damaged frame is a keyframe. 

Lets just as 2 examples consider multiple slices some being parsed as IDR
some as non IDR, so what is that ? or what if some packet reodering or
duplication causes parts of a IDR and non IDR frame to be mixed.
where is the IDR frame in that case ? 2 parsers can disagree
still a decoder can with some luck start decoding from such a
point i would guess. But does the user want this marked as a keyframe ?
maybe yes, maybe no. I think taking the users only way to set the keyframe
flag away is a step backward

Thanks

[...]
James Almer July 30, 2021, 1:31 p.m. UTC | #12
On 7/30/2021 8:33 AM, Michael Niedermayer wrote:
> On Thu, Jul 29, 2021 at 03:09:03PM -0300, James Almer wrote:
>> On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
>>> On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
>>>> Since we can't blindly trust the keyframe flag in packets and assume its
>>>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>>>> Sync Sample table in addition to a Random Access Recovery Point table.
>>>>
>>>> Suggested-by: ffmpeg@fb.com
>>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>>> ---
>>>>    libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>>>    libavformat/movenc.h         |   1 +
>>>>    tests/ref/lavf-fate/h264.mp4 |   6 +-
>>>>    3 files changed, 123 insertions(+), 9 deletions(-)
>>>
> 
> A.
>>> Will this allow a user to mark a subset of keyframes as random
>>> access points ?
>>> A user may want seeking to preferably happen to a scene start and not a
>>> frame before
>>>
> 
> B.
>>> Will this allow a user to mark a damaged keyframe as such ?
>>> A frame might in fact have been a keyframe and maybe the only in the file
>>> but maybe was damaged so a parser might fail to detect it.
>>
>> This code will ensure all packets with an IDR picture will be listed in the
>> Sync Sample table, and all packets with a Recovery Point SEI message will be
>> listed in the Random Access Recovery Point table.
>> Whatever is signaled in packets (like the keyframe flag) is ignored to
>> prevent creating non-spec compliant output.
> 
> The file in case B will be non compliant, it is damaged data, it cannot
> be muxed in a compliant way. If we support muxing damaged data then
> parsing that data as a way to unconditionally override the user is
> problematic.
> If you try to interpret damaged data the result is undefined, the spec
> does not tell you how to interpret it and 2 parsers can disagree
> if a damaged frame is a keyframe.

If you don't mark a packet as a Sync Sample, even if it contains damaged 
data, the file is still compliant. The point here is to avoid filling 
the Sync Sample table with bogus entries.

> 
> Lets just as 2 examples consider multiple slices some being parsed as IDR
> some as non IDR, so what is that ? or what if some packet reodering or
> duplication causes parts of a IDR and non IDR frame to be mixed.
> where is the IDR frame in that case ? 2 parsers can disagree

The AVCodecParserContext will tag a packet with an IDR slice as 
keyframe, and then disregard anything else it may find. I made the code 
i wrote here do the same.

> still a decoder can with some luck start decoding from such a
> point i would guess. But does the user want this marked as a keyframe ?
> maybe yes, maybe no. I think taking the users only way to set the keyframe
> flag away is a step backward

Look at MPEG2 parsing and others in this container. It's the same 
scenario. Did any of these concerns show up back then?
What I'm doing here for h264 is the same thing we did for those, to 
ensure we don't create non spec compliant files because of damaged or 
mistagged input, or a careless user.

Letting the user mark whatever they want as a Sync Sample on a container 
where it's explicit what should and should not be such is not good 
practice. If you want that, you can use Matroska and other formats where 
keyframe tagging is apparently not strict.

Now, i want to point out that I wrote this parsing code here because my 
previous attempt at stopping the AVCodecParserContext from being too 
liberal marking things as keyframes was rejected *precisely* because it 
would let the user create non compliant output in mp4. And now you don't 
want this one either because you want to let the user create non 
complaint output.
I'm running in circles here and it's getting tiresome.

> 
> Thanks
> 
> [...]
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Michael Niedermayer July 31, 2021, 9:41 a.m. UTC | #13
On Fri, Jul 30, 2021 at 10:31:53AM -0300, James Almer wrote:
> On 7/30/2021 8:33 AM, Michael Niedermayer wrote:
> > On Thu, Jul 29, 2021 at 03:09:03PM -0300, James Almer wrote:
> > > On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
> > > > On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
> > > > > Since we can't blindly trust the keyframe flag in packets and assume its
> > > > > contents are a valid Sync Sample, do some basic bitstream parsing to build the
> > > > > Sync Sample table in addition to a Random Access Recovery Point table.
> > > > > 
> > > > > Suggested-by: ffmpeg@fb.com
> > > > > Signed-off-by: James Almer <jamrial@gmail.com>
> > > > > ---
> > > > >    libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
> > > > >    libavformat/movenc.h         |   1 +
> > > > >    tests/ref/lavf-fate/h264.mp4 |   6 +-
> > > > >    3 files changed, 123 insertions(+), 9 deletions(-)
> > > > 
> > 
> > A.
> > > > Will this allow a user to mark a subset of keyframes as random
> > > > access points ?
> > > > A user may want seeking to preferably happen to a scene start and not a
> > > > frame before
> > > > 
> > 
> > B.
> > > > Will this allow a user to mark a damaged keyframe as such ?
> > > > A frame might in fact have been a keyframe and maybe the only in the file
> > > > but maybe was damaged so a parser might fail to detect it.
> > > 
> > > This code will ensure all packets with an IDR picture will be listed in the
> > > Sync Sample table, and all packets with a Recovery Point SEI message will be
> > > listed in the Random Access Recovery Point table.
> > > Whatever is signaled in packets (like the keyframe flag) is ignored to
> > > prevent creating non-spec compliant output.
> > 
> > The file in case B will be non compliant, it is damaged data, it cannot
> > be muxed in a compliant way. If we support muxing damaged data then
> > parsing that data as a way to unconditionally override the user is
> > problematic.
> > If you try to interpret damaged data the result is undefined, the spec
> > does not tell you how to interpret it and 2 parsers can disagree
> > if a damaged frame is a keyframe.
> 
> If you don't mark a packet as a Sync Sample, even if it contains damaged
> data, the file is still compliant. The point here is to avoid filling the
> Sync Sample table with bogus entries.

IMHO, damaged data is not always a compliant video stream, otherwise in the most extreem
example /dev/random would have to be a compliant h264 stream.


> 
> > 
> > Lets just as 2 examples consider multiple slices some being parsed as IDR
> > some as non IDR, so what is that ? or what if some packet reodering or
> > duplication causes parts of a IDR and non IDR frame to be mixed.
> > where is the IDR frame in that case ? 2 parsers can disagree
> 
> The AVCodecParserContext will tag a packet with an IDR slice as keyframe,
> and then disregard anything else it may find. I made the code i wrote here
> do the same.

If thats the case, we could take the example of a frame lets say it has
100 slices, and in one of these 1 byte is changed so it becomes a IDR slice
That is not really a keyframe or sync symple yet it will
be marked as one IIUC.
Now to continue with this example, which way will it work better
X. With this marked not a keyframe or
Y. With this marked as keyframe as the parser detects ?

Its not containing a single valid IDR slice just 1 of the non IDR slices
have been damaged and are misparsed. That will not work well as a sync sample
yet as a normal sample the 1% of damage might not be noticed at all as its copied
from the last frame



> 
> > still a decoder can with some luck start decoding from such a
> > point i would guess. But does the user want this marked as a keyframe ?
> > maybe yes, maybe no. I think taking the users only way to set the keyframe
> > flag away is a step backward
> 
> Look at MPEG2 parsing and others in this container. It's the same scenario.
> Did any of these concerns show up back then?
> What I'm doing here for h264 is the same thing we did for those, to ensure
> we don't create non spec compliant files because of damaged or mistagged
> input, or a careless user.
> 
> Letting the user mark whatever they want as a Sync Sample on a container
> where it's explicit what should and should not be such is not good practice.
> If you want that, you can use Matroska and other formats where keyframe
> tagging is apparently not strict.
> 
> Now, i want to point out that I wrote this parsing code here because my
> previous attempt at stopping the AVCodecParserContext from being too liberal
> marking things as keyframes was rejected *precisely* because it would let
> the user create non compliant output in mp4. And now you don't want this one
> either because you want to let the user create non complaint output.
> I'm running in circles here and it's getting tiresome.

I see the problem and i agree with you that it is a problem that needs
a solution. Its very bad if the users sets bad flags and it results in
non-compliant files. What my concern is, is that theres no way for the
user to override this when its the parsing code thats wrong and previously
the user could override this.
It may be very rare that the parser is wrong and a user would bother to
override it, i do not know, then again some people are quite crazy with
the level of perfection they try to achieve in their videos

Theres also the 2nd situation where all the information the parser
extracts is already known to the lib user and going over the whole
bitstream is wasting cpu cycles. This can be a situation where for
example all input files where just a moment ago created by FFmpeg
and we assume these would then be as compliant as redoing the parsing
would make them

Thanks

[...]
Hendrik Leppkes July 31, 2021, 12:40 p.m. UTC | #14
On Sat, Jul 31, 2021 at 11:41 AM Michael Niedermayer
<michael@niedermayer.cc> wrote:
> Theres also the 2nd situation where all the information the parser
> extracts is already known to the lib user and going over the whole
> bitstream is wasting cpu cycles. This can be a situation where for
> example all input files where just a moment ago created by FFmpeg
> and we assume these would then be as compliant as redoing the parsing
> would make them
>

This is however not necessarily the case, because we only have one
keyframe field and there are quite varying definitions of what a
container would consider a keyframe for their own purposes.
This is the entire reason this patch was created in the first place,
because different containers have different keyframe semantics, and a
single field in AVPacket cannot cleanly represent it.

A container therefore determining for its own definition of keyframe
how to mark them is the only reliable way currently.

- Hendrik
James Almer July 31, 2021, 4:10 p.m. UTC | #15
On 7/31/2021 6:41 AM, Michael Niedermayer wrote:
> On Fri, Jul 30, 2021 at 10:31:53AM -0300, James Almer wrote:
>> On 7/30/2021 8:33 AM, Michael Niedermayer wrote:
>>> On Thu, Jul 29, 2021 at 03:09:03PM -0300, James Almer wrote:
>>>> On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
>>>>> On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
>>>>>> Since we can't blindly trust the keyframe flag in packets and assume its
>>>>>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>>>>>> Sync Sample table in addition to a Random Access Recovery Point table.
>>>>>>
>>>>>> Suggested-by: ffmpeg@fb.com
>>>>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>>>>> ---
>>>>>>     libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>>>>>     libavformat/movenc.h         |   1 +
>>>>>>     tests/ref/lavf-fate/h264.mp4 |   6 +-
>>>>>>     3 files changed, 123 insertions(+), 9 deletions(-)
>>>>>
>>>
>>> A.
>>>>> Will this allow a user to mark a subset of keyframes as random
>>>>> access points ?
>>>>> A user may want seeking to preferably happen to a scene start and not a
>>>>> frame before
>>>>>
>>>
>>> B.
>>>>> Will this allow a user to mark a damaged keyframe as such ?
>>>>> A frame might in fact have been a keyframe and maybe the only in the file
>>>>> but maybe was damaged so a parser might fail to detect it.
>>>>
>>>> This code will ensure all packets with an IDR picture will be listed in the
>>>> Sync Sample table, and all packets with a Recovery Point SEI message will be
>>>> listed in the Random Access Recovery Point table.
>>>> Whatever is signaled in packets (like the keyframe flag) is ignored to
>>>> prevent creating non-spec compliant output.
>>>
>>> The file in case B will be non compliant, it is damaged data, it cannot
>>> be muxed in a compliant way. If we support muxing damaged data then
>>> parsing that data as a way to unconditionally override the user is
>>> problematic.
>>> If you try to interpret damaged data the result is undefined, the spec
>>> does not tell you how to interpret it and 2 parsers can disagree
>>> if a damaged frame is a keyframe.
>>
>> If you don't mark a packet as a Sync Sample, even if it contains damaged
>> data, the file is still compliant. The point here is to avoid filling the
>> Sync Sample table with bogus entries.
> 
> IMHO, damaged data is not always a compliant video stream, otherwise in the most extreem
> example /dev/random would have to be a compliant h264 stream.
> 
> 
>>
>>>
>>> Lets just as 2 examples consider multiple slices some being parsed as IDR
>>> some as non IDR, so what is that ? or what if some packet reodering or
>>> duplication causes parts of a IDR and non IDR frame to be mixed.
>>> where is the IDR frame in that case ? 2 parsers can disagree
>>
>> The AVCodecParserContext will tag a packet with an IDR slice as keyframe,
>> and then disregard anything else it may find. I made the code i wrote here
>> do the same.
> 
> If thats the case, we could take the example of a frame lets say it has
> 100 slices, and in one of these 1 byte is changed so it becomes a IDR slice
> That is not really a keyframe or sync symple yet it will
> be marked as one IIUC.

Unless it's the first slice NALU out of those 100 in the AU, it will not 
be marked as a keyframe by the AVCodecParserContext, nor as a Sync 
Sample by this code.

> Now to continue with this example, which way will it work better
> X. With this marked not a keyframe or
> Y. With this marked as keyframe as the parser detects ?

It should not be marked as a Sync Sample.

> 
> Its not containing a single valid IDR slice just 1 of the non IDR slices
> have been damaged and are misparsed. That will not work well as a sync sample
> yet as a normal sample the 1% of damage might not be noticed at all as its copied
> from the last frame

How about i look at every slice NALU in the AU and ensure there's only 
IDR slices before marking it as a Sync Sample, instead of only looking 
at the very first slice and decide based on it?
Same thing could be done in the AVCodecParserContext, for that matter.

> 
> 
> 
>>
>>> still a decoder can with some luck start decoding from such a
>>> point i would guess. But does the user want this marked as a keyframe ?
>>> maybe yes, maybe no. I think taking the users only way to set the keyframe
>>> flag away is a step backward
>>
>> Look at MPEG2 parsing and others in this container. It's the same scenario.
>> Did any of these concerns show up back then?
>> What I'm doing here for h264 is the same thing we did for those, to ensure
>> we don't create non spec compliant files because of damaged or mistagged
>> input, or a careless user.
>>
>> Letting the user mark whatever they want as a Sync Sample on a container
>> where it's explicit what should and should not be such is not good practice.
>> If you want that, you can use Matroska and other formats where keyframe
>> tagging is apparently not strict.
>>
>> Now, i want to point out that I wrote this parsing code here because my
>> previous attempt at stopping the AVCodecParserContext from being too liberal
>> marking things as keyframes was rejected *precisely* because it would let
>> the user create non compliant output in mp4. And now you don't want this one
>> either because you want to let the user create non complaint output.
>> I'm running in circles here and it's getting tiresome.
> 
> I see the problem and i agree with you that it is a problem that needs
> a solution. Its very bad if the users sets bad flags and it results in
> non-compliant files. What my concern is, is that theres no way for the
> user to override this when its the parsing code thats wrong and previously
> the user could override this.
> It may be very rare that the parser is wrong and a user would bother to
> override it, i do not know, then again some people are quite crazy with
> the level of perfection they try to achieve in their videos

The only reason the user may have wanted to override the packet's 
keyframe flag was precisely because the AVCodecParserContext was tagging 
way too many packets as such. This would no longer be necessary after 
this change, and you can trust the muxer to do the right thing.

I can however accept a compromise here, and take into account the 
packet's keyframe flag if and only if invalid or unexpected data is 
found in the bitstream. If the parsing succeeds, then whatever the 
packet metadata may signal should be outright ignored.

> 
> Theres also the 2nd situation where all the information the parser
> extracts is already known to the lib user and going over the whole
> bitstream is wasting cpu cycles. This can be a situation where for
> example all input files where just a moment ago created by FFmpeg
> and we assume these would then be as compliant as redoing the parsing
> would make them

Other information, like Recovery Point SEI messages, can't be properly 
signaled by encoders within packet metadata. Muxer can and should, for 
now, look at the bitstream for this purpose.

> 
> Thanks
> 
> [...]
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Michael Niedermayer July 31, 2021, 4:49 p.m. UTC | #16
On Sat, Jul 31, 2021 at 02:40:11PM +0200, Hendrik Leppkes wrote:
> On Sat, Jul 31, 2021 at 11:41 AM Michael Niedermayer
> <michael@niedermayer.cc> wrote:
> > Theres also the 2nd situation where all the information the parser
> > extracts is already known to the lib user and going over the whole
> > bitstream is wasting cpu cycles. This can be a situation where for
> > example all input files where just a moment ago created by FFmpeg
> > and we assume these would then be as compliant as redoing the parsing
> > would make them
> >
> 
> This is however not necessarily the case, because we only have one
> keyframe field and there are quite varying definitions of what a
> container would consider a keyframe for their own purposes.
> This is the entire reason this patch was created in the first place,
> because different containers have different keyframe semantics, and a
> single field in AVPacket cannot cleanly represent it.
> 

agree on all this and thats why i was pushing a bit toward improving the
API. 


> A container therefore determining for its own definition of keyframe
> how to mark them is the only reliable way currently.

Does it not feel ugly to you that some data can be set by the user
through the API and is used, some can be set and is ignored and re-parsed
for some container-codec combinations and some cannot be set at all ?
What we have is quite inconsistent. I am trying to push a bit towards
improving this. Iam not insisting on it.
One thing iam suggesting is to extend the API to at least be able to
represent all information. 

If all information can be represented then that allows moving the "parser" 
out of the muxer and also naturally allows passing information on from
user or demuxer or encoder and also allows judging if we are missing
something or already have all data. 
I may be missing something but to me this just seems cleaner
than ignoring the user settings completely, reparsing and then whats
the next step ? we replace the parser system but do we leave this inside
the muxer or move it back out ? If we move it back out its a odd moving
around of the code, if its left in the muxer under a new API then its 
IMHO a bit unwieldy, sometimes redundant and hard to interact with.
Iam clearly not understanding something here of the grand plan  ...

thx

[...]
Michael Niedermayer July 31, 2021, 5:46 p.m. UTC | #17
On Sat, Jul 31, 2021 at 01:10:27PM -0300, James Almer wrote:
> On 7/31/2021 6:41 AM, Michael Niedermayer wrote:
> > On Fri, Jul 30, 2021 at 10:31:53AM -0300, James Almer wrote:
> > > On 7/30/2021 8:33 AM, Michael Niedermayer wrote:
> > > > On Thu, Jul 29, 2021 at 03:09:03PM -0300, James Almer wrote:
> > > > > On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
> > > > > > On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
> > > > > > > Since we can't blindly trust the keyframe flag in packets and assume its
> > > > > > > contents are a valid Sync Sample, do some basic bitstream parsing to build the
> > > > > > > Sync Sample table in addition to a Random Access Recovery Point table.
> > > > > > > 
> > > > > > > Suggested-by: ffmpeg@fb.com
> > > > > > > Signed-off-by: James Almer <jamrial@gmail.com>
> > > > > > > ---
> > > > > > >     libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
> > > > > > >     libavformat/movenc.h         |   1 +
> > > > > > >     tests/ref/lavf-fate/h264.mp4 |   6 +-
> > > > > > >     3 files changed, 123 insertions(+), 9 deletions(-)
> > > > > > 
> > > > 
> > > > A.
> > > > > > Will this allow a user to mark a subset of keyframes as random
> > > > > > access points ?
> > > > > > A user may want seeking to preferably happen to a scene start and not a
> > > > > > frame before
> > > > > > 
> > > > 
> > > > B.
> > > > > > Will this allow a user to mark a damaged keyframe as such ?
> > > > > > A frame might in fact have been a keyframe and maybe the only in the file
> > > > > > but maybe was damaged so a parser might fail to detect it.
> > > > > 
> > > > > This code will ensure all packets with an IDR picture will be listed in the
> > > > > Sync Sample table, and all packets with a Recovery Point SEI message will be
> > > > > listed in the Random Access Recovery Point table.
> > > > > Whatever is signaled in packets (like the keyframe flag) is ignored to
> > > > > prevent creating non-spec compliant output.
> > > > 
> > > > The file in case B will be non compliant, it is damaged data, it cannot
> > > > be muxed in a compliant way. If we support muxing damaged data then
> > > > parsing that data as a way to unconditionally override the user is
> > > > problematic.
> > > > If you try to interpret damaged data the result is undefined, the spec
> > > > does not tell you how to interpret it and 2 parsers can disagree
> > > > if a damaged frame is a keyframe.
> > > 
> > > If you don't mark a packet as a Sync Sample, even if it contains damaged
> > > data, the file is still compliant. The point here is to avoid filling the
> > > Sync Sample table with bogus entries.
> > 
> > IMHO, damaged data is not always a compliant video stream, otherwise in the most extreem
> > example /dev/random would have to be a compliant h264 stream.
> > 
> > 
> > > 
> > > > 
> > > > Lets just as 2 examples consider multiple slices some being parsed as IDR
> > > > some as non IDR, so what is that ? or what if some packet reodering or
> > > > duplication causes parts of a IDR and non IDR frame to be mixed.
> > > > where is the IDR frame in that case ? 2 parsers can disagree
> > > 
> > > The AVCodecParserContext will tag a packet with an IDR slice as keyframe,
> > > and then disregard anything else it may find. I made the code i wrote here
> > > do the same.
> > 
> > If thats the case, we could take the example of a frame lets say it has
> > 100 slices, and in one of these 1 byte is changed so it becomes a IDR slice
> > That is not really a keyframe or sync symple yet it will
> > be marked as one IIUC.
> 
> Unless it's the first slice NALU out of those 100 in the AU, it will not be
> marked as a keyframe by the AVCodecParserContext, nor as a Sync Sample by
> this code.
>
> > Now to continue with this example, which way will it work better
> > X. With this marked not a keyframe or
> > Y. With this marked as keyframe as the parser detects ?
> 
> It should not be marked as a Sync Sample.
> 
> > 

> > Its not containing a single valid IDR slice just 1 of the non IDR slices
> > have been damaged and are misparsed. That will not work well as a sync sample
> > yet as a normal sample the 1% of damage might not be noticed at all as its copied
> > from the last frame
> 
> How about i look at every slice NALU in the AU and ensure there's only IDR
> slices before marking it as a Sync Sample, instead of only looking at the
> very first slice and decide based on it?
> Same thing could be done in the AVCodecParserContext, for that matter.

Then you get a IDR frame wrong if it has a single damaged slice where its
parsed as non IDR.

What you could do and i do not suggest this actually, just hypothetically
is take the majority >50% being IDR as probably IDR. That will probably 
require extra care when there are 2 fields or frames in one packet

The problem is if you unconditionally overreide the user you really need
to be perfect and this is not easy. More so as being done by default it 
too should be low overhead not slowing things down


> 
> > 
> > 
> > 
> > > 
> > > > still a decoder can with some luck start decoding from such a
> > > > point i would guess. But does the user want this marked as a keyframe ?
> > > > maybe yes, maybe no. I think taking the users only way to set the keyframe
> > > > flag away is a step backward
> > > 
> > > Look at MPEG2 parsing and others in this container. It's the same scenario.
> > > Did any of these concerns show up back then?
> > > What I'm doing here for h264 is the same thing we did for those, to ensure
> > > we don't create non spec compliant files because of damaged or mistagged
> > > input, or a careless user.
> > > 
> > > Letting the user mark whatever they want as a Sync Sample on a container
> > > where it's explicit what should and should not be such is not good practice.
> > > If you want that, you can use Matroska and other formats where keyframe
> > > tagging is apparently not strict.
> > > 
> > > Now, i want to point out that I wrote this parsing code here because my
> > > previous attempt at stopping the AVCodecParserContext from being too liberal
> > > marking things as keyframes was rejected *precisely* because it would let
> > > the user create non compliant output in mp4. And now you don't want this one
> > > either because you want to let the user create non complaint output.
> > > I'm running in circles here and it's getting tiresome.
> > 
> > I see the problem and i agree with you that it is a problem that needs
> > a solution. Its very bad if the users sets bad flags and it results in
> > non-compliant files. What my concern is, is that theres no way for the
> > user to override this when its the parsing code thats wrong and previously
> > the user could override this.
> > It may be very rare that the parser is wrong and a user would bother to
> > override it, i do not know, then again some people are quite crazy with
> > the level of perfection they try to achieve in their videos
> 
> The only reason the user may have wanted to override the packet's keyframe
> flag was precisely because the AVCodecParserContext was tagging way too many
> packets as such. This would no longer be necessary after this change, and
> you can trust the muxer to do the right thing.
> 

> I can however accept a compromise here, and take into account the packet's
> keyframe flag if and only if invalid or unexpected data is found in the
> bitstream. If the parsing succeeds, then whatever the packet metadata may
> signal should be outright ignored.

how do you detect invalid or unexpected data ?


> 
> > 
> > Theres also the 2nd situation where all the information the parser
> > extracts is already known to the lib user and going over the whole
> > bitstream is wasting cpu cycles. This can be a situation where for
> > example all input files where just a moment ago created by FFmpeg
> > and we assume these would then be as compliant as redoing the parsing
> > would make them
> 
> Other information, like Recovery Point SEI messages, can't be properly
> signaled by encoders within packet metadata. Muxer can and should, for now,
> look at the bitstream for this purpose.

yes

thx

[...]
James Almer July 31, 2021, 5:51 p.m. UTC | #18
On 7/31/2021 2:46 PM, Michael Niedermayer wrote:
> On Sat, Jul 31, 2021 at 01:10:27PM -0300, James Almer wrote:
>> On 7/31/2021 6:41 AM, Michael Niedermayer wrote:
>>> On Fri, Jul 30, 2021 at 10:31:53AM -0300, James Almer wrote:
>>>> On 7/30/2021 8:33 AM, Michael Niedermayer wrote:
>>>>> On Thu, Jul 29, 2021 at 03:09:03PM -0300, James Almer wrote:
>>>>>> On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
>>>>>>> On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
>>>>>>>> Since we can't blindly trust the keyframe flag in packets and assume its
>>>>>>>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>>>>>>>> Sync Sample table in addition to a Random Access Recovery Point table.
>>>>>>>>
>>>>>>>> Suggested-by: ffmpeg@fb.com
>>>>>>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>>>>>>> ---
>>>>>>>>      libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>>>>>>>      libavformat/movenc.h         |   1 +
>>>>>>>>      tests/ref/lavf-fate/h264.mp4 |   6 +-
>>>>>>>>      3 files changed, 123 insertions(+), 9 deletions(-)
>>>>>>>
>>>>>
>>>>> A.
>>>>>>> Will this allow a user to mark a subset of keyframes as random
>>>>>>> access points ?
>>>>>>> A user may want seeking to preferably happen to a scene start and not a
>>>>>>> frame before
>>>>>>>
>>>>>
>>>>> B.
>>>>>>> Will this allow a user to mark a damaged keyframe as such ?
>>>>>>> A frame might in fact have been a keyframe and maybe the only in the file
>>>>>>> but maybe was damaged so a parser might fail to detect it.
>>>>>>
>>>>>> This code will ensure all packets with an IDR picture will be listed in the
>>>>>> Sync Sample table, and all packets with a Recovery Point SEI message will be
>>>>>> listed in the Random Access Recovery Point table.
>>>>>> Whatever is signaled in packets (like the keyframe flag) is ignored to
>>>>>> prevent creating non-spec compliant output.
>>>>>
>>>>> The file in case B will be non compliant, it is damaged data, it cannot
>>>>> be muxed in a compliant way. If we support muxing damaged data then
>>>>> parsing that data as a way to unconditionally override the user is
>>>>> problematic.
>>>>> If you try to interpret damaged data the result is undefined, the spec
>>>>> does not tell you how to interpret it and 2 parsers can disagree
>>>>> if a damaged frame is a keyframe.
>>>>
>>>> If you don't mark a packet as a Sync Sample, even if it contains damaged
>>>> data, the file is still compliant. The point here is to avoid filling the
>>>> Sync Sample table with bogus entries.
>>>
>>> IMHO, damaged data is not always a compliant video stream, otherwise in the most extreem
>>> example /dev/random would have to be a compliant h264 stream.
>>>
>>>
>>>>
>>>>>
>>>>> Lets just as 2 examples consider multiple slices some being parsed as IDR
>>>>> some as non IDR, so what is that ? or what if some packet reodering or
>>>>> duplication causes parts of a IDR and non IDR frame to be mixed.
>>>>> where is the IDR frame in that case ? 2 parsers can disagree
>>>>
>>>> The AVCodecParserContext will tag a packet with an IDR slice as keyframe,
>>>> and then disregard anything else it may find. I made the code i wrote here
>>>> do the same.
>>>
>>> If thats the case, we could take the example of a frame lets say it has
>>> 100 slices, and in one of these 1 byte is changed so it becomes a IDR slice
>>> That is not really a keyframe or sync symple yet it will
>>> be marked as one IIUC.
>>
>> Unless it's the first slice NALU out of those 100 in the AU, it will not be
>> marked as a keyframe by the AVCodecParserContext, nor as a Sync Sample by
>> this code.
>>
>>> Now to continue with this example, which way will it work better
>>> X. With this marked not a keyframe or
>>> Y. With this marked as keyframe as the parser detects ?
>>
>> It should not be marked as a Sync Sample.
>>
>>>
> 
>>> Its not containing a single valid IDR slice just 1 of the non IDR slices
>>> have been damaged and are misparsed. That will not work well as a sync sample
>>> yet as a normal sample the 1% of damage might not be noticed at all as its copied
>>> from the last frame
>>
>> How about i look at every slice NALU in the AU and ensure there's only IDR
>> slices before marking it as a Sync Sample, instead of only looking at the
>> very first slice and decide based on it?
>> Same thing could be done in the AVCodecParserContext, for that matter.
> 
> Then you get a IDR frame wrong if it has a single damaged slice where its
> parsed as non IDR.

Same situation: It does not get added to the Sync Sample table.

> 
> What you could do and i do not suggest this actually, just hypothetically
> is take the majority >50% being IDR as probably IDR. That will probably
> require extra care when there are 2 fields or frames in one packet
> 
> The problem is if you unconditionally overreide the user you really need
> to be perfect and this is not easy. More so as being done by default it
> too should be low overhead not slowing things down
> 
> 
>>
>>>
>>>
>>>
>>>>
>>>>> still a decoder can with some luck start decoding from such a
>>>>> point i would guess. But does the user want this marked as a keyframe ?
>>>>> maybe yes, maybe no. I think taking the users only way to set the keyframe
>>>>> flag away is a step backward
>>>>
>>>> Look at MPEG2 parsing and others in this container. It's the same scenario.
>>>> Did any of these concerns show up back then?
>>>> What I'm doing here for h264 is the same thing we did for those, to ensure
>>>> we don't create non spec compliant files because of damaged or mistagged
>>>> input, or a careless user.
>>>>
>>>> Letting the user mark whatever they want as a Sync Sample on a container
>>>> where it's explicit what should and should not be such is not good practice.
>>>> If you want that, you can use Matroska and other formats where keyframe
>>>> tagging is apparently not strict.
>>>>
>>>> Now, i want to point out that I wrote this parsing code here because my
>>>> previous attempt at stopping the AVCodecParserContext from being too liberal
>>>> marking things as keyframes was rejected *precisely* because it would let
>>>> the user create non compliant output in mp4. And now you don't want this one
>>>> either because you want to let the user create non complaint output.
>>>> I'm running in circles here and it's getting tiresome.
>>>
>>> I see the problem and i agree with you that it is a problem that needs
>>> a solution. Its very bad if the users sets bad flags and it results in
>>> non-compliant files. What my concern is, is that theres no way for the
>>> user to override this when its the parsing code thats wrong and previously
>>> the user could override this.
>>> It may be very rare that the parser is wrong and a user would bother to
>>> override it, i do not know, then again some people are quite crazy with
>>> the level of perfection they try to achieve in their videos
>>
>> The only reason the user may have wanted to override the packet's keyframe
>> flag was precisely because the AVCodecParserContext was tagging way too many
>> packets as such. This would no longer be necessary after this change, and
>> you can trust the muxer to do the right thing.
>>
> 
>> I can however accept a compromise here, and take into account the packet's
>> keyframe flag if and only if invalid or unexpected data is found in the
>> bitstream. If the parsing succeeds, then whatever the packet metadata may
>> signal should be outright ignored.
> 
> how do you detect invalid or unexpected data ?

Any parsing error for which we already abort, and if we find a mix of 
IDR and non IDR slices within a given AU. Instead of deciding to not add 
them to the Sync Sample table like i mentioned above, we look at the 
packet's keyframe flag and use it as last resort.

> 
> 
>>
>>>
>>> Theres also the 2nd situation where all the information the parser
>>> extracts is already known to the lib user and going over the whole
>>> bitstream is wasting cpu cycles. This can be a situation where for
>>> example all input files where just a moment ago created by FFmpeg
>>> and we assume these would then be as compliant as redoing the parsing
>>> would make them
>>
>> Other information, like Recovery Point SEI messages, can't be properly
>> signaled by encoders within packet metadata. Muxer can and should, for now,
>> look at the bitstream for this purpose.
> 
> yes
> 
> thx
> 
> [...]
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Michael Niedermayer July 31, 2021, 8:47 p.m. UTC | #19
On Sat, Jul 31, 2021 at 02:51:16PM -0300, James Almer wrote:
> On 7/31/2021 2:46 PM, Michael Niedermayer wrote:
> > On Sat, Jul 31, 2021 at 01:10:27PM -0300, James Almer wrote:
> > > On 7/31/2021 6:41 AM, Michael Niedermayer wrote:
> > > > On Fri, Jul 30, 2021 at 10:31:53AM -0300, James Almer wrote:
> > > > > On 7/30/2021 8:33 AM, Michael Niedermayer wrote:
> > > > > > On Thu, Jul 29, 2021 at 03:09:03PM -0300, James Almer wrote:
> > > > > > > On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
> > > > > > > > On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
> > > > > > > > > Since we can't blindly trust the keyframe flag in packets and assume its
> > > > > > > > > contents are a valid Sync Sample, do some basic bitstream parsing to build the
> > > > > > > > > Sync Sample table in addition to a Random Access Recovery Point table.
> > > > > > > > > 
> > > > > > > > > Suggested-by: ffmpeg@fb.com
> > > > > > > > > Signed-off-by: James Almer <jamrial@gmail.com>
> > > > > > > > > ---
> > > > > > > > >      libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
> > > > > > > > >      libavformat/movenc.h         |   1 +
> > > > > > > > >      tests/ref/lavf-fate/h264.mp4 |   6 +-
> > > > > > > > >      3 files changed, 123 insertions(+), 9 deletions(-)
> > > > > > > > 
> > > > > > 
> > > > > > A.
> > > > > > > > Will this allow a user to mark a subset of keyframes as random
> > > > > > > > access points ?
> > > > > > > > A user may want seeking to preferably happen to a scene start and not a
> > > > > > > > frame before
> > > > > > > > 
> > > > > > 
> > > > > > B.
> > > > > > > > Will this allow a user to mark a damaged keyframe as such ?
> > > > > > > > A frame might in fact have been a keyframe and maybe the only in the file
> > > > > > > > but maybe was damaged so a parser might fail to detect it.
> > > > > > > 
> > > > > > > This code will ensure all packets with an IDR picture will be listed in the
> > > > > > > Sync Sample table, and all packets with a Recovery Point SEI message will be
> > > > > > > listed in the Random Access Recovery Point table.
> > > > > > > Whatever is signaled in packets (like the keyframe flag) is ignored to
> > > > > > > prevent creating non-spec compliant output.
> > > > > > 
> > > > > > The file in case B will be non compliant, it is damaged data, it cannot
> > > > > > be muxed in a compliant way. If we support muxing damaged data then
> > > > > > parsing that data as a way to unconditionally override the user is
> > > > > > problematic.
> > > > > > If you try to interpret damaged data the result is undefined, the spec
> > > > > > does not tell you how to interpret it and 2 parsers can disagree
> > > > > > if a damaged frame is a keyframe.
> > > > > 
> > > > > If you don't mark a packet as a Sync Sample, even if it contains damaged
> > > > > data, the file is still compliant. The point here is to avoid filling the
> > > > > Sync Sample table with bogus entries.
> > > > 
> > > > IMHO, damaged data is not always a compliant video stream, otherwise in the most extreem
> > > > example /dev/random would have to be a compliant h264 stream.
> > > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > Lets just as 2 examples consider multiple slices some being parsed as IDR
> > > > > > some as non IDR, so what is that ? or what if some packet reodering or
> > > > > > duplication causes parts of a IDR and non IDR frame to be mixed.
> > > > > > where is the IDR frame in that case ? 2 parsers can disagree
> > > > > 
> > > > > The AVCodecParserContext will tag a packet with an IDR slice as keyframe,
> > > > > and then disregard anything else it may find. I made the code i wrote here
> > > > > do the same.
> > > > 
> > > > If thats the case, we could take the example of a frame lets say it has
> > > > 100 slices, and in one of these 1 byte is changed so it becomes a IDR slice
> > > > That is not really a keyframe or sync symple yet it will
> > > > be marked as one IIUC.
> > > 
> > > Unless it's the first slice NALU out of those 100 in the AU, it will not be
> > > marked as a keyframe by the AVCodecParserContext, nor as a Sync Sample by
> > > this code.
> > > 
> > > > Now to continue with this example, which way will it work better
> > > > X. With this marked not a keyframe or
> > > > Y. With this marked as keyframe as the parser detects ?
> > > 
> > > It should not be marked as a Sync Sample.
> > > 
> > > > 
> > 
> > > > Its not containing a single valid IDR slice just 1 of the non IDR slices
> > > > have been damaged and are misparsed. That will not work well as a sync sample
> > > > yet as a normal sample the 1% of damage might not be noticed at all as its copied
> > > > from the last frame
> > > 
> > > How about i look at every slice NALU in the AU and ensure there's only IDR
> > > slices before marking it as a Sync Sample, instead of only looking at the
> > > very first slice and decide based on it?
> > > Same thing could be done in the AVCodecParserContext, for that matter.
> > 
> > Then you get a IDR frame wrong if it has a single damaged slice where its
> > parsed as non IDR.
> 
> Same situation: It does not get added to the Sync Sample table.

and then you cannot seek to the begin of the file anymore



> 
> > 
> > What you could do and i do not suggest this actually, just hypothetically
> > is take the majority >50% being IDR as probably IDR. That will probably
> > require extra care when there are 2 fields or frames in one packet
> > 
> > The problem is if you unconditionally overreide the user you really need
> > to be perfect and this is not easy. More so as being done by default it
> > too should be low overhead not slowing things down
> > 
> > 
> > > 
> > > > 
> > > > 
> > > > 
> > > > > 
> > > > > > still a decoder can with some luck start decoding from such a
> > > > > > point i would guess. But does the user want this marked as a keyframe ?
> > > > > > maybe yes, maybe no. I think taking the users only way to set the keyframe
> > > > > > flag away is a step backward
> > > > > 
> > > > > Look at MPEG2 parsing and others in this container. It's the same scenario.
> > > > > Did any of these concerns show up back then?
> > > > > What I'm doing here for h264 is the same thing we did for those, to ensure
> > > > > we don't create non spec compliant files because of damaged or mistagged
> > > > > input, or a careless user.
> > > > > 
> > > > > Letting the user mark whatever they want as a Sync Sample on a container
> > > > > where it's explicit what should and should not be such is not good practice.
> > > > > If you want that, you can use Matroska and other formats where keyframe
> > > > > tagging is apparently not strict.
> > > > > 
> > > > > Now, i want to point out that I wrote this parsing code here because my
> > > > > previous attempt at stopping the AVCodecParserContext from being too liberal
> > > > > marking things as keyframes was rejected *precisely* because it would let
> > > > > the user create non compliant output in mp4. And now you don't want this one
> > > > > either because you want to let the user create non complaint output.
> > > > > I'm running in circles here and it's getting tiresome.
> > > > 
> > > > I see the problem and i agree with you that it is a problem that needs
> > > > a solution. Its very bad if the users sets bad flags and it results in
> > > > non-compliant files. What my concern is, is that theres no way for the
> > > > user to override this when its the parsing code thats wrong and previously
> > > > the user could override this.
> > > > It may be very rare that the parser is wrong and a user would bother to
> > > > override it, i do not know, then again some people are quite crazy with
> > > > the level of perfection they try to achieve in their videos
> > > 
> > > The only reason the user may have wanted to override the packet's keyframe
> > > flag was precisely because the AVCodecParserContext was tagging way too many
> > > packets as such. This would no longer be necessary after this change, and
> > > you can trust the muxer to do the right thing.
> > > 
> > 
> > > I can however accept a compromise here, and take into account the packet's
> > > keyframe flag if and only if invalid or unexpected data is found in the
> > > bitstream. If the parsing succeeds, then whatever the packet metadata may
> > > signal should be outright ignored.
> > 
> > how do you detect invalid or unexpected data ?
> 
> Any parsing error for which we already abort, and if we find a mix of IDR
> and non IDR slices within a given AU. Instead of deciding to not add them to
> the Sync Sample table like i mentioned above, we look at the packet's
> keyframe flag and use it as last resort.

Is there any other reason to parse the whole AU ?
if not then this would require O(n) instead of O(1) parsing and be slower
If you do this, i would suggest that 
if the keyframe flag is set to stop parsing on the first IDR slice, similarly
if the keyframe flag is not set to stop parsing on the first non IDR slice
else all slices are the same type and you would override the user

With this you only need to parse O(1) when the flag matches a undamaged AU
and often also for a damaged AU.
the O(n) case would be reserved for the oddball cases

That said, i still think theres value in having the parsing code be that 
a parser, bsf or other API not cramed into the muxer but run prior in a 
more modular way in the future

Thanks

[...]
James Almer July 31, 2021, 9:12 p.m. UTC | #20
On 7/31/2021 5:47 PM, Michael Niedermayer wrote:
> On Sat, Jul 31, 2021 at 02:51:16PM -0300, James Almer wrote:
>> On 7/31/2021 2:46 PM, Michael Niedermayer wrote:
>>> On Sat, Jul 31, 2021 at 01:10:27PM -0300, James Almer wrote:
>>>> On 7/31/2021 6:41 AM, Michael Niedermayer wrote:
>>>>> On Fri, Jul 30, 2021 at 10:31:53AM -0300, James Almer wrote:
>>>>>> On 7/30/2021 8:33 AM, Michael Niedermayer wrote:
>>>>>>> On Thu, Jul 29, 2021 at 03:09:03PM -0300, James Almer wrote:
>>>>>>>> On 7/29/2021 2:58 PM, Michael Niedermayer wrote:
>>>>>>>>> On Tue, Jul 27, 2021 at 10:08:20AM -0300, James Almer wrote:
>>>>>>>>>> Since we can't blindly trust the keyframe flag in packets and assume its
>>>>>>>>>> contents are a valid Sync Sample, do some basic bitstream parsing to build the
>>>>>>>>>> Sync Sample table in addition to a Random Access Recovery Point table.
>>>>>>>>>>
>>>>>>>>>> Suggested-by: ffmpeg@fb.com
>>>>>>>>>> Signed-off-by: James Almer <jamrial@gmail.com>
>>>>>>>>>> ---
>>>>>>>>>>       libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
>>>>>>>>>>       libavformat/movenc.h         |   1 +
>>>>>>>>>>       tests/ref/lavf-fate/h264.mp4 |   6 +-
>>>>>>>>>>       3 files changed, 123 insertions(+), 9 deletions(-)
>>>>>>>>>
>>>>>>>
>>>>>>> A.
>>>>>>>>> Will this allow a user to mark a subset of keyframes as random
>>>>>>>>> access points ?
>>>>>>>>> A user may want seeking to preferably happen to a scene start and not a
>>>>>>>>> frame before
>>>>>>>>>
>>>>>>>
>>>>>>> B.
>>>>>>>>> Will this allow a user to mark a damaged keyframe as such ?
>>>>>>>>> A frame might in fact have been a keyframe and maybe the only in the file
>>>>>>>>> but maybe was damaged so a parser might fail to detect it.
>>>>>>>>
>>>>>>>> This code will ensure all packets with an IDR picture will be listed in the
>>>>>>>> Sync Sample table, and all packets with a Recovery Point SEI message will be
>>>>>>>> listed in the Random Access Recovery Point table.
>>>>>>>> Whatever is signaled in packets (like the keyframe flag) is ignored to
>>>>>>>> prevent creating non-spec compliant output.
>>>>>>>
>>>>>>> The file in case B will be non compliant, it is damaged data, it cannot
>>>>>>> be muxed in a compliant way. If we support muxing damaged data then
>>>>>>> parsing that data as a way to unconditionally override the user is
>>>>>>> problematic.
>>>>>>> If you try to interpret damaged data the result is undefined, the spec
>>>>>>> does not tell you how to interpret it and 2 parsers can disagree
>>>>>>> if a damaged frame is a keyframe.
>>>>>>
>>>>>> If you don't mark a packet as a Sync Sample, even if it contains damaged
>>>>>> data, the file is still compliant. The point here is to avoid filling the
>>>>>> Sync Sample table with bogus entries.
>>>>>
>>>>> IMHO, damaged data is not always a compliant video stream, otherwise in the most extreem
>>>>> example /dev/random would have to be a compliant h264 stream.
>>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Lets just as 2 examples consider multiple slices some being parsed as IDR
>>>>>>> some as non IDR, so what is that ? or what if some packet reodering or
>>>>>>> duplication causes parts of a IDR and non IDR frame to be mixed.
>>>>>>> where is the IDR frame in that case ? 2 parsers can disagree
>>>>>>
>>>>>> The AVCodecParserContext will tag a packet with an IDR slice as keyframe,
>>>>>> and then disregard anything else it may find. I made the code i wrote here
>>>>>> do the same.
>>>>>
>>>>> If thats the case, we could take the example of a frame lets say it has
>>>>> 100 slices, and in one of these 1 byte is changed so it becomes a IDR slice
>>>>> That is not really a keyframe or sync symple yet it will
>>>>> be marked as one IIUC.
>>>>
>>>> Unless it's the first slice NALU out of those 100 in the AU, it will not be
>>>> marked as a keyframe by the AVCodecParserContext, nor as a Sync Sample by
>>>> this code.
>>>>
>>>>> Now to continue with this example, which way will it work better
>>>>> X. With this marked not a keyframe or
>>>>> Y. With this marked as keyframe as the parser detects ?
>>>>
>>>> It should not be marked as a Sync Sample.
>>>>
>>>>>
>>>
>>>>> Its not containing a single valid IDR slice just 1 of the non IDR slices
>>>>> have been damaged and are misparsed. That will not work well as a sync sample
>>>>> yet as a normal sample the 1% of damage might not be noticed at all as its copied
>>>>> from the last frame
>>>>
>>>> How about i look at every slice NALU in the AU and ensure there's only IDR
>>>> slices before marking it as a Sync Sample, instead of only looking at the
>>>> very first slice and decide based on it?
>>>> Same thing could be done in the AVCodecParserContext, for that matter.
>>>
>>> Then you get a IDR frame wrong if it has a single damaged slice where its
>>> parsed as non IDR.
>>
>> Same situation: It does not get added to the Sync Sample table.
> 
> and then you cannot seek to the begin of the file anymore

We're trying to create spec complaint files as much as possible, not 
invalid files that some software can then read better. If the first 
sample is not tagged as Sync because it didn't meet the requirements, 
then it's how it's meant to be. Remember that files with no Sync Samples 
at all are still valid.

But i already agreed to let the user override the choice made by this 
code as a last resort, so no point arguing about it.

> 
> 
> 
>>
>>>
>>> What you could do and i do not suggest this actually, just hypothetically
>>> is take the majority >50% being IDR as probably IDR. That will probably
>>> require extra care when there are 2 fields or frames in one packet
>>>
>>> The problem is if you unconditionally overreide the user you really need
>>> to be perfect and this is not easy. More so as being done by default it
>>> too should be low overhead not slowing things down
>>>
>>>
>>>>
>>>>>
>>>>>
>>>>>
>>>>>>
>>>>>>> still a decoder can with some luck start decoding from such a
>>>>>>> point i would guess. But does the user want this marked as a keyframe ?
>>>>>>> maybe yes, maybe no. I think taking the users only way to set the keyframe
>>>>>>> flag away is a step backward
>>>>>>
>>>>>> Look at MPEG2 parsing and others in this container. It's the same scenario.
>>>>>> Did any of these concerns show up back then?
>>>>>> What I'm doing here for h264 is the same thing we did for those, to ensure
>>>>>> we don't create non spec compliant files because of damaged or mistagged
>>>>>> input, or a careless user.
>>>>>>
>>>>>> Letting the user mark whatever they want as a Sync Sample on a container
>>>>>> where it's explicit what should and should not be such is not good practice.
>>>>>> If you want that, you can use Matroska and other formats where keyframe
>>>>>> tagging is apparently not strict.
>>>>>>
>>>>>> Now, i want to point out that I wrote this parsing code here because my
>>>>>> previous attempt at stopping the AVCodecParserContext from being too liberal
>>>>>> marking things as keyframes was rejected *precisely* because it would let
>>>>>> the user create non compliant output in mp4. And now you don't want this one
>>>>>> either because you want to let the user create non complaint output.
>>>>>> I'm running in circles here and it's getting tiresome.
>>>>>
>>>>> I see the problem and i agree with you that it is a problem that needs
>>>>> a solution. Its very bad if the users sets bad flags and it results in
>>>>> non-compliant files. What my concern is, is that theres no way for the
>>>>> user to override this when its the parsing code thats wrong and previously
>>>>> the user could override this.
>>>>> It may be very rare that the parser is wrong and a user would bother to
>>>>> override it, i do not know, then again some people are quite crazy with
>>>>> the level of perfection they try to achieve in their videos
>>>>
>>>> The only reason the user may have wanted to override the packet's keyframe
>>>> flag was precisely because the AVCodecParserContext was tagging way too many
>>>> packets as such. This would no longer be necessary after this change, and
>>>> you can trust the muxer to do the right thing.
>>>>
>>>
>>>> I can however accept a compromise here, and take into account the packet's
>>>> keyframe flag if and only if invalid or unexpected data is found in the
>>>> bitstream. If the parsing succeeds, then whatever the packet metadata may
>>>> signal should be outright ignored.
>>>
>>> how do you detect invalid or unexpected data ?
>>
>> Any parsing error for which we already abort, and if we find a mix of IDR
>> and non IDR slices within a given AU. Instead of deciding to not add them to
>> the Sync Sample table like i mentioned above, we look at the packet's
>> keyframe flag and use it as last resort.
> 
> Is there any other reason to parse the whole AU ?
> if not then this would require O(n) instead of O(1) parsing and be slower
> If you do this, i would suggest that
> if the keyframe flag is set to stop parsing on the first IDR slice, similarly
> if the keyframe flag is not set to stop parsing on the first non IDR slice
> else all slices are the same type and you would override the user

Alright, I'll look into this.

> 
> With this you only need to parse O(1) when the flag matches a undamaged AU
> and often also for a damaged AU.
> the O(n) case would be reserved for the oddball cases
> 
> That said, i still think theres value in having the parsing code be that
> a parser, bsf or other API not cramed into the muxer but run prior in a
> more modular way in the future

Yes, i agree and mentioned as much.

> 
> Thanks
> 
> [...]
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index 57062f45c5..159e0261b7 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -34,13 +34,17 @@ 
 #include "avc.h"
 #include "libavcodec/ac3_parser_internal.h"
 #include "libavcodec/dnxhddata.h"
+#include "libavcodec/h264.h"
+#include "libavcodec/h2645_parse.h"
 #include "libavcodec/flac.h"
 #include "libavcodec/get_bits.h"
+#include "libavcodec/golomb.h"
 
 #include "libavcodec/internal.h"
 #include "libavcodec/put_bits.h"
 #include "libavcodec/vc1_common.h"
 #include "libavcodec/raw.h"
+#include "libavcodec/sei.h"
 #include "internal.h"
 #include "libavutil/avstring.h"
 #include "libavutil/channel_layout.h"
@@ -2537,7 +2541,9 @@  static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
     if (!sgpd_entries)
         return AVERROR(ENOMEM);
 
-    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC);
+    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
+               track->par->codec_id == AV_CODEC_ID_AAC  ||
+               track->par->codec_id == AV_CODEC_ID_H264);
 
     if (track->par->codec_id == AV_CODEC_ID_OPUS) {
         for (i = 0; i < track->entry; i++) {
@@ -2545,7 +2551,7 @@  static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
             int distance = 0;
             for (j = i - 1; j >= 0; j--) {
                 roll_samples_remaining -= get_cluster_duration(track, j);
-                distance++;
+                distance--;
                 if (roll_samples_remaining <= 0)
                     break;
             }
@@ -2555,7 +2561,7 @@  static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
             if (roll_samples_remaining > 0)
                 distance = 0;
             /* Verify distance is a maximum of 32 (2.5ms) packets. */
-            if (distance > 32)
+            if (distance < 32)
                 return AVERROR_INVALIDDATA;
             if (i && distance == sgpd_entries[entries].roll_distance) {
                 sgpd_entries[entries].count++;
@@ -2566,10 +2572,22 @@  static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
                 sgpd_entries[entries].group_description_index = distance ? ++group : 0;
             }
         }
+    } else if (track->par->codec_id == AV_CODEC_ID_H264) {
+        for (i = 0; i < track->entry; i++) {
+            int distance = track->cluster[i].roll_distance;
+            if (i && distance == sgpd_entries[entries].roll_distance) {
+                sgpd_entries[entries].count++;
+            } else {
+                entries++;
+                sgpd_entries[entries].count = 1;
+                sgpd_entries[entries].roll_distance = distance;
+                sgpd_entries[entries].group_description_index = distance ? ++group : 0;
+            }
+        }
     } else {
         entries++;
         sgpd_entries[entries].count = track->sample_count;
-        sgpd_entries[entries].roll_distance = 1;
+        sgpd_entries[entries].roll_distance = -1;
         sgpd_entries[entries].group_description_index = ++group;
     }
     entries++;
@@ -2588,7 +2606,7 @@  static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
     avio_wb32(pb, group); /* entry_count */
     for (i = 0; i < entries; i++) {
         if (sgpd_entries[i].group_description_index) {
-            avio_wb16(pb, -sgpd_entries[i].roll_distance); /* roll_distance */
+            avio_wb16(pb, sgpd_entries[i].roll_distance); /* roll_distance */
         }
     }
 
@@ -2639,7 +2657,9 @@  static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
     if (track->cenc.aes_ctr) {
         ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
     }
-    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC) {
+    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
+        track->par->codec_id == AV_CODEC_ID_AAC  ||
+        track->par->codec_id == AV_CODEC_ID_H264) {
         mov_preroll_write_stbl_atoms(pb, track);
     }
     return update_size(pb, pos);
@@ -5150,6 +5170,96 @@  static int mov_parse_mpeg2_frame(AVPacket *pkt, uint32_t *flags)
     return 0;
 }
 
+static int mov_parse_h264_frame(AVPacket *pkt, MOVTrack *trk)
+{
+    GetBitContext gb;
+    const uint8_t *buf = pkt->data;
+    const uint8_t *buf_end = pkt->data + pkt->size;
+    uint32_t state = -1;
+    unsigned roll_distance = 0;
+    int nal_length_size = 0, nalsize;
+    int idr = 0;
+
+    if (!pkt->size)
+        return 0;
+    if (trk->vos_data && trk->vos_data[0] == 1) {
+        if (trk->vos_len < 5)
+            return 0;
+        nal_length_size = (trk->vos_data[4] & 0x03) + 1;
+    }
+
+    while (buf_end - buf >= 4) {
+        if (nal_length_size) {
+            int i = 0;
+            nalsize = get_nalsize(nal_length_size, buf, buf_end - buf, &i, NULL);
+            if (nalsize < 0)
+                break;
+            state = buf[nal_length_size];
+            buf += nal_length_size + 1;
+        } else {
+            buf = avpriv_find_start_code(buf, buf_end, &state);
+            if (buf >= buf_end)
+                break;
+        }
+
+        switch (state & 0x1f) {
+        case H264_NAL_IDR_SLICE:
+            idr = 1;
+            goto end;
+        case H264_NAL_SEI:
+            init_get_bits8(&gb, buf, buf_end - buf);
+            do {
+                GetBitContext gb_payload;
+                int ret, type = 0, size = 0;
+
+                do {
+                    if (get_bits_left(&gb) < 8)
+                        goto end;
+                    type += show_bits(&gb, 8);
+                } while (get_bits(&gb, 8) == 255);
+                do {
+                    if (get_bits_left(&gb) < 8)
+                        goto end;
+                    size += show_bits(&gb, 8);
+                } while (get_bits(&gb, 8) == 255);
+
+                if (size > get_bits_left(&gb) / 8)
+                    goto end;
+
+                ret = init_get_bits8(&gb_payload, gb.buffer + get_bits_count(&gb) / 8, size);
+                if (ret < 0)
+                    goto end;
+
+                switch (type) {
+                case SEI_TYPE_RECOVERY_POINT:
+                    roll_distance = get_ue_golomb_long(&gb_payload);
+                    break;
+                default:
+                    break;
+                }
+                skip_bits_long(&gb, 8 * size);
+            } while (get_bits_left(&gb) > 0 && show_bits(&gb, 8) != 0x80);
+            break;
+        default:
+            break;
+        }
+        if (nal_length_size)
+            buf += nalsize - 1;
+    }
+
+end:
+    if (roll_distance != (int16_t)roll_distance)
+        roll_distance = 0;
+    trk->cluster[trk->entry].roll_distance = roll_distance;
+
+    if (idr) {
+        trk->cluster[trk->entry].flags |= MOV_SYNC_SAMPLE;
+        trk->has_keyframes++;
+    }
+
+    return 0;
+}
+
 static void mov_parse_vc1_frame(AVPacket *pkt, MOVTrack *trk)
 {
     const uint8_t *start, *next, *end = pkt->data + pkt->size;
@@ -5740,6 +5850,7 @@  int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
     trk->cluster[trk->entry].entries          = samples_in_chunk;
     trk->cluster[trk->entry].dts              = pkt->dts;
     trk->cluster[trk->entry].pts              = pkt->pts;
+    trk->cluster[trk->entry].roll_distance    = 0;
     if (!trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
         if (!trk->frag_discont) {
             /* First packet of a new fragment. We already wrote the duration
@@ -5821,6 +5932,8 @@  int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
 
     if (par->codec_id == AV_CODEC_ID_VC1) {
         mov_parse_vc1_frame(pkt, trk);
+    } else if (par->codec_id == AV_CODEC_ID_H264) {
+        mov_parse_h264_frame(pkt, trk);
     } else if (par->codec_id == AV_CODEC_ID_TRUEHD) {
         mov_parse_truehd_frame(pkt, trk);
     } else if (pkt->flags & AV_PKT_FLAG_KEY) {
diff --git a/libavformat/movenc.h b/libavformat/movenc.h
index af1ea0bce6..73bf73ce8f 100644
--- a/libavformat/movenc.h
+++ b/libavformat/movenc.h
@@ -56,6 +56,7 @@  typedef struct MOVIentry {
 #define MOV_PARTIAL_SYNC_SAMPLE 0x0002
 #define MOV_DISPOSABLE_SAMPLE   0x0004
     uint32_t     flags;
+    int16_t      roll_distance;
     AVProducerReferenceTime prft;
 } MOVIentry;
 
diff --git a/tests/ref/lavf-fate/h264.mp4 b/tests/ref/lavf-fate/h264.mp4
index a9c3823c2c..c08ee4c7ae 100644
--- a/tests/ref/lavf-fate/h264.mp4
+++ b/tests/ref/lavf-fate/h264.mp4
@@ -1,3 +1,3 @@ 
-fe299ea5205b71a48281f917b1256a5d *tests/data/lavf-fate/lavf.h264.mp4
-547928 tests/data/lavf-fate/lavf.h264.mp4
-tests/data/lavf-fate/lavf.h264.mp4 CRC=0x9da2c999
+2812f617314d23474fcb23898b8a56ab *tests/data/lavf-fate/lavf.h264.mp4
+548084 tests/data/lavf-fate/lavf.h264.mp4
+tests/data/lavf-fate/lavf.h264.mp4 CRC=0x396f0829