diff mbox series

[FFmpeg-devel] avformat/dv: fix timestamps of audio packets in case of dropped corrupt audio frames

Message ID 20201031165624.31062-1-cus@passwd.hu
State Accepted
Commit 76fbb0052df471075858c1cb82b04c8be7adba8d
Headers show
Series [FFmpeg-devel] avformat/dv: fix timestamps of audio packets in case of dropped corrupt audio frames | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Marton Balint Oct. 31, 2020, 4:56 p.m. UTC
Fixes out of sync timestamps in ticket #8762.

Signed-off-by: Marton Balint <cus@passwd.hu>
---
 libavformat/dv.c       | 16 ++--------------
 tests/ref/seek/lavf-dv | 18 +++++++++---------
 2 files changed, 11 insertions(+), 23 deletions(-)

Comments

Dave Rice Oct. 31, 2020, 6:42 p.m. UTC | #1
Hi Marton,

> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu> wrote:
> 
> Fixes out of sync timestamps in ticket #8762.

Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.

Beyond copying or encoding the audio, are there other options I should use to test this?

> Signed-off-by: Marton Balint <cus@passwd.hu>
> ---
> libavformat/dv.c       | 16 ++--------------
> tests/ref/seek/lavf-dv | 18 +++++++++---------
> 2 files changed, 11 insertions(+), 23 deletions(-)
> 
> diff --git a/libavformat/dv.c b/libavformat/dv.c
> index 3e0d12c0e3..26a78139f5 100644
> --- a/libavformat/dv.c
> +++ b/libavformat/dv.c
> @@ -49,7 +49,6 @@ struct DVDemuxContext {
>     uint8_t           audio_buf[4][8192];
>     int               ach;
>     int               frames;
> -    uint64_t          abytes;
> };
> 
> static inline uint16_t dv_audio_12to16(uint16_t sample)
> @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>             c->ast[i] = avformat_new_stream(c->fctx, NULL);
>             if (!c->ast[i])
>                 break;
> -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
> +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
>             c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
>             c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
> 
> @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>     for (i = 0; i < c->ach; i++) {
>         c->audio_pkt[i].pos  = pos;
>         c->audio_pkt[i].size = size;
> -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
> -                               c->ast[i]->codecpar->bit_rate;
> +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
>         ppcm[i] = c->audio_buf[i];
>     }
>     if (c->ach)
> @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>             c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>         } else {
>             c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
> -            c->abytes           += size;
>         }
> -    } else {
> -        c->abytes += size;
>     }
> 
>     /* Now it's time to return video packet */
> @@ -444,13 +439,6 @@ static int64_t dv_frame_offset(AVFormatContext *s, DVDemuxContext *c,
> void ff_dv_offset_reset(DVDemuxContext *c, int64_t frame_offset)
> {
>     c->frames = frame_offset;
> -    if (c->ach) {
> -        if (c->sys) {
> -        c->abytes = av_rescale_q(c->frames, c->sys->time_base,
> -                                 (AVRational) { 8, c->ast[0]->codecpar->bit_rate });
> -        } else
> -            av_log(c->fctx, AV_LOG_ERROR, "cannot adjust audio bytes\n");
> -    }
>     c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>     c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
> }
> diff --git a/tests/ref/seek/lavf-dv b/tests/ref/seek/lavf-dv
> index 0000ff5abe..f63e4460be 100644
> --- a/tests/ref/seek/lavf-dv
> +++ b/tests/ref/seek/lavf-dv
> @@ -7,9 +7,9 @@ ret: 0         st: 0 flags:0  ts: 0.800000
> ret: 0         st: 0 flags:1 dts: 0.800000 pts: 0.800000 pos:2880000 size:144000
> ret: 0         st: 0 flags:1  ts:-0.320000
> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
> -ret: 0         st: 1 flags:0  ts: 2.576667
> +ret: 0         st: 1 flags:0  ts: 2.560000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> -ret: 0         st: 1 flags:1  ts: 1.470833
> +ret: 0         st: 1 flags:1  ts: 1.480000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> ret: 0         st:-1 flags:0  ts: 0.365002
> ret: 0         st: 0 flags:1 dts: 0.360000 pts: 0.360000 pos:1296000 size:144000
> @@ -19,9 +19,9 @@ ret: 0         st: 0 flags:0  ts: 2.160000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> ret: 0         st: 0 flags:1  ts: 1.040000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> -ret: 0         st: 1 flags:0  ts:-0.058333
> +ret: 0         st: 1 flags:0  ts:-0.040000
> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
> -ret: 0         st: 1 flags:1  ts: 2.835833
> +ret: 0         st: 1 flags:1  ts: 2.840000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> ret: 0         st:-1 flags:0  ts: 1.730004
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> @@ -31,10 +31,10 @@ ret: 0         st: 0 flags:0  ts:-0.480000
> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
> ret: 0         st: 0 flags:1  ts: 2.400000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> -ret: 0         st: 1 flags:0  ts: 1.306667
> -ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> -ret: 0         st: 1 flags:1  ts: 0.200833
> +ret: 0         st: 1 flags:0  ts: 1.320000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> +ret: 0         st: 1 flags:1  ts: 0.200000
> +ret: 0         st: 0 flags:1 dts: 0.200000 pts: 0.200000 pos: 720000 size:144000
> ret: 0         st:-1 flags:0  ts:-0.904994
> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
> ret: 0         st:-1 flags:1  ts: 1.989173
> @@ -43,9 +43,9 @@ ret: 0         st: 0 flags:0  ts: 0.880000
> ret: 0         st: 0 flags:1 dts: 0.880000 pts: 0.880000 pos:3168000 size:144000
> ret: 0         st: 0 flags:1  ts:-0.240000
> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
> -ret: 0         st: 1 flags:0  ts: 2.671667
> +ret: 0         st: 1 flags:0  ts: 2.680000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> -ret: 0         st: 1 flags:1  ts: 1.565833
> +ret: 0         st: 1 flags:1  ts: 1.560000
> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
> ret: 0         st:-1 flags:0  ts: 0.460008
> ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:1728000 size:144000
> — 
> 2.26.2

Best Regards,
Dave Rice
Carl Eugen Hoyos Oct. 31, 2020, 7:20 p.m. UTC | #2
Am Sa., 31. Okt. 2020 um 19:43 Uhr schrieb Dave Rice <dave@dericed.com>:
>
> Hi Marton,
>
> > On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu> wrote:
> >
> > Fixes out of sync timestamps in ticket #8762.
>
> Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.
>
> Beyond copying or encoding the audio, are there other options I should use to test this?

Maybe I misunderstand the issue but if audio is missing and you don't
re-encode (and
force the missing audio to be inserted), you will always run into
issues because of how
audio is generally interpreted in multimedia files.

Carl Eugen
Marton Balint Oct. 31, 2020, 7:47 p.m. UTC | #3
On Sat, 31 Oct 2020, Dave Rice wrote:

> Hi Marton,
>
>> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu> wrote:
>> 
>> Fixes out of sync timestamps in ticket #8762.
>
> Although Michael’s recent patch does address the issue documented in 
> 8762, I haven’t found this patch to fix the issue. I tried with -c:a 
> copy and with -c:a pcm_s16le with some sample files that exhibit this 
> issue but each output was out of sync. I put an output at 
> https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795. That 
> output notes that 3597 packages of video are read and 3586 packets of 
> audio. In the resulting file, at the end of the timeline the audio is 9 
> frames out of sync and my output video stream is 00:02:00.020 and output 
> audio stream is 00:01:59.653.
>
> Beyond copying or encoding the audio, are there other options I should 
> use to test this?

Well, it depends on what you want. After this patch you should get a file 
which has audio packets synced to video, but the audio stream is sparse, 
not every video packet has a corresponding audio packet. (It looks like 
our MOV muxer does not support muxing of sparse audio therefore does not 
produce proper timestamps. But MKV does, please try that.)

You can also make ffmpeg generate the missing audio based on packet 
timestamps. Swresample has an async=1 option, so something like this 
should get you synced audio with continous audio packets:
ffmpeg -y -i 1670520000_12.dv -c:v copy \
-af aresample=async=1:min_hard_comp=0.01 -c:a pcm_s16le 1670520000_12.mov

Regards,
Marton


>
>> Signed-off-by: Marton Balint <cus@passwd.hu>
>> ---
>> libavformat/dv.c       | 16 ++--------------
>> tests/ref/seek/lavf-dv | 18 +++++++++---------
>> 2 files changed, 11 insertions(+), 23 deletions(-)
>> 
>> diff --git a/libavformat/dv.c b/libavformat/dv.c
>> index 3e0d12c0e3..26a78139f5 100644
>> --- a/libavformat/dv.c
>> +++ b/libavformat/dv.c
>> @@ -49,7 +49,6 @@ struct DVDemuxContext {
>>     uint8_t           audio_buf[4][8192];
>>     int               ach;
>>     int               frames;
>> -    uint64_t          abytes;
>> };
>> 
>> static inline uint16_t dv_audio_12to16(uint16_t sample)
>> @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>>             c->ast[i] = avformat_new_stream(c->fctx, NULL);
>>             if (!c->ast[i])
>>                 break;
>> -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
>> +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
>>             c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
>>             c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
>> 
>> @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>     for (i = 0; i < c->ach; i++) {
>>         c->audio_pkt[i].pos  = pos;
>>         c->audio_pkt[i].size = size;
>> -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
>> -                               c->ast[i]->codecpar->bit_rate;
>> +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
>>         ppcm[i] = c->audio_buf[i];
>>     }
>>     if (c->ach)
>> @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>             c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>         } else {
>>             c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>> -            c->abytes           += size;
>>         }
>> -    } else {
>> -        c->abytes += size;
>>     }
>>
>>     /* Now it's time to return video packet */
>> @@ -444,13 +439,6 @@ static int64_t dv_frame_offset(AVFormatContext *s, DVDemuxContext *c,
>> void ff_dv_offset_reset(DVDemuxContext *c, int64_t frame_offset)
>> {
>>     c->frames = frame_offset;
>> -    if (c->ach) {
>> -        if (c->sys) {
>> -        c->abytes = av_rescale_q(c->frames, c->sys->time_base,
>> -                                 (AVRational) { 8, c->ast[0]->codecpar->bit_rate });
>> -        } else
>> -            av_log(c->fctx, AV_LOG_ERROR, "cannot adjust audio bytes\n");
>> -    }
>>     c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>>     c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>> }
>> diff --git a/tests/ref/seek/lavf-dv b/tests/ref/seek/lavf-dv
>> index 0000ff5abe..f63e4460be 100644
>> --- a/tests/ref/seek/lavf-dv
>> +++ b/tests/ref/seek/lavf-dv
>> @@ -7,9 +7,9 @@ ret: 0         st: 0 flags:0  ts: 0.800000
>> ret: 0         st: 0 flags:1 dts: 0.800000 pts: 0.800000 pos:2880000 size:144000
>> ret: 0         st: 0 flags:1  ts:-0.320000
>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>> -ret: 0         st: 1 flags:0  ts: 2.576667
>> +ret: 0         st: 1 flags:0  ts: 2.560000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> -ret: 0         st: 1 flags:1  ts: 1.470833
>> +ret: 0         st: 1 flags:1  ts: 1.480000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> ret: 0         st:-1 flags:0  ts: 0.365002
>> ret: 0         st: 0 flags:1 dts: 0.360000 pts: 0.360000 pos:1296000 size:144000
>> @@ -19,9 +19,9 @@ ret: 0         st: 0 flags:0  ts: 2.160000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> ret: 0         st: 0 flags:1  ts: 1.040000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> -ret: 0         st: 1 flags:0  ts:-0.058333
>> +ret: 0         st: 1 flags:0  ts:-0.040000
>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>> -ret: 0         st: 1 flags:1  ts: 2.835833
>> +ret: 0         st: 1 flags:1  ts: 2.840000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> ret: 0         st:-1 flags:0  ts: 1.730004
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> @@ -31,10 +31,10 @@ ret: 0         st: 0 flags:0  ts:-0.480000
>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>> ret: 0         st: 0 flags:1  ts: 2.400000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> -ret: 0         st: 1 flags:0  ts: 1.306667
>> -ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> -ret: 0         st: 1 flags:1  ts: 0.200833
>> +ret: 0         st: 1 flags:0  ts: 1.320000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> +ret: 0         st: 1 flags:1  ts: 0.200000
>> +ret: 0         st: 0 flags:1 dts: 0.200000 pts: 0.200000 pos: 720000 size:144000
>> ret: 0         st:-1 flags:0  ts:-0.904994
>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>> ret: 0         st:-1 flags:1  ts: 1.989173
>> @@ -43,9 +43,9 @@ ret: 0         st: 0 flags:0  ts: 0.880000
>> ret: 0         st: 0 flags:1 dts: 0.880000 pts: 0.880000 pos:3168000 size:144000
>> ret: 0         st: 0 flags:1  ts:-0.240000
>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>> -ret: 0         st: 1 flags:0  ts: 2.671667
>> +ret: 0         st: 1 flags:0  ts: 2.680000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> -ret: 0         st: 1 flags:1  ts: 1.565833
>> +ret: 0         st: 1 flags:1  ts: 1.560000
>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>> ret: 0         st:-1 flags:0  ts: 0.460008
>> ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:1728000 size:144000
>> — 
>> 2.26.2
>
> Best Regards,
> Dave Rice
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Dave Rice Oct. 31, 2020, 8:32 p.m. UTC | #4
> On Oct 31, 2020, at 3:47 PM, Marton Balint <cus@passwd.hu> wrote:
> On Sat, 31 Oct 2020, Dave Rice wrote:
> 
>> Hi Marton,
>> 
>>> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu> wrote:
>>> Fixes out of sync timestamps in ticket #8762.
>> 
>> Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.
>> 
>> Beyond copying or encoding the audio, are there other options I should use to test this?
> 
> Well, it depends on what you want. After this patch you should get a file which has audio packets synced to video, but the audio stream is sparse, not every video packet has a corresponding audio packet. (It looks like our MOV muxer does not support muxing of sparse audio therefore does not produce proper timestamps. But MKV does, please try that.)
> 
> You can also make ffmpeg generate the missing audio based on packet timestamps. Swresample has an async=1 option, so something like this should get you synced audio with continous audio packets:
> ffmpeg -y -i 1670520000_12.dv -c:v copy \
> -af aresample=async=1:min_hard_comp=0.01 -c:a pcm_s16le 1670520000_12.mov

Thank you for this. With the patch and async, the result is synced and the resulting audio was the same as Michael’s patch.

Could you explain why you used min_hard_comp here? IIUC min_hard_comp is a set a threshold between the strategies of trim/fill or stretch/squeeze to align the audio to time; however, the async documentation says "Setting this to 1 will enable filling and trimming, larger values represent the maximum amount in samples that the data may be stretched or squeezed” so I thought that async=1 would not permit stretch/squeeze anyway.

> Regards,
> Marton
> 
> 
>> 
>>> Signed-off-by: Marton Balint <cus@passwd.hu>
>>> ---
>>> libavformat/dv.c       | 16 ++--------------
>>> tests/ref/seek/lavf-dv | 18 +++++++++---------
>>> 2 files changed, 11 insertions(+), 23 deletions(-)
>>> diff --git a/libavformat/dv.c b/libavformat/dv.c
>>> index 3e0d12c0e3..26a78139f5 100644
>>> --- a/libavformat/dv.c
>>> +++ b/libavformat/dv.c
>>> @@ -49,7 +49,6 @@ struct DVDemuxContext {
>>>    uint8_t           audio_buf[4][8192];
>>>    int               ach;
>>>    int               frames;
>>> -    uint64_t          abytes;
>>> };
>>> static inline uint16_t dv_audio_12to16(uint16_t sample)
>>> @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>>>            c->ast[i] = avformat_new_stream(c->fctx, NULL);
>>>            if (!c->ast[i])
>>>                break;
>>> -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
>>> +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
>>>            c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
>>>            c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
>>> @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>>    for (i = 0; i < c->ach; i++) {
>>>        c->audio_pkt[i].pos  = pos;
>>>        c->audio_pkt[i].size = size;
>>> -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
>>> -                               c->ast[i]->codecpar->bit_rate;
>>> +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
>>>        ppcm[i] = c->audio_buf[i];
>>>    }
>>>    if (c->ach)
>>> @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>>            c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>>        } else {
>>>            c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>>> -            c->abytes           += size;
>>>        }
>>> -    } else {
>>> -        c->abytes += size;
>>>    }
>>> 
>>>    /* Now it's time to return video packet */
>>> @@ -444,13 +439,6 @@ static int64_t dv_frame_offset(AVFormatContext *s, DVDemuxContext *c,
>>> void ff_dv_offset_reset(DVDemuxContext *c, int64_t frame_offset)
>>> {
>>>    c->frames = frame_offset;
>>> -    if (c->ach) {
>>> -        if (c->sys) {
>>> -        c->abytes = av_rescale_q(c->frames, c->sys->time_base,
>>> -                                 (AVRational) { 8, c->ast[0]->codecpar->bit_rate });
>>> -        } else
>>> -            av_log(c->fctx, AV_LOG_ERROR, "cannot adjust audio bytes\n");
>>> -    }
>>>    c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>>>    c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>> }
>>> diff --git a/tests/ref/seek/lavf-dv b/tests/ref/seek/lavf-dv
>>> index 0000ff5abe..f63e4460be 100644
>>> --- a/tests/ref/seek/lavf-dv
>>> +++ b/tests/ref/seek/lavf-dv
>>> @@ -7,9 +7,9 @@ ret: 0         st: 0 flags:0  ts: 0.800000
>>> ret: 0         st: 0 flags:1 dts: 0.800000 pts: 0.800000 pos:2880000 size:144000
>>> ret: 0         st: 0 flags:1  ts:-0.320000
>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>> -ret: 0         st: 1 flags:0  ts: 2.576667
>>> +ret: 0         st: 1 flags:0  ts: 2.560000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> -ret: 0         st: 1 flags:1  ts: 1.470833
>>> +ret: 0         st: 1 flags:1  ts: 1.480000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> ret: 0         st:-1 flags:0  ts: 0.365002
>>> ret: 0         st: 0 flags:1 dts: 0.360000 pts: 0.360000 pos:1296000 size:144000
>>> @@ -19,9 +19,9 @@ ret: 0         st: 0 flags:0  ts: 2.160000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> ret: 0         st: 0 flags:1  ts: 1.040000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> -ret: 0         st: 1 flags:0  ts:-0.058333
>>> +ret: 0         st: 1 flags:0  ts:-0.040000
>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>> -ret: 0         st: 1 flags:1  ts: 2.835833
>>> +ret: 0         st: 1 flags:1  ts: 2.840000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> ret: 0         st:-1 flags:0  ts: 1.730004
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> @@ -31,10 +31,10 @@ ret: 0         st: 0 flags:0  ts:-0.480000
>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>> ret: 0         st: 0 flags:1  ts: 2.400000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> -ret: 0         st: 1 flags:0  ts: 1.306667
>>> -ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> -ret: 0         st: 1 flags:1  ts: 0.200833
>>> +ret: 0         st: 1 flags:0  ts: 1.320000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> +ret: 0         st: 1 flags:1  ts: 0.200000
>>> +ret: 0         st: 0 flags:1 dts: 0.200000 pts: 0.200000 pos: 720000 size:144000
>>> ret: 0         st:-1 flags:0  ts:-0.904994
>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>> ret: 0         st:-1 flags:1  ts: 1.989173
>>> @@ -43,9 +43,9 @@ ret: 0         st: 0 flags:0  ts: 0.880000
>>> ret: 0         st: 0 flags:1 dts: 0.880000 pts: 0.880000 pos:3168000 size:144000
>>> ret: 0         st: 0 flags:1  ts:-0.240000
>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>> -ret: 0         st: 1 flags:0  ts: 2.671667
>>> +ret: 0         st: 1 flags:0  ts: 2.680000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> -ret: 0         st: 1 flags:1  ts: 1.565833
>>> +ret: 0         st: 1 flags:1  ts: 1.560000
>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>> ret: 0         st:-1 flags:0  ts: 0.460008
>>> ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:1728000 size:144000
>>> — 2.26.2
>> 
>> Best Regards,
>> Dave Rice
>> 
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
>> 
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
Marton Balint Oct. 31, 2020, 9:15 p.m. UTC | #5
On Sat, 31 Oct 2020, Dave Rice wrote:

>
>> On Oct 31, 2020, at 3:47 PM, Marton Balint <cus@passwd.hu> wrote:
>> On Sat, 31 Oct 2020, Dave Rice wrote:
>> 
>>> Hi Marton,
>>> 
>>>> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu> wrote:
>>>> Fixes out of sync timestamps in ticket #8762.
>>> 
>>> Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.
>>> 
>>> Beyond copying or encoding the audio, are there other options I should use to test this?
>> 
>> Well, it depends on what you want. After this patch you should get a file which has audio packets synced to video, but the audio stream is sparse, not every video packet has a corresponding audio packet. (It looks like our MOV muxer does not support muxing of sparse audio therefore does not produce proper timestamps. But MKV does, please try that.)
>> 
>> You can also make ffmpeg generate the missing audio based on packet timestamps. Swresample has an async=1 option, so something like this should get you synced audio with continous audio packets:
>> ffmpeg -y -i 1670520000_12.dv -c:v copy \
>> -af aresample=async=1:min_hard_comp=0.01 -c:a pcm_s16le 1670520000_12.mov
>
> Thank you for this. With the patch and async, the result is synced and the resulting audio was the same as Michael’s patch.
>
> Could you explain why you used min_hard_comp here? IIUC min_hard_comp is 
> a set a threshold between the strategies of trim/fill or stretch/squeeze 
> to align the audio to time; however, the async documentation says 
> "Setting this to 1 will enable filling and trimming, larger values 
> represent the maximum amount in samples that the data may be stretched 
> or squeezed” so I thought that async=1 would not permit stretch/squeeze 
> anyway.

It is documented poorly, but if you check the source code you will see 
that async=1 implicitly sets min_comp to 0.001 enabling trimming/dropping. 
min_hard_comp decides the threshold when silence injection actually 
happens, and the default for that is 0.1, which is more than a frame, 
therefore not acceptable if we want to maintain <1 frame accuracy. Or at 
least that is how I think it should work.

Regards,
Marton

>>> 
>>>> Signed-off-by: Marton Balint <cus@passwd.hu>
>>>> ---
>>>> libavformat/dv.c       | 16 ++--------------
>>>> tests/ref/seek/lavf-dv | 18 +++++++++---------
>>>> 2 files changed, 11 insertions(+), 23 deletions(-)
>>>> diff --git a/libavformat/dv.c b/libavformat/dv.c
>>>> index 3e0d12c0e3..26a78139f5 100644
>>>> --- a/libavformat/dv.c
>>>> +++ b/libavformat/dv.c
>>>> @@ -49,7 +49,6 @@ struct DVDemuxContext {
>>>>    uint8_t           audio_buf[4][8192];
>>>>    int               ach;
>>>>    int               frames;
>>>> -    uint64_t          abytes;
>>>> };
>>>> static inline uint16_t dv_audio_12to16(uint16_t sample)
>>>> @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>>>>            c->ast[i] = avformat_new_stream(c->fctx, NULL);
>>>>            if (!c->ast[i])
>>>>                break;
>>>> -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
>>>> +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
>>>>            c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
>>>>            c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
>>>> @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>>>    for (i = 0; i < c->ach; i++) {
>>>>        c->audio_pkt[i].pos  = pos;
>>>>        c->audio_pkt[i].size = size;
>>>> -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
>>>> -                               c->ast[i]->codecpar->bit_rate;
>>>> +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
>>>>        ppcm[i] = c->audio_buf[i];
>>>>    }
>>>>    if (c->ach)
>>>> @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>>>            c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>>>        } else {
>>>>            c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>>>> -            c->abytes           += size;
>>>>        }
>>>> -    } else {
>>>> -        c->abytes += size;
>>>>    }
>>>>
>>>>    /* Now it's time to return video packet */
>>>> @@ -444,13 +439,6 @@ static int64_t dv_frame_offset(AVFormatContext *s, DVDemuxContext *c,
>>>> void ff_dv_offset_reset(DVDemuxContext *c, int64_t frame_offset)
>>>> {
>>>>    c->frames = frame_offset;
>>>> -    if (c->ach) {
>>>> -        if (c->sys) {
>>>> -        c->abytes = av_rescale_q(c->frames, c->sys->time_base,
>>>> -                                 (AVRational) { 8, c->ast[0]->codecpar->bit_rate });
>>>> -        } else
>>>> -            av_log(c->fctx, AV_LOG_ERROR, "cannot adjust audio bytes\n");
>>>> -    }
>>>>    c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>>>>    c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>>> }
>>>> diff --git a/tests/ref/seek/lavf-dv b/tests/ref/seek/lavf-dv
>>>> index 0000ff5abe..f63e4460be 100644
>>>> --- a/tests/ref/seek/lavf-dv
>>>> +++ b/tests/ref/seek/lavf-dv
>>>> @@ -7,9 +7,9 @@ ret: 0         st: 0 flags:0  ts: 0.800000
>>>> ret: 0         st: 0 flags:1 dts: 0.800000 pts: 0.800000 pos:2880000 size:144000
>>>> ret: 0         st: 0 flags:1  ts:-0.320000
>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>> -ret: 0         st: 1 flags:0  ts: 2.576667
>>>> +ret: 0         st: 1 flags:0  ts: 2.560000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> -ret: 0         st: 1 flags:1  ts: 1.470833
>>>> +ret: 0         st: 1 flags:1  ts: 1.480000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> ret: 0         st:-1 flags:0  ts: 0.365002
>>>> ret: 0         st: 0 flags:1 dts: 0.360000 pts: 0.360000 pos:1296000 size:144000
>>>> @@ -19,9 +19,9 @@ ret: 0         st: 0 flags:0  ts: 2.160000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> ret: 0         st: 0 flags:1  ts: 1.040000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> -ret: 0         st: 1 flags:0  ts:-0.058333
>>>> +ret: 0         st: 1 flags:0  ts:-0.040000
>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>> -ret: 0         st: 1 flags:1  ts: 2.835833
>>>> +ret: 0         st: 1 flags:1  ts: 2.840000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> ret: 0         st:-1 flags:0  ts: 1.730004
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> @@ -31,10 +31,10 @@ ret: 0         st: 0 flags:0  ts:-0.480000
>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>> ret: 0         st: 0 flags:1  ts: 2.400000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> -ret: 0         st: 1 flags:0  ts: 1.306667
>>>> -ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> -ret: 0         st: 1 flags:1  ts: 0.200833
>>>> +ret: 0         st: 1 flags:0  ts: 1.320000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> +ret: 0         st: 1 flags:1  ts: 0.200000
>>>> +ret: 0         st: 0 flags:1 dts: 0.200000 pts: 0.200000 pos: 720000 size:144000
>>>> ret: 0         st:-1 flags:0  ts:-0.904994
>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>> ret: 0         st:-1 flags:1  ts: 1.989173
>>>> @@ -43,9 +43,9 @@ ret: 0         st: 0 flags:0  ts: 0.880000
>>>> ret: 0         st: 0 flags:1 dts: 0.880000 pts: 0.880000 pos:3168000 size:144000
>>>> ret: 0         st: 0 flags:1  ts:-0.240000
>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>> -ret: 0         st: 1 flags:0  ts: 2.671667
>>>> +ret: 0         st: 1 flags:0  ts: 2.680000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> -ret: 0         st: 1 flags:1  ts: 1.565833
>>>> +ret: 0         st: 1 flags:1  ts: 1.560000
>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>> ret: 0         st:-1 flags:0  ts: 0.460008
>>>> ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:1728000 size:144000
>>>> — 2.26.2
>>> 
>>> Best Regards,
>>> Dave Rice
>>> 
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
>>> 
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
>> 
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Dave Rice Oct. 31, 2020, 10:42 p.m. UTC | #6
> On Oct 31, 2020, at 5:15 PM, Marton Balint <cus@passwd.hu> wrote:
> On Sat, 31 Oct 2020, Dave Rice wrote:
>>> On Oct 31, 2020, at 3:47 PM, Marton Balint <cus@passwd.hu> wrote:
>>> On Sat, 31 Oct 2020, Dave Rice wrote:
>>>> Hi Marton,
>>>>> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu> wrote:
>>>>> Fixes out of sync timestamps in ticket #8762.
>>>> Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.
>>>> Beyond copying or encoding the audio, are there other options I should use to test this?
>>> Well, it depends on what you want. After this patch you should get a file which has audio packets synced to video, but the audio stream is sparse, not every video packet has a corresponding audio packet. (It looks like our MOV muxer does not support muxing of sparse audio therefore does not produce proper timestamps. But MKV does, please try that.)
>>> You can also make ffmpeg generate the missing audio based on packet timestamps. Swresample has an async=1 option, so something like this should get you synced audio with continous audio packets:
>>> ffmpeg -y -i 1670520000_12.dv -c:v copy \
>>> -af aresample=async=1:min_hard_comp=0.01 -c:a pcm_s16le 1670520000_12.mov
>> 
>> Thank you for this. With the patch and async, the result is synced and the resulting audio was the same as Michael’s patch.
>> 
>> Could you explain why you used min_hard_comp here? IIUC min_hard_comp is a set a threshold between the strategies of trim/fill or stretch/squeeze to align the audio to time; however, the async documentation says "Setting this to 1 will enable filling and trimming, larger values represent the maximum amount in samples that the data may be stretched or squeezed” so I thought that async=1 would not permit stretch/squeeze anyway.
> 
> It is documented poorly, but if you check the source code you will see that async=1 implicitly sets min_comp to 0.001 enabling trimming/dropping. min_hard_comp decides the threshold when silence injection actually happens, and the default for that is 0.1, which is more than a frame, therefore not acceptable if we want to maintain <1 frame accuracy. Or at least that is how I think it should work.

Thanks for the explanation.
I’ve tested this patch with larger sample sets and really appreciate that the audio timestamps are now properly ordered. For instance with https://archive.org/download/dvr-007/DVR_007.dv, before the patch the timestamps for audio around 20 - 30 seconds get severely jumbled which makes it difficult to use astats metadata. While the documentation could be better, as I’ve seen many forums entries of ffmpeg users struggling to keep dv from tape transfers in sync, I don’t think that’s blocking. I’d really like to see this merged.

Although this patch resolves the issue, I think Michael’s patch is interesting too as it would allow access to the audio dif blocks of dv frames with no audio source pack metadata. I’d have to test more to find an instance where the 'dvaudio_concealment pass’ option of Michael’s patch provides a different result than the added silence from async.

Thanks so much.
Dave

> Regards,
> Marton
> 
>>>>> Signed-off-by: Marton Balint <cus@passwd.hu>
>>>>> ---
>>>>> libavformat/dv.c       | 16 ++--------------
>>>>> tests/ref/seek/lavf-dv | 18 +++++++++---------
>>>>> 2 files changed, 11 insertions(+), 23 deletions(-)
>>>>> diff --git a/libavformat/dv.c b/libavformat/dv.c
>>>>> index 3e0d12c0e3..26a78139f5 100644
>>>>> --- a/libavformat/dv.c
>>>>> +++ b/libavformat/dv.c
>>>>> @@ -49,7 +49,6 @@ struct DVDemuxContext {
>>>>>   uint8_t           audio_buf[4][8192];
>>>>>   int               ach;
>>>>>   int               frames;
>>>>> -    uint64_t          abytes;
>>>>> };
>>>>> static inline uint16_t dv_audio_12to16(uint16_t sample)
>>>>> @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>>>>>           c->ast[i] = avformat_new_stream(c->fctx, NULL);
>>>>>           if (!c->ast[i])
>>>>>               break;
>>>>> -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
>>>>> +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
>>>>>           c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
>>>>>           c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
>>>>> @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>>>>   for (i = 0; i < c->ach; i++) {
>>>>>       c->audio_pkt[i].pos  = pos;
>>>>>       c->audio_pkt[i].size = size;
>>>>> -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
>>>>> -                               c->ast[i]->codecpar->bit_rate;
>>>>> +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
>>>>>       ppcm[i] = c->audio_buf[i];
>>>>>   }
>>>>>   if (c->ach)
>>>>> @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>>>>           c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>>>>       } else {
>>>>>           c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>>>>> -            c->abytes           += size;
>>>>>       }
>>>>> -    } else {
>>>>> -        c->abytes += size;
>>>>>   }
>>>>> 
>>>>>   /* Now it's time to return video packet */
>>>>> @@ -444,13 +439,6 @@ static int64_t dv_frame_offset(AVFormatContext *s, DVDemuxContext *c,
>>>>> void ff_dv_offset_reset(DVDemuxContext *c, int64_t frame_offset)
>>>>> {
>>>>>   c->frames = frame_offset;
>>>>> -    if (c->ach) {
>>>>> -        if (c->sys) {
>>>>> -        c->abytes = av_rescale_q(c->frames, c->sys->time_base,
>>>>> -                                 (AVRational) { 8, c->ast[0]->codecpar->bit_rate });
>>>>> -        } else
>>>>> -            av_log(c->fctx, AV_LOG_ERROR, "cannot adjust audio bytes\n");
>>>>> -    }
>>>>>   c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>>>>>   c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>>>> }
>>>>> diff --git a/tests/ref/seek/lavf-dv b/tests/ref/seek/lavf-dv
>>>>> index 0000ff5abe..f63e4460be 100644
>>>>> --- a/tests/ref/seek/lavf-dv
>>>>> +++ b/tests/ref/seek/lavf-dv
>>>>> @@ -7,9 +7,9 @@ ret: 0         st: 0 flags:0  ts: 0.800000
>>>>> ret: 0         st: 0 flags:1 dts: 0.800000 pts: 0.800000 pos:2880000 size:144000
>>>>> ret: 0         st: 0 flags:1  ts:-0.320000
>>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>>> -ret: 0         st: 1 flags:0  ts: 2.576667
>>>>> +ret: 0         st: 1 flags:0  ts: 2.560000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> -ret: 0         st: 1 flags:1  ts: 1.470833
>>>>> +ret: 0         st: 1 flags:1  ts: 1.480000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> ret: 0         st:-1 flags:0  ts: 0.365002
>>>>> ret: 0         st: 0 flags:1 dts: 0.360000 pts: 0.360000 pos:1296000 size:144000
>>>>> @@ -19,9 +19,9 @@ ret: 0         st: 0 flags:0  ts: 2.160000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> ret: 0         st: 0 flags:1  ts: 1.040000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> -ret: 0         st: 1 flags:0  ts:-0.058333
>>>>> +ret: 0         st: 1 flags:0  ts:-0.040000
>>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>>> -ret: 0         st: 1 flags:1  ts: 2.835833
>>>>> +ret: 0         st: 1 flags:1  ts: 2.840000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> ret: 0         st:-1 flags:0  ts: 1.730004
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> @@ -31,10 +31,10 @@ ret: 0         st: 0 flags:0 ts:-0.480000
>>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>>> ret: 0         st: 0 flags:1  ts: 2.400000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> -ret: 0         st: 1 flags:0  ts: 1.306667
>>>>> -ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> -ret: 0         st: 1 flags:1  ts: 0.200833
>>>>> +ret: 0         st: 1 flags:0  ts: 1.320000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> +ret: 0         st: 1 flags:1  ts: 0.200000
>>>>> +ret: 0         st: 0 flags:1 dts: 0.200000 pts: 0.200000 pos: 720000 size:144000
>>>>> ret: 0         st:-1 flags:0  ts:-0.904994
>>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>>> ret: 0         st:-1 flags:1  ts: 1.989173
>>>>> @@ -43,9 +43,9 @@ ret: 0         st: 0 flags:0  ts: 0.880000
>>>>> ret: 0         st: 0 flags:1 dts: 0.880000 pts: 0.880000 pos:3168000 size:144000
>>>>> ret: 0         st: 0 flags:1  ts:-0.240000
>>>>> ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
>>>>> -ret: 0         st: 1 flags:0  ts: 2.671667
>>>>> +ret: 0         st: 1 flags:0  ts: 2.680000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> -ret: 0         st: 1 flags:1  ts: 1.565833
>>>>> +ret: 0         st: 1 flags:1  ts: 1.560000
>>>>> ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
>>>>> ret: 0         st:-1 flags:0  ts: 0.460008
>>>>> ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:1728000 size:144000
>>>>> — 2.26.2
>>>> Best Regards,
>>>> Dave Rice
>>>> _______________________________________________
>>>> ffmpeg-devel mailing list
>>>> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org> <mailto:ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>>
>>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel><https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>>
>>>> To unsubscribe, visit link above, or email
>>>> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> <mailto:ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org>> with subject "unsubscribe".
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org> <mailto:ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>>
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel><https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> <mailto:ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org>> with subject "unsubscribe".
>> 
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> 
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Michael Niedermayer Nov. 1, 2020, 5:40 p.m. UTC | #7
On Sat, Oct 31, 2020 at 05:56:24PM +0100, Marton Balint wrote:
> Fixes out of sync timestamps in ticket #8762.
> 
> Signed-off-by: Marton Balint <cus@passwd.hu>
> ---
>  libavformat/dv.c       | 16 ++--------------
>  tests/ref/seek/lavf-dv | 18 +++++++++---------
>  2 files changed, 11 insertions(+), 23 deletions(-)
> 
> diff --git a/libavformat/dv.c b/libavformat/dv.c
> index 3e0d12c0e3..26a78139f5 100644
> --- a/libavformat/dv.c
> +++ b/libavformat/dv.c
> @@ -49,7 +49,6 @@ struct DVDemuxContext {
>      uint8_t           audio_buf[4][8192];
>      int               ach;
>      int               frames;
> -    uint64_t          abytes;
>  };
>  
>  static inline uint16_t dv_audio_12to16(uint16_t sample)
> @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>              c->ast[i] = avformat_new_stream(c->fctx, NULL);
>              if (!c->ast[i])
>                  break;
> -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
> +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
>              c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
>              c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
>  
> @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>      for (i = 0; i < c->ach; i++) {
>          c->audio_pkt[i].pos  = pos;
>          c->audio_pkt[i].size = size;
> -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
> -                               c->ast[i]->codecpar->bit_rate;
> +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
>          ppcm[i] = c->audio_buf[i];
>      }
>      if (c->ach)
> @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>              c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>          } else {
>              c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
> -            c->abytes           += size;
>          }
> -    } else {
> -        c->abytes += size;
>      }
>  
>      /* Now it's time to return video packet */

Please correct me if iam wrong but
in cases where no audio is missing or damaged, this would also ignore how much
audio is in each packet. So you could have lets say a timestamp difference
of excatly 1 second between 2 packets while their is actually not exactly
1 second worth of audio samples between them. 


[...]
Marton Balint Nov. 1, 2020, 8:58 p.m. UTC | #8
On Sun, 1 Nov 2020, Michael Niedermayer wrote:

> On Sat, Oct 31, 2020 at 05:56:24PM +0100, Marton Balint wrote:
>> Fixes out of sync timestamps in ticket #8762.
>>
>> Signed-off-by: Marton Balint <cus@passwd.hu>
>> ---
>>  libavformat/dv.c       | 16 ++--------------
>>  tests/ref/seek/lavf-dv | 18 +++++++++---------
>>  2 files changed, 11 insertions(+), 23 deletions(-)
>>
>> diff --git a/libavformat/dv.c b/libavformat/dv.c
>> index 3e0d12c0e3..26a78139f5 100644
>> --- a/libavformat/dv.c
>> +++ b/libavformat/dv.c
>> @@ -49,7 +49,6 @@ struct DVDemuxContext {
>>      uint8_t           audio_buf[4][8192];
>>      int               ach;
>>      int               frames;
>> -    uint64_t          abytes;
>>  };
>>
>>  static inline uint16_t dv_audio_12to16(uint16_t sample)
>> @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>>              c->ast[i] = avformat_new_stream(c->fctx, NULL);
>>              if (!c->ast[i])
>>                  break;
>> -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
>> +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
>>              c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
>>              c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
>>
>> @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>      for (i = 0; i < c->ach; i++) {
>>          c->audio_pkt[i].pos  = pos;
>>          c->audio_pkt[i].size = size;
>> -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
>> -                               c->ast[i]->codecpar->bit_rate;
>> +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
>>          ppcm[i] = c->audio_buf[i];
>>      }
>>      if (c->ach)
>> @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>              c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>          } else {
>>              c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>> -            c->abytes           += size;
>>          }
>> -    } else {
>> -        c->abytes += size;
>>      }
>>
>>      /* Now it's time to return video packet */
>
> Please correct me if iam wrong but
> in cases where no audio is missing or damaged, this would also ignore how much
> audio is in each packet. So you could have lets say a timestamp difference
> of excatly 1 second between 2 packets while their is actually not exactly
> 1 second worth of audio samples between them.

This is true, by using the frame counter (and the video time base) for 
audio, we lose some audio packet timestamp precision inherently. However I 
don't consider this a problem, audio timestamps do not have to be sample 
accurate, for most formats they are not. Also it is not practical to keep 
track of how many samples are there in the packets, for example when you 
do seeking, obviously you can't read all the audio data before the seek 
point to get a precise sample accurate timestamp.

What matters is that based on what I understand about the DV format (but 
maybe Dave can confirm or deny this) the divergence between the audio 
timestamp and the video timestamp in a DV frame must be less than 1/3 
frame duration even for unlocked mode:

http://www.adamwilt.com/DV-FAQ-tech.html#LockedAudio

I believe this patch also fixes the timestamps when the audio clock is 
unlocked and have a completely indpendent clock source. (E.g. runs on 
fixed 48009 Hz, see the "Unlocked audio: real life" section in the linked 
page)

Regards,
Marton
Dave Rice Nov. 1, 2020, 9:52 p.m. UTC | #9
> On Nov 1, 2020, at 3:58 PM, Marton Balint <cus@passwd.hu> wrote:
> 
> 
> 
> On Sun, 1 Nov 2020, Michael Niedermayer wrote:
> 
>> On Sat, Oct 31, 2020 at 05:56:24PM +0100, Marton Balint wrote:
>>> Fixes out of sync timestamps in ticket #8762.
>>> 
>>> Signed-off-by: Marton Balint <cus@passwd.hu>
>>> ---
>>> libavformat/dv.c       | 16 ++--------------
>>> tests/ref/seek/lavf-dv | 18 +++++++++---------
>>> 2 files changed, 11 insertions(+), 23 deletions(-)
>>> 
>>> diff --git a/libavformat/dv.c b/libavformat/dv.c
>>> index 3e0d12c0e3..26a78139f5 100644
>>> --- a/libavformat/dv.c
>>> +++ b/libavformat/dv.c
>>> @@ -49,7 +49,6 @@ struct DVDemuxContext {
>>>     uint8_t           audio_buf[4][8192];
>>>     int               ach;
>>>     int               frames;
>>> -    uint64_t          abytes;
>>> };
>>> 
>>> static inline uint16_t dv_audio_12to16(uint16_t sample)
>>> @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>>>             c->ast[i] = avformat_new_stream(c->fctx, NULL);
>>>             if (!c->ast[i])
>>>                 break;
>>> -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
>>> +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
>>>             c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
>>>             c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
>>> 
>>> @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>>     for (i = 0; i < c->ach; i++) {
>>>         c->audio_pkt[i].pos  = pos;
>>>         c->audio_pkt[i].size = size;
>>> -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
>>> -                               c->ast[i]->codecpar->bit_rate;
>>> +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
>>>         ppcm[i] = c->audio_buf[i];
>>>     }
>>>     if (c->ach)
>>> @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>>>             c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
>>>         } else {
>>>             c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
>>> -            c->abytes           += size;
>>>         }
>>> -    } else {
>>> -        c->abytes += size;
>>>     }
>>> 
>>>     /* Now it's time to return video packet */
>> 
>> Please correct me if iam wrong but
>> in cases where no audio is missing or damaged, this would also ignore how much
>> audio is in each packet. So you could have lets say a timestamp difference
>> of excatly 1 second between 2 packets while their is actually not exactly
>> 1 second worth of audio samples between them.
> 
> This is true, by using the frame counter (and the video time base) for audio, we lose some audio packet timestamp precision inherently. However I don't consider this a problem, audio timestamps do not have to be sample accurate, for most formats they are not. Also it is not practical to keep track of how many samples are there in the packets, for example when you do seeking, obviously you can't read all the audio data before the seek point to get a precise sample accurate timestamp.

Good point.

> What matters is that based on what I understand about the DV format (but maybe Dave can confirm or deny this) the divergence between the audio timestamp and the video timestamp in a DV frame must be less than 1/3 frame duration even for unlocked mode:
> 
> http://www.adamwilt.com/DV-FAQ-tech.html#LockedAudio

The divergence could be a little larger than 1/3 frame in unlocked mode. IEC61384-2 defines the allowable range of minimum to maximum samples per frame and the maximum allowable divergence of accumulated samples per frame.

Mode       | Min-Max   | Allowance of accumulated difference
NTSC 48000 | 1580-1620 | 20
NTSC 44100 | 1452-1489 | 19
NTSC 32000 | 1053-1080 | 14
PAL  48000 | 1896-1944 | 24
PAL  44100 | 1742-1786 | 22
PAL  32000 | 1264-1296 | 16

The divergence between the audio timestamp and video timestamp is conditional on the mode, so that would be

Mode       | Max divergence as percentage of frame duration
NTSC 48000 | 0.3742511235
NTSC 44100 | 0.3869807536
NTSC 32000 | 0.3929636797
PAL  48000 | 0.3125
PAL  44100 | 0.3117913832
PAL  32000 | 0.3125

0.3929636797 is the max divergence, at least according to the spec’s limit of the allowable accumulated difference.

> I believe this patch also fixes the timestamps when the audio clock is unlocked and have a completely indpendent clock source. (E.g. runs on fixed 48009 Hz, see the "Unlocked audio: real life" section in the linked page)

I suppose it’s possible for a DV stream to use unlocked audio and ignore the defined limits of allowable accumulated difference ("real life" or worse) and fill max samples for each frame which could give sample rates of 48,600 in PAL at 48.

Kind Regards,
Dave Rice
Michael Niedermayer Nov. 2, 2020, 2:47 p.m. UTC | #10
On Sun, Nov 01, 2020 at 09:58:43PM +0100, Marton Balint wrote:
> 
> 
> On Sun, 1 Nov 2020, Michael Niedermayer wrote:
> 
> > On Sat, Oct 31, 2020 at 05:56:24PM +0100, Marton Balint wrote:
> > > Fixes out of sync timestamps in ticket #8762.
> > > 
> > > Signed-off-by: Marton Balint <cus@passwd.hu>
> > > ---
> > >  libavformat/dv.c       | 16 ++--------------
> > >  tests/ref/seek/lavf-dv | 18 +++++++++---------
> > >  2 files changed, 11 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/libavformat/dv.c b/libavformat/dv.c
> > > index 3e0d12c0e3..26a78139f5 100644
> > > --- a/libavformat/dv.c
> > > +++ b/libavformat/dv.c
> > > @@ -49,7 +49,6 @@ struct DVDemuxContext {
> > >      uint8_t           audio_buf[4][8192];
> > >      int               ach;
> > >      int               frames;
> > > -    uint64_t          abytes;
> > >  };
> > > 
> > >  static inline uint16_t dv_audio_12to16(uint16_t sample)
> > > @@ -258,7 +257,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
> > >              c->ast[i] = avformat_new_stream(c->fctx, NULL);
> > >              if (!c->ast[i])
> > >                  break;
> > > -            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
> > > +            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
> > >              c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
> > >              c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
> > > 
> > > @@ -387,8 +386,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
> > >      for (i = 0; i < c->ach; i++) {
> > >          c->audio_pkt[i].pos  = pos;
> > >          c->audio_pkt[i].size = size;
> > > -        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
> > > -                               c->ast[i]->codecpar->bit_rate;
> > > +        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
> > >          ppcm[i] = c->audio_buf[i];
> > >      }
> > >      if (c->ach)
> > > @@ -401,10 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
> > >              c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
> > >          } else {
> > >              c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
> > > -            c->abytes           += size;
> > >          }
> > > -    } else {
> > > -        c->abytes += size;
> > >      }
> > > 
> > >      /* Now it's time to return video packet */
> > 
> > Please correct me if iam wrong but
> > in cases where no audio is missing or damaged, this would also ignore how much
> > audio is in each packet. So you could have lets say a timestamp difference
> > of excatly 1 second between 2 packets while their is actually not exactly
> > 1 second worth of audio samples between them.
> 
> This is true, by using the frame counter (and the video time base) for
> audio, we lose some audio packet timestamp precision inherently. However I
> don't consider this a problem, audio timestamps do not have to be sample
> accurate, for most formats they are not. 


> Also it is not practical to keep
> track of how many samples are there in the packets, for example when you do
> seeking, obviously you can't read all the audio data before the seek point
> to get a precise sample accurate timestamp.

Its true that with seeking there is not enough information for sample precisse
timestamps. But from packet to packet as long as no seek happened there is.
My concern was more about something like significant frame to frame
differences in audio sample numbers. 
Because if some hw or sw generates this we would produce packets of
identical duration which differ substantially in number of samples and
that would not be handled well in any scenario that accepted the timestamps
and durations as exact.

Maybe this never occurs and in that case your patch should be a good idea
but if it does happen then some code would be needed to deal with that.
It is detectable when sample counts do not match what is expected.

That said, i would consider a fix for #8762 to produce correct audio in
all cases including wav/pcm/mov/... output and not just when the output
can store "corrupted"/"sparse" audio.
Also to me returning the data from the input file which would represent audio
if it was not corrupt seems to be somehow the "correct" thing to do.
Maybe this never contains any useful data then it doesnt matter in
reality but still it feels a bit odd to fix just the timestamps.
But maybe that was not what you intended anyway?

thx
[...]
Marton Balint Nov. 2, 2020, 8:42 p.m. UTC | #11
On Mon, 2 Nov 2020, Michael Niedermayer wrote:

>>> Please correct me if iam wrong but
>>> in cases where no audio is missing or damaged, this would also ignore how much
>>> audio is in each packet. So you could have lets say a timestamp difference
>>> of excatly 1 second between 2 packets while their is actually not exactly
>>> 1 second worth of audio samples between them.
>>
>> This is true, by using the frame counter (and the video time base) for
>> audio, we lose some audio packet timestamp precision inherently. However I
>> don't consider this a problem, audio timestamps do not have to be sample
>> accurate, for most formats they are not.
>
>
>> Also it is not practical to keep
>> track of how many samples are there in the packets, for example when you do
>> seeking, obviously you can't read all the audio data before the seek point
>> to get a precise sample accurate timestamp.
>
> Its true that with seeking there is not enough information for sample precisse
> timestamps. But from packet to packet as long as no seek happened there is.

And that timestamp can turn out to be wrong. If the audio clock is running 
at little more than 48 kHz, there will be A-V desync because after some 
time audio and video timestamps for packets coming from the same DV frame 
will diverge significantly.

> My concern was more about something like significant frame to frame
> differences in audio sample numbers.
> Because if some hw or sw generates this we would produce packets of
> identical duration which differ substantially in number of samples and
> that would not be handled well in any scenario that accepted the timestamps
> and durations as exact.

In general, you can't assume that timestamps or packet durations are 
exact. Consider you have a format which stores timestamps and durations in 
miliseconds. Rounding errors will occur. Also, for consumer equipment 
audio and video is rarely locked together, and audio sample rates are 
rarely very precise.

> Maybe this never occurs and in that case your patch should be a good idea
> but if it does happen then some code would be needed to deal with that.
> It is detectable when sample counts do not match what is expected.

Yeah, and we have tools to fix that, like -af aresample=async=1.

> That said, i would consider a fix for #8762 to produce correct audio in
> all cases including wav/pcm/mov/... output and not just when the output
> can store "corrupted"/"sparse" audio.

I think ffmpeg.c should be smarter about it, and be aware if unlocked or 
sparse audio (or audio not starting at the same time as video) is 
supported by certain muxers or not. And if it is not suppoted, then maybe 
-af async=1 or similar should be used automagically.

> Also to me returning the data from the input file which would represent audio
> if it was not corrupt seems to be somehow the "correct" thing to do.
> Maybe this never contains any useful data then it doesnt matter in
> reality but still it feels a bit odd to fix just the timestamps.

I am not strictly against applying your patch, I can accept that for the 
users it might be useful to get the data at the demuxer level and not play 
with async=1, yes, sparse audio requires extra care. I might even be OK 
with changing the default to pass corrupt packets. But this does not 
change the fact that the audio timestamps are currently wrong, because 
they ignore that audio and video from the same DV frame are synced 
together with at most 1/3 frame duration error.

Regards,
Marton
Michael Niedermayer Nov. 4, 2020, 9:09 a.m. UTC | #12
On Mon, Nov 02, 2020 at 09:42:21PM +0100, Marton Balint wrote:
> 
> 
> On Mon, 2 Nov 2020, Michael Niedermayer wrote:
> 
> > > > Please correct me if iam wrong but
> > > > in cases where no audio is missing or damaged, this would also ignore how much
> > > > audio is in each packet. So you could have lets say a timestamp difference
> > > > of excatly 1 second between 2 packets while their is actually not exactly
> > > > 1 second worth of audio samples between them.
> > > 
> > > This is true, by using the frame counter (and the video time base) for
> > > audio, we lose some audio packet timestamp precision inherently. However I
> > > don't consider this a problem, audio timestamps do not have to be sample
> > > accurate, for most formats they are not.
> > 
> > 
> > > Also it is not practical to keep
> > > track of how many samples are there in the packets, for example when you do
> > > seeking, obviously you can't read all the audio data before the seek point
> > > to get a precise sample accurate timestamp.
> > 
> > Its true that with seeking there is not enough information for sample precisse
> > timestamps. But from packet to packet as long as no seek happened there is.
> 
> And that timestamp can turn out to be wrong. If the audio clock is running
> at little more than 48 kHz, there will be A-V desync because after some time
> audio and video timestamps for packets coming from the same DV frame will
> diverge significantly.
> 

> > My concern was more about something like significant frame to frame
> > differences in audio sample numbers.
> > Because if some hw or sw generates this we would produce packets of
> > identical duration which differ substantially in number of samples and
> > that would not be handled well in any scenario that accepted the timestamps
> > and durations as exact.
> 
> In general, you can't assume that timestamps or packet durations are exact.
> Consider you have a format which stores timestamps and durations in
> miliseconds. Rounding errors will occur. 

sure, maybe the distinction of millisecond/rounded timebases and exact
timebases needs a flag somewhere.


> Also, for consumer equipment audio
> and video is rarely locked together, and audio sample rates are rarely very
> precise.

sure, this case is maybe a bit more exceptional than this though
we have "millisecond" based formats, rounded timestamps
we have "exact" cases, maybe the timebase being 1 packet/frame per tick
we have "high precission" where the timebase is so precisse it doesnt matter

This here though is a bit an oddball, the size if 1 PCM frame is 1 sample
The timebase is not a millisecond based one, its not 1 frame either nor is
it exact nor high precission.
Its 1 video frame, and whatever amount of audio there is in the container

which IIUC can differ from 1 video frame even rounded.
maybe this just doesnt occur and each frame has a count of samples always
rounded to the closes integer count for the video frame.

But if for example some hardware was using internally a 16 sample buffer
and only put multiplies of 16 samples in frames this would introduce a
considerable amount of jitter in the timestamps in relation to the actual
duration. And using async to fix this without introducing more problems
might require some care


 
> > Maybe this never occurs and in that case your patch should be a good idea
> > but if it does happen then some code would be needed to deal with that.
> > It is detectable when sample counts do not match what is expected.
> 
> Yeah, and we have tools to fix that, like -af aresample=async=1.



> 
> > That said, i would consider a fix for #8762 to produce correct audio in
> > all cases including wav/pcm/mov/... output and not just when the output
> > can store "corrupted"/"sparse" audio.
> 
> I think ffmpeg.c should be smarter about it, and be aware if unlocked or
> sparse audio (or audio not starting at the same time as video) is supported
> by certain muxers or not. And if it is not suppoted, then maybe -af async=1
> or similar should be used automagically.

yes

thx

[....]
Marton Balint Nov. 4, 2020, 9:44 p.m. UTC | #13
On Wed, 4 Nov 2020, Michael Niedermayer wrote:

> we have "millisecond" based formats, rounded timestamps
> we have "exact" cases, maybe the timebase being 1 packet/frame per tick
> we have "high precission" where the timebase is so precisse it doesnt matter
>
> This here though is a bit an oddball, the size if 1 PCM frame is 1 sample
> The timebase is not a millisecond based one, its not 1 frame either nor is
> it exact nor high precission.
> Its 1 video frame, and whatever amount of audio there is in the container
>
> which IIUC can differ from 1 video frame even rounded.
> maybe this just doesnt occur and each frame has a count of samples always
> rounded to the closes integer count for the video frame.

The difference between the audio timestamp and the video timestamp for 
packets from the same DV frame is at most 0.3929636797*frame_duration as 
the specification says, as Dave quoted, so I don't see how the error can 
be bigger than this.

It looks to me you are mixing timestamps coming from a demuxer, and 
timestamps you calculate by counting the number of demuxed/decoded audio 
samples or video frames. Synchronization is done using the former.

>
> But if for example some hardware was using internally a 16 sample buffer
> and only put multiplies of 16 samples in frames this would introduce a
> considerable amount of jitter in the timestamps in relation to the actual
> duration. And using async to fix this without introducing more problems
> might require some care.

I still don't see why timestamp or duration jitter is a problem as long as 
the error is below frame_duration/2. You can safely use async with 
min_hard_comp set to frame_duration/2.

In general, don't you find it problematic that the dv demuxer can return 
different timestamps if you read packets sequentially and if you seek to 
the end of a file? It looks like a huge bug which is not fixable if you 
insist on sample counting...

Regards,
Marton
Michael Niedermayer Nov. 6, 2020, 9:03 p.m. UTC | #14
On Wed, Nov 04, 2020 at 10:44:56PM +0100, Marton Balint wrote:
> 
> 
> On Wed, 4 Nov 2020, Michael Niedermayer wrote:
> 
> > we have "millisecond" based formats, rounded timestamps
> > we have "exact" cases, maybe the timebase being 1 packet/frame per tick
> > we have "high precission" where the timebase is so precisse it doesnt matter
> > 
> > This here though is a bit an oddball, the size if 1 PCM frame is 1 sample
> > The timebase is not a millisecond based one, its not 1 frame either nor is
> > it exact nor high precission.
> > Its 1 video frame, and whatever amount of audio there is in the container
> > 
> > which IIUC can differ from 1 video frame even rounded.
> > maybe this just doesnt occur and each frame has a count of samples always
> > rounded to the closes integer count for the video frame.
> 
> The difference between the audio timestamp and the video timestamp for
> packets from the same DV frame is at most 0.3929636797*frame_duration as the
> specification says, as Dave quoted, so I don't see how the error can be
> bigger than this.
> 
> It looks to me you are mixing timestamps coming from a demuxer, and
> timestamps you calculate by counting the number of demuxed/decoded audio
> samples or video frames. Synchronization is done using the former.
> 

> > 
> > But if for example some hardware was using internally a 16 sample buffer
> > and only put multiplies of 16 samples in frames this would introduce a
> > considerable amount of jitter in the timestamps in relation to the actual
> > duration. And using async to fix this without introducing more problems
> > might require some care.
> 
> I still don't see why timestamp or duration jitter is a problem 

> as long as
> the error is below frame_duration/2. You can safely use async with
> min_hard_comp set to frame_duration/2.

Thats exactly what i meant. an async like filter which behaves differently
or async with a different value there can mess this up.
IMHO such mess up is ok when the input is corrupted or invalid. OTOH
here it is valid and correct data.


> 
> In general, don't you find it problematic that the dv demuxer can return
> different timestamps if you read packets sequentially and if you seek to the
> end of a file? It looks like a huge bug

yes, this is not great
but even with your patch you still have this effect
when seeking to some point in time a player has to output video and
audio to the user at an exact time and that will differ even with async
from linear playbacks presentation


> which is not fixable if you insist
> on sample counting...

I think you misunderstood me, or maybe i didnt state my opinion well,
iam not saying that i consider what dv in git does good. Rather that there
is a problem beyond what these patches fix. 
Some concept of timestamp accuracy independant of the distance of representable
values would be usefull.
if you take teh 1/25 or whatever they are based on dv timestamps and convert that
to teh mpeg 90khz based ones thats not making it that accurate.
OTOH if you take 1/25 based audio where each packet is 1/25sec worth of samples
that very well might be sample accurate or even beyond.
knowing this accuracy is usefull for configuring a async like filter or also in
knowing how to deal with inconsistencies, is that timestamp jtter ? or the sample
rate jittering / some droped samples ? 
Its important to know as in one instance its the timestamps that need adjustment 
while in the other the samples need adjustment
ATM its down to the user to figure out on a file by file base how to deal or
ignore this. Instead it should be possible for an automated system to 
compensate such issues ...

thx

[...]
Dave Rice Nov. 10, 2020, 3 a.m. UTC | #15
> On Nov 6, 2020, at 4:03 PM, Michael Niedermayer <michael@niedermayer.cc> wrote:
> 
> On Wed, Nov 04, 2020 at 10:44:56PM +0100, Marton Balint wrote:
>> 
>> On Wed, 4 Nov 2020, Michael Niedermayer wrote:
>> 
>>> we have "millisecond" based formats, rounded timestamps
>>> we have "exact" cases, maybe the timebase being 1 packet/frame per tick
>>> we have "high precission" where the timebase is so precisse it doesnt matter
>>> 
>>> This here though is a bit an oddball, the size if 1 PCM frame is 1 sample
>>> The timebase is not a millisecond based one, its not 1 frame either nor is
>>> it exact nor high precission.
>>> Its 1 video frame, and whatever amount of audio there is in the container
>>> 
>>> which IIUC can differ from 1 video frame even rounded.
>>> maybe this just doesnt occur and each frame has a count of samples always
>>> rounded to the closes integer count for the video frame.
>> 
>> The difference between the audio timestamp and the video timestamp for
>> packets from the same DV frame is at most 0.3929636797*frame_duration as the
>> specification says, as Dave quoted, so I don't see how the error can be
>> bigger than this.
>> 
>> It looks to me you are mixing timestamps coming from a demuxer, and
>> timestamps you calculate by counting the number of demuxed/decoded audio
>> samples or video frames. Synchronization is done using the former.
>> 
> 
>>> 
>>> But if for example some hardware was using internally a 16 sample buffer
>>> and only put multiplies of 16 samples in frames this would introduce a
>>> considerable amount of jitter in the timestamps in relation to the actual
>>> duration. And using async to fix this without introducing more problems
>>> might require some care.
>> 
>> I still don't see why timestamp or duration jitter is a problem 
> 
>> as long as
>> the error is below frame_duration/2. You can safely use async with
>> min_hard_comp set to frame_duration/2.
> 
> Thats exactly what i meant. an async like filter which behaves differently
> or async with a different value there can mess this up.
> IMHO such mess up is ok when the input is corrupted or invalid. OTOH
> here it is valid and correct data.
> 
>> In general, don't you find it problematic that the dv demuxer can return
>> different timestamps if you read packets sequentially and if you seek to the
>> end of a file? It looks like a huge bug
> 
> yes, this is not great
> but even with your patch you still have this effect
> when seeking to some point in time a player has to output video and
> audio to the user at an exact time and that will differ even with async
> from linear playbacks presentation

When trying to workaround the loss of audio sync, I use -skip_initial_bytes on the dv input to jump to the frame after a missing audio pack to read from that point to keep audio and video in sync from that offset in the bytestream (at least until the next missing audio source pack).

>> which is not fixable if you insist
>> on sample counting...
> 
> I think you misunderstood me, or maybe i didnt state my opinion well,
> iam not saying that i consider what dv in git does good. Rather that there
> is a problem beyond what these patches fix. 
> Some concept of timestamp accuracy independant of the distance of representable
> values would be usefull.
> if you take teh 1/25 or whatever they are based on dv timestamps and convert that
> to teh mpeg 90khz based ones thats not making it that accurate.
> OTOH if you take 1/25 based audio where each packet is 1/25sec worth of samples
> that very well might be sample accurate or even beyond.
> knowing this accuracy is usefull for configuring a async like filter or also in
> knowing how to deal with inconsistencies, is that timestamp jtter ? or the sample
> rate jittering / some droped samples ? 
> Its important to know as in one instance its the timestamps that need adjustment 
> while in the other the samples need adjustment
> ATM its down to the user to figure out on a file by file base how to deal or
> ignore this. Instead it should be possible for an automated system to 
> compensate such issues ...

As mentioned elsewhere, some automation (or at least a logged hint) would be helpful to add or suggest aresample=async=1 to fill the gaps when using containers that don’t support sparse audio. With Marton’s patch, the user has the opportunity to use that filter to keep the audio in sync.

[…]

Dave Rice
Marton Balint Nov. 15, 2020, 12:14 a.m. UTC | #16
On Fri, 6 Nov 2020, Michael Niedermayer wrote:

> On Wed, Nov 04, 2020 at 10:44:56PM +0100, Marton Balint wrote:
>>
>>
>> On Wed, 4 Nov 2020, Michael Niedermayer wrote:
>>
>>> we have "millisecond" based formats, rounded timestamps
>>> we have "exact" cases, maybe the timebase being 1 packet/frame per tick
>>> we have "high precission" where the timebase is so precisse it doesnt matter
>>>
>>> This here though is a bit an oddball, the size if 1 PCM frame is 1 sample
>>> The timebase is not a millisecond based one, its not 1 frame either nor is
>>> it exact nor high precission.
>>> Its 1 video frame, and whatever amount of audio there is in the container
>>>
>>> which IIUC can differ from 1 video frame even rounded.
>>> maybe this just doesnt occur and each frame has a count of samples always
>>> rounded to the closes integer count for the video frame.
>>
>> The difference between the audio timestamp and the video timestamp for
>> packets from the same DV frame is at most 0.3929636797*frame_duration as the
>> specification says, as Dave quoted, so I don't see how the error can be
>> bigger than this.
>>
>> It looks to me you are mixing timestamps coming from a demuxer, and
>> timestamps you calculate by counting the number of demuxed/decoded audio
>> samples or video frames. Synchronization is done using the former.
>>
>
>>>
>>> But if for example some hardware was using internally a 16 sample buffer
>>> and only put multiplies of 16 samples in frames this would introduce a
>>> considerable amount of jitter in the timestamps in relation to the actual
>>> duration. And using async to fix this without introducing more problems
>>> might require some care.
>>
>> I still don't see why timestamp or duration jitter is a problem
>
>> as long as
>> the error is below frame_duration/2. You can safely use async with
>> min_hard_comp set to frame_duration/2.
>
> Thats exactly what i meant. an async like filter which behaves differently
> or async with a different value there can mess this up.
> IMHO such mess up is ok when the input is corrupted or invalid. OTOH
> here it is valid and correct data.
>
>
>>
>> In general, don't you find it problematic that the dv demuxer can return
>> different timestamps if you read packets sequentially and if you seek to the
>> end of a file? It looks like a huge bug
>
> yes, this is not great
> but even with your patch you still have this effect
> when seeking to some point in time a player has to output video and
> audio to the user at an exact time and that will differ even with async
> from linear playbacks presentation
>
>
>> which is not fixable if you insist
>> on sample counting...
>
> I think you misunderstood me, or maybe i didnt state my opinion well,
> iam not saying that i consider what dv in git does good. Rather that there
> is a problem beyond what these patches fix.
> Some concept of timestamp accuracy independant of the distance of representable
> values would be usefull.
> if you take teh 1/25 or whatever they are based on dv timestamps and convert that
> to teh mpeg 90khz based ones thats not making it that accurate.
> OTOH if you take 1/25 based audio where each packet is 1/25sec worth of samples
> that very well might be sample accurate or even beyond.
> knowing this accuracy is usefull for configuring a async like filter or also in
> knowing how to deal with inconsistencies, is that timestamp jtter ? or the sample
> rate jittering / some droped samples ?
> Its important to know as in one instance its the timestamps that need adjustment
> while in the other the samples need adjustment
> ATM its down to the user to figure out on a file by file base how to deal or
> ignore this. Instead it should be possible for an automated system to
> compensate such issues ...

OK, but the automated solution is far from trivial, e.g. it should start 
with a analysis of the file to check if the sample rate is accurate or 
not... And if it is not, is the difference constant througout the file? 
Then there are several methods to fix it and the user might have a 
preference. E.g consider audio clock "master" and duplicate/drop video 
frames. Or keep all video frames, but stretch audio (with or without pitch 
correction - and which filter you want for pitch correction? atempo? 
rubberband?). So making it automated is not trivial at all.

Anyhow, is it OK to apply this patch then?

Thanks,
Marton
Dave Rice Dec. 3, 2020, 10:31 p.m. UTC | #17
> On Nov 14, 2020, at 7:14 PM, Marton Balint <cus@passwd.hu> wrote:
> 
> On Fri, 6 Nov 2020, Michael Niedermayer wrote:
> 
>> On Wed, Nov 04, 2020 at 10:44:56PM +0100, Marton Balint wrote:
>>> 
>>> 
>>> On Wed, 4 Nov 2020, Michael Niedermayer wrote:
>>> 
>>>> we have "millisecond" based formats, rounded timestamps
>>>> we have "exact" cases, maybe the timebase being 1 packet/frame per tick
>>>> we have "high precission" where the timebase is so precisse it doesnt matter
>>>> 
>>>> This here though is a bit an oddball, the size if 1 PCM frame is 1 sample
>>>> The timebase is not a millisecond based one, its not 1 frame either nor is
>>>> it exact nor high precission.
>>>> Its 1 video frame, and whatever amount of audio there is in the container
>>>> 
>>>> which IIUC can differ from 1 video frame even rounded.
>>>> maybe this just doesnt occur and each frame has a count of samples always
>>>> rounded to the closes integer count for the video frame.
>>> 
>>> The difference between the audio timestamp and the video timestamp for
>>> packets from the same DV frame is at most 0.3929636797*frame_duration as the
>>> specification says, as Dave quoted, so I don't see how the error can be
>>> bigger than this.
>>> 
>>> It looks to me you are mixing timestamps coming from a demuxer, and
>>> timestamps you calculate by counting the number of demuxed/decoded audio
>>> samples or video frames. Synchronization is done using the former.
>>> 
>> 
>>>> 
>>>> But if for example some hardware was using internally a 16 sample buffer
>>>> and only put multiplies of 16 samples in frames this would introduce a
>>>> considerable amount of jitter in the timestamps in relation to the actual
>>>> duration. And using async to fix this without introducing more problems
>>>> might require some care.
>>> 
>>> I still don't see why timestamp or duration jitter is a problem
>> 
>>> as long as
>>> the error is below frame_duration/2. You can safely use async with
>>> min_hard_comp set to frame_duration/2.
>> 
>> Thats exactly what i meant. an async like filter which behaves differently
>> or async with a different value there can mess this up.
>> IMHO such mess up is ok when the input is corrupted or invalid. OTOH
>> here it is valid and correct data.
>> 
>> 
>>> 
>>> In general, don't you find it problematic that the dv demuxer can return
>>> different timestamps if you read packets sequentially and if you seek to the
>>> end of a file? It looks like a huge bug
>> 
>> yes, this is not great
>> but even with your patch you still have this effect
>> when seeking to some point in time a player has to output video and
>> audio to the user at an exact time and that will differ even with async
>> from linear playbacks presentation
>> 
>> 
>>> which is not fixable if you insist
>>> on sample counting...
>> 
>> I think you misunderstood me, or maybe i didnt state my opinion well,
>> iam not saying that i consider what dv in git does good. Rather that there
>> is a problem beyond what these patches fix.
>> Some concept of timestamp accuracy independant of the distance of representable
>> values would be usefull.
>> if you take teh 1/25 or whatever they are based on dv timestamps and convert that
>> to teh mpeg 90khz based ones thats not making it that accurate.
>> OTOH if you take 1/25 based audio where each packet is 1/25sec worth of samples
>> that very well might be sample accurate or even beyond.
>> knowing this accuracy is usefull for configuring a async like filter or also in
>> knowing how to deal with inconsistencies, is that timestamp jtter ? or the sample
>> rate jittering / some droped samples ?
>> Its important to know as in one instance its the timestamps that need adjustment
>> while in the other the samples need adjustment
>> ATM its down to the user to figure out on a file by file base how to deal or
>> ignore this. Instead it should be possible for an automated system to
>> compensate such issues ...
> 
> OK, but the automated solution is far from trivial, e.g. it should start with a analysis of the file to check if the sample rate is accurate or not... And if it is not, is the difference constant througout the file? Then there are several methods to fix it and the user might have a preference. E.g consider audio clock "master" and duplicate/drop video frames. Or keep all video frames, but stretch audio (with or without pitch correction - and which filter you want for pitch correction? atempo? rubberband?). So making it automated is not trivial at all.
> 
> Anyhow, is it OK to apply this patch then?

Nudging on this, as this patch has been very helpful to avoid tedious workarounds. I’d love to see it merged. If any further test results would be useful, let me know.
Dave
Michael Niedermayer Dec. 5, 2020, 10:53 p.m. UTC | #18
On Sun, Nov 15, 2020 at 01:14:55AM +0100, Marton Balint wrote:
> 
> 
> On Fri, 6 Nov 2020, Michael Niedermayer wrote:
> 
> > On Wed, Nov 04, 2020 at 10:44:56PM +0100, Marton Balint wrote:
> > > 
> > > 
> > > On Wed, 4 Nov 2020, Michael Niedermayer wrote:
> > > 
> > > > we have "millisecond" based formats, rounded timestamps
> > > > we have "exact" cases, maybe the timebase being 1 packet/frame per tick
> > > > we have "high precission" where the timebase is so precisse it doesnt matter
> > > > 
> > > > This here though is a bit an oddball, the size if 1 PCM frame is 1 sample
> > > > The timebase is not a millisecond based one, its not 1 frame either nor is
> > > > it exact nor high precission.
> > > > Its 1 video frame, and whatever amount of audio there is in the container
> > > > 
> > > > which IIUC can differ from 1 video frame even rounded.
> > > > maybe this just doesnt occur and each frame has a count of samples always
> > > > rounded to the closes integer count for the video frame.
> > > 
> > > The difference between the audio timestamp and the video timestamp for
> > > packets from the same DV frame is at most 0.3929636797*frame_duration as the
> > > specification says, as Dave quoted, so I don't see how the error can be
> > > bigger than this.
> > > 
> > > It looks to me you are mixing timestamps coming from a demuxer, and
> > > timestamps you calculate by counting the number of demuxed/decoded audio
> > > samples or video frames. Synchronization is done using the former.
> > > 
> > 
> > > > 
> > > > But if for example some hardware was using internally a 16 sample buffer
> > > > and only put multiplies of 16 samples in frames this would introduce a
> > > > considerable amount of jitter in the timestamps in relation to the actual
> > > > duration. And using async to fix this without introducing more problems
> > > > might require some care.
> > > 
> > > I still don't see why timestamp or duration jitter is a problem
> > 
> > > as long as
> > > the error is below frame_duration/2. You can safely use async with
> > > min_hard_comp set to frame_duration/2.
> > 
> > Thats exactly what i meant. an async like filter which behaves differently
> > or async with a different value there can mess this up.
> > IMHO such mess up is ok when the input is corrupted or invalid. OTOH
> > here it is valid and correct data.
> > 
> > 
> > > 
> > > In general, don't you find it problematic that the dv demuxer can return
> > > different timestamps if you read packets sequentially and if you seek to the
> > > end of a file? It looks like a huge bug
> > 
> > yes, this is not great
> > but even with your patch you still have this effect
> > when seeking to some point in time a player has to output video and
> > audio to the user at an exact time and that will differ even with async
> > from linear playbacks presentation
> > 
> > 
> > > which is not fixable if you insist
> > > on sample counting...
> > 
> > I think you misunderstood me, or maybe i didnt state my opinion well,
> > iam not saying that i consider what dv in git does good. Rather that there
> > is a problem beyond what these patches fix.
> > Some concept of timestamp accuracy independant of the distance of representable
> > values would be usefull.
> > if you take teh 1/25 or whatever they are based on dv timestamps and convert that
> > to teh mpeg 90khz based ones thats not making it that accurate.
> > OTOH if you take 1/25 based audio where each packet is 1/25sec worth of samples
> > that very well might be sample accurate or even beyond.
> > knowing this accuracy is usefull for configuring a async like filter or also in
> > knowing how to deal with inconsistencies, is that timestamp jtter ? or the sample
> > rate jittering / some droped samples ?
> > Its important to know as in one instance its the timestamps that need adjustment
> > while in the other the samples need adjustment
> > ATM its down to the user to figure out on a file by file base how to deal or
> > ignore this. Instead it should be possible for an automated system to
> > compensate such issues ...
> 
> OK, but the automated solution is far from trivial, e.g. it should start
> with a analysis of the file to check if the sample rate is accurate or
> not... And if it is not, is the difference constant througout the file? Then
> there are several methods to fix it and the user might have a preference.
> E.g consider audio clock "master" and duplicate/drop video frames. Or keep
> all video frames, but stretch audio (with or without pitch correction - and
> which filter you want for pitch correction? atempo? rubberband?). So making
> it automated is not trivial at all.
> 

> Anyhow, is it OK to apply this patch then?

yes

thx

[...]
Marton Balint Dec. 6, 2020, 6:04 p.m. UTC | #19
On Sat, 5 Dec 2020, Michael Niedermayer wrote:

> On Sun, Nov 15, 2020 at 01:14:55AM +0100, Marton Balint wrote:
>>
>>
>> On Fri, 6 Nov 2020, Michael Niedermayer wrote:
>>
>>> On Wed, Nov 04, 2020 at 10:44:56PM +0100, Marton Balint wrote:
>>>>
>>>>
>>>> On Wed, 4 Nov 2020, Michael Niedermayer wrote:
>>>>
>>>>> we have "millisecond" based formats, rounded timestamps
>>>>> we have "exact" cases, maybe the timebase being 1 packet/frame per tick
>>>>> we have "high precission" where the timebase is so precisse it doesnt matter
>>>>>
>>>>> This here though is a bit an oddball, the size if 1 PCM frame is 1 sample
>>>>> The timebase is not a millisecond based one, its not 1 frame either nor is
>>>>> it exact nor high precission.
>>>>> Its 1 video frame, and whatever amount of audio there is in the container
>>>>>
>>>>> which IIUC can differ from 1 video frame even rounded.
>>>>> maybe this just doesnt occur and each frame has a count of samples always
>>>>> rounded to the closes integer count for the video frame.
>>>>
>>>> The difference between the audio timestamp and the video timestamp for
>>>> packets from the same DV frame is at most 0.3929636797*frame_duration as the
>>>> specification says, as Dave quoted, so I don't see how the error can be
>>>> bigger than this.
>>>>
>>>> It looks to me you are mixing timestamps coming from a demuxer, and
>>>> timestamps you calculate by counting the number of demuxed/decoded audio
>>>> samples or video frames. Synchronization is done using the former.
>>>>
>>>
>>>>>
>>>>> But if for example some hardware was using internally a 16 sample buffer
>>>>> and only put multiplies of 16 samples in frames this would introduce a
>>>>> considerable amount of jitter in the timestamps in relation to the actual
>>>>> duration. And using async to fix this without introducing more problems
>>>>> might require some care.
>>>>
>>>> I still don't see why timestamp or duration jitter is a problem
>>>
>>>> as long as
>>>> the error is below frame_duration/2. You can safely use async with
>>>> min_hard_comp set to frame_duration/2.
>>>
>>> Thats exactly what i meant. an async like filter which behaves differently
>>> or async with a different value there can mess this up.
>>> IMHO such mess up is ok when the input is corrupted or invalid. OTOH
>>> here it is valid and correct data.
>>>
>>>
>>>>
>>>> In general, don't you find it problematic that the dv demuxer can return
>>>> different timestamps if you read packets sequentially and if you seek to the
>>>> end of a file? It looks like a huge bug
>>>
>>> yes, this is not great
>>> but even with your patch you still have this effect
>>> when seeking to some point in time a player has to output video and
>>> audio to the user at an exact time and that will differ even with async
>>> from linear playbacks presentation
>>>
>>>
>>>> which is not fixable if you insist
>>>> on sample counting...
>>>
>>> I think you misunderstood me, or maybe i didnt state my opinion well,
>>> iam not saying that i consider what dv in git does good. Rather that there
>>> is a problem beyond what these patches fix.
>>> Some concept of timestamp accuracy independant of the distance of representable
>>> values would be usefull.
>>> if you take teh 1/25 or whatever they are based on dv timestamps and convert that
>>> to teh mpeg 90khz based ones thats not making it that accurate.
>>> OTOH if you take 1/25 based audio where each packet is 1/25sec worth of samples
>>> that very well might be sample accurate or even beyond.
>>> knowing this accuracy is usefull for configuring a async like filter or also in
>>> knowing how to deal with inconsistencies, is that timestamp jtter ? or the sample
>>> rate jittering / some droped samples ?
>>> Its important to know as in one instance its the timestamps that need adjustment
>>> while in the other the samples need adjustment
>>> ATM its down to the user to figure out on a file by file base how to deal or
>>> ignore this. Instead it should be possible for an automated system to
>>> compensate such issues ...
>>
>> OK, but the automated solution is far from trivial, e.g. it should start
>> with a analysis of the file to check if the sample rate is accurate or
>> not... And if it is not, is the difference constant througout the file? Then
>> there are several methods to fix it and the user might have a preference.
>> E.g consider audio clock "master" and duplicate/drop video frames. Or keep
>> all video frames, but stretch audio (with or without pitch correction - and
>> which filter you want for pitch correction? atempo? rubberband?). So making
>> it automated is not trivial at all.
>>
>
>> Anyhow, is it OK to apply this patch then?
>
> yes

Thanks, applied.

Regards,
Marton
Dave Rice Feb. 20, 2021, 5:20 p.m. UTC | #20
Hi,

> On Oct 31, 2020, at 5:15 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
> On Sat, 31 Oct 2020, Dave Rice wrote:
>>> On Oct 31, 2020, at 3:47 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>> On Sat, 31 Oct 2020, Dave Rice wrote:
>>>> Hi Marton,
>>>>> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>>>> Fixes out of sync timestamps in ticket #8762.
>>>> Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795 <https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795>. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.
>>>> Beyond copying or encoding the audio, are there other options I should use to test this?
>>> Well, it depends on what you want. After this patch you should get a file which has audio packets synced to video, but the audio stream is sparse, not every video packet has a corresponding audio packet. (It looks like our MOV muxer does not support muxing of sparse audio therefore does not produce proper timestamps. But MKV does, please try that.)
>>> You can also make ffmpeg generate the missing audio based on packet timestamps. Swresample has an async=1 option, so something like this should get you synced audio with continous audio packets:
>>> ffmpeg -y -i 1670520000_12.dv -c:v copy \
>>> -af aresample=async=1:min_hard_comp=0.01 -c:a pcm_s16le 1670520000_12.mov
>> 
>> Thank you for this. With the patch and async, the result is synced and the resulting audio was the same as Michael’s patch.
>> 
>> Could you explain why you used min_hard_comp here? IIUC min_hard_comp is a set a threshold between the strategies of trim/fill or stretch/squeeze to align the audio to time; however, the async documentation says "Setting this to 1 will enable filling and trimming, larger values represent the maximum amount in samples that the data may be stretched or squeezed” so I thought that async=1 would not permit stretch/squeeze anyway.
> 
> It is documented poorly, but if you check the source code you will see that async=1 implicitly sets min_comp to 0.001 enabling trimming/dropping. min_hard_comp decides the threshold when silence injection actually happens, and the default for that is 0.1, which is more than a frame, therefore not acceptable if we want to maintain <1 frame accuracy. Or at least that is how I think it should work.

I’ve found that aresample=async=1:min_hard_comp=0.01, as discussed here, works well to add audio samples to maintain timestamp accuracy when muxing into a format like mov. However, this approach doesn’t work if the sparseness of the audio stream is at the end of the stream. Is there a way to use min_hard_comp to consider differences between a timestamp and audio data when one of the ends of that range is the end of the file?
Best Regards,
Dave Rice
Marton Balint Feb. 23, 2021, 7:42 p.m. UTC | #21
On Sat, 20 Feb 2021, Dave Rice wrote:

> Hi,
>
>> On Oct 31, 2020, at 5:15 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>> On Sat, 31 Oct 2020, Dave Rice wrote:
>>>> On Oct 31, 2020, at 3:47 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>>> On Sat, 31 Oct 2020, Dave Rice wrote:
>>>>> Hi Marton,
>>>>>> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>>>>> Fixes out of sync timestamps in ticket #8762.
>>>>> Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795 <https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795>. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.
>>>>> Beyond copying or encoding the audio, are there other options I should use to test this?
>>>> Well, it depends on what you want. After this patch you should get a file which has audio packets synced to video, but the audio stream is sparse, not every video packet has a corresponding audio packet. (It looks like our MOV muxer does not support muxing of sparse audio therefore does not produce proper timestamps. But MKV does, please try that.)
>>>> You can also make ffmpeg generate the missing audio based on packet timestamps. Swresample has an async=1 option, so something like this should get you synced audio with continous audio packets:
>>>> ffmpeg -y -i 1670520000_12.dv -c:v copy \
>>>> -af aresample=async=1:min_hard_comp=0.01 -c:a pcm_s16le 1670520000_12.mov
>>> 
>>> Thank you for this. With the patch and async, the result is synced and the resulting audio was the same as Michael’s patch.
>>> 
>>> Could you explain why you used min_hard_comp here? IIUC min_hard_comp is a set a threshold between the strategies of trim/fill or stretch/squeeze to align the audio to time; however, the async documentation says "Setting this to 1 will enable filling and trimming, larger values represent the maximum amount in samples that the data may be stretched or squeezed” so I thought that async=1 would not permit stretch/squeeze anyway.
>> 
>> It is documented poorly, but if you check the source code you will see that async=1 implicitly sets min_comp to 0.001 enabling trimming/dropping. min_hard_comp decides the threshold when silence injection actually happens, and the default for that is 0.1, which is more than a frame, therefore not acceptable if we want to maintain <1 frame accuracy. Or at least that is how I think it should work.
>

> I’ve found that aresample=async=1:min_hard_comp=0.01, as discussed here, 
> works well to add audio samples to maintain timestamp accuracy when 
> muxing into a format like mov. However, this approach doesn’t work if 
> the sparseness of the audio stream is at the end of the stream. Is there 
> a way to use min_hard_comp to consider differences between a timestamp 
> and audio data when one of the ends of that range is the end of the 
> file?

I am not aware of a smart method to generate missing audio in the end 
until the end of video.

As a possible workaround you may query the video length using
ffprobe or mediainfo, and then use a second filter, apad to pad audio:

-af aresample=async=1:min_hard_comp=0.01,apad=whole_dur=<video_length>

Tnis might do what you want, but requires an additional step to query the 
video length...

Regards,
Marton
Dave Rice Feb. 23, 2021, 8:07 p.m. UTC | #22
> On Feb 23, 2021, at 2:42 PM, Marton Balint <cus@passwd.hu> wrote:
> 
> 
> 
> On Sat, 20 Feb 2021, Dave Rice wrote:
> 
>> Hi,
>> 
>>> On Oct 31, 2020, at 5:15 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>> On Sat, 31 Oct 2020, Dave Rice wrote:
>>>>> On Oct 31, 2020, at 3:47 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>>>> On Sat, 31 Oct 2020, Dave Rice wrote:
>>>>>> Hi Marton,
>>>>>>> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>>>>>> Fixes out of sync timestamps in ticket #8762.
>>>>>> Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795 <https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795>. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.
>>>>>> Beyond copying or encoding the audio, are there other options I should use to test this?
>>>>> Well, it depends on what you want. After this patch you should get a file which has audio packets synced to video, but the audio stream is sparse, not every video packet has a corresponding audio packet. (It looks like our MOV muxer does not support muxing of sparse audio therefore does not produce proper timestamps. But MKV does, please try that.)
>>>>> You can also make ffmpeg generate the missing audio based on packet timestamps. Swresample has an async=1 option, so something like this should get you synced audio with continous audio packets:
>>>>> ffmpeg -y -i 1670520000_12.dv -c:v copy \
>>>>> -af aresample=async=1:min_hard_comp=0.01 -c:a pcm_s16le 1670520000_12.mov
>>>> Thank you for this. With the patch and async, the result is synced and the resulting audio was the same as Michael’s patch.
>>>> Could you explain why you used min_hard_comp here? IIUC min_hard_comp is a set a threshold between the strategies of trim/fill or stretch/squeeze to align the audio to time; however, the async documentation says "Setting this to 1 will enable filling and trimming, larger values represent the maximum amount in samples that the data may be stretched or squeezed” so I thought that async=1 would not permit stretch/squeeze anyway.
>>> It is documented poorly, but if you check the source code you will see that async=1 implicitly sets min_comp to 0.001 enabling trimming/dropping. min_hard_comp decides the threshold when silence injection actually happens, and the default for that is 0.1, which is more than a frame, therefore not acceptable if we want to maintain <1 frame accuracy. Or at least that is how I think it should work.
>> 
> 
>> I’ve found that aresample=async=1:min_hard_comp=0.01, as discussed here, works well to add audio samples to maintain timestamp accuracy when muxing into a format like mov. However, this approach doesn’t work if the sparseness of the audio stream is at the end of the stream. Is there a way to use min_hard_comp to consider differences between a timestamp and audio data when one of the ends of that range is the end of the file?
> 
> I am not aware of a smart method to generate missing audio in the end until the end of video.
> 
> As a possible workaround you may query the video length using
> ffprobe or mediainfo, and then use a second filter, apad to pad audio:
> 
> -af aresample=async=1:min_hard_comp=0.01,apad=whole_dur=<video_length>
> 
> Tnis might do what you want, but requires an additional step to query the video length…


[…]
Perfect, thanks for sharing this idea.
Dave
Dave Rice Aug. 22, 2021, 6:38 p.m. UTC | #23
Hi Marton,

> On Feb 23, 2021, at 3:07 PM, Dave Rice <dave@dericed.com> wrote:
> 
>> On Feb 23, 2021, at 2:42 PM, Marton Balint <cus@passwd.hu> wrote:
>> 
>> On Sat, 20 Feb 2021, Dave Rice wrote:
>> 
>>> Hi,
>>> 
>>>> On Oct 31, 2020, at 5:15 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>>> On Sat, 31 Oct 2020, Dave Rice wrote:
>>>>>> On Oct 31, 2020, at 3:47 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>>>>> On Sat, 31 Oct 2020, Dave Rice wrote:
>>>>>>> Hi Marton,
>>>>>>>> On Oct 31, 2020, at 12:56 PM, Marton Balint <cus@passwd.hu <mailto:cus@passwd.hu>> wrote:
>>>>>>>> Fixes out of sync timestamps in ticket #8762.
>>>>>>> Although Michael’s recent patch does address the issue documented in 8762, I haven’t found this patch to fix the issue. I tried with -c:a copy and with -c:a pcm_s16le with some sample files that exhibit this issue but each output was out of sync. I put an output at https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795 <https://gist.github.com/dericed/659bd843bd38b6f24a60198b5e345795>. That output notes that 3597 packages of video are read and 3586 packets of audio. In the resulting file, at the end of the timeline the audio is 9 frames out of sync and my output video stream is 00:02:00.020 and output audio stream is 00:01:59.653.
>>>>>>> Beyond copying or encoding the audio, are there other options I should use to test this?
>>>>>> Well, it depends on what you want. After this patch you should get a file which has audio packets synced to video, but the audio stream is sparse, not every video packet has a corresponding audio packet. (It looks like our MOV muxer does not support muxing of sparse audio therefore does not produce proper timestamps. But MKV does, please try that.)
>>>>>> You can also make ffmpeg generate the missing audio based on packet timestamps. Swresample has an async=1 option, so something like this should get you synced audio with continous audio packets:
>>>>>> ffmpeg -y -i 1670520000_12.dv -c:v copy \
>>>>>> -af aresample=async=1:min_hard_comp=0.01 -c:a pcm_s16le 1670520000_12.mov
>>>>> Thank you for this. With the patch and async, the result is synced and the resulting audio was the same as Michael’s patch.
>>>>> Could you explain why you used min_hard_comp here? IIUC min_hard_comp is a set a threshold between the strategies of trim/fill or stretch/squeeze to align the audio to time; however, the async documentation says "Setting this to 1 will enable filling and trimming, larger values represent the maximum amount in samples that the data may be stretched or squeezed” so I thought that async=1 would not permit stretch/squeeze anyway.
>>>> It is documented poorly, but if you check the source code you will see that async=1 implicitly sets min_comp to 0.001 enabling trimming/dropping. min_hard_comp decides the threshold when silence injection actually happens, and the default for that is 0.1, which is more than a frame, therefore not acceptable if we want to maintain <1 frame accuracy. Or at least that is how I think it should work.
>>> 
>> 
>>> I’ve found that aresample=async=1:min_hard_comp=0.01, as discussed here, works well to add audio samples to maintain timestamp accuracy when muxing into a format like mov. However, this approach doesn’t work if the sparseness of the audio stream is at the end of the stream. Is there a way to use min_hard_comp to consider differences between a timestamp and audio data when one of the ends of that range is the end of the file?
>> 
>> I am not aware of a smart method to generate missing audio in the end until the end of video.
>> 
>> As a possible workaround you may query the video length using
>> ffprobe or mediainfo, and then use a second filter, apad to pad audio:
>> 
>> -af aresample=async=1:min_hard_comp=0.01,apad=whole_dur=<video_length>
>> 
>> Tnis might do what you want, but requires an additional step to query the video length…
> 
> 
> […]
> Perfect, thanks for sharing this idea.

I was hoping I could ask your advise on a related issue. There’s a sample to show it at https://archive.org/download/test_a_202108/test_a.dv <https://archive.org/download/test_a_202108/test_a.dv>. This sample has three frames, frame 1 and 3 have 2 tracks of 32k stereo audio. Frame 2 has nulled audio in the audio dif blocks and there are no audio metadata packs, so ffmpeg (rightly) does not find any audio in this frame.

Using your advice about the aresample filter above, I was using a command like this to mux the video and audio into a new container as lossless as feasible with:
ffmpeg -y -i test_a.dv -filter_complex "[0:a:0]aresample=async=1:min_hard_comp=0.01[aud1]" -c:v:0 copy -map 0:v:0 -c:a pcm_s16le -map "[aud1]” test_audio.mov

However here, the resulting audio track of test_audio.mov is 84 ms and the video track is 101 ms. The audio and video for frame 3 in test_audio.mov is out of sync as the audio for frame 3 starts in the middle of frame 2. It looks like frame 1 and 3 output 1001/30000 sec of audio but aresample only adds about half of that to fill the audio sparseness of the lack of audio in frame 2.

Kind Regards,

Dave Rice
diff mbox series

Patch

diff --git a/libavformat/dv.c b/libavformat/dv.c
index 3e0d12c0e3..26a78139f5 100644
--- a/libavformat/dv.c
+++ b/libavformat/dv.c
@@ -49,7 +49,6 @@  struct DVDemuxContext {
     uint8_t           audio_buf[4][8192];
     int               ach;
     int               frames;
-    uint64_t          abytes;
 };
 
 static inline uint16_t dv_audio_12to16(uint16_t sample)
@@ -258,7 +257,7 @@  static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
             c->ast[i] = avformat_new_stream(c->fctx, NULL);
             if (!c->ast[i])
                 break;
-            avpriv_set_pts_info(c->ast[i], 64, 1, 30000);
+            avpriv_set_pts_info(c->ast[i], 64, c->sys->time_base.num, c->sys->time_base.den);
             c->ast[i]->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
             c->ast[i]->codecpar->codec_id   = AV_CODEC_ID_PCM_S16LE;
 
@@ -387,8 +386,7 @@  int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
     for (i = 0; i < c->ach; i++) {
         c->audio_pkt[i].pos  = pos;
         c->audio_pkt[i].size = size;
-        c->audio_pkt[i].pts  = c->abytes * 30000 * 8 /
-                               c->ast[i]->codecpar->bit_rate;
+        c->audio_pkt[i].pts  = (c->sys->height == 720) ? (c->frames & ~1) : c->frames;
         ppcm[i] = c->audio_buf[i];
     }
     if (c->ach)
@@ -401,10 +399,7 @@  int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
             c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
         } else {
             c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
-            c->abytes           += size;
         }
-    } else {
-        c->abytes += size;
     }
 
     /* Now it's time to return video packet */
@@ -444,13 +439,6 @@  static int64_t dv_frame_offset(AVFormatContext *s, DVDemuxContext *c,
 void ff_dv_offset_reset(DVDemuxContext *c, int64_t frame_offset)
 {
     c->frames = frame_offset;
-    if (c->ach) {
-        if (c->sys) {
-        c->abytes = av_rescale_q(c->frames, c->sys->time_base,
-                                 (AVRational) { 8, c->ast[0]->codecpar->bit_rate });
-        } else
-            av_log(c->fctx, AV_LOG_ERROR, "cannot adjust audio bytes\n");
-    }
     c->audio_pkt[0].size = c->audio_pkt[1].size = 0;
     c->audio_pkt[2].size = c->audio_pkt[3].size = 0;
 }
diff --git a/tests/ref/seek/lavf-dv b/tests/ref/seek/lavf-dv
index 0000ff5abe..f63e4460be 100644
--- a/tests/ref/seek/lavf-dv
+++ b/tests/ref/seek/lavf-dv
@@ -7,9 +7,9 @@  ret: 0         st: 0 flags:0  ts: 0.800000
 ret: 0         st: 0 flags:1 dts: 0.800000 pts: 0.800000 pos:2880000 size:144000
 ret: 0         st: 0 flags:1  ts:-0.320000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
-ret: 0         st: 1 flags:0  ts: 2.576667
+ret: 0         st: 1 flags:0  ts: 2.560000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
-ret: 0         st: 1 flags:1  ts: 1.470833
+ret: 0         st: 1 flags:1  ts: 1.480000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
 ret: 0         st:-1 flags:0  ts: 0.365002
 ret: 0         st: 0 flags:1 dts: 0.360000 pts: 0.360000 pos:1296000 size:144000
@@ -19,9 +19,9 @@  ret: 0         st: 0 flags:0  ts: 2.160000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
 ret: 0         st: 0 flags:1  ts: 1.040000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
-ret: 0         st: 1 flags:0  ts:-0.058333
+ret: 0         st: 1 flags:0  ts:-0.040000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
-ret: 0         st: 1 flags:1  ts: 2.835833
+ret: 0         st: 1 flags:1  ts: 2.840000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
 ret: 0         st:-1 flags:0  ts: 1.730004
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
@@ -31,10 +31,10 @@  ret: 0         st: 0 flags:0  ts:-0.480000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
 ret: 0         st: 0 flags:1  ts: 2.400000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
-ret: 0         st: 1 flags:0  ts: 1.306667
-ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
-ret: 0         st: 1 flags:1  ts: 0.200833
+ret: 0         st: 1 flags:0  ts: 1.320000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
+ret: 0         st: 1 flags:1  ts: 0.200000
+ret: 0         st: 0 flags:1 dts: 0.200000 pts: 0.200000 pos: 720000 size:144000
 ret: 0         st:-1 flags:0  ts:-0.904994
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
 ret: 0         st:-1 flags:1  ts: 1.989173
@@ -43,9 +43,9 @@  ret: 0         st: 0 flags:0  ts: 0.880000
 ret: 0         st: 0 flags:1 dts: 0.880000 pts: 0.880000 pos:3168000 size:144000
 ret: 0         st: 0 flags:1  ts:-0.240000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:144000
-ret: 0         st: 1 flags:0  ts: 2.671667
+ret: 0         st: 1 flags:0  ts: 2.680000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
-ret: 0         st: 1 flags:1  ts: 1.565833
+ret: 0         st: 1 flags:1  ts: 1.560000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos:3456000 size:144000
 ret: 0         st:-1 flags:0  ts: 0.460008
 ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos:1728000 size:144000