diff mbox series

[FFmpeg-devel,1/2] avformat/dv: allow returning damaged audio

Message ID 20200731222019.2897-1-michael@niedermayer.cc
State New
Headers show
Series [FFmpeg-devel,1/2] avformat/dv: allow returning damaged audio | expand

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Michael Niedermayer July 31, 2020, 10:20 p.m. UTC
Fixes: Ticket8762
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
---
 libavformat/dv.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 7 deletions(-)

Comments

Marton Balint Aug. 1, 2020, 5:28 p.m. UTC | #1
On Sat, 1 Aug 2020, Michael Niedermayer wrote:

> Fixes: Ticket8762
> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
> ---
> libavformat/dv.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 42 insertions(+), 7 deletions(-)

If "dv remux loses sync", then the timestamps should be fixed, not 
additional packets should be generated based on previously read packet 
data (which is a fragile approach to begin with, e.g. what if the first 
frame is the corrupt one?).

Regards,
Marton

>
> diff --git a/libavformat/dv.c b/libavformat/dv.c
> index e99422d4b5..7ebe1815b3 100644
> --- a/libavformat/dv.c
> +++ b/libavformat/dv.c
> @@ -34,12 +34,18 @@
> #include "libavcodec/dv_profile.h"
> #include "libavcodec/dv.h"
> #include "libavutil/channel_layout.h"
> +#include "libavutil/opt.h"
> #include "libavutil/intreadwrite.h"
> #include "libavutil/mathematics.h"
> #include "libavutil/timecode.h"
> #include "dv.h"
> #include "libavutil/avassert.h"
> 
> +enum AudioConceal {
> +    AUDIO_CONCEAL_PASS = 1,
> +    AUDIO_CONCEAL_DROP = 0,
> +};
> +
> struct DVDemuxContext {
>     const AVDVProfile*  sys;    /* Current DV profile. E.g.: 525/60, 625/50 */
>     AVFormatContext*  fctx;
> @@ -50,6 +56,8 @@ struct DVDemuxContext {
>     int               ach;
>     int               frames;
>     uint64_t          abytes;
> +    uint8_t           as_pack[5];
> +    int               dvaudio_concealment;
> };
> 
> static inline uint16_t dv_audio_12to16(uint16_t sample)
> @@ -72,7 +80,7 @@ static inline uint16_t dv_audio_12to16(uint16_t sample)
>     return result;
> }
> 
> -static const uint8_t *dv_extract_pack(const uint8_t *frame, enum dv_pack_type t)
> +static const uint8_t *dv_extract_pack(DVDemuxContext *d, const uint8_t *frame, enum dv_pack_type t)
> {
>     int offs;
>     int c;
> @@ -101,6 +109,13 @@ static const uint8_t *dv_extract_pack(const uint8_t *frame, enum dv_pack_type t)
>             break;
>     }
> 
> +    if (t == dv_audio_source || t == dv_audio_control) {
> +        if (frame[offs] == t) {
> +            memcpy(d->as_pack, &frame[offs], sizeof(d->as_pack));
> +        } else if (d->as_pack[0] && d->dvaudio_concealment == AUDIO_CONCEAL_PASS)
> +            return d->as_pack;
> +    }
> +
>     return frame[offs] == t ? &frame[offs] : NULL;
> }
> 
> @@ -116,7 +131,7 @@ static const int dv_audio_frequency[3] = {
>  * 3. Audio is always returned as 16-bit linear samples: 12-bit nonlinear samples
>  *    are converted into 16-bit linear ones.
>  */
> -static int dv_extract_audio(const uint8_t *frame, uint8_t **ppcm,
> +static int dv_extract_audio(DVDemuxContext *c, const uint8_t *frame, uint8_t **ppcm,
>                             const AVDVProfile *sys)
> {
>     int size, chan, i, j, d, of, smpls, freq, quant, half_ch;
> @@ -124,7 +139,7 @@ static int dv_extract_audio(const uint8_t *frame, uint8_t **ppcm,
>     const uint8_t *as_pack;
>     uint8_t *pcm, ipcm;
> 
> -    as_pack = dv_extract_pack(frame, dv_audio_source);
> +    as_pack = dv_extract_pack(c, frame, dv_audio_source);
>     if (!as_pack)    /* No audio ? */
>         return 0;
> 
> @@ -224,7 +239,7 @@ static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
>     const uint8_t *as_pack;
>     int freq, stype, smpls, quant, i, ach;
> 
> -    as_pack = dv_extract_pack(frame, dv_audio_source);
> +    as_pack = dv_extract_pack(c, frame, dv_audio_source);
>     if (!as_pack || !c->sys) {    /* No audio ? */
>         c->ach = 0;
>         return 0;
> @@ -292,7 +307,7 @@ static int dv_extract_video_info(DVDemuxContext *c, const uint8_t *frame)
>     c->vst->avg_frame_rate = av_inv_q(c->vst->time_base);
>
>     /* finding out SAR is a little bit messy */
> -    vsc_pack = dv_extract_pack(frame, dv_video_control);
> +    vsc_pack = dv_extract_pack(c, frame, dv_video_control);
>     apt      = frame[4] & 0x07;
>     is16_9   = (vsc_pack && ((vsc_pack[2] & 0x07) == 0x02 ||
>                              (!apt && (vsc_pack[2] & 0x07) == 0x07)));
> @@ -312,7 +327,7 @@ static int dv_extract_timecode(DVDemuxContext* c, const uint8_t* frame, char *tc
>     // is only relevant for NTSC systems.
>     int prevent_df = c->sys->ltc_divisor == 25 || c->sys->ltc_divisor == 50;
> 
> -    tc_pack = dv_extract_pack(frame, dv_timecode);
> +    tc_pack = dv_extract_pack(c, frame, dv_timecode);
>     if (!tc_pack)
>         return 0;
>     av_timecode_make_smpte_tc_string(tc, AV_RB32(tc_pack + 1), prevent_df);
> @@ -384,7 +399,7 @@ int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
>         ppcm[i] = c->audio_buf[i];
>     }
>     if (c->ach)
> -        dv_extract_audio(buf, ppcm, c->sys);
> +        dv_extract_audio(c, buf, ppcm, c->sys);
>
>     /* We work with 720p frames split in half, thus even frames have
>      * channels 0,1 and odd 2,3. */
> @@ -452,8 +467,10 @@ void ff_dv_offset_reset(DVDemuxContext *c, int64_t frame_offset)
>  ************************************************************/
> 
> typedef struct RawDVContext {
> +    const AVClass  *class;
>     DVDemuxContext *dv_demux;
>     uint8_t         buf[DV_MAX_FRAME_SIZE];
> +    int             dvaudio_concealment;
> } RawDVContext;
> 
> static int dv_read_timecode(AVFormatContext *s) {
> @@ -500,6 +517,7 @@ static int dv_read_header(AVFormatContext *s)
>     c->dv_demux = avpriv_dv_init_demux(s);
>     if (!c->dv_demux)
>         return AVERROR(ENOMEM);
> +    c->dv_demux->dvaudio_concealment = c->dvaudio_concealment;
>
>     state = avio_rb32(s->pb);
>     while ((state & 0xffffff7f) != 0x1f07003f) {
> @@ -639,6 +657,22 @@ static int dv_probe(const AVProbeData *p)
>     return 0;
> }
> 
> +#define OFFSET(x) offsetof(RawDVContext, x)
> +#define DEC AV_OPT_FLAG_DECODING_PARAM
> +static const AVOption dv_options[] = {
> +    { "dvaudio_concealment", "", OFFSET(dvaudio_concealment), AV_OPT_TYPE_INT  , {.i64 = AUDIO_CONCEAL_DROP}, 0, INT_MAX, DEC, "dvaudio_concealment"},
> +    { "drop",                "", 0                          , AV_OPT_TYPE_CONST, {.i64 = AUDIO_CONCEAL_DROP}, 0, INT_MAX, DEC, "dvaudio_concealment"},
> +    { "pass",                "", 0                          , AV_OPT_TYPE_CONST, {.i64 = AUDIO_CONCEAL_PASS}, 0, INT_MAX, DEC, "dvaudio_concealment"},
> +    { NULL },
> +};
> +
> +static const AVClass dv_demuxer_class = {
> +    .class_name = "DV demuxer",
> +    .item_name  = av_default_item_name,
> +    .option     = dv_options,
> +    .version    = LIBAVUTIL_VERSION_INT,
> +};
> +
> AVInputFormat ff_dv_demuxer = {
>     .name           = "dv",
>     .long_name      = NULL_IF_CONFIG_SMALL("DV (Digital Video)"),
> @@ -649,4 +683,5 @@ AVInputFormat ff_dv_demuxer = {
>     .read_close     = dv_read_close,
>     .read_seek      = dv_read_seek,
>     .extensions     = "dv,dif",
> +    .priv_class     = &dv_demuxer_class,
> };
> -- 
> 2.17.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Michael Niedermayer Aug. 1, 2020, 6:24 p.m. UTC | #2
On Sat, Aug 01, 2020 at 07:28:53PM +0200, Marton Balint wrote:
> 
> 
> On Sat, 1 Aug 2020, Michael Niedermayer wrote:
> 
> > Fixes: Ticket8762
> > Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
> > ---
> > libavformat/dv.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
> > 1 file changed, 42 insertions(+), 7 deletions(-)
> 
> If "dv remux loses sync", then the timestamps should be fixed, not
> additional packets should be generated based on previously read packet data
> (which is a fragile approach to begin with, e.g. what if the first frame is
> the corrupt one?).

Ticket8762 is about stream copy, so if no packets are returned for audio
but are for video and just timestamps are updated this would at least on
its own probably not work that well.

about the case of a damaged first frame. Do you have a testcase ?

Thanks

[...]
Marton Balint Aug. 1, 2020, 9:26 p.m. UTC | #3
On Sat, 1 Aug 2020, Michael Niedermayer wrote:

> On Sat, Aug 01, 2020 at 07:28:53PM +0200, Marton Balint wrote:
>>
>>
>> On Sat, 1 Aug 2020, Michael Niedermayer wrote:
>>
>>> Fixes: Ticket8762
>>> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
>>> ---
>>> libavformat/dv.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
>>> 1 file changed, 42 insertions(+), 7 deletions(-)
>>
>> If "dv remux loses sync", then the timestamps should be fixed, not
>> additional packets should be generated based on previously read packet data
>> (which is a fragile approach to begin with, e.g. what if the first frame is
>> the corrupt one?).
>
> Ticket8762 is about stream copy, so if no packets are returned for audio
> but are for video and just timestamps are updated this would at least on
> its own probably not work that well.

If the timestamps are good, a good player should be able to play it 
correctly, even if audio stream is sparse.

None of the demuxers generate packets because the timestamps are not 
continous, I just don't think it would be consistent if DV suddenly 
started to do this. E.g. what if the user wants to drop video with no 
audio?

>
> about the case of a damaged first frame. Do you have a testcase ?

No, but it can happen, can't it? If the stream starts with no audio for 1 
second you will have 1 second A-V desync, as far as I see, that is why I 
believe fixing the timestamps is the proper fix.

Regards,
Marton
Dave Rice Aug. 2, 2020, 7:50 p.m. UTC | #4
> On Aug 1, 2020, at 5:26 PM, Marton Balint <cus@passwd.hu> wrote:
> 
> 
> 
> On Sat, 1 Aug 2020, Michael Niedermayer wrote:
> 
>> On Sat, Aug 01, 2020 at 07:28:53PM +0200, Marton Balint wrote:
>>> 
>>> 
>>> On Sat, 1 Aug 2020, Michael Niedermayer wrote:
>>> 
>>>> Fixes: Ticket8762
>>>> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
>>>> ---
>>>> libavformat/dv.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
>>>> 1 file changed, 42 insertions(+), 7 deletions(-)
>>> 
>>> If "dv remux loses sync", then the timestamps should be fixed, not
>>> additional packets should be generated based on previously read packet data
>>> (which is a fragile approach to begin with, e.g. what if the first frame is
>>> the corrupt one?).
>> 
>> Ticket8762 is about stream copy, so if no packets are returned for audio
>> but are for video and just timestamps are updated this would at least on
>> its own probably not work that well.
> 
> If the timestamps are good, a good player should be able to play it correctly, even if audio stream is sparse.
> 
> None of the demuxers generate packets because the timestamps are not continous, I just don't think it would be consistent if DV suddenly started to do this. E.g. what if the user wants to drop video with no audio?

In practice, when dv frames with video and no audio are interleaved within a dv stream that otherwise has both, it is because the playback videotape player of the dv tape is in pause mode or the tape is damaged. These frames most common are filled with only video dif blocks that note concealment (so the image is a copy of a prior image) and the audio source pack metadata is missing, but the paylock of the audio dif blocks are filled with error code so they would decode as silence.

I did a test of 114 dv tape transfers
	61 no difference in demuxing between pass and drop (using this patch)
	31 the difference is only in the final frames so the impact of the demuxer option would be hard for the user to see, no loss of sync, just video with no audio at the end
	22 the difference occurs mid-stream, with the drop option the demuxer outputs video and audio at different rate when hitting frames with no audio source pack, so the output of the demuxer loses sync

>> about the case of a damaged first frame. Do you have a testcase ?
> 
> No, but it can happen, can't it? If the stream starts with no audio for 1 second you will have 1 second A-V desync, as far as I see, that is why I believe fixing the timestamps is the proper fix.

Yes this happens (though it is more rare and didn’t occur in the test noted above). In that case, ffmpeg shows no audio at all and I’d have to read the stream at later frames using -skip_initial_bytes.

> Regards,
> Marton
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
Marton Balint Aug. 3, 2020, 8:38 p.m. UTC | #5
On Sun, 2 Aug 2020, Dave Rice wrote:

>
>
>> On Aug 1, 2020, at 5:26 PM, Marton Balint <cus@passwd.hu> wrote:
>> 
>> 
>> 
>> On Sat, 1 Aug 2020, Michael Niedermayer wrote:
>> 
>>> On Sat, Aug 01, 2020 at 07:28:53PM +0200, Marton Balint wrote:
>>>> 
>>>> 
>>>> On Sat, 1 Aug 2020, Michael Niedermayer wrote:
>>>> 
>>>>> Fixes: Ticket8762
>>>>> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
>>>>> ---
>>>>> libavformat/dv.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
>>>>> 1 file changed, 42 insertions(+), 7 deletions(-)
>>>> 
>>>> If "dv remux loses sync", then the timestamps should be fixed, not
>>>> additional packets should be generated based on previously read packet data
>>>> (which is a fragile approach to begin with, e.g. what if the first frame is
>>>> the corrupt one?).
>>> 
>>> Ticket8762 is about stream copy, so if no packets are returned for audio
>>> but are for video and just timestamps are updated this would at least on
>>> its own probably not work that well.
>> 
>> If the timestamps are good, a good player should be able to play it 
>> correctly, even if audio stream is sparse.
>> 
>> None of the demuxers generate packets because the timestamps are not 
>> continous, I just don't think it would be consistent if DV suddenly 
>> started to do this. E.g. what if the user wants to drop video with no 
>> audio?
>
> In practice, when dv frames with video and no audio are interleaved 
> within a dv stream that otherwise has both, it is because the playback 
> videotape player of the dv tape is in pause mode or the tape is damaged. 
> These frames most common are filled with only video dif blocks that note 
> concealment (so the image is a copy of a prior image) and the audio 
> source pack metadata is missing, but the paylock of the audio dif blocks 
> are filled with error code so they would decode as silence.

But if the audio source pack metadata is missing, then how can you 
determine the audio settings? Or the number of samples the errornous 
frame contains (e.g. 1600 v.s 1602)?

At least can you assume that audio settings are the same throughout a 
recording? Or DV can have mid-stream audio setting changes?

>
> I did a test of 114 dv tape transfers
> 	61 no difference in demuxing between pass and drop (using this patch)
> 	31 the difference is only in the final frames so the impact of the demuxer option would be hard for the user to see, no loss of sync, just video with no audio at the end
> 	22 the difference occurs mid-stream, with the drop option the demuxer outputs video and audio at different rate when hitting frames with no audio source pack, so the output of the demuxer loses sync
>
>>> about the case of a damaged first frame. Do you have a testcase ?
>> 
>> No, but it can happen, can't it? If the stream starts with no audio for 1 second you will have 1 second A-V desync, as far as I see, that is why I believe fixing the timestamps is the proper fix.
>
> Yes this happens (though it is more rare and didn’t occur in the test 
> noted above). In that case, ffmpeg shows no audio at all and I’d have to 
> read the stream at later frames using -skip_initial_bytes.

I guess this should be fixed as well, although if there is no way to 
determine the audio settings, this might prove to be difficult. Maybe the 
probing should keep reading the file until an audio source pack is found 
and cache that instead of relying on the last pack? At least that would 
give you more consistent results, irrespective of the order of 
reads/seeks, etc. But this can only work if audio settings are the same 
throughout a file, can we assume that?

Also maybe setting the CORRUPT packet flag should be done in this case?

Regards,
Marton



>
>> Regards,
>> Marton
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>
>> 
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Michael Niedermayer Aug. 3, 2020, 9:16 p.m. UTC | #6
On Mon, Aug 03, 2020 at 10:38:21PM +0200, Marton Balint wrote:
> 
> 
> On Sun, 2 Aug 2020, Dave Rice wrote:
> 
> > 
> > 
> > > On Aug 1, 2020, at 5:26 PM, Marton Balint <cus@passwd.hu> wrote:
> > > 
> > > 
> > > 
> > > On Sat, 1 Aug 2020, Michael Niedermayer wrote:
> > > 
> > > > On Sat, Aug 01, 2020 at 07:28:53PM +0200, Marton Balint wrote:
> > > > > 
> > > > > 
> > > > > On Sat, 1 Aug 2020, Michael Niedermayer wrote:
> > > > > 
> > > > > > Fixes: Ticket8762
> > > > > > Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
> > > > > > ---
> > > > > > libavformat/dv.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
> > > > > > 1 file changed, 42 insertions(+), 7 deletions(-)
> > > > > 
> > > > > If "dv remux loses sync", then the timestamps should be fixed, not
> > > > > additional packets should be generated based on previously read packet data
> > > > > (which is a fragile approach to begin with, e.g. what if the first frame is
> > > > > the corrupt one?).
> > > > 
> > > > Ticket8762 is about stream copy, so if no packets are returned for audio
> > > > but are for video and just timestamps are updated this would at least on
> > > > its own probably not work that well.
> > > 
> > > If the timestamps are good, a good player should be able to play it
> > > correctly, even if audio stream is sparse.
> > > 
> > > None of the demuxers generate packets because the timestamps are not
> > > continous, I just don't think it would be consistent if DV suddenly
> > > started to do this. E.g. what if the user wants to drop video with
> > > no audio?
> > 
> > In practice, when dv frames with video and no audio are interleaved
> > within a dv stream that otherwise has both, it is because the playback
> > videotape player of the dv tape is in pause mode or the tape is damaged.
> > These frames most common are filled with only video dif blocks that note
> > concealment (so the image is a copy of a prior image) and the audio
> > source pack metadata is missing, but the paylock of the audio dif blocks
> > are filled with error code so they would decode as silence.
> 
> But if the audio source pack metadata is missing, then how can you determine
> the audio settings? 

> Or the number of samples the errornous frame contains
> (e.g. 1600 v.s 1602)?

some testcase would be useful here where this is done clearly wrong currently


[...]

> Also maybe setting the CORRUPT packet flag should be done in this case?

yes was thinking that too, that should be in the next revision

[....]
Dave Rice Sept. 6, 2020, 3:32 p.m. UTC | #7
> On Aug 3, 2020, at 5:16 PM, Michael Niedermayer <michael@niedermayer.cc> wrote:
> 
> On Mon, Aug 03, 2020 at 10:38:21PM +0200, Marton Balint wrote:
>> 
>> 
>> On Sun, 2 Aug 2020, Dave Rice wrote:
>> 
>>> 
>>> 
>>>> On Aug 1, 2020, at 5:26 PM, Marton Balint <cus@passwd.hu> wrote:
>>>> 
>>>> 
>>>> 
>>>> On Sat, 1 Aug 2020, Michael Niedermayer wrote:
>>>> 
>>>>> On Sat, Aug 01, 2020 at 07:28:53PM +0200, Marton Balint wrote:
>>>>>> 
>>>>>> 
>>>>>> On Sat, 1 Aug 2020, Michael Niedermayer wrote:
>>>>>> 
>>>>>>> Fixes: Ticket8762
>>>>>>> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
>>>>>>> ---
>>>>>>> libavformat/dv.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
>>>>>>> 1 file changed, 42 insertions(+), 7 deletions(-)
>>>>>> 
>>>>>> If "dv remux loses sync", then the timestamps should be fixed, not
>>>>>> additional packets should be generated based on previously read packet data
>>>>>> (which is a fragile approach to begin with, e.g. what if the first frame is
>>>>>> the corrupt one?).
>>>>> 
>>>>> Ticket8762 is about stream copy, so if no packets are returned for audio
>>>>> but are for video and just timestamps are updated this would at least on
>>>>> its own probably not work that well.
>>>> 
>>>> If the timestamps are good, a good player should be able to play it
>>>> correctly, even if audio stream is sparse.
>>>> 
>>>> None of the demuxers generate packets because the timestamps are not
>>>> continous, I just don't think it would be consistent if DV suddenly
>>>> started to do this. E.g. what if the user wants to drop video with
>>>> no audio?
>>> 
>>> In practice, when dv frames with video and no audio are interleaved
>>> within a dv stream that otherwise has both, it is because the playback
>>> videotape player of the dv tape is in pause mode or the tape is damaged.
>>> These frames most common are filled with only video dif blocks that note
>>> concealment (so the image is a copy of a prior image) and the audio
>>> source pack metadata is missing, but the paylock of the audio dif blocks
>>> are filled with error code so they would decode as silence.
>> 
>> But if the audio source pack metadata is missing, then how can you determine
>> the audio settings?

I tested with QuickTime Player 7 and when frames are read with the audio source pack metadata missing, the first audio source pack is used. So these frames provide silent output as an earlier audio source pack is used. The disadvantage here is that a mid stream change such as 32kHz to 48kHz causes QuickTime Player 7 to mangle the audio by applying the wrong characteristics.

>> Or the number of samples the errornous frame contains
>> (e.g. 1600 v.s 1602)?
> 
> some testcase would be useful here where this is done clearly wrong currently

I put two additional samples at https://archive.org/download/001.dv.audiogap/001.dv.audiogap.dv <https://archive.org/download/001.dv.audiogap/001.dv.audiogap.dv> and https://archive.org/download/001.dv.audiogap/DVC00036_001.dv.audiogap.dv <https://archive.org/download/001.dv.audiogap/DVC00036_001.dv.audiogap.dv>. Each contains a series of frames in the middle that have all video blocks as concealed and all audio blocks are simply error code with no audio source pack.

For each example, both "ffmpeg -i file -c copy out” and “ffmpeg -i file out” has a loss of sync in the result and an audio track shorter than the video.

But true, a frame with no audio source pack does not communicate if it should be 1600 or 1602 samples.

In the SMPTE specification for DV at http://web.archive.org/web/20060927044735/http://www.smpte.org/smpte_store/standards/pdf/s314m.pdf <http://web.archive.org/web/20060927044735/http://www.smpte.org/smpte_store/standards/pdf/s314m.pdf>, it says on page 18 that for NTSC systems, the five-frame pattern should be: 1600, 1602, 1602, 1602, 1602. So if a frame has no audio source pack, the pattern of prior frames could be used or simply use this pattern upon finding a sequence of such frames starting at 1600. Or possibly the relationship between the starting time of the audio data and the starting time for the video data could be used to guess if 1600 or 1602 maintains the alignment more closely.

>> Also maybe setting the CORRUPT packet flag should be done in this case?
> 
> yes was thinking that too, that should be in the next revision

In the reference specification, table 26 shows how the STA value is interpreted to note if the frame contains concealed video DIF blocks or not. This doesn’t necessarily mean that the frame is corrupt, but that it is the product of data concealment caused by a misreading of the DV videotape.

[…]
Dave
diff mbox series

Patch

diff --git a/libavformat/dv.c b/libavformat/dv.c
index e99422d4b5..7ebe1815b3 100644
--- a/libavformat/dv.c
+++ b/libavformat/dv.c
@@ -34,12 +34,18 @@ 
 #include "libavcodec/dv_profile.h"
 #include "libavcodec/dv.h"
 #include "libavutil/channel_layout.h"
+#include "libavutil/opt.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/timecode.h"
 #include "dv.h"
 #include "libavutil/avassert.h"
 
+enum AudioConceal {
+    AUDIO_CONCEAL_PASS = 1,
+    AUDIO_CONCEAL_DROP = 0,
+};
+
 struct DVDemuxContext {
     const AVDVProfile*  sys;    /* Current DV profile. E.g.: 525/60, 625/50 */
     AVFormatContext*  fctx;
@@ -50,6 +56,8 @@  struct DVDemuxContext {
     int               ach;
     int               frames;
     uint64_t          abytes;
+    uint8_t           as_pack[5];
+    int               dvaudio_concealment;
 };
 
 static inline uint16_t dv_audio_12to16(uint16_t sample)
@@ -72,7 +80,7 @@  static inline uint16_t dv_audio_12to16(uint16_t sample)
     return result;
 }
 
-static const uint8_t *dv_extract_pack(const uint8_t *frame, enum dv_pack_type t)
+static const uint8_t *dv_extract_pack(DVDemuxContext *d, const uint8_t *frame, enum dv_pack_type t)
 {
     int offs;
     int c;
@@ -101,6 +109,13 @@  static const uint8_t *dv_extract_pack(const uint8_t *frame, enum dv_pack_type t)
             break;
     }
 
+    if (t == dv_audio_source || t == dv_audio_control) {
+        if (frame[offs] == t) {
+            memcpy(d->as_pack, &frame[offs], sizeof(d->as_pack));
+        } else if (d->as_pack[0] && d->dvaudio_concealment == AUDIO_CONCEAL_PASS)
+            return d->as_pack;
+    }
+
     return frame[offs] == t ? &frame[offs] : NULL;
 }
 
@@ -116,7 +131,7 @@  static const int dv_audio_frequency[3] = {
  * 3. Audio is always returned as 16-bit linear samples: 12-bit nonlinear samples
  *    are converted into 16-bit linear ones.
  */
-static int dv_extract_audio(const uint8_t *frame, uint8_t **ppcm,
+static int dv_extract_audio(DVDemuxContext *c, const uint8_t *frame, uint8_t **ppcm,
                             const AVDVProfile *sys)
 {
     int size, chan, i, j, d, of, smpls, freq, quant, half_ch;
@@ -124,7 +139,7 @@  static int dv_extract_audio(const uint8_t *frame, uint8_t **ppcm,
     const uint8_t *as_pack;
     uint8_t *pcm, ipcm;
 
-    as_pack = dv_extract_pack(frame, dv_audio_source);
+    as_pack = dv_extract_pack(c, frame, dv_audio_source);
     if (!as_pack)    /* No audio ? */
         return 0;
 
@@ -224,7 +239,7 @@  static int dv_extract_audio_info(DVDemuxContext *c, const uint8_t *frame)
     const uint8_t *as_pack;
     int freq, stype, smpls, quant, i, ach;
 
-    as_pack = dv_extract_pack(frame, dv_audio_source);
+    as_pack = dv_extract_pack(c, frame, dv_audio_source);
     if (!as_pack || !c->sys) {    /* No audio ? */
         c->ach = 0;
         return 0;
@@ -292,7 +307,7 @@  static int dv_extract_video_info(DVDemuxContext *c, const uint8_t *frame)
     c->vst->avg_frame_rate = av_inv_q(c->vst->time_base);
 
     /* finding out SAR is a little bit messy */
-    vsc_pack = dv_extract_pack(frame, dv_video_control);
+    vsc_pack = dv_extract_pack(c, frame, dv_video_control);
     apt      = frame[4] & 0x07;
     is16_9   = (vsc_pack && ((vsc_pack[2] & 0x07) == 0x02 ||
                              (!apt && (vsc_pack[2] & 0x07) == 0x07)));
@@ -312,7 +327,7 @@  static int dv_extract_timecode(DVDemuxContext* c, const uint8_t* frame, char *tc
     // is only relevant for NTSC systems.
     int prevent_df = c->sys->ltc_divisor == 25 || c->sys->ltc_divisor == 50;
 
-    tc_pack = dv_extract_pack(frame, dv_timecode);
+    tc_pack = dv_extract_pack(c, frame, dv_timecode);
     if (!tc_pack)
         return 0;
     av_timecode_make_smpte_tc_string(tc, AV_RB32(tc_pack + 1), prevent_df);
@@ -384,7 +399,7 @@  int avpriv_dv_produce_packet(DVDemuxContext *c, AVPacket *pkt,
         ppcm[i] = c->audio_buf[i];
     }
     if (c->ach)
-        dv_extract_audio(buf, ppcm, c->sys);
+        dv_extract_audio(c, buf, ppcm, c->sys);
 
     /* We work with 720p frames split in half, thus even frames have
      * channels 0,1 and odd 2,3. */
@@ -452,8 +467,10 @@  void ff_dv_offset_reset(DVDemuxContext *c, int64_t frame_offset)
  ************************************************************/
 
 typedef struct RawDVContext {
+    const AVClass  *class;
     DVDemuxContext *dv_demux;
     uint8_t         buf[DV_MAX_FRAME_SIZE];
+    int             dvaudio_concealment;
 } RawDVContext;
 
 static int dv_read_timecode(AVFormatContext *s) {
@@ -500,6 +517,7 @@  static int dv_read_header(AVFormatContext *s)
     c->dv_demux = avpriv_dv_init_demux(s);
     if (!c->dv_demux)
         return AVERROR(ENOMEM);
+    c->dv_demux->dvaudio_concealment = c->dvaudio_concealment;
 
     state = avio_rb32(s->pb);
     while ((state & 0xffffff7f) != 0x1f07003f) {
@@ -639,6 +657,22 @@  static int dv_probe(const AVProbeData *p)
     return 0;
 }
 
+#define OFFSET(x) offsetof(RawDVContext, x)
+#define DEC AV_OPT_FLAG_DECODING_PARAM
+static const AVOption dv_options[] = {
+    { "dvaudio_concealment", "", OFFSET(dvaudio_concealment), AV_OPT_TYPE_INT  , {.i64 = AUDIO_CONCEAL_DROP}, 0, INT_MAX, DEC, "dvaudio_concealment"},
+    { "drop",                "", 0                          , AV_OPT_TYPE_CONST, {.i64 = AUDIO_CONCEAL_DROP}, 0, INT_MAX, DEC, "dvaudio_concealment"},
+    { "pass",                "", 0                          , AV_OPT_TYPE_CONST, {.i64 = AUDIO_CONCEAL_PASS}, 0, INT_MAX, DEC, "dvaudio_concealment"},
+    { NULL },
+};
+
+static const AVClass dv_demuxer_class = {
+    .class_name = "DV demuxer",
+    .item_name  = av_default_item_name,
+    .option     = dv_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVInputFormat ff_dv_demuxer = {
     .name           = "dv",
     .long_name      = NULL_IF_CONFIG_SMALL("DV (Digital Video)"),
@@ -649,4 +683,5 @@  AVInputFormat ff_dv_demuxer = {
     .read_close     = dv_read_close,
     .read_seek      = dv_read_seek,
     .extensions     = "dv,dif",
+    .priv_class     = &dv_demuxer_class,
 };