diff mbox

[FFmpeg-devel,v6,1/2] lavf/isom: support for demuxing and remuxing of MPEG-H 3D Audio in MP4

Message ID 20191122083611.7504-1-Yuki.Tsuchiya@sony.com
State New
Headers show

Commit Message

Yuki.Tsuchiya Nov. 22, 2019, 8:36 a.m. UTC
Implemented according to the specification at https://www.iso.org/standard/69561.html
The 'mhm1' sample entry is registered with MP4RA, which is defined as MHAS encapsulated single stream MPEG-H 3D Audio.
'MHAS' stands for MPEG-H audio stream, which contains encoded audio data and corresponds metadata for decoding.
This patch enables extracting the MHAS bitstream from MP4 and remuxing into MP4.

Signed-off-by: Yuki Tsuchiya <Yuki.Tsuchiya@sony.com>
---
 libavcodec/avcodec.h    | 1 +
 libavcodec/codec_desc.c | 7 +++++++
 libavcodec/version.h    | 2 +-
 libavformat/isom.c      | 1 +
 libavformat/movenc.c    | 2 ++
 libavformat/utils.c     | 3 ++-
 6 files changed, 14 insertions(+), 2 deletions(-)

Comments

Yuki.Tsuchiya Nov. 27, 2019, 9:33 a.m. UTC | #1
Hi,

Could you merge this patch if there is no objection?

> -----Original Message-----

> From: Tsuchiya, Yuki (SHES) <Yuki.Tsuchiya@sony.com>

> 

> Implemented according to the specification at

> https://www.iso.org/standard/69561.html

> The 'mhm1' sample entry is registered with MP4RA, which is defined as MHAS

> encapsulated single stream MPEG-H 3D Audio.

> 'MHAS' stands for MPEG-H audio stream, which contains encoded audio data

> and corresponds metadata for decoding.

> This patch enables extracting the MHAS bitstream from MP4 and remuxing

> into MP4.

>
Yuki.Tsuchiya Dec. 10, 2019, 9:32 a.m. UTC | #2
Hi,

Is there any comments?

On 2019/11/22 17:44, "Tsuchiya, Yuki (SHES)" <Yuki.Tsuchiya@sony.com> wrote:
> 

>     Implemented according to the specification at https://www.iso.org/standard/69561.html

>     The 'mhm1' sample entry is registered with MP4RA, which is defined as MHAS encapsulated single stream MPEG-H 3D Audio.

>     'MHAS' stands for MPEG-H audio stream, which contains encoded audio data and corresponds metadata for decoding.

>     This patch enables extracting the MHAS bitstream from MP4 and remuxing into MP4.
James Almer Dec. 10, 2019, 1:24 p.m. UTC | #3
On 11/22/2019 5:36 AM, Yuki Tsuchiya wrote:
> Implemented according to the specification at https://www.iso.org/standard/69561.html
> The 'mhm1' sample entry is registered with MP4RA, which is defined as MHAS encapsulated single stream MPEG-H 3D Audio.
> 'MHAS' stands for MPEG-H audio stream, which contains encoded audio data and corresponds metadata for decoding.
> This patch enables extracting the MHAS bitstream from MP4 and remuxing into MP4.
> 
> Signed-off-by: Yuki Tsuchiya <Yuki.Tsuchiya@sony.com>
> ---
>  libavcodec/avcodec.h    | 1 +
>  libavcodec/codec_desc.c | 7 +++++++
>  libavcodec/version.h    | 2 +-
>  libavformat/isom.c      | 1 +
>  libavformat/movenc.c    | 2 ++
>  libavformat/utils.c     | 3 ++-
>  6 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index 813a43b72e..85c9401b8f 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -654,6 +654,7 @@ enum AVCodecID {
>      AV_CODEC_ID_ATRAC9,
>      AV_CODEC_ID_HCOM,
>      AV_CODEC_ID_ACELP_KELVIN,
> +    AV_CODEC_ID_MPEGH_3D_AUDIO,
>  
>      /* subtitle codecs */
>      AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
> diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
> index 5961af3c85..ba8ec32e4e 100644
> --- a/libavcodec/codec_desc.c
> +++ b/libavcodec/codec_desc.c
> @@ -2999,6 +2999,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
>          .long_name = NULL_IF_CONFIG_SMALL("Sipro ACELP.KELVIN"),
>          .props     = AV_CODEC_PROP_LOSSY,
>      },
> +    {
> +        .id        = AV_CODEC_ID_MPEGH_3D_AUDIO,
> +        .type      = AVMEDIA_TYPE_AUDIO,
> +        .name      = "mpegh_3d_audio",
> +        .long_name = NULL_IF_CONFIG_SMALL("MPEG-H 3D Audio"),
> +        .props     = AV_CODEC_PROP_LOSSY,
> +    },
>  
>      /* subtitle codecs */
>      {
> diff --git a/libavcodec/version.h b/libavcodec/version.h
> index 58ea00a520..5c8e376ea1 100644
> --- a/libavcodec/version.h
> +++ b/libavcodec/version.h
> @@ -28,7 +28,7 @@
>  #include "libavutil/version.h"
>  
>  #define LIBAVCODEC_VERSION_MAJOR  58
> -#define LIBAVCODEC_VERSION_MINOR  62
> +#define LIBAVCODEC_VERSION_MINOR  63
>  #define LIBAVCODEC_VERSION_MICRO 100
>  
>  #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
> diff --git a/libavformat/isom.c b/libavformat/isom.c
> index edd0d81063..824e811177 100644
> --- a/libavformat/isom.c
> +++ b/libavformat/isom.c
> @@ -371,6 +371,7 @@ const AVCodecTag ff_codec_movaudio_tags[] = {
>      { AV_CODEC_ID_FLAC,            MKTAG('f', 'L', 'a', 'C') }, /* nonstandard */
>      { AV_CODEC_ID_TRUEHD,          MKTAG('m', 'l', 'p', 'a') }, /* mp4ra.org */
>      { AV_CODEC_ID_OPUS,            MKTAG('O', 'p', 'u', 's') }, /* mp4ra.org */
> +    { AV_CODEC_ID_MPEGH_3D_AUDIO,  MKTAG('m', 'h', 'm', '1') }, /* MPEG-H 3D Audio bitstream */
>      { AV_CODEC_ID_NONE, 0 },
>  };
>  
> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
> index 715bec1c2f..985341bac2 100644
> --- a/libavformat/movenc.c
> +++ b/libavformat/movenc.c
> @@ -2493,6 +2493,7 @@ static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
>      mov_write_stts_tag(pb, track);
>      if ((track->par->codec_type == AVMEDIA_TYPE_VIDEO ||
>           track->par->codec_id == AV_CODEC_ID_TRUEHD ||
> +         track->par->codec_id == AV_CODEC_ID_MPEGH_3D_AUDIO ||
>           track->par->codec_tag == MKTAG('r','t','p',' ')) &&
>          track->has_keyframes && track->has_keyframes < track->entry)
>          mov_write_stss_tag(pb, track, MOV_SYNC_SAMPLE);
> @@ -6877,6 +6878,7 @@ const AVCodecTag codec_mp4_tags[] = {
>      { AV_CODEC_ID_DVD_SUBTITLE, MKTAG('m', 'p', '4', 's') },
>      { AV_CODEC_ID_MOV_TEXT    , MKTAG('t', 'x', '3', 'g') },
>      { AV_CODEC_ID_BIN_DATA    , MKTAG('g', 'p', 'm', 'd') },
> +    { AV_CODEC_ID_MPEGH_3D_AUDIO, MKTAG('m', 'h', 'm', '1') },
>      { AV_CODEC_ID_NONE        ,    0 },
>  };
>  
> diff --git a/libavformat/utils.c b/libavformat/utils.c
> index 8196442dd1..70f9e23d8c 100644
> --- a/libavformat/utils.c
> +++ b/libavformat/utils.c
> @@ -1021,7 +1021,8 @@ static int is_intra_only(enum AVCodecID id)
>      const AVCodecDescriptor *d = avcodec_descriptor_get(id);
>      if (!d)
>          return 0;
> -    if (d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY))
> +    if ((d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY)) ||
> +        id == AV_CODEC_ID_MPEGH_3D_AUDIO)

Instead of changing generic code like this to prevent all MPEG-H 3D
Audio packets from being tagged as key, you should add a custom function
much like the VC1 and the TrueHD ones called at the end of
ff_mov_write_packet() to set the MOV_SYNC_SAMPLE flag and increase the
track->has_keyframes counter where corresponds.

You or someone else mentioned there's a sync frame every 25 frames or
so. It should be possible to easily parse the bitstream to detect and
properly mark them.

>          return 0;
>      return 1;
>  }
>
Yuki.Tsuchiya Dec. 11, 2019, 10:15 a.m. UTC | #4
Hello James,

On 2019/12/10 22:30, "James Almer" <jamrial@gmail.com> wrote:

    >> diff --git a/libavformat/utils.c b/libavformat/utils.c

    >> index 8196442dd1..70f9e23d8c 100644

    >> --- a/libavformat/utils.c

    >> +++ b/libavformat/utils.c

    >> @@ -1021,7 +1021,8 @@ static int is_intra_only(enum AVCodecID id)

    >>      const AVCodecDescriptor *d = avcodec_descriptor_get(id);

    >>      if (!d)

    >>          return 0;

    >> -    if (d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY))

    >> +    if ((d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY)) ||

    >> +        id == AV_CODEC_ID_MPEGH_3D_AUDIO)

    
    >Instead of changing generic code like this to prevent all MPEG-H 3D

    >Audio packets from being tagged as key, you should add a custom function

    >much like the VC1 and the TrueHD ones called at the end of

    >ff_mov_write_packet() to set the MOV_SYNC_SAMPLE flag and increase the

    >track->has_keyframes counter where corresponds.


If the modification in the is_intra_only() is removed, all demuxed frames will be tagged as key frame.
This means that all single frames got from av_read_frame() will be key frame so it is difficult to start decoding from actual key frame.
I think this modification or something like other method for keeping key frame information from stss box is required.
Do you have any idea?
    
    _______________________________________________
    ffmpeg-devel mailing list
    ffmpeg-devel@ffmpeg.org
    https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
    
    To unsubscribe, visit link above, or email
    ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
James Almer Dec. 11, 2019, 6:15 p.m. UTC | #5
On 12/11/2019 7:15 AM, Tsuchiya, Yuki (SHES) wrote:
> Hello James,
> 
> On 2019/12/10 22:30, "James Almer" <jamrial@gmail.com> wrote:
> 
>     >> diff --git a/libavformat/utils.c b/libavformat/utils.c
>     >> index 8196442dd1..70f9e23d8c 100644
>     >> --- a/libavformat/utils.c
>     >> +++ b/libavformat/utils.c
>     >> @@ -1021,7 +1021,8 @@ static int is_intra_only(enum AVCodecID id)
>     >>      const AVCodecDescriptor *d = avcodec_descriptor_get(id);
>     >>      if (!d)
>     >>          return 0;
>     >> -    if (d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY))
>     >> +    if ((d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY)) ||
>     >> +        id == AV_CODEC_ID_MPEGH_3D_AUDIO)
>     
>     >Instead of changing generic code like this to prevent all MPEG-H 3D
>     >Audio packets from being tagged as key, you should add a custom function
>     >much like the VC1 and the TrueHD ones called at the end of
>     >ff_mov_write_packet() to set the MOV_SYNC_SAMPLE flag and increase the
>     >track->has_keyframes counter where corresponds.
> 
> If the modification in the is_intra_only() is removed, all demuxed frames will be tagged as key frame.
> This means that all single frames got from av_read_frame() will be key frame so it is difficult to start decoding from actual key frame.
> I think this modification or something like other method for keeping key frame information from stss box is required.
> Do you have any idea?

I see now, this is during demuxing. I was thinking about muxing.
I can reproduce what you mean with TrueHD which has sync frames every
128 or so frames. Despite the mp4 and matroska files having the sync
frames correctly marked, lavf tags every frame as key.

A general approach at fixing this issue would be to change
is_intra_only() to not just look at video codecs but also audio codecs,
and updating all the audio entries in libavcodec/codec_desc.c that are
not currently set as AV_CODEC_PROP_INTRA_ONLY and require it.
Yuki.Tsuchiya Dec. 12, 2019, 5:30 a.m. UTC | #6
Hi James,

> I see now, this is during demuxing. I was thinking about muxing.

> I can reproduce what you mean with TrueHD which has sync frames every

> 128 or so frames. Despite the mp4 and matroska files having the sync

> frames correctly marked, lavf tags every frame as key.

>

> A general approach at fixing this issue would be to change

> is_intra_only() to not just look at video codecs but also audio codecs,

> and updating all the audio entries in libavcodec/codec_desc.c that are

> not currently set as AV_CODEC_PROP_INTRA_ONLY and require it. 


Thank you for your suggestion.
I will send new patch soon, so please review again.
diff mbox

Patch

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 813a43b72e..85c9401b8f 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -654,6 +654,7 @@  enum AVCodecID {
     AV_CODEC_ID_ATRAC9,
     AV_CODEC_ID_HCOM,
     AV_CODEC_ID_ACELP_KELVIN,
+    AV_CODEC_ID_MPEGH_3D_AUDIO,
 
     /* subtitle codecs */
     AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index 5961af3c85..ba8ec32e4e 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -2999,6 +2999,13 @@  static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("Sipro ACELP.KELVIN"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_MPEGH_3D_AUDIO,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "mpegh_3d_audio",
+        .long_name = NULL_IF_CONFIG_SMALL("MPEG-H 3D Audio"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* subtitle codecs */
     {
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 58ea00a520..5c8e376ea1 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -28,7 +28,7 @@ 
 #include "libavutil/version.h"
 
 #define LIBAVCODEC_VERSION_MAJOR  58
-#define LIBAVCODEC_VERSION_MINOR  62
+#define LIBAVCODEC_VERSION_MINOR  63
 #define LIBAVCODEC_VERSION_MICRO 100
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
diff --git a/libavformat/isom.c b/libavformat/isom.c
index edd0d81063..824e811177 100644
--- a/libavformat/isom.c
+++ b/libavformat/isom.c
@@ -371,6 +371,7 @@  const AVCodecTag ff_codec_movaudio_tags[] = {
     { AV_CODEC_ID_FLAC,            MKTAG('f', 'L', 'a', 'C') }, /* nonstandard */
     { AV_CODEC_ID_TRUEHD,          MKTAG('m', 'l', 'p', 'a') }, /* mp4ra.org */
     { AV_CODEC_ID_OPUS,            MKTAG('O', 'p', 'u', 's') }, /* mp4ra.org */
+    { AV_CODEC_ID_MPEGH_3D_AUDIO,  MKTAG('m', 'h', 'm', '1') }, /* MPEG-H 3D Audio bitstream */
     { AV_CODEC_ID_NONE, 0 },
 };
 
diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index 715bec1c2f..985341bac2 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -2493,6 +2493,7 @@  static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
     mov_write_stts_tag(pb, track);
     if ((track->par->codec_type == AVMEDIA_TYPE_VIDEO ||
          track->par->codec_id == AV_CODEC_ID_TRUEHD ||
+         track->par->codec_id == AV_CODEC_ID_MPEGH_3D_AUDIO ||
          track->par->codec_tag == MKTAG('r','t','p',' ')) &&
         track->has_keyframes && track->has_keyframes < track->entry)
         mov_write_stss_tag(pb, track, MOV_SYNC_SAMPLE);
@@ -6877,6 +6878,7 @@  const AVCodecTag codec_mp4_tags[] = {
     { AV_CODEC_ID_DVD_SUBTITLE, MKTAG('m', 'p', '4', 's') },
     { AV_CODEC_ID_MOV_TEXT    , MKTAG('t', 'x', '3', 'g') },
     { AV_CODEC_ID_BIN_DATA    , MKTAG('g', 'p', 'm', 'd') },
+    { AV_CODEC_ID_MPEGH_3D_AUDIO, MKTAG('m', 'h', 'm', '1') },
     { AV_CODEC_ID_NONE        ,    0 },
 };
 
diff --git a/libavformat/utils.c b/libavformat/utils.c
index 8196442dd1..70f9e23d8c 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -1021,7 +1021,8 @@  static int is_intra_only(enum AVCodecID id)
     const AVCodecDescriptor *d = avcodec_descriptor_get(id);
     if (!d)
         return 0;
-    if (d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY))
+    if ((d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY)) ||
+        id == AV_CODEC_ID_MPEGH_3D_AUDIO)
         return 0;
     return 1;
 }