diff mbox

[FFmpeg-devel,v5,1/2] lavf/isom: support for demuxing MPEG-H 3D Audio in MP4

Message ID 20191121072521.7459-1-Yuki.Tsuchiya@sony.com
State New
Headers show

Commit Message

Yuki.Tsuchiya Nov. 21, 2019, 7:25 a.m. UTC
Implemented according to the specification at https://www.iso.org/standard/69561.html
The 'mhm1' sample entry is registered with MP4RA, which is defined as MHAS encapsulated single stream MPEG-H 3D Audio.
'MHAS' stands for MPEG-H audio stream, which contains encoded audio data and corresponds metadata for decoding.
This patch enables extracting the MHAS bitstream from MP4.

Signed-off-by: Yuki Tsuchiya <Yuki.Tsuchiya@sony.com>
---
 libavcodec/avcodec.h    | 1 +
 libavcodec/codec_desc.c | 7 +++++++
 libavcodec/version.h    | 2 +-
 libavformat/isom.c      | 1 +
 libavformat/movenc.c    | 6 ++++--
 libavformat/utils.c     | 3 ++-
 6 files changed, 16 insertions(+), 4 deletions(-)

Comments

Yuki.Tsuchiya Nov. 21, 2019, 7:42 a.m. UTC | #1
> On 2019/11/21 16:41, "Tsuchiya, Yuki (SHES)" <Yuki.Tsuchiya@sony.com> wrote:

> 

>    Implemented according to the specification at https://www.iso.org/standard/69561.html

>    The 'mhm1' sample entry is registered with MP4RA, which is defined as MHAS encapsulated single stream MPEG-H 3D Audio.

>    'MHAS' stands for MPEG-H audio stream, which contains encoded audio data and corresponds metadata for decoding.

>    This patch enables extracting the MHAS bitstream from MP4.

    
Hi,

This patch is just rebase of previous patch against master due to conflict.
I could not get response from Jan.., but could you merge this patch if there is no other comment?


Carl Eugen Hoyos Nov. 21, 2019, 10:06 a.m. UTC | #2
Am Do., 21. Nov. 2019 um 08:40 Uhr schrieb Yuki Tsuchiya
<Yuki.Tsuchiya@sony.com>:
>
> Implemented according to the specification at https://www.iso.org/standard/69561.html
> The 'mhm1' sample entry is registered with MP4RA, which is defined as MHAS encapsulated single stream MPEG-H 3D Audio.
> 'MHAS' stands for MPEG-H audio stream, which contains encoded audio data and corresponds metadata for decoding.
> This patch enables extracting the MHAS bitstream from MP4.

I will push this if nobody objects.

Carl Eugen
James Almer Nov. 21, 2019, 12:19 p.m. UTC | #3
On 11/21/2019 4:25 AM, Yuki Tsuchiya wrote:
> Implemented according to the specification at https://www.iso.org/standard/69561.html
> The 'mhm1' sample entry is registered with MP4RA, which is defined as MHAS encapsulated single stream MPEG-H 3D Audio.
> 'MHAS' stands for MPEG-H audio stream, which contains encoded audio data and corresponds metadata for decoding.
> This patch enables extracting the MHAS bitstream from MP4.

This is changing the muxer, but the commit subject and description talks
about demuxing.

> 
> Signed-off-by: Yuki Tsuchiya <Yuki.Tsuchiya@sony.com>
> ---
>  libavcodec/avcodec.h    | 1 +
>  libavcodec/codec_desc.c | 7 +++++++
>  libavcodec/version.h    | 2 +-
>  libavformat/isom.c      | 1 +
>  libavformat/movenc.c    | 6 ++++--
>  libavformat/utils.c     | 3 ++-
>  6 files changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index 813a43b72e..85c9401b8f 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -654,6 +654,7 @@ enum AVCodecID {
>      AV_CODEC_ID_ATRAC9,
>      AV_CODEC_ID_HCOM,
>      AV_CODEC_ID_ACELP_KELVIN,
> +    AV_CODEC_ID_MPEGH_3D_AUDIO,
>  
>      /* subtitle codecs */
>      AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
> diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
> index 5961af3c85..ba8ec32e4e 100644
> --- a/libavcodec/codec_desc.c
> +++ b/libavcodec/codec_desc.c
> @@ -2999,6 +2999,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
>          .long_name = NULL_IF_CONFIG_SMALL("Sipro ACELP.KELVIN"),
>          .props     = AV_CODEC_PROP_LOSSY,
>      },
> +    {
> +        .id        = AV_CODEC_ID_MPEGH_3D_AUDIO,
> +        .type      = AVMEDIA_TYPE_AUDIO,
> +        .name      = "mpegh_3d_audio",
> +        .long_name = NULL_IF_CONFIG_SMALL("MPEG-H 3D Audio"),
> +        .props     = AV_CODEC_PROP_LOSSY,
> +    },
>  
>      /* subtitle codecs */
>      {
> diff --git a/libavcodec/version.h b/libavcodec/version.h
> index 58ea00a520..5c8e376ea1 100644
> --- a/libavcodec/version.h
> +++ b/libavcodec/version.h
> @@ -28,7 +28,7 @@
>  #include "libavutil/version.h"
>  
>  #define LIBAVCODEC_VERSION_MAJOR  58
> -#define LIBAVCODEC_VERSION_MINOR  62
> +#define LIBAVCODEC_VERSION_MINOR  63
>  #define LIBAVCODEC_VERSION_MICRO 100
>  
>  #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
> diff --git a/libavformat/isom.c b/libavformat/isom.c
> index edd0d81063..824e811177 100644
> --- a/libavformat/isom.c
> +++ b/libavformat/isom.c
> @@ -371,6 +371,7 @@ const AVCodecTag ff_codec_movaudio_tags[] = {
>      { AV_CODEC_ID_FLAC,            MKTAG('f', 'L', 'a', 'C') }, /* nonstandard */
>      { AV_CODEC_ID_TRUEHD,          MKTAG('m', 'l', 'p', 'a') }, /* mp4ra.org */
>      { AV_CODEC_ID_OPUS,            MKTAG('O', 'p', 'u', 's') }, /* mp4ra.org */
> +    { AV_CODEC_ID_MPEGH_3D_AUDIO,  MKTAG('m', 'h', 'm', '1') }, /* MPEG-H 3D Audio bitstream */
>      { AV_CODEC_ID_NONE, 0 },
>  };
>  
> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
> index 715bec1c2f..ff234d9f2b 100644
> --- a/libavformat/movenc.c
> +++ b/libavformat/movenc.c
> @@ -2411,7 +2411,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
>      if (!sgpd_entries)
>          return AVERROR(ENOMEM);
>  
> -    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC);
> +    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC || track->par->codec_id == AV_CODEC_ID_MPEGH_3D_AUDIO);

You're calling this function for MPEGH 3D Audio, but not writing an
implementation for it. The sgpd_entries array will be empty.

>  
>      if (track->par->codec_id == AV_CODEC_ID_OPUS) {
>          for (i = 0; i < track->entry; i++) {
> @@ -2493,6 +2493,7 @@ static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
>      mov_write_stts_tag(pb, track);
>      if ((track->par->codec_type == AVMEDIA_TYPE_VIDEO ||
>           track->par->codec_id == AV_CODEC_ID_TRUEHD ||
> +         track->par->codec_id == AV_CODEC_ID_MPEGH_3D_AUDIO ||
>           track->par->codec_tag == MKTAG('r','t','p',' ')) &&
>          track->has_keyframes && track->has_keyframes < track->entry)
>          mov_write_stss_tag(pb, track, MOV_SYNC_SAMPLE);

Is every sample meant to be a Sync Sample?

> @@ -2512,7 +2513,7 @@ static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
>      if (track->cenc.aes_ctr) {
>          ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
>      }
> -    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC) {
> +    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC || track->par->codec_id == AV_CODEC_ID_MPEGH_3D_AUDIO) {
>          mov_preroll_write_stbl_atoms(pb, track);
>      }
>      return update_size(pb, pos);
> @@ -6877,6 +6878,7 @@ const AVCodecTag codec_mp4_tags[] = {
>      { AV_CODEC_ID_DVD_SUBTITLE, MKTAG('m', 'p', '4', 's') },
>      { AV_CODEC_ID_MOV_TEXT    , MKTAG('t', 'x', '3', 'g') },
>      { AV_CODEC_ID_BIN_DATA    , MKTAG('g', 'p', 'm', 'd') },
> +    { AV_CODEC_ID_MPEGH_3D_AUDIO, MKTAG('m', 'h', 'm', '1') },
>      { AV_CODEC_ID_NONE        ,    0 },
>  };
>  
> diff --git a/libavformat/utils.c b/libavformat/utils.c
> index 8196442dd1..70f9e23d8c 100644
> --- a/libavformat/utils.c
> +++ b/libavformat/utils.c
> @@ -1021,7 +1021,8 @@ static int is_intra_only(enum AVCodecID id)
>      const AVCodecDescriptor *d = avcodec_descriptor_get(id);
>      if (!d)
>          return 0;
> -    if (d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY))
> +    if ((d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY)) ||
> +        id == AV_CODEC_ID_MPEGH_3D_AUDIO)
>          return 0;
>      return 1;
>  }
>
Yuki.Tsuchiya Nov. 22, 2019, 8:40 a.m. UTC | #4
> This is changing the muxer, but the commit subject and description talks about

> demuxing.

Yes. This code contains both the remuxer into MP4 and the demuxer. I will change the commit description
 
> You're calling this function for MPEGH 3D Audio, but not writing an

> implementation for it. The sgpd_entries array will be empty.

Thank you for pointing it out. 
In AAC case, the sgpd delay is set to -1 statically because most of popular AAC encoders have same encoder delay(1024 samples), but basically this value would be different for each encoder.
At this moment, there is no standard in MPEG-H 3D Audio so the value should not set statically.
In this patch, I'll remove this code. Most of media player could remove the encoder delay by using the edit list even if there is no sgpd box.

> Is every sample meant to be a Sync Sample?

No. The stss box is generated based on the sync frame flag information in MOVtrack. The sample MPEG-H 3D Audio content has sync frame in every 25 frame.
diff mbox

Patch

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 813a43b72e..85c9401b8f 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -654,6 +654,7 @@  enum AVCodecID {
     AV_CODEC_ID_ATRAC9,
     AV_CODEC_ID_HCOM,
     AV_CODEC_ID_ACELP_KELVIN,
+    AV_CODEC_ID_MPEGH_3D_AUDIO,
 
     /* subtitle codecs */
     AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index 5961af3c85..ba8ec32e4e 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -2999,6 +2999,13 @@  static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("Sipro ACELP.KELVIN"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_MPEGH_3D_AUDIO,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "mpegh_3d_audio",
+        .long_name = NULL_IF_CONFIG_SMALL("MPEG-H 3D Audio"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* subtitle codecs */
     {
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 58ea00a520..5c8e376ea1 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -28,7 +28,7 @@ 
 #include "libavutil/version.h"
 
 #define LIBAVCODEC_VERSION_MAJOR  58
-#define LIBAVCODEC_VERSION_MINOR  62
+#define LIBAVCODEC_VERSION_MINOR  63
 #define LIBAVCODEC_VERSION_MICRO 100
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
diff --git a/libavformat/isom.c b/libavformat/isom.c
index edd0d81063..824e811177 100644
--- a/libavformat/isom.c
+++ b/libavformat/isom.c
@@ -371,6 +371,7 @@  const AVCodecTag ff_codec_movaudio_tags[] = {
     { AV_CODEC_ID_FLAC,            MKTAG('f', 'L', 'a', 'C') }, /* nonstandard */
     { AV_CODEC_ID_TRUEHD,          MKTAG('m', 'l', 'p', 'a') }, /* mp4ra.org */
     { AV_CODEC_ID_OPUS,            MKTAG('O', 'p', 'u', 's') }, /* mp4ra.org */
+    { AV_CODEC_ID_MPEGH_3D_AUDIO,  MKTAG('m', 'h', 'm', '1') }, /* MPEG-H 3D Audio bitstream */
     { AV_CODEC_ID_NONE, 0 },
 };
 
diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index 715bec1c2f..ff234d9f2b 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -2411,7 +2411,7 @@  static int mov_preroll_write_stbl_atoms(AVIOContext *pb, MOVTrack *track)
     if (!sgpd_entries)
         return AVERROR(ENOMEM);
 
-    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC);
+    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC || track->par->codec_id == AV_CODEC_ID_MPEGH_3D_AUDIO);
 
     if (track->par->codec_id == AV_CODEC_ID_OPUS) {
         for (i = 0; i < track->entry; i++) {
@@ -2493,6 +2493,7 @@  static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
     mov_write_stts_tag(pb, track);
     if ((track->par->codec_type == AVMEDIA_TYPE_VIDEO ||
          track->par->codec_id == AV_CODEC_ID_TRUEHD ||
+         track->par->codec_id == AV_CODEC_ID_MPEGH_3D_AUDIO ||
          track->par->codec_tag == MKTAG('r','t','p',' ')) &&
         track->has_keyframes && track->has_keyframes < track->entry)
         mov_write_stss_tag(pb, track, MOV_SYNC_SAMPLE);
@@ -2512,7 +2513,7 @@  static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext
     if (track->cenc.aes_ctr) {
         ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
     }
-    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC) {
+    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC || track->par->codec_id == AV_CODEC_ID_MPEGH_3D_AUDIO) {
         mov_preroll_write_stbl_atoms(pb, track);
     }
     return update_size(pb, pos);
@@ -6877,6 +6878,7 @@  const AVCodecTag codec_mp4_tags[] = {
     { AV_CODEC_ID_DVD_SUBTITLE, MKTAG('m', 'p', '4', 's') },
     { AV_CODEC_ID_MOV_TEXT    , MKTAG('t', 'x', '3', 'g') },
     { AV_CODEC_ID_BIN_DATA    , MKTAG('g', 'p', 'm', 'd') },
+    { AV_CODEC_ID_MPEGH_3D_AUDIO, MKTAG('m', 'h', 'm', '1') },
     { AV_CODEC_ID_NONE        ,    0 },
 };
 
diff --git a/libavformat/utils.c b/libavformat/utils.c
index 8196442dd1..70f9e23d8c 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -1021,7 +1021,8 @@  static int is_intra_only(enum AVCodecID id)
     const AVCodecDescriptor *d = avcodec_descriptor_get(id);
     if (!d)
         return 0;
-    if (d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY))
+    if ((d->type == AVMEDIA_TYPE_VIDEO && !(d->props & AV_CODEC_PROP_INTRA_ONLY)) ||
+        id == AV_CODEC_ID_MPEGH_3D_AUDIO)
         return 0;
     return 1;
 }