[FFmpeg-devel,v3,3/3,WIP,RFC] avformat: Immersive Audio Model and Formats demuxer

Message ID	20231030152354.2818-3-jamrial@gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: James Almer <jamrial@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Mon, 30 Oct 2023 12:23:54 -0300 Message-ID: <20231030152354.2818-3-jamrial@gmail.com> In-Reply-To: <20231030152354.2818-1-jamrial@gmail.com> References: <20231030152354.2818-1-jamrial@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v3 3/3][WIP][RFC] avformat: Immersive Audio Model and Formats demuxer Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,v3,1/3,PoC] avformat: introduce AVStreamGroup \| expand [FFmpeg-devel,v3,1/3,PoC] avformat: introduce AVStreamGroup [FFmpeg-devel,v3,2/3] avutil/mem: add av_dynarray2_add_nofree [FFmpeg-devel,v3,3/3,WIP,RFC] avformat: Immersive Audio Model and Formats demuxer

Context	Check	Description
yinshiyou/make_loongarch64	success	Make finished
yinshiyou/make_fate_loongarch64	success	Make fate finished
andriy/make_x86	success	Make finished
andriy/make_fate_x86	success	Make fate finished

diff --git a/libavcodec/avpacket.c b/libavcodec/avpacket.c index e29725c2d2..0f8c9b77ae 100644 --- a/libavcodec/avpacket.c +++ b/libavcodec/avpacket.c @@ -301,6 +301,9 @@ const char *av_packet_side_data_name(enum AVPacketSideDataType type) case AV_PKT_DATA_DOVI_CONF: return "DOVI configuration record"; case AV_PKT_DATA_S12M_TIMECODE: return "SMPTE ST 12-1:2014 timecode"; case AV_PKT_DATA_DYNAMIC_HDR10_PLUS: return "HDR10+ Dynamic Metadata (SMPTE 2094-40)"; + case AV_PKT_DATA_IAMF_MIX_GAIN_PARAM: return "IAMF Mix Gain Parameter Data"; + case AV_PKT_DATA_IAMF_DEMIXING_INFO_PARAM: return "IAMF Demixing Info Parameter Data"; + case AV_PKT_DATA_IAMF_RECON_GAIN_INFO_PARAM: return "IAMF Recon Gain Info Parameter Data"; } return NULL; } diff --git a/libavcodec/packet.h b/libavcodec/packet.h index b19409b719..2c57d262c6 100644 --- a/libavcodec/packet.h +++ b/libavcodec/packet.h @@ -299,6 +299,30 @@ enum AVPacketSideDataType { */ AV_PKT_DATA_DYNAMIC_HDR10_PLUS, + /** + * IAMF Mix Gain Parameter Data associated with the audio frame. This metadata + * is in the form of the AVIAMFParamDefinition struct and contains information + * defined in sections 3.6.1 and 3.8.1 of the Immersive Audio Model and + * Formats standard. + */ + AV_PKT_DATA_IAMF_MIX_GAIN_PARAM, + + /** + * IAMF Demixing Info Parameter Data associated with the audio frame. This + * metadata is in the form of the AVIAMFParamDefinition struct and contains + * information defined in sections 3.6.1 and 3.8.2 of the Immersive Audio Model + * and Formats standard. + */ + AV_PKT_DATA_IAMF_DEMIXING_INFO_PARAM, + + /** + * IAMF Recon Gain Info Parameter Data associated with the audio frame. This + * metadata is in the form of the AVIAMFParamDefinition struct and contains + * information defined in sections 3.6.1 and 3.8.3 of the Immersive Audio Model + * and Formats standard. + */ + AV_PKT_DATA_IAMF_RECON_GAIN_INFO_PARAM, + /** * The number of side data types. * This is not part of the public API/ABI in the sense that it may diff --git a/libavformat/Makefile b/libavformat/Makefile index 329055ccfd..364bc417a3 100644 --- a/libavformat/Makefile +++ b/libavformat/Makefile @@ -3,6 +3,7 @@ DESC = FFmpeg container format library HEADERS = avformat.h \ avio.h \ + iamf.h \ version.h \ version_major.h \ @@ -258,6 +259,7 @@ OBJS-$(CONFIG_EVC_MUXER) += rawenc.o OBJS-$(CONFIG_HLS_DEMUXER) += hls.o hls_sample_encryption.o OBJS-$(CONFIG_HLS_MUXER) += hlsenc.o hlsplaylist.o avc.o OBJS-$(CONFIG_HNM_DEMUXER) += hnm.o +OBJS-$(CONFIG_IAMF_DEMUXER) += iamfdec.o iamf.o OBJS-$(CONFIG_ICO_DEMUXER) += icodec.o OBJS-$(CONFIG_ICO_MUXER) += icoenc.o OBJS-$(CONFIG_IDCIN_DEMUXER) += idcin.o diff --git a/libavformat/allformats.c b/libavformat/allformats.c index d4b505a5a3..63ca44bacd 100644 --- a/libavformat/allformats.c +++ b/libavformat/allformats.c @@ -212,6 +212,7 @@ extern const FFOutputFormat ff_hevc_muxer; extern const AVInputFormat ff_hls_demuxer; extern const FFOutputFormat ff_hls_muxer; extern const AVInputFormat ff_hnm_demuxer; +extern const AVInputFormat ff_iamf_demuxer; extern const AVInputFormat ff_ico_demuxer; extern const FFOutputFormat ff_ico_muxer; extern const AVInputFormat ff_idcin_demuxer; diff --git a/libavformat/avformat.c b/libavformat/avformat.c index 05d9837b97..32d40a5fac 100644 --- a/libavformat/avformat.c +++ b/libavformat/avformat.c @@ -37,6 +37,7 @@ #include "avformat.h" #include "avio.h" #include "demux.h" +#include "iamf.h" #include "mux.h" #include "internal.h" @@ -91,7 +92,14 @@ void ff_free_stream_group(AVStreamGroup **pstg) av_dict_free(&stg->metadata); av_freep(&stg->priv_data); switch (stg->type) { - // Structs in the union are freed here + case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT: { + avformat_iamf_audio_element_free(&stg->params.iamf_audio_element); + break; + } + case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION: { + avformat_iamf_mix_presentation_free(&stg->params.iamf_mix_presentation); + break; + } default: break; } diff --git a/libavformat/avformat.h b/libavformat/avformat.h index 9b2ee7ff14..41b244725e 100644 --- a/libavformat/avformat.h +++ b/libavformat/avformat.h @@ -1020,8 +1020,13 @@ typedef struct AVStream { enum AVStreamGroupParamsType { AV_STREAM_GROUP_PARAMS_NONE, + AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT, + AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION, }; +struct AVIAMFAudioElement; +struct AVIAMFMixPresentation; + typedef struct AVStreamGroup { /** * A class for @ref avoptions. Set on group creation. @@ -1055,7 +1060,8 @@ typedef struct AVStreamGroup { * Group-specific type parameters */ union { - uintptr_t dummy; // Placeholder + struct AVIAMFAudioElement *iamf_audio_element; + struct AVIAMFMixPresentation *iamf_mix_presentation; } params; /** diff --git a/libavformat/dump.c b/libavformat/dump.c index 7713415cae..c56121f682 100644 --- a/libavformat/dump.c +++ b/libavformat/dump.c @@ -38,6 +38,7 @@ #include "libavcodec/avcodec.h" #include "avformat.h" +#include "iamf.h" #include "internal.h" #define HEXDUMP_PRINT(...) \ @@ -134,28 +135,33 @@ static void print_fps(double d, const char *postfix) av_log(NULL, AV_LOG_INFO, "%1.0fk %s", d / 1000, postfix); } -static void dump_metadata(void *ctx, const AVDictionary *m, const char *indent) +static void dump_dictionary(void *ctx, const AVDictionary *m, + const char *name, const char *indent) { - if (m && !(av_dict_count(m) == 1 && av_dict_get(m, "language", NULL, 0))) { - const AVDictionaryEntry *tag = NULL; - - av_log(ctx, AV_LOG_INFO, "%sMetadata:\n", indent); - while ((tag = av_dict_iterate(m, tag))) - if (strcmp("language", tag->key)) { - const char *p = tag->value; - av_log(ctx, AV_LOG_INFO, - "%s %-16s: ", indent, tag->key); - while (*p) { - size_t len = strcspn(p, "\x8\xa\xb\xc\xd"); - av_log(ctx, AV_LOG_INFO, "%.*s", (int)(FFMIN(255, len)), p); - p += len; - if (*p == 0xd) av_log(ctx, AV_LOG_INFO, " "); - if (*p == 0xa) av_log(ctx, AV_LOG_INFO, "\n%s %-16s: ", indent, ""); - if (*p) p++; - } - av_log(ctx, AV_LOG_INFO, "\n"); + const AVDictionaryEntry *tag = NULL; + + av_log(ctx, AV_LOG_INFO, "%s%s:\n", indent, name); + while ((tag = av_dict_iterate(m, tag))) + if (strcmp("language", tag->key)) { + const char *p = tag->value; + av_log(ctx, AV_LOG_INFO, + "%s %-16s: ", indent, tag->key); + while (*p) { + size_t len = strcspn(p, "\x8\xa\xb\xc\xd"); + av_log(ctx, AV_LOG_INFO, "%.*s", (int)(FFMIN(255, len)), p); + p += len; + if (*p == 0xd) av_log(ctx, AV_LOG_INFO, " "); + if (*p == 0xa) av_log(ctx, AV_LOG_INFO, "\n%s %-16s: ", indent, ""); + if (*p) p++; } - } + av_log(ctx, AV_LOG_INFO, "\n"); + } +} + +static void dump_metadata(void *ctx, const AVDictionary *m, const char *indent) +{ + if (m && !(av_dict_count(m) == 1 && av_dict_get(m, "language", NULL, 0))) + dump_dictionary(ctx, m, "Metadata", indent); } /* param change side data*/ @@ -638,12 +644,55 @@ static void dump_stream_group(const AVFormatContext *ic, uint8_t *printed, char buf[512]; int ret; - av_log(NULL, AV_LOG_INFO, " Stream group #%d:%d:", index, i); + av_log(NULL, AV_LOG_INFO, " Stream group #%d:%d[0x%"PRIx64"]:", index, i, stg->id); switch (stg->type) { - default: + case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT: { + AVIAMFAudioElement *iamf = stg->params.iamf_audio_element; + int substream_count = 0; + av_log(NULL, AV_LOG_INFO, " IAMF Audio Element\n"); + for (int j = 0; j < iamf->num_layers; j++) { + AVIAMFLayer *layer = iamf->layers[j]; + substream_count += layer->substream_count; + av_log(NULL, AV_LOG_INFO, " Layer %d:", j); + ret = av_channel_layout_describe(&layer->ch_layout, buf, sizeof(buf)); + if (ret >= 0) + av_log(NULL, AV_LOG_INFO, " %s", buf); + av_log(NULL, AV_LOG_INFO, "\n"); + for (int k = 0; k < substream_count && k < stg->nb_streams; k++) { + dump_stream_format(ic, stg->streams[k]->index, i, index, is_output); + printed[stg->streams[k]->index] = 1; + } + } + break; + } + case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION: { + AVIAMFMixPresentation *mix_presentation = stg->params.iamf_mix_presentation; + av_log(NULL, AV_LOG_INFO, " IAMF Mix Presentation\n"); + dump_dictionary(NULL, mix_presentation->annotations, "Annotations", " "); + for (int j = 0; j < mix_presentation->num_sub_mixes; j++) { + AVIAMFSubmixPresentation *sub_mix = mix_presentation->sub_mixes[j]; + av_log(NULL, AV_LOG_INFO, " Submix %d:\n", j); + for (int k = 0; k < sub_mix->num_submix_elements; k++) { + AVIAMFSubmixElement *submix_element = sub_mix->submix_elements[k]; + av_log(NULL, AV_LOG_INFO, " IAMF Audio Element #%d:%d[0x%"PRIx64"]\n", index, submix_element->audio_element->index, submix_element->audio_element->id); + dump_dictionary(NULL, submix_element->annotations, "Annotations", " "); + } + for (int k = 0; k < sub_mix->num_submix_layouts; k++) { + AVIAMFSubmixLayout *submix_layout = sub_mix->submix_layouts[k]; + av_log(NULL, AV_LOG_INFO, " Layout #%d:", k); + if (submix_layout->layout_type == 2) { + ret = av_channel_layout_describe(&submix_layout->sound_system, buf, sizeof(buf)); + if (ret >= 0) + av_log(NULL, AV_LOG_INFO, " %s", buf); + } else if (submix_layout->layout_type == 3) + av_log(NULL, AV_LOG_INFO, " Binaural"); + av_log(NULL, AV_LOG_INFO, "\n"); + } + } break; } + } } void av_dump_format(AVFormatContext *ic, int index, diff --git a/libavformat/iamf.c b/libavformat/iamf.c new file mode 100644 index 0000000000..ff8203ed5e --- /dev/null +++ b/libavformat/iamf.c @@ -0,0 +1,550 @@ +/* + * Immersive Audio Model and Formats helper functions and defines + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <limits.h> +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/avassert.h" +#include "libavutil/error.h" +#include "libavutil/mem.h" + +#include "iamf.h" +#include "iamf_internal.h" + +const AVChannelLayout ff_iamf_scalable_ch_layouts[10] = { + AV_CHANNEL_LAYOUT_MONO, + AV_CHANNEL_LAYOUT_STEREO, + // "Loudspeaker configuration for Sound System B" + AV_CHANNEL_LAYOUT_5POINT1_BACK, + // "Loudspeaker configuration for Sound System C" + AV_CHANNEL_LAYOUT_5POINT1POINT2_BACK, + // "Loudspeaker configuration for Sound System D" + AV_CHANNEL_LAYOUT_5POINT1POINT4_BACK, + // "Loudspeaker configuration for Sound System I" + AV_CHANNEL_LAYOUT_7POINT1, + // "Loudspeaker configuration for Sound System I" + Ltf + Rtf + AV_CHANNEL_LAYOUT_7POINT1POINT2, + // "Loudspeaker configuration for Sound System J" + AV_CHANNEL_LAYOUT_7POINT1POINT4_BACK, + // Front subset of "Loudspeaker configuration for Sound System J" + AV_CHANNEL_LAYOUT_3POINT1POINT2, + // Binaural + AV_CHANNEL_LAYOUT_STEREO, +}; + +const struct IAMFSoundSystemMap ff_iamf_sound_system_map[13] = { + { SOUND_SYSTEM_A_0_2_0, AV_CHANNEL_LAYOUT_STEREO }, + { SOUND_SYSTEM_B_0_5_0, AV_CHANNEL_LAYOUT_5POINT1_BACK }, + { SOUND_SYSTEM_C_2_5_0, AV_CHANNEL_LAYOUT_5POINT1POINT2_BACK }, + { SOUND_SYSTEM_D_4_5_0, AV_CHANNEL_LAYOUT_5POINT1POINT4_BACK }, + { SOUND_SYSTEM_E_4_5_1, + { + .nb_channels = 11, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_5POINT1POINT4_BACK | AV_CH_BOTTOM_FRONT_CENTER, + }, + }, + { SOUND_SYSTEM_F_3_7_0, + { + .nb_channels = 12, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1POINT2 | AV_CH_TOP_BACK_CENTER | AV_CH_LOW_FREQUENCY_2, + }, + }, + { SOUND_SYSTEM_G_4_9_0, + { + .nb_channels = 14, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1POINT4_BACK | AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER, + }, + }, + { SOUND_SYSTEM_H_9_10_3, AV_CHANNEL_LAYOUT_22POINT2 }, + { SOUND_SYSTEM_I_0_7_0, AV_CHANNEL_LAYOUT_7POINT1 }, + { SOUND_SYSTEM_J_4_7_0, AV_CHANNEL_LAYOUT_7POINT1POINT4_BACK }, + { SOUND_SYSTEM_10_2_7_0, AV_CHANNEL_LAYOUT_7POINT1POINT2 }, + { SOUND_SYSTEM_11_2_3_0, AV_CHANNEL_LAYOUT_3POINT1POINT2 }, + { SOUND_SYSTEM_12_0_1_0, AV_CHANNEL_LAYOUT_MONO }, +}; + +#define DEFAULT 0 +#define FLAGS AV_OPT_FLAG_ENCODING_PARAM + +// +// Audio Element +// +#define OFFSET(x) offsetof(AVIAMFAudioElement, x) +static const AVOption audio_element_options[] = { + { "audio_element_type", "set audio_element_type", OFFSET(audio_element_type), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, 1, FLAGS }, + { "default_w", "set default_w", OFFSET(default_w), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, 10, FLAGS }, + { NULL }, +}; + +const AVClass ff_iamf_ae_class = { + .class_name = "AVIAMFAudioElement", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = audio_element_options, +}; + +AVIAMFAudioElement *avformat_iamf_audio_element_alloc(void) +{ + AVIAMFAudioElement *audio_element = av_mallocz(sizeof(*audio_element)); + + if (audio_element) { + audio_element->av_class = &ff_iamf_ae_class; + av_opt_set_defaults(audio_element); + } + + return audio_element; +} + +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFLayer, x) +static const AVOption layer_options[] = { + { "ch_layout", "set ch_layout", OFFSET(ch_layout), AV_OPT_TYPE_CHLAYOUT, {.str = NULL }, 0, 0, FLAGS }, + { "substream_count", "set substream_count", OFFSET(substream_count), AV_OPT_TYPE_INT64, {.i64 = 1 }, 1, 255, FLAGS }, + { "recon_gain_is_present", "set recon_gain_is_present", OFFSET(recon_gain_is_present), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, FLAGS }, + { "output_gain_flags", "set output_gain_flags", OFFSET(output_gain_flags), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, 0x3F, FLAGS }, + { "output_gain", "set output_gain", OFFSET(output_gain), AV_OPT_TYPE_RATIONAL, { .dbl = 0 }, -128.0, 128.0, FLAGS }, + { "ambisonics_mode", "set ambisonics_mode", OFFSET(ambisonics_mode), AV_OPT_TYPE_INT64, { .i64 = 0 }, 0, 1, FLAGS }, + { NULL }, +}; + +static const AVClass layer_class = { + .class_name = "AVIAMFLayer", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = layer_options, +}; + +int avformat_iamf_audio_element_add_layer(AVIAMFAudioElement *audio_element, AVDictionary **options) +{ + AVIAMFLayer **layers, *layer; + + if (audio_element->num_layers == UINT_MAX) + return AVERROR(EINVAL); + + layers = av_realloc_array(audio_element->layers, audio_element->num_layers + 1, + sizeof(*audio_element->layers)); + if (!layers) + return AVERROR(ENOMEM); + + audio_element->layers = layers; + + layer = audio_element->layers[audio_element->num_layers] = av_mallocz(sizeof(AVIAMFLayer)); + if (!layer) + return AVERROR(ENOMEM); + + layer->av_class = &layer_class; + av_opt_set_defaults(layer); + if (options) { + int ret = av_opt_set_dict(layer, options); + if (ret < 0) { + av_freep(&audio_element->layers[audio_element->num_layers]); + return ret; + } + } + audio_element->num_layers++; + + return 0; +} + +void avformat_iamf_audio_element_free(AVIAMFAudioElement **paudio_element) +{ + AVIAMFAudioElement *audio_element = *paudio_element; + + if (!audio_element) + return; + + for (int i; i < audio_element->num_layers; i++) { + AVIAMFLayer *layer = audio_element->layers[i]; + av_opt_free(layer); + av_free(layer->demixing_matrix); + av_free(layer); + } + av_free(audio_element->layers); + + av_free(audio_element->demixing_info); + av_free(audio_element->recon_gain_info); + av_freep(paudio_element); +} + +// +// Mix Presentation +// +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFMixPresentation, x) +static const AVOption mix_presentation_options[] = { + { "annotations", "set annotations", OFFSET(annotations), AV_OPT_TYPE_DICT, {.str = NULL }, 0, 0, FLAGS }, + { NULL }, +}; + +const AVClass ff_iamf_mp_class = { + .class_name = "AVIAMFMixPresentation", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = mix_presentation_options, +}; + +AVIAMFMixPresentation *avformat_iamf_mix_presentation_alloc(void) +{ + AVIAMFMixPresentation *mix_presentation = av_mallocz(sizeof(*mix_presentation)); + + if (mix_presentation) { + mix_presentation->av_class = &ff_iamf_mp_class; + av_opt_set_defaults(mix_presentation); + } + + return mix_presentation; +} + +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFSubmixElement, x) +static const AVOption submix_element_options[] = { + { "headphones_rendering_mode", "Headphones rendering mode", OFFSET(headphones_rendering_mode), AV_OPT_TYPE_INT64, { .i64 = 0 }, 0, 1, FLAGS }, + { "default_mix_gain", "Default mix gain", OFFSET(default_mix_gain), AV_OPT_TYPE_RATIONAL, { .dbl = 0 }, -128.0, 128.0, FLAGS }, + { "annotations", "Annotations", OFFSET(annotations), AV_OPT_TYPE_DICT, { .str = NULL }, 0, 0, FLAGS }, + { NULL }, +}; + +static const AVClass submix_element_class = { + .class_name = "AVIAMFSubmixElement", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = submix_element_options, +}; + +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFSubmixLayout, x) +static const AVOption submix_layout_options[] = { + { "layout_type", "Layout type", OFFSET(layout_type), AV_OPT_TYPE_INT64, { .i64 = 2 }, 2, 3, FLAGS }, + { "sound_system", "Sound System", OFFSET(sound_system), AV_OPT_TYPE_CHLAYOUT, { .str = NULL }, 0, 0, FLAGS }, + { "integrated_loudness", "Integrated loudness", OFFSET(integrated_loudness), AV_OPT_TYPE_RATIONAL, { .dbl = 0 }, -128.0, 128.0, FLAGS }, + { "digital_peak", "Digital peak", OFFSET(digital_peak), AV_OPT_TYPE_RATIONAL, { .dbl = 0 }, -128.0, 128.0, FLAGS }, + { "true_peak", "True peak", OFFSET(true_peak), AV_OPT_TYPE_RATIONAL, { .dbl = 0 }, -128.0, 128.0, FLAGS }, + { NULL }, +}; + +static const AVClass submix_layout_class = { + .class_name = "AVIAMFSubmixLayout", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = submix_layout_options, +}; + +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFSubmixPresentation, x) +static const AVOption submix_presentation_options[] = { + { "default_mix_gain", "Default mix gain", OFFSET(default_mix_gain), AV_OPT_TYPE_RATIONAL, { .dbl = 0 }, -128.0, 128.0, FLAGS }, + { NULL }, +}; + +static const AVClass submix_presentation_class = { + .class_name = "AVIAMFSubmixPresentation", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = submix_presentation_options, +}; + +int avformat_iamf_mix_presentation_add_submix(AVIAMFMixPresentation *mix_presentation, + unsigned int num_submix_elements, + AVDictionary **element_options, + unsigned int num_submix_layouts, + AVDictionary **layout_options) +{ + AVIAMFSubmixPresentation **sub_mixes, *sub_mix; + int ret = AVERROR(ENOMEM); + + if (mix_presentation->num_sub_mixes == UINT_MAX) + return AVERROR(EINVAL); + + sub_mixes = av_realloc_array(mix_presentation->sub_mixes, mix_presentation->num_sub_mixes + 1, + sizeof(*mix_presentation->sub_mixes)); + if (!sub_mixes) + return AVERROR(ENOMEM); + + mix_presentation->sub_mixes = sub_mixes; + + sub_mix = av_mallocz(sizeof(*sub_mix)); + if (!sub_mix) + return AVERROR(ENOMEM); + + sub_mix->av_class = &submix_presentation_class; + av_opt_set_defaults(sub_mix); + + sub_mix->submix_elements = av_calloc(num_submix_elements, sizeof(*sub_mix->submix_elements)); + if (!sub_mix->submix_elements) + goto fail; + + sub_mix->submix_layouts = av_calloc(num_submix_layouts, sizeof(*sub_mix->submix_layouts)); + if (!sub_mix->submix_layouts) + goto fail; + + for (int i = 0; i < num_submix_elements; i++) { + AVIAMFSubmixElement *submix_element = av_mallocz(sizeof(*submix_element)); + if (!submix_element) { + ret = AVERROR(ENOMEM); + goto fail; + } + submix_element->av_class = &submix_element_class; + av_opt_set_defaults(submix_element); + if (element_options && element_options[i]) { + ret = av_opt_set_dict(submix_element, &element_options[i]); + if (ret < 0) + goto fail; + } + sub_mix->submix_elements[sub_mix->num_submix_elements++] = submix_element; + } + + for (int i = 0; i < num_submix_layouts; i++) { + AVIAMFSubmixLayout *submix_layout = av_mallocz(sizeof(*submix_layout)); + if (!submix_layout) { + ret = AVERROR(ENOMEM); + goto fail; + } + + submix_layout->av_class = &submix_layout_class; + av_opt_set_defaults(submix_layout); + if (layout_options && layout_options[i]) { + ret = av_opt_set_dict(submix_layout, &layout_options[i]); + if (ret < 0) + goto fail; + } + + sub_mix->submix_layouts[sub_mix->num_submix_layouts++] = submix_layout; + } + + mix_presentation->sub_mixes[mix_presentation->num_sub_mixes++] = sub_mix; + + return 0; +fail: + for (int i = 0; i < sub_mix->num_submix_elements; i++) { + av_free(sub_mix->submix_elements[i]->element_mix_config); + av_free(sub_mix->submix_elements[i]); + } + for (int i = 0; i < sub_mix->num_submix_layouts; i++) { + av_opt_free(sub_mix->submix_layouts[i]); + av_free(sub_mix->submix_layouts[i]); + } + av_free(sub_mix->submix_elements); + av_free(sub_mix->submix_layouts); + av_free(sub_mix); + + return ret; +} + +void avformat_iamf_mix_presentation_free(AVIAMFMixPresentation **pmix_presentation) +{ + AVIAMFMixPresentation *mix_presentation = *pmix_presentation; + + if (!mix_presentation) + return; + + for (int i; i < mix_presentation->num_sub_mixes; i++) { + AVIAMFSubmixPresentation *sub_mix = mix_presentation->sub_mixes[i]; + for (int j; j < sub_mix->num_submix_elements; j++) { + AVIAMFSubmixElement *submix_element = sub_mix->submix_elements[j]; + av_opt_free(submix_element); + av_free(submix_element->element_mix_config); + av_free(submix_element); + } + av_free(sub_mix->submix_elements); + for (int j; j < sub_mix->num_submix_layouts; j++) { + AVIAMFSubmixLayout *submix_layout = sub_mix->submix_layouts[j]; + av_opt_free(submix_layout); + av_free(submix_layout); + } + av_free(sub_mix->submix_layouts); + av_free(sub_mix->output_mix_config); + av_free(sub_mix); + } + av_opt_free(mix_presentation); + av_free(mix_presentation->sub_mixes); + + av_freep(pmix_presentation); +} + +// +// Param Definition +// +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFParamDefinition, x) +static const AVOption param_definition_options[] = { + { "parameter_id", "set parameter_id", OFFSET(parameter_id), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, UINT_MAX, FLAGS }, + { "parameter_rate", "set parameter_rate", OFFSET(parameter_rate), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, UINT_MAX, FLAGS }, + { "param_definition_mode", "set param_definition_mode", OFFSET(param_definition_mode), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, 1, FLAGS }, + { "duration", "set duration", OFFSET(duration), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, UINT_MAX, FLAGS }, + { "constant_subblock_duration", "set constant_subblock_duration", OFFSET(constant_subblock_duration), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, UINT_MAX, FLAGS }, + { NULL }, +}; + +static const AVClass param_definition_class = { + .class_name = "AVIAMFParamDefinition", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = param_definition_options, +}; + +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFMixGainParameterData, x) +static const AVOption mix_gain_options[] = { + { "subblock_duration", "set subblock_duration", OFFSET(subblock_duration), AV_OPT_TYPE_INT64, {.i64 = 1 }, 1, UINT_MAX, FLAGS }, + { "animation_type", "set animation_type", OFFSET(animation_type), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, 2, FLAGS }, + { "start_point_value", "set start_point_value", OFFSET(animation_type), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, -128.0, 128.0, FLAGS }, + { "end_point_value", "set end_point_value", OFFSET(animation_type), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, -128.0, 128.0, FLAGS }, + { "control_point_value", "set control_point_value", OFFSET(animation_type), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, -128.0, 128.0, FLAGS }, + { "control_point_relative_time", "set control_point_relative_time", OFFSET(animation_type), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, UINT8_MAX, FLAGS }, + { NULL }, +}; + +static const AVClass mix_gain_class = { + .class_name = "AVIAMFSubmixElement", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = mix_gain_options, +}; + +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFDemixingInfoParameterData, x) +static const AVOption demixing_info_options[] = { + { "subblock_duration", "set subblock_duration", OFFSET(subblock_duration), AV_OPT_TYPE_INT64, {.i64 = 1 }, 1, UINT_MAX, FLAGS }, + { "dmixp_mode", "set dmixp_mode", OFFSET(dmixp_mode), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, 6, FLAGS }, + { NULL }, +}; + +static const AVClass demixing_info_class = { + .class_name = "AVIAMFDemixingInfoParameterData", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = demixing_info_options, +}; + +#undef OFFSET +#define OFFSET(x) offsetof(AVIAMFReconGainParameterData, x) +static const AVOption recon_gain_options[] = { + { "subblock_duration", "set subblock_duration", OFFSET(subblock_duration), AV_OPT_TYPE_INT64, {.i64 = 1 }, 1, UINT_MAX, FLAGS }, + { NULL }, +}; + +static const AVClass recon_gain_class = { + .class_name = "AVIAMFReconGainParameterData", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, + .option = recon_gain_options, +}; + +#undef OFFSET +#undef FLAGS +#undef DEFAULT + +AVIAMFParamDefinition *avformat_iamf_param_definition_alloc(enum AVIAMFParamDefinitionType type, AVDictionary **options, + unsigned int num_subblocks, AVDictionary **subblock_options, + size_t *out_size) +{ + + struct MixGainStruct { + AVIAMFParamDefinition p; + AVIAMFMixGainParameterData m; + }; + struct DemixStruct { + AVIAMFParamDefinition p; + AVIAMFDemixingInfoParameterData d; + }; + struct ReconGainStruct { + AVIAMFParamDefinition p; + AVIAMFReconGainParameterData r; + }; + size_t subblocks_offset, subblock_size; + size_t size; + AVIAMFParamDefinition *par; + + switch (type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: + subblocks_offset = offsetof(struct MixGainStruct, m); + subblock_size = sizeof(AVIAMFMixGainParameterData); + break; + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: + subblocks_offset = offsetof(struct DemixStruct, d); + subblock_size = sizeof(AVIAMFDemixingInfoParameterData); + break; + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: + subblocks_offset = offsetof(struct ReconGainStruct, r); + subblock_size = sizeof(AVIAMFReconGainParameterData); + break; + default: + return NULL; + } + + size = subblocks_offset; + if (num_subblocks > (SIZE_MAX - size) / subblock_size) + return NULL; + size += subblock_size * num_subblocks; + + par = av_mallocz(size); + if (!par) + return NULL; + + par->av_class = &param_definition_class; + av_opt_set_defaults(par); + if (options) { + int ret = av_opt_set_dict(par, options); + if (ret < 0) { + av_free(par); + return NULL; + } + } + par->param_definition_type = type; + par->num_subblocks = num_subblocks; + par->subblock_size = subblock_size; + par->subblocks_offset = subblocks_offset; + + for (int i = 0; i < num_subblocks; i++) { + void *subblock = avformat_iamf_param_definition_get_subblock(par, i); + + switch (type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: + ((AVIAMFMixGainParameterData *)subblock)->av_class = &mix_gain_class; + break; + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: + ((AVIAMFDemixingInfoParameterData *)subblock)->av_class = &demixing_info_class; + break; + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: + ((AVIAMFReconGainParameterData *)subblock)->av_class = &recon_gain_class; + break; + default: + av_assert0(0); + } + + av_opt_set_defaults(subblock); + if (subblock_options && subblock_options[i]) { + int ret = av_opt_set_dict(subblock, &subblock_options[i]); + if (ret < 0) { + av_free(par); + return NULL; + } + } + } + + if (out_size) + *out_size = size; + + return par; +} diff --git a/libavformat/iamf.h b/libavformat/iamf.h new file mode 100644 index 0000000000..783f8edc47 --- /dev/null +++ b/libavformat/iamf.h @@ -0,0 +1,364 @@ +/* + * Immersive Audio Model and Formats helper functions and defines + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFORMAT_IAMF_H +#define AVFORMAT_IAMF_H + +/** + * @file + * Immersive Audio Model and Formats API header + */ + +#include <stdint.h> +#include <stddef.h> + +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/channel_layout.h" +#include "libavutil/dict.h" +#include "libavutil/rational.h" + +struct AVStreamGroup; + +enum AVIAMFAudioElementType { + AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL, + AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE, +}; + +/** + * @defgroup lavf_iamf_params Parameter Definition + * @{ + * Parameters as defined in section 3.6.1 and 3.8 + * @} + * @defgroup lavf_iamf_audio Audio Element + * @{ + * Audio Elements as defined in section 3.6 + * @} + * @defgroup lavf_iamf_mix Mix Presentation + * @{ + * Mix Presentations as defined in section 3.7 + * @} + * + * @} + * @addtogroup lavf_iamf_params + * @{ + */ +enum AVIAMFParamDefinitionType { + AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, + AV_IAMF_PARAMETER_DEFINITION_DEMIXING, + AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN, +}; + +/** + * Parameters as defined in section 3.6.1 + */ +typedef struct AVIAMFParamDefinition { + const AVClass *av_class; + + size_t subblocks_offset; + size_t subblock_size; + + enum AVIAMFParamDefinitionType param_definition_type; + unsigned int num_subblocks; + + // AVOption enabled fields + unsigned int parameter_id; + unsigned int parameter_rate; + unsigned int param_definition_mode; + unsigned int duration; + unsigned int constant_subblock_duration; +} AVIAMFParamDefinition; + +AVIAMFParamDefinition *avformat_iamf_param_definition_alloc(enum AVIAMFParamDefinitionType param_definition_type, + AVDictionary **options, + unsigned int num_subblocks, AVDictionary **subblock_options, + size_t *size); + +/** + * Get the subblock at the specified {@code idx}. Must be between 0 and num_subblocks - 1. + * + * The @ref AVIAMFParamDefinition.param_definition_type "param definition type" defines + * the struct type of the returned pointer. + */ +static av_always_inline void* +avformat_iamf_param_definition_get_subblock(AVIAMFParamDefinition *par, unsigned int idx) +{ + av_assert0(idx < par->num_subblocks); + return (void *)((uint8_t *)par + par->subblocks_offset + idx * par->subblock_size); +} + +enum AVIAMFAnimationType { + AV_IAMF_ANIMATION_TYPE_STEP, + AV_IAMF_ANIMATION_TYPE_LINEAR, + AV_IAMF_ANIMATION_TYPE_BEZIER, +}; + +/** + * Mix Gain Parameter Data as defined in section 3.8.1 + * + * Subblocks in AVIAMFParamDefinition use this struct when the value or + * @ref AVIAMFParamDefinition.param_definition_type param_definition_type is + * AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN. + */ +typedef struct AVIAMFMixGainParameterData { + const AVClass *av_class; + + // AVOption enabled fields + unsigned int subblock_duration; + enum AVIAMFAnimationType animation_type; + AVRational start_point_value; + AVRational end_point_value; + AVRational control_point_value; + unsigned int control_point_relative_time; +} AVIAMFMixGainParameterData; + +/** + * Demixing Info Parameter Data as defined in section 3.8.2 + * + * Subblocks in AVIAMFParamDefinition use this struct when the value or + * @ref AVIAMFParamDefinition.param_definition_type param_definition_type is + * AV_IAMF_PARAMETER_DEFINITION_DEMIXING. + */ +typedef struct AVIAMFDemixingInfoParameterData { + const AVClass *av_class; + + // AVOption enabled fields + unsigned int subblock_duration; + unsigned int dmixp_mode; +} AVIAMFDemixingInfoParameterData; + +/** + * Recon Gain Info Parameter Data as defined in section 3.8.3 + * + * Subblocks in AVIAMFParamDefinition use this struct when the value or + * @ref AVIAMFParamDefinition.param_definition_type param_definition_type is + * AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN. + */ +typedef struct AVIAMFReconGainParameterData { + const AVClass *av_class; + + uint8_t recon_gain[6][12]; + + // AVOption enabled fields + unsigned int subblock_duration; +} AVIAMFReconGainParameterData; + +/** + * @} + * @addtogroup lavf_iamf_audio + * @{ + */ + +/** + * A layer defining a Channel Layout in the Audio Element. + * + * When audio_element_type is AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL, this + * corresponds to an Scalable Channel Layout layer as defined in section 3.6.2. + * For AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE, it is an Ambisonics channel + * layout as defined in section 3.6.3 + */ +typedef struct AVIAMFLayer { + const AVClass *av_class; + + // AVOption enabled fields + AVChannelLayout ch_layout; + unsigned int substream_count; + + unsigned int recon_gain_is_present; + /** + * Output gain flags as defined in section 3.6.2 + * + * This field is defined only if audio_element_type is + * AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL, must be 0 otherwise. + */ + unsigned int output_gain_flags; + /** + * Output gain as defined in section 3.6.2 + * + * Must be 0 if @ref output_gain_flags is 0. + */ + AVRational output_gain; + /** + * Ambisonics mode as defined in section 3.6.3 + * + * This field is defined only if audio_element_type is + * AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE, must be 0 otherwise. + * + * If 0, channel_mapping is defined implicitly (Ambisonic Order) + * or explicitly (Custom Order with ambi channels) in @ref ch_layout. + * If 1, @ref demixing_matrix must be set. + */ + unsigned int ambisonics_mode; + + // End of AVOption enabled fields + /** + * Demixing matrix as defined in section 3.6.3 + * + * Set only if @ref ambisonics_mode == 1, must be NULL otherwise. + */ + AVRational *demixing_matrix; +} AVIAMFLayer; + +typedef struct AVIAMFAudioElement { + const AVClass *av_class; + + AVIAMFLayer **layers; + /** + * Number of layers, or channel groups, in the Audio Element. + * For audio_element_type AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE, there + * may be exactly 1. + * + * Set by avformat_iamf_audio_element_add_layer(), must not be + * modified by any other code. + */ + unsigned int num_layers; + + unsigned int codec_config_id; + + AVIAMFParamDefinition *demixing_info; + AVIAMFParamDefinition *recon_gain_info; + + // AVOption enabled fields + /** + * Audio element type as defined in section 3.6 + */ + enum AVIAMFAudioElementType audio_element_type; + + /** + * Default weight value as defined in section 3.6 + */ + unsigned int default_w; +} AVIAMFAudioElement; + +AVIAMFAudioElement *avformat_iamf_audio_element_alloc(void); + +int avformat_iamf_audio_element_add_layer(AVIAMFAudioElement *audio_element, AVDictionary **options); + +void avformat_iamf_audio_element_free(AVIAMFAudioElement **audio_element); + +/** + * @} + * @addtogroup lavf_iamf_mix + * @{ + */ + +enum AVIAMFHeadphonesMode { + AV_IAMF_HEADPHONES_MODE_STEREO, + AV_IAMF_HEADPHONES_MODE_BINAURAL, +}; + +typedef struct AVIAMFSubmixElement { + const AVClass *av_class; + + const struct AVStreamGroup *audio_element; + + AVIAMFParamDefinition *element_mix_config; + + // AVOption enabled fields + /** + * A dictionary of @ref AVIAMFMixPresentation.count_label "count_label" + * amount of strings. + * Must be empty if @ref AVIAMFMixPresentation.count_label "count_label" + * is 0. + * + * decoding: set by libavformat + * encoding: set by the user + */ + AVDictionary *annotations; + + enum AVIAMFHeadphonesMode headphones_rendering_mode; + AVRational default_mix_gain; +} AVIAMFSubmixElement; + +typedef struct AVIAMFSubmixLayout { + const AVClass *av_class; + + // AVOption enabled fields + unsigned int layout_type; + AVChannelLayout sound_system; + AVRational integrated_loudness; + AVRational digital_peak; + AVRational true_peak; +} AVIAMFSubmixLayout; + +typedef struct AVIAMFSubmixPresentation { + const AVClass *av_class; + + AVIAMFSubmixElement **submix_elements; + /** + * Set by avformat_iamf_mix_presentation_add_submix(), must not be + * modified by any other code. + */ + unsigned int num_submix_elements; + + AVIAMFSubmixLayout **submix_layouts; + /** + * Set by avformat_iamf_mix_presentation_add_submix(), must not be + * modified by any other code. + */ + unsigned int num_submix_layouts; + + AVIAMFParamDefinition *output_mix_config; + AVRational default_mix_gain; +} AVIAMFSubmixPresentation; + +typedef struct AVIAMFMixPresentation { + const AVClass *av_class; + + AVIAMFSubmixPresentation **sub_mixes; + /** + * Number of submixes in the presentation. + * + * Set by avformat_iamf_mix_presentation_add_submix(), must not be + * modified by any other code. + */ + unsigned int num_sub_mixes; + + /** + * Amount of metadata strings used to identify the Mix Presentation. + * + * decoding: set by libavformat + * encoding: set by the user + */ + unsigned int count_label; + /** + * A dictionary of @ref count_label amount strings. + * Must be empty if count_label is 0. + * + * decoding: set by libavformat + * encoding: set by the user + */ + AVDictionary *annotations; +} AVIAMFMixPresentation; + +AVIAMFMixPresentation *avformat_iamf_mix_presentation_alloc(void); + +int avformat_iamf_mix_presentation_add_submix(AVIAMFMixPresentation *mix_presentation, + unsigned int num_submix_elements, + AVDictionary **element_options, + unsigned int num_submix_layouts, + AVDictionary **layout_options); + +void avformat_iamf_mix_presentation_free(AVIAMFMixPresentation **mix_presentation); +/** + * @} + */ + +#endif /* AVFORMAT_IAMF_H */ diff --git a/libavformat/iamf_internal.h b/libavformat/iamf_internal.h new file mode 100644 index 0000000000..527ac42fa4 --- /dev/null +++ b/libavformat/iamf_internal.h @@ -0,0 +1,89 @@ +/* + * Immersive Audio Model and Formats helper functions and defines + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFORMAT_IAMF_INTERNAL_H +#define AVFORMAT_IAMF_INTERNAL_H + +#include <stdint.h> + +#include "libavutil/channel_layout.h" +#include "libavutil/log.h" +#include "libavutil/opt.h" + +#define MAX_IAMF_OBU_HEADER_SIZE (1 + 8 * 3) + +// OBU types (section 3.2). +enum IAMF_OBU_Type { + IAMF_OBU_IA_CODEC_CONFIG = 0, + IAMF_OBU_IA_AUDIO_ELEMENT = 1, + IAMF_OBU_IA_MIX_PRESENTATION = 2, + IAMF_OBU_IA_PARAMETER_BLOCK = 3, + IAMF_OBU_IA_TEMPORAL_DELIMITER = 4, + IAMF_OBU_IA_AUDIO_FRAME = 5, + IAMF_OBU_IA_AUDIO_FRAME_ID0 = 6, + IAMF_OBU_IA_AUDIO_FRAME_ID1 = 7, + IAMF_OBU_IA_AUDIO_FRAME_ID2 = 8, + IAMF_OBU_IA_AUDIO_FRAME_ID3 = 9, + IAMF_OBU_IA_AUDIO_FRAME_ID4 = 10, + IAMF_OBU_IA_AUDIO_FRAME_ID5 = 11, + IAMF_OBU_IA_AUDIO_FRAME_ID6 = 12, + IAMF_OBU_IA_AUDIO_FRAME_ID7 = 13, + IAMF_OBU_IA_AUDIO_FRAME_ID8 = 14, + IAMF_OBU_IA_AUDIO_FRAME_ID9 = 15, + IAMF_OBU_IA_AUDIO_FRAME_ID10 = 16, + IAMF_OBU_IA_AUDIO_FRAME_ID11 = 17, + IAMF_OBU_IA_AUDIO_FRAME_ID12 = 18, + IAMF_OBU_IA_AUDIO_FRAME_ID13 = 19, + IAMF_OBU_IA_AUDIO_FRAME_ID14 = 20, + IAMF_OBU_IA_AUDIO_FRAME_ID15 = 21, + IAMF_OBU_IA_AUDIO_FRAME_ID16 = 22, + IAMF_OBU_IA_AUDIO_FRAME_ID17 = 23, + // 24~30 reserved. + IAMF_OBU_IA_SEQUENCE_HEADER = 31, +}; + +enum IAMF_Sound_System { + SOUND_SYSTEM_A_0_2_0 = 0, // "Loudspeaker configuration for Sound System A" + SOUND_SYSTEM_B_0_5_0 = 1, // "Loudspeaker configuration for Sound System B" + SOUND_SYSTEM_C_2_5_0 = 2, // "Loudspeaker configuration for Sound System C" + SOUND_SYSTEM_D_4_5_0 = 3, // "Loudspeaker configuration for Sound System D" + SOUND_SYSTEM_E_4_5_1 = 4, // "Loudspeaker configuration for Sound System E" + SOUND_SYSTEM_F_3_7_0 = 5, // "Loudspeaker configuration for Sound System F" + SOUND_SYSTEM_G_4_9_0 = 6, // "Loudspeaker configuration for Sound System G" + SOUND_SYSTEM_H_9_10_3 = 7, // "Loudspeaker configuration for Sound System H" + SOUND_SYSTEM_I_0_7_0 = 8, // "Loudspeaker configuration for Sound System I" + SOUND_SYSTEM_J_4_7_0 = 9, // "Loudspeaker configuration for Sound System J" + SOUND_SYSTEM_10_2_7_0 = 10, // "Loudspeaker configuration for Sound System I" + Ltf + Rtf + SOUND_SYSTEM_11_2_3_0 = 11, // Front subset of "Loudspeaker configuration for Sound System J" + SOUND_SYSTEM_12_0_1_0 = 12, // Mono +}; + +struct IAMFSoundSystemMap { + enum IAMF_Sound_System id; + AVChannelLayout layout; +}; + +extern const AVChannelLayout ff_iamf_scalable_ch_layouts[10]; +extern const struct IAMFSoundSystemMap ff_iamf_sound_system_map[13]; + +extern const AVClass ff_iamf_ae_class; +extern const AVClass ff_iamf_mp_class; + +#endif /* AVFORMAT_IAMF_INTERNAL_H */ diff --git a/libavformat/iamfdec.c b/libavformat/iamfdec.c new file mode 100644 index 0000000000..f478f08d07 --- /dev/null +++ b/libavformat/iamfdec.c @@ -0,0 +1,1686 @@ +/* + * Immersive Audio Model and Formats demuxer + * Copyright (c) 2023 James Almer <jamrial@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config_components.h" + +#include "libavutil/avassert.h" +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/opt.h" +#include "libavcodec/get_bits.h" +#include "libavcodec/flac.h" +#include "libavcodec/mpeg4audio.h" +#include "libavcodec/put_bits.h" +#include "avformat.h" +#include "avio_internal.h" +#include "demux.h" +#include "iamf.h" +#include "iamf_internal.h" +#include "internal.h" +#include "isom.h" + +typedef struct IAMFCodecConfig { + unsigned codec_config_id; + enum AVCodecID codec_id; + unsigned nb_samples; + int seek_preroll; + uint8_t *extradata; + int extradata_size; + int sample_rate; +} IAMFCodecConfig; + +typedef struct IAMFAudioElement { + AVStreamGroup *stream_group; + + AVStream **audio_substreams; + int num_substreams; +} IAMFAudioElement; + +typedef struct IAMFMixPresentation { + AVStreamGroup *stream_group; + unsigned int count_label; + char **language_label; +} IAMFMixPresentation; + +typedef struct IAMFParamDefinition { + const AVIAMFAudioElement *audio_element; + AVIAMFParamDefinition *param; +} IAMFParamDefinition; + +typedef struct IAMFDemuxContext { + IAMFCodecConfig *codec_configs; + int nb_codec_configs; + IAMFAudioElement *audio_elements; + int nb_audio_elements; + IAMFMixPresentation *mix_presentations; + int nb_mix_presentations; + IAMFParamDefinition *param_definitions; + int nb_param_definitions; + + // Packet side data + AVIAMFParamDefinition *mix; + size_t mix_size; + AVIAMFParamDefinition *demix; + size_t demix_size; + AVIAMFParamDefinition *recon; + size_t recon_size; +} IAMFDemuxContext; + +static inline unsigned get_leb128(GetBitContext *gb) { + int more, i = 0; + unsigned len = 0; + + do { + unsigned bits; + int byte = get_bits(gb, 8); + more = byte & 0x80; + bits = byte & 0x7f; + if (i <= 3 || (i == 4 && bits < (1 << 4))) + len |= bits << (i * 7); + else if (bits) + return AVERROR_INVALIDDATA; + if (++i == 8 && more) + return AVERROR_INVALIDDATA; + } while (more); + + return len; +} + +static int parse_obu_header(const uint8_t *buf, int buf_size, + unsigned *obu_size, int *start_pos, enum IAMF_OBU_Type *type, + unsigned *skip_samples, unsigned *discard_padding) +{ + GetBitContext gb; + int ret, extension_flag, trimming, start; + unsigned size; + + ret = init_get_bits8(&gb, buf, FFMIN(buf_size, MAX_IAMF_OBU_HEADER_SIZE)); + if (ret < 0) + return ret; + + *type = get_bits(&gb, 5); + /*redundant =*/ get_bits1(&gb); + trimming = get_bits1(&gb); + extension_flag = get_bits1(&gb); + + *obu_size = get_leb128(&gb); + if (*obu_size > INT_MAX) + return AVERROR_INVALIDDATA; + + start = get_bits_count(&gb) / 8; + + if (skip_samples) + *skip_samples = trimming ? get_leb128(&gb) : 0; // num_samples_to_trim_at_end + if (discard_padding) + *discard_padding = trimming ? get_leb128(&gb) : 0; // num_samples_to_trim_at_start + + if (extension_flag) { + unsigned extension_bytes = get_leb128(&gb); + if (extension_bytes > INT_MAX / 8) + return AVERROR_INVALIDDATA; + skip_bits_long(&gb, extension_bytes * 8); + } + + if (get_bits_left(&gb) < 0) + return AVERROR_INVALIDDATA; + + size = *obu_size + start; + if (size > INT_MAX) + return AVERROR_INVALIDDATA; + + *obu_size -= get_bits_count(&gb) / 8 - start; + *start_pos = size - *obu_size; + + return size; +} + +//return < 0 if we need more data +static int get_score(const uint8_t *buf, int buf_size, enum IAMF_OBU_Type type, int *seq) +{ + if (type == IAMF_OBU_IA_SEQUENCE_HEADER) { + if (buf_size < 4 || AV_RB32(buf) != MKBETAG('i','a','m','f')) + return 0; + *seq = 1; + return -1; + } + if (type >= IAMF_OBU_IA_CODEC_CONFIG && type <= IAMF_OBU_IA_TEMPORAL_DELIMITER) + return *seq ? -1 : 0; + if (type >= IAMF_OBU_IA_AUDIO_FRAME && type <= IAMF_OBU_IA_AUDIO_FRAME_ID17) + return *seq ? AVPROBE_SCORE_EXTENSION + 1 : 0; + return 0; +} + +static int iamf_probe(const AVProbeData *p) +{ + unsigned obu_size; + enum IAMF_OBU_Type type; + int seq = 0, cnt = 0, start_pos; + int ret; + + while (1) { + int size = parse_obu_header(p->buf + cnt, p->buf_size - cnt, + &obu_size, &start_pos, &type, + NULL, NULL); + if (size < 0) + return 0; + + ret = get_score(p->buf + cnt + start_pos, + p->buf_size - cnt - start_pos, + type, &seq); + if (ret >= 0) + return ret; + + cnt += FFMIN(size, p->buf_size - cnt); + } + return 0; +} + +static inline int leb(AVIOContext *pb, unsigned *len) { + int more, i = 0; + *len = 0; + + do { + unsigned bits; + int byte = avio_r8(pb); + if (pb->error) + return pb->error; + if (pb->eof_reached) + return AVERROR_INVALIDDATA; + more = byte & 0x80; + bits = byte & 0x7f; + if (i <= 3 || (i == 4 && bits < (1 << 4))) + *len |= bits << (i * 7); + else if (bits) + return AVERROR_INVALIDDATA; + if (++i == 8 && more) + return AVERROR_INVALIDDATA; + } while (more); + + return i; +} + +static int opus_decoder_config(AVFormatContext *s, AVIOContext *pb, int len, + IAMFCodecConfig *codec_config) +{ + int left = len - avio_tell(pb); + + if (left < 11) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left + 8); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + AV_WB32(codec_config->extradata, MKBETAG('O','p','u','s')); + AV_WB32(codec_config->extradata + 4, MKBETAG('H','e','a','d')); + codec_config->extradata_size = avio_read(pb, codec_config->extradata + 8, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + codec_config->extradata_size += 8; + codec_config->sample_rate = 48000; + + return 0; +} + +static int aac_decoder_config(AVFormatContext *s, AVIOContext *pb, int len, + IAMFCodecConfig *codec_config) +{ + MPEG4AudioConfig cfg = { 0 }; + int object_type_id, codec_id, stream_type; + int ret, tag, left; + + tag = avio_r8(pb); + if (tag != MP4DecConfigDescrTag) + return AVERROR_INVALIDDATA; + + object_type_id = avio_r8(pb); + if (object_type_id != 0x40) + return AVERROR_INVALIDDATA; + + stream_type = avio_r8(pb); + if (((stream_type >> 2) != 5) || ((stream_type >> 1) & 1)) + return AVERROR_INVALIDDATA; + + avio_skip(pb, 3); // buffer size db + avio_skip(pb, 4); // rc_max_rate + avio_skip(pb, 4); // avg bitrate + + codec_id = ff_codec_get_id(ff_mp4_obj_type, object_type_id); + if (codec_id && codec_id != codec_config->codec_id) + return AVERROR_INVALIDDATA; + + tag = avio_r8(pb); + if (tag != MP4DecSpecificDescrTag) + return AVERROR_INVALIDDATA; + + left = len - avio_tell(pb); + if (left <= 0) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + codec_config->extradata_size = avio_read(pb, codec_config->extradata, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + ret = avpriv_mpeg4audio_get_config2(&cfg, codec_config->extradata, + codec_config->extradata_size, 1, s); + if (ret < 0) + return ret; + + codec_config->sample_rate = cfg.sample_rate; + + return 0; +} + +static int flac_decoder_config(AVFormatContext *s, AVIOContext *pb, int len, + IAMFCodecConfig *codec_config) +{ + int left; + + avio_skip(pb, 4); // METADATA_BLOCK_HEADER + + left = len - avio_tell(pb); + if (left < FLAC_STREAMINFO_SIZE) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + codec_config->extradata_size = avio_read(pb, codec_config->extradata, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + codec_config->sample_rate = AV_RB24(codec_config->extradata + 10) >> 4; + + return 0; +} + +static int ipcm_decoder_config(AVFormatContext *s, AVIOContext *pb, int len, + IAMFCodecConfig *codec_config) +{ + static const enum AVSampleFormat sample_fmt[2][3] = { + { AV_CODEC_ID_PCM_S16BE, AV_CODEC_ID_PCM_S24BE, AV_CODEC_ID_PCM_S32BE }, + { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S24LE, AV_CODEC_ID_PCM_S32LE }, + }; + int sample_format = avio_r8(pb); // 0 = BE, 1 = LE + int sample_size = (avio_r8(pb) / 8 - 2); // 16, 24, 32 + if (sample_format > 1 || sample_size > 2) + return AVERROR_INVALIDDATA; + + codec_config->codec_id = sample_fmt[sample_format][sample_size]; + codec_config->sample_rate = avio_rb32(pb); + + if (len - avio_tell(pb)) + return AVERROR_INVALIDDATA; + + return 0; +} + +static int codec_config_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + IAMFCodecConfig *codec_config = NULL; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + enum AVCodecID avcodec_id; + unsigned codec_config_id, nb_samples, codec_id; + int16_t seek_preroll; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + ret = leb(pb, &codec_config_id); + if (ret < 0) + goto fail; + + codec_id = avio_rb32(pb); + ret = leb(pb, &nb_samples); + if (ret < 0) + goto fail; + + seek_preroll = avio_rb16(pb); + + switch(codec_id) { + case MKBETAG('O','p','u','s'): + avcodec_id = AV_CODEC_ID_OPUS; + break; + case MKBETAG('m','p','4','a'): + avcodec_id = AV_CODEC_ID_AAC; + break; + case MKBETAG('f','L','a','C'): + avcodec_id = AV_CODEC_ID_FLAC; + break; + default: + avcodec_id = AV_CODEC_ID_NONE; + break; + } + + for (int i = 0; i < c->nb_codec_configs; i++) + if (c->codec_configs[i].codec_config_id == codec_config_id) { + ret = AVERROR_INVALIDDATA; + goto fail; + } + + codec_config = av_dynarray2_add_nofree((void **)&c->codec_configs, &c->nb_codec_configs, + sizeof(*c->codec_configs), NULL); + if (!codec_config) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(codec_config, 0, sizeof(*codec_config)); + + codec_config->codec_config_id = codec_config_id; + codec_config->codec_id = avcodec_id; + codec_config->nb_samples = nb_samples; + codec_config->seek_preroll = seek_preroll; + + switch(codec_id) { + case MKBETAG('O','p','u','s'): + ret = opus_decoder_config(s, pb, len, codec_config); + break; + case MKBETAG('m','p','4','a'): + ret = aac_decoder_config(s, pb, len, codec_config); + break; + case MKBETAG('f','L','a','C'): + ret = flac_decoder_config(s, pb, len, codec_config); + break; + case MKBETAG('i','p','c','m'): + ret = ipcm_decoder_config(s, pb, len, codec_config); + break; + default: + break; + } + if (ret < 0) + goto fail; + + len -= avio_tell(pb); + if (len) { + int level = (s->error_recognition & AV_EF_EXPLODE) ? AV_LOG_ERROR : AV_LOG_WARNING; + av_log(s, level, "Underread in codec_config_obu. %d bytes left at the end\n", len); + } + + ret = 0; +fail: + av_free(buf); + return ret; +} + +static int update_extradata(AVFormatContext *s, AVStream *st) +{ + GetBitContext gb; + PutBitContext pb; + int ret; + + switch(st->codecpar->codec_id) { + case AV_CODEC_ID_OPUS: + AV_WB8(st->codecpar->extradata + 9, st->codecpar->ch_layout.nb_channels); + break; + case AV_CODEC_ID_AAC: { + uint8_t buf[5]; + + init_put_bits(&pb, buf, sizeof(buf)); + ret = init_get_bits8(&gb, st->codecpar->extradata, st->codecpar->extradata_size); + if (ret < 0) + return ret; + + ret = get_bits(&gb, 5); + put_bits(&pb, 5, ret); + if (ret == AOT_ESCAPE) // violates section 3.11.2, but better check for it + put_bits(&pb, 6, get_bits(&gb, 6)); + ret = get_bits(&gb, 4); + put_bits(&pb, 4, ret); + if (ret == 0x0f) + put_bits(&pb, 24, get_bits(&gb, 24)); + + skip_bits(&gb, 4); + put_bits(&pb, 4, st->codecpar->ch_layout.nb_channels); // set channel config + ret = put_bits_left(&pb); + put_bits(&pb, ret, get_bits(&gb, ret)); + flush_put_bits(&pb); + + memcpy(st->codecpar->extradata, buf, sizeof(buf)); + break; + } + case AV_CODEC_ID_FLAC: { + uint8_t buf[13]; + + init_put_bits(&pb, buf, sizeof(buf)); + ret = init_get_bits8(&gb, st->codecpar->extradata, st->codecpar->extradata_size); + if (ret < 0) + return ret; + + put_bits32(&pb, get_bits_long(&gb, 32)); // min/max blocksize + put_bits64(&pb, 48, get_bits64(&gb, 48)); // min/max framesize + put_bits(&pb, 20, get_bits(&gb, 20)); // samplerate + skip_bits(&gb, 3); + put_bits(&pb, 3, st->codecpar->ch_layout.nb_channels - 1); + ret = put_bits_left(&pb); + put_bits(&pb, ret, get_bits(&gb, ret)); + flush_put_bits(&pb); + + memcpy(st->codecpar->extradata, buf, sizeof(buf)); + break; + } + } + + return 0; +} + +static int scalable_channel_layout_config(AVFormatContext *s, AVIOContext *pb, + IAMFAudioElement *audio_element, + const IAMFCodecConfig *codec_config) +{ + AVStreamGroup *stg = audio_element->stream_group; + int num_layers, k = 0; + + num_layers = avio_r8(pb) >> 5; // get_bits(&gb, 3); + // skip_bits(&gb, 5); //reserved + + if (num_layers > 6) + return AVERROR_INVALIDDATA; + + for (int i = 0; i < num_layers; i++) { + AVIAMFLayer *layer; + int loudspeaker_layout, output_gain_is_present_flag; + int coupled_substream_count; + int ret, byte = avio_r8(pb); + + ret = avformat_iamf_audio_element_add_layer(stg->params.iamf_audio_element, NULL); + if (ret < 0) + return ret; + + loudspeaker_layout = byte >> 4; // get_bits(&gb, 4); + output_gain_is_present_flag = (byte >> 3) & 1; //get_bits1(&gb); + layer = stg->params.iamf_audio_element->layers[i]; + layer->recon_gain_is_present = (byte >> 2) & 1; + layer->substream_count = avio_r8(pb); + coupled_substream_count = avio_r8(pb); + + if (output_gain_is_present_flag) { + layer->output_gain_flags = avio_r8(pb) >> 2; // get_bits(&gb, 6); + layer->output_gain = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + } + + if (loudspeaker_layout < 10) + av_channel_layout_copy(&layer->ch_layout, &ff_iamf_scalable_ch_layouts[loudspeaker_layout]); + else + layer->ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_UNSPEC, + .nb_channels = layer->substream_count + + coupled_substream_count }; + + for (int j = 0; j < layer->substream_count; j++) { + AVStream *st = audio_element->audio_substreams[k++]; + + ret = avformat_stream_group_add_stream(stg, st); + if (ret < 0) + return ret; + + st->codecpar->ch_layout = coupled_substream_count-- > 0 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO : + (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + ret = update_extradata(s, st); + if (ret < 0) + return ret; + + avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate); + } + + } + + return 0; +} + +static int ambisonics_config(AVFormatContext *s, AVIOContext *pb, + IAMFAudioElement *audio_element, + const IAMFCodecConfig *codec_config) +{ + AVStreamGroup *stg = audio_element->stream_group; + AVIAMFLayer *layer; + unsigned ambisonics_mode; + int output_channel_count, substream_count, order; + int ret; + + ret = leb(pb, &ambisonics_mode); + if (ret < 0) + return ret; + + if (ambisonics_mode > 1) + return 0; + + output_channel_count = avio_r8(pb); // C + substream_count = avio_r8(pb); // N + if (audio_element->num_substreams != substream_count) + return AVERROR_INVALIDDATA; + + order = floor(sqrt(output_channel_count - 1)); + /* incomplete order - some harmonics are missing */ + if ((order + 1) * (order + 1) != output_channel_count) + return AVERROR_INVALIDDATA; + + ret = avformat_iamf_audio_element_add_layer(stg->params.iamf_audio_element, NULL); + if (ret < 0) + return ret; + + layer = stg->params.iamf_audio_element->layers[0]; + layer->ambisonics_mode = ambisonics_mode; + layer->substream_count = substream_count; + if (ambisonics_mode == 0) { + for (int i = 0; i < substream_count; i++) { + AVStream *st = audio_element->audio_substreams[i]; + + st->codecpar->ch_layout = (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + ret = avformat_stream_group_add_stream(stg, st); + if (ret < 0) + return ret; + + ret = update_extradata(s, st); + if (ret < 0) + return ret; + + avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate); + } + + layer->ch_layout.order = AV_CHANNEL_ORDER_CUSTOM; + layer->ch_layout.nb_channels = output_channel_count; + layer->ch_layout.u.map = av_calloc(output_channel_count, sizeof(*layer->ch_layout.u.map)); + if (!layer->ch_layout.u.map) + return AVERROR(ENOMEM); + + for (int i = 0; i < output_channel_count; i++) + layer->ch_layout.u.map[i].id = avio_r8(pb) + AV_CHAN_AMBISONIC_BASE; + + } else { + int coupled_substream_count = avio_r8(pb); // M + int nb_demixing_matrix = substream_count + coupled_substream_count; + int demixing_matrix_size = nb_demixing_matrix * output_channel_count; + + layer->ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_AMBISONIC, .nb_channels = output_channel_count }; + layer->demixing_matrix = av_malloc_array(demixing_matrix_size, sizeof(*layer->demixing_matrix)); + if (!layer->demixing_matrix) + return AVERROR(ENOMEM); + + for (int i = 0; i < demixing_matrix_size; i++) + layer->demixing_matrix[i] = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + + for (int i = 0; i < substream_count; i++) { + AVStream *st = audio_element->audio_substreams[i]; + + st->codecpar->ch_layout = coupled_substream_count-- > 0 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO : + (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + ret = avformat_stream_group_add_stream(stg, st); + if (ret < 0) + return ret; + + ret = update_extradata(s, st); + if (ret < 0) + return ret; + + avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate); + } + } + + return 0; +} + +static int param_parse(AVFormatContext *s, AVIOContext *pb, + unsigned int param_definition_type, + const AVIAMFAudioElement *audio_element, + AVIAMFParamDefinition **out_param_definition) +{ + IAMFDemuxContext *const c = s->priv_data; + IAMFParamDefinition *param_definition; + const IAMFParamDefinition *old_param = NULL; + unsigned int parameter_id, parameter_rate, param_definition_mode; + unsigned int duration, constant_subblock_duration, num_subblocks = 0; + int nb_param_definitions = c->nb_param_definitions, ret; + + ret = leb(pb, &parameter_id); + if (ret < 0) + return ret; + + for (int i = 0; i < c->nb_param_definitions; i++) + if (c->param_definitions[i].param->parameter_id == parameter_id) { + old_param = param_definition = &c->param_definitions[i]; + break; + } + + if (!old_param) { + param_definition = av_dynarray2_add_nofree((void **)&c->param_definitions, &nb_param_definitions, + sizeof(*c->param_definitions), NULL); + if (!param_definition) + return AVERROR(ENOMEM); + + memset(param_definition, 0, sizeof(*param_definition)); + } + + ret = leb(pb, &parameter_rate); + if (ret < 0) + return ret; + + param_definition_mode = avio_r8(pb) >> 7; + + if (old_param && (param_definition_mode != old_param->param->param_definition_mode || + param_definition_type != old_param->param->param_definition_type)) { + av_log(s, AV_LOG_ERROR, "Inconsistent param_definition_mode or param_definition_type values " + "for parameter_id %d\n", parameter_id); + return AVERROR_INVALIDDATA; + } + + if (param_definition_mode == 0) { + ret = leb(pb, &duration); + if (ret < 0) + return ret; + + ret = leb(pb, &constant_subblock_duration); + if (ret < 0) + return ret; + + if (constant_subblock_duration == 0) { + ret = leb(pb, &num_subblocks); + if (ret < 0) + return ret; + } else + num_subblocks = duration / constant_subblock_duration; + } + + if (old_param) { + if (num_subblocks != old_param->param->num_subblocks) { + av_log(s, AV_LOG_ERROR, "Inconsistent num_subblocks values for parameter_id %d\n", parameter_id); + return AVERROR_INVALIDDATA; + } + } else { + param_definition->param = avformat_iamf_param_definition_alloc(param_definition_type, NULL, num_subblocks, NULL, NULL); + if (!param_definition->param) + return AVERROR(ENOMEM); + param_definition->audio_element = audio_element; + } + + for (int i = 0; i < num_subblocks; i++) { + void *subblock = avformat_iamf_param_definition_get_subblock(param_definition->param, i); + unsigned int subblock_duration = constant_subblock_duration; + + if (constant_subblock_duration == 0) { + ret = leb(pb, &subblock_duration); + if (ret < 0) { + if (!old_param) + av_freep(&param_definition->param); + return ret; + } + } + + switch (param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: { + AVIAMFMixGainParameterData *mix = subblock; + mix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: { + AVIAMFDemixingInfoParameterData *demix = subblock; + demix->subblock_duration = subblock_duration; + // DemixingInfoParameterData + demix->dmixp_mode = avio_r8(pb) >> 5; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: { + AVIAMFReconGainParameterData *recon = subblock; + recon->subblock_duration = subblock_duration; + break; + } + default: + if (!old_param) + av_freep(&param_definition->param); + return AVERROR_INVALIDDATA; + } + } + + param_definition->param->parameter_id = parameter_id; + param_definition->param->parameter_rate = parameter_rate; + param_definition->param->param_definition_mode = param_definition_mode; + param_definition->param->duration = duration; + param_definition->param->constant_subblock_duration = constant_subblock_duration; + param_definition->param->num_subblocks = num_subblocks; + + av_assert0(out_param_definition); + *out_param_definition = param_definition->param; + + if (!old_param) + c->nb_param_definitions = nb_param_definitions; + + return 0; +} + +static int audio_element_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + const IAMFCodecConfig *codec_config = NULL; + AVIAMFAudioElement *avaudio_element; + IAMFAudioElement *audio_element; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + unsigned audio_element_id, codec_config_id, num_substreams, num_parameters; + int audio_element_type, ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + ret = leb(pb, &audio_element_id); + if (ret < 0) + goto fail; + + for (int i = 0; i < c->nb_audio_elements; i++) + if (c->audio_elements[i].stream_group->id == audio_element_id) { + av_log(s, AV_LOG_ERROR, "Duplicate audio_element_id %d\n", audio_element_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + audio_element_type = avio_r8(pb) >> 5; + + ret = leb(pb, &codec_config_id); + if (ret < 0) + goto fail; + + for (int i = 0; i < c->nb_codec_configs; i++) { + if (c->codec_configs[i].codec_config_id == codec_config_id) { + codec_config = &c->codec_configs[i]; + break; + } + } + + if (!codec_config) { + av_log(s, AV_LOG_ERROR, "Non existant codec config id %d referenced in an audio element\n", codec_config_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + if (codec_config->codec_id == AV_CODEC_ID_NONE) { + av_log(s, AV_LOG_DEBUG, "Unknown codec id referenced in an audio element. Ignoring\n"); + ret = 0; + goto fail; + } + + ret = leb(pb, &num_substreams); + if (ret < 0) + goto fail; + + audio_element = av_dynarray2_add_nofree((void **)&c->audio_elements, &c->nb_audio_elements, + sizeof(*c->audio_elements), NULL); + if (!audio_element) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(audio_element, 0, sizeof(*audio_element)); + + audio_element->audio_substreams = av_calloc(num_substreams, sizeof(*audio_element->audio_substreams)); + if (!audio_element->audio_substreams) { + ret = AVERROR(ENOMEM); + goto fail; + } + + audio_element->stream_group = avformat_stream_group_create(s, AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT); + if (!audio_element->stream_group) + return AVERROR(ENOMEM); + audio_element->stream_group->id = audio_element_id; + avaudio_element = audio_element->stream_group->params.iamf_audio_element; + avaudio_element->codec_config_id = codec_config_id; + avaudio_element->audio_element_type = audio_element_type; + + audio_element->num_substreams = num_substreams; + + for (int i = 0; i < num_substreams; i++) { + AVStream *st = audio_element->audio_substreams[i] = avformat_new_stream(s, NULL); + unsigned audio_substream_id; + + if (!st) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = leb(pb, &audio_substream_id); + if (ret < 0) + goto fail; + + st->id = audio_substream_id; + st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO; + st->codecpar->codec_id = codec_config->codec_id; + st->codecpar->frame_size = codec_config->nb_samples; + st->codecpar->sample_rate = codec_config->sample_rate; + st->codecpar->seek_preroll = codec_config->seek_preroll; + ffstream(st)->need_parsing = AVSTREAM_PARSE_HEADERS; + + switch(st->codecpar->codec_id) { + case AV_CODEC_ID_AAC: + case AV_CODEC_ID_FLAC: + case AV_CODEC_ID_OPUS: + st->codecpar->extradata = av_malloc(codec_config->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!st->codecpar->extradata) { + ret = AVERROR(ENOMEM); + goto fail; + } + memcpy(st->codecpar->extradata, codec_config->extradata, codec_config->extradata_size); + memset(st->codecpar->extradata + codec_config->extradata_size, 0, AV_INPUT_BUFFER_PADDING_SIZE); + st->codecpar->extradata_size = codec_config->extradata_size; + break; + } + } + + ret = leb(pb, &num_parameters); + if (ret < 0) + goto fail; + + if (num_parameters && audio_element_type != 0) { + av_log(s, AV_LOG_ERROR, "Audio Element parameter count %u is invalid for Scene representations\n", num_parameters); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + for (int i = 0; i < num_parameters; i++) { + unsigned param_definition_type; + + ret = leb(pb, &param_definition_type); + if (ret < 0) + goto fail; + + if (param_definition_type == 0) { + ret = AVERROR_INVALIDDATA; + goto fail; + } else if (param_definition_type == 1) { + ret = param_parse(s, pb, param_definition_type, avaudio_element, &avaudio_element->demixing_info); + if (ret < 0) + goto fail; + + avaudio_element->default_w = avio_r8(pb) >> 4; + } else if (param_definition_type == 2) { + ret = param_parse(s, pb, param_definition_type, avaudio_element, &avaudio_element->recon_gain_info); + if (ret < 0) + goto fail; + } else { + unsigned param_definition_size; + ret = leb(pb, &param_definition_size); + if (ret < 0) + goto fail; + + avio_skip(pb, param_definition_size); + } + } + + if (audio_element_type == AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL) { + ret = scalable_channel_layout_config(s, pb, audio_element, codec_config); + if (ret < 0) + goto fail; + } else if (audio_element_type == AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE) { + ret = ambisonics_config(s, pb, audio_element, codec_config); + if (ret < 0) + goto fail; + } else { + unsigned audio_element_config_size; + ret = leb(pb, &audio_element_config_size); + if (ret < 0) + goto fail; + } + + len -= avio_tell(pb); + if (len) { + int level = (s->error_recognition & AV_EF_EXPLODE) ? AV_LOG_ERROR : AV_LOG_WARNING; + av_log(s, level, "Underread in audio_element_obu. %d bytes left at the end\n", len); + } + + ret = 0; +fail: + av_free(buf); + + return ret; +} + +static int label_string(AVFormatContext *s, AVIOContext *pb, char **label) +{ + uint8_t buf[128]; + + avio_get_str(pb, sizeof(buf), buf, sizeof(buf)); + + if (pb->error) + return pb->error; + if (pb->eof_reached) + return AVERROR_INVALIDDATA; + *label = av_strdup(buf); + if (!*label) + return AVERROR(ENOMEM); + + return 0; +} + +static int mix_presentation_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + AVIAMFMixPresentation *mix_presentation; + IAMFMixPresentation *mixi; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + unsigned mix_presentation_id; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + ret = leb(pb, &mix_presentation_id); + if (ret < 0) + goto fail; + + for (int i = 0; i < c->nb_mix_presentations; i++) + if (c->mix_presentations[i].stream_group->id == mix_presentation_id) { + av_log(s, AV_LOG_ERROR, "Duplicate mix_presentation_id %d\n", mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + mixi = av_dynarray2_add_nofree((void **)&c->mix_presentations, &c->nb_mix_presentations, + sizeof(*c->mix_presentations), NULL); + if (!mixi) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(mixi, 0, sizeof(*mixi)); + mixi->stream_group = avformat_stream_group_create(s, AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION); + if (!mixi->stream_group) { + ret = AVERROR(ENOMEM); + goto fail; + } + + mixi->stream_group->id = mix_presentation_id; + + mix_presentation = mixi->stream_group->params.iamf_mix_presentation; + + ret = leb(pb, &mixi->count_label); + if (ret < 0) + goto fail; + + mixi->language_label = av_calloc(mixi->count_label, sizeof(*mixi->language_label)); + if (!mixi->language_label) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < mixi->count_label; i++) { + ret = label_string(s, pb, &mixi->language_label[i]); + if (ret < 0) + goto fail; + } + + for (int i = 0; i < mixi->count_label; i++) { + char *annotation = NULL; + ret = label_string(s, pb, &annotation); + if (ret < 0) + goto fail; + ret = av_dict_set(&mix_presentation->annotations, mixi->language_label[i], annotation, + AV_DICT_DONT_STRDUP_VAL | AV_DICT_DONT_OVERWRITE); + if (ret < 0) + goto fail; + } + + ret = leb(pb, &mix_presentation->num_sub_mixes); + if (ret < 0) + goto fail; + + mix_presentation->sub_mixes = av_calloc(mix_presentation->num_sub_mixes, sizeof(*mix_presentation->sub_mixes)); + if (!mix_presentation->sub_mixes) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < mix_presentation->num_sub_mixes; i++) { + AVIAMFSubmixPresentation *sub_mix; + + sub_mix = mix_presentation->sub_mixes[i] = av_mallocz(sizeof(*sub_mix)); + if (!sub_mix) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = leb(pb, &sub_mix->num_submix_elements); + if (ret < 0) + goto fail; + + sub_mix->submix_elements = av_calloc(sub_mix->num_submix_elements, sizeof(*sub_mix->submix_elements)); + if (!sub_mix->submix_elements) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int j = 0; j < sub_mix->num_submix_elements; j++) { + AVIAMFSubmixElement *submix_element; + IAMFAudioElement *audio_element = NULL; + unsigned int audio_element_id, rendering_config_extension_size; + + submix_element = sub_mix->submix_elements[j] = av_mallocz(sizeof(*submix_element)); + if (!submix_element) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = leb(pb, &audio_element_id); + if (ret < 0) + goto fail; + + for (int k = 0; k < c->nb_audio_elements; k++) + if (c->audio_elements[k].stream_group->id == audio_element_id) { + audio_element = &c->audio_elements[k]; + submix_element->audio_element = audio_element->stream_group; + } + + if (!audio_element) { + av_log(s, AV_LOG_ERROR, "Invalid Audio Element with id %u referenced by Mix Parameters %u\n", audio_element_id, mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + for (int k = 0; k < audio_element->num_substreams; k++) { + ret = avformat_stream_group_add_stream(mixi->stream_group, audio_element->audio_substreams[k]); + if (ret < 0 && ret != AVERROR(EEXIST)) + goto fail; + } + + for (int k = 0; k < mixi->count_label; k++) { + char *annotation = NULL; + ret = label_string(s, pb, &annotation); + if (ret < 0) + goto fail; + ret = av_dict_set(&submix_element->annotations, mixi->language_label[i], annotation, + AV_DICT_DONT_STRDUP_VAL | AV_DICT_DONT_OVERWRITE); + if (ret < 0) + goto fail; + } + + submix_element->headphones_rendering_mode = avio_r8(pb) >> 6; + + ret = leb(pb, &rendering_config_extension_size); + if (ret < 0) + goto fail; + avio_skip(pb, rendering_config_extension_size); + + ret = param_parse(s, pb, AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, NULL, &submix_element->element_mix_config); + if (ret < 0) + goto fail; + submix_element->default_mix_gain = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + } + ret = param_parse(s, pb, AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, NULL, &sub_mix->output_mix_config); + if (ret < 0) + goto fail; + sub_mix->default_mix_gain = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + + ret = leb(pb, &sub_mix->num_submix_layouts); + if (ret < 0) + goto fail; + + sub_mix->submix_layouts = av_calloc(sub_mix->num_submix_layouts, sizeof(*sub_mix->submix_layouts)); + if (!sub_mix->submix_layouts) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int j = 0; j < sub_mix->num_submix_layouts; j++) { + AVIAMFSubmixLayout *submix_layout; + int info_type; + int byte = avio_r8(pb); + + submix_layout = sub_mix->submix_layouts[j] = av_mallocz(sizeof(*submix_layout)); + if (!submix_layout) { + ret = AVERROR(ENOMEM); + goto fail; + } + + submix_layout->layout_type = byte >> 6; + if (submix_layout->layout_type < 2 && submix_layout->layout_type > 3) { + av_log(s, AV_LOG_ERROR, "Invalid Layout type %u in a submix from Mix Presentation %u\n", submix_layout->layout_type, mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + if (submix_layout->layout_type == 2) { + int sound_system; + sound_system = (byte >> 2) & 0xF; + av_channel_layout_copy(&submix_layout->sound_system, &ff_iamf_sound_system_map[sound_system].layout); + } + + info_type = avio_r8(pb); + submix_layout->integrated_loudness = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + submix_layout->digital_peak = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + + if (info_type & 1) + submix_layout->true_peak = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + + if (info_type & 2) { + unsigned int num_anchored_loudness = avio_r8(pb); + + for (int k = 0; k < num_anchored_loudness; k++) { + unsigned int anchor_element = avio_r8(pb); + AVRational anchored_loudness = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + } + } + + if (info_type & 0xFC) { + unsigned int info_type_size; + ret = leb(pb, &info_type_size); + if (ret < 0) + goto fail; + + avio_skip(pb, info_type_size); + } + } + } + + len -= avio_tell(pb); + if (len) { + int level = (s->error_recognition & AV_EF_EXPLODE) ? AV_LOG_ERROR : AV_LOG_WARNING; + av_log(s, level, "Underread in mix_presentation_obu. %d bytes left at the end\n", len); + } + + ret = 0; +fail: + av_free(buf); + + return ret; +} + +static int iamf_read_header(AVFormatContext *s) +{ + IAMFDemuxContext *const c = s->priv_data; + uint8_t header[MAX_IAMF_OBU_HEADER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; + int ret; + + while (1) { + unsigned obu_size; + enum IAMF_OBU_Type type; + int start_pos, len, size; + + if ((ret = ffio_ensure_seekback(s->pb, MAX_IAMF_OBU_HEADER_SIZE)) < 0) + return ret; + size = avio_read(s->pb, header, MAX_IAMF_OBU_HEADER_SIZE); + if (size < 0) + return size; + + len = parse_obu_header(header, size, &obu_size, &start_pos, &type, NULL, NULL); + if (len < 0) { + av_log(s, AV_LOG_ERROR, "Failed to read obu\n"); + return len; + } + + if (type >= IAMF_OBU_IA_PARAMETER_BLOCK && type < IAMF_OBU_IA_SEQUENCE_HEADER) { + avio_seek(s->pb, -size, SEEK_CUR); + break; + } + + avio_seek(s->pb, -(size - start_pos), SEEK_CUR); + switch (type) { + case IAMF_OBU_IA_CODEC_CONFIG: + ret = codec_config_obu(s, obu_size); + break; + case IAMF_OBU_IA_AUDIO_ELEMENT: + ret = audio_element_obu(s, obu_size); + break; + case IAMF_OBU_IA_MIX_PRESENTATION: + ret = mix_presentation_obu(s, obu_size); + break; + case IAMF_OBU_IA_TEMPORAL_DELIMITER: + av_freep(&c->mix); + c->mix_size = 0; + av_freep(&c->demix); + c->demix_size = 0; + av_freep(&c->recon); + c->recon_size = 0; + break; + default: { + int64_t offset = avio_skip(s->pb, obu_size); + if (offset < 0) + ret = offset; + break; + } + } + if (ret < 0) + return ret; + } + + return 0; +} + +static AVStream *find_stream_by_id(AVFormatContext *s, int id) +{ + for (int i = 0; i < s->nb_streams; i++) + if (s->streams[i]->id == id) + return s->streams[i]; + + av_log(s, AV_LOG_ERROR, "Invalid stream id %d\n", id); + return NULL; +} + +static int audio_frame_obu(AVFormatContext *s, AVPacket *pkt, int len, + enum IAMF_OBU_Type type, + unsigned skip_samples, unsigned discard_padding, + int id_in_bitstream) +{ + const IAMFDemuxContext *const c = s->priv_data; + AVStream *st; + int ret, audio_substream_id; + + if (id_in_bitstream) { + unsigned explicit_audio_substream_id; + ret = leb(s->pb, &explicit_audio_substream_id); + if (ret < 0) + return ret; + len -= ret; + audio_substream_id = explicit_audio_substream_id; + } else + audio_substream_id = type - IAMF_OBU_IA_AUDIO_FRAME_ID0; + + st = find_stream_by_id(s, audio_substream_id); + if (!st) + return AVERROR_INVALIDDATA; + + ret = av_get_packet(s->pb, pkt, len); + if (ret < 0) + return ret; + if (ret != len) + return AVERROR_INVALIDDATA; + + if (skip_samples || discard_padding) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_SKIP_SAMPLES, 10); + if (!side_data) + return AVERROR(ENOMEM); + AV_WL32(side_data, skip_samples); + AV_WL32(side_data + 4, discard_padding); + } + if (c->mix) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_MIX_GAIN_PARAM, c->mix_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->mix, c->mix_size); + } + if (c->demix) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_DEMIXING_INFO_PARAM, c->demix_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->demix, c->demix_size); + } + if (c->recon) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_RECON_GAIN_INFO_PARAM, c->recon_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->recon, c->recon_size); + } + + pkt->stream_index = st->index; + return 0; +} + +static const IAMFParamDefinition *get_param_definition(AVFormatContext *s, unsigned int parameter_id) +{ + const IAMFDemuxContext *const c = s->priv_data; + const IAMFParamDefinition *param_definition = NULL; + + for (int i = 0; i < c->nb_param_definitions; i++) + if (c->param_definitions[i].param->parameter_id == parameter_id) { + param_definition = &c->param_definitions[i]; + break; + } + + return param_definition; +} + +static int parameter_block_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + const IAMFParamDefinition *param_definition; + const AVIAMFParamDefinition *param; + AVIAMFParamDefinition *out_param = NULL; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + unsigned int duration, constant_subblock_duration; + unsigned int num_subblocks; + unsigned int parameter_id; + size_t out_param_size; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + ret = leb(pb, &parameter_id); + if (ret < 0) + goto fail; + + param_definition = get_param_definition(s, parameter_id); + if (!param_definition) { + ret = 0; + goto fail; + } + + param = param_definition->param; + if (param->param_definition_mode) { + ret = leb(pb, &duration); + if (ret < 0) + goto fail; + + ret = leb(pb, &constant_subblock_duration); + if (ret < 0) + goto fail; + + if (constant_subblock_duration == 0) { + ret = leb(pb, &num_subblocks); + if (ret < 0) + goto fail; + } else + num_subblocks = duration / constant_subblock_duration; + } else { + duration = param->duration; + constant_subblock_duration = param->constant_subblock_duration; + num_subblocks = param->num_subblocks; + if (!num_subblocks) + num_subblocks = duration / constant_subblock_duration; + } + + out_param = avformat_iamf_param_definition_alloc(param->param_definition_type, NULL, num_subblocks, NULL, &out_param_size); + if (!out_param) { + ret = AVERROR(ENOMEM); + goto fail; + } + + out_param->parameter_id = param->parameter_id; + out_param->param_definition_type = param->param_definition_type; + out_param->parameter_rate = param->parameter_rate; + out_param->param_definition_mode = param->param_definition_mode; + out_param->duration = duration; + out_param->constant_subblock_duration = constant_subblock_duration; + out_param->num_subblocks = num_subblocks; + + for (int i = 0; i < num_subblocks; i++) { + void *subblock = avformat_iamf_param_definition_get_subblock(out_param, i); + unsigned int subblock_duration; + + if (param->param_definition_mode && !constant_subblock_duration) { + ret = leb(pb, &subblock_duration); + if (ret < 0) + goto fail; + } else { + switch (param->param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: + subblock_duration = ((AVIAMFMixGainParameterData *)subblock)->subblock_duration; + break; + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: + subblock_duration = ((AVIAMFDemixingInfoParameterData *)subblock)->subblock_duration; + break; + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: + subblock_duration = ((AVIAMFReconGainParameterData *)subblock)->subblock_duration; + break; + default: + return AVERROR_INVALIDDATA; + } + } + + switch (param->param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: { + AVIAMFMixGainParameterData *mix = subblock; + + ret = leb(pb, &mix->animation_type); + if (ret < 0) + goto fail; + + if (mix->animation_type > AV_IAMF_ANIMATION_TYPE_BEZIER) { + ret = 0; + av_free(out_param); + goto fail; + } + + mix->start_point_value = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + if (mix->animation_type >= AV_IAMF_ANIMATION_TYPE_LINEAR) { + mix->end_point_value = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + } + if (mix->animation_type == AV_IAMF_ANIMATION_TYPE_BEZIER) { + mix->control_point_value = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + mix->control_point_relative_time = avio_r8(pb); + } + mix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: { + AVIAMFDemixingInfoParameterData *demix = subblock; + + demix->dmixp_mode = avio_r8(pb) >> 5; + demix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: { + AVIAMFReconGainParameterData *recon = subblock; + const AVIAMFAudioElement *audio_element = param_definition->audio_element; + + av_assert0(audio_element); + for (int i = 0; i < audio_element->num_layers; i++) { + const AVIAMFLayer *layer = audio_element->layers[i]; + if (layer->recon_gain_is_present) { + unsigned int recon_gain_flags, bitcount; + ret = leb(pb, &recon_gain_flags); + if (ret < 0) + goto fail; + + bitcount = 7 + 5 * !!(recon_gain_flags & 0x80); + recon_gain_flags = (recon_gain_flags & 0x7F) | ((recon_gain_flags & 0xFF00) >> 1); + for (int j = 0; j < bitcount; j++) { + if (recon_gain_flags & (1 << j)) + recon->recon_gain[i][j] = avio_r8(pb); + } + } + } + recon->subblock_duration = subblock_duration; + break; + } + default: { + unsigned parameter_data_size; + ret = leb(pb, &parameter_data_size); + if (ret < 0) + goto fail; + + avio_skip(pb, parameter_data_size); + break; + } + } + } + + len -= avio_tell(pb); + if (len) { + int level = (s->error_recognition & AV_EF_EXPLODE) ? AV_LOG_ERROR : AV_LOG_WARNING; + av_log(s, level, "Underread in parameter_block_obu. %d bytes left at the end\n", len); + } + + switch (param->param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: + av_free(c->mix); + c->mix = out_param; + c->mix_size = out_param_size; + break; + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: + av_free(c->demix); + c->demix = out_param; + c->demix_size = out_param_size; + break; + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: // TODO + ret = 0; + av_free(out_param); + goto fail; + default: + return AVERROR_INVALIDDATA; + } + + ret = 0; +fail: + if (ret < 0) + av_free(out_param); + av_free(buf); + + return ret; +} + +static int iamf_read_packet(AVFormatContext *s, AVPacket *pkt) +{ + uint8_t header[MAX_IAMF_OBU_HEADER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; + unsigned obu_size; + int ret; + + while (1) { + enum IAMF_OBU_Type type; + unsigned skip_samples, discard_padding; + int len, size, start_pos; + + if ((ret = ffio_ensure_seekback(s->pb, MAX_IAMF_OBU_HEADER_SIZE)) < 0) + return ret; + size = avio_read(s->pb, header, MAX_IAMF_OBU_HEADER_SIZE); + if (size < 0) + return size; + + len = parse_obu_header(header, size, &obu_size, &start_pos, &type, + &skip_samples, &discard_padding); + if (len < 0) { + av_log(s, AV_LOG_ERROR, "Failed to read obu\n"); + return len; + } + avio_seek(s->pb, -(size - start_pos), SEEK_CUR); + + if (type == IAMF_OBU_IA_AUDIO_FRAME) + return audio_frame_obu(s, pkt, obu_size, type, + skip_samples, discard_padding, 1); + else if (type >= IAMF_OBU_IA_AUDIO_FRAME_ID0 && type <= IAMF_OBU_IA_AUDIO_FRAME_ID17) + return audio_frame_obu(s, pkt, obu_size, type, + skip_samples, discard_padding, 0); + else if (type == IAMF_OBU_IA_PARAMETER_BLOCK) { + ret = parameter_block_obu(s, obu_size); + if (ret < 0) + return ret; + } else { + int64_t offset = avio_skip(s->pb, obu_size); + if (offset < 0) + ret = offset; + break; + } + } + + return ret; +} + +static int iamf_read_close(AVFormatContext *s) +{ + IAMFDemuxContext *const c = s->priv_data; + + for (int i = 0; i < c->nb_codec_configs; i++) + av_free(c->codec_configs[i].extradata); + av_freep(&c->codec_configs); + c->nb_codec_configs = 0; + + for (int i = 0; i < c->nb_audio_elements; i++) + av_free(c->audio_elements[i].audio_substreams); + av_freep(&c->audio_elements); + c->nb_audio_elements = 0; + + for (int i = 0; i < c->nb_mix_presentations; i++) { + for (int j = 0; j < c->mix_presentations[i].count_label; j++) + av_free(c->mix_presentations[i].language_label[j]); + av_free(c->mix_presentations[i].language_label); + } + av_freep(&c->mix_presentations); + c->nb_mix_presentations = 0; + + av_freep(&c->param_definitions); + c->nb_param_definitions = 0; + + av_freep(&c->mix); + c->mix_size = 0; + av_freep(&c->demix); + c->demix_size = 0; + av_freep(&c->recon); + c->recon_size = 0; + return 0; +} + +const AVInputFormat ff_iamf_demuxer = { + .name = "iamf", + .long_name = NULL_IF_CONFIG_SMALL("Raw Immersive Audio Model and Formats"), + .priv_data_size = sizeof(IAMFDemuxContext), + .flags_internal = FF_FMT_INIT_CLEANUP, + .read_probe = iamf_probe, + .read_header = iamf_read_header, + .read_packet = iamf_read_packet, + .read_close = iamf_read_close, + .extensions = "iamf", + .flags = AVFMT_GENERIC_INDEX | AVFMT_NO_BYTE_SEEK | AVFMT_NOTIMESTAMPS | AVFMT_SHOW_IDS, +}; diff --git a/libavformat/options.c b/libavformat/options.c index b3998bae43..aa16704e88 100644 --- a/libavformat/options.c +++ b/libavformat/options.c @@ -20,6 +20,8 @@ #include "avformat.h" #include "avio_internal.h" #include "demux.h" +#include "iamf.h" +#include "iamf_internal.h" #include "internal.h" #include "libavcodec/avcodec.h" @@ -326,10 +328,49 @@ fail: return NULL; } +static void *stream_group_child_next(void *obj, void *prev) +{ + AVStreamGroup *stg = obj; + if (!prev) { + switch(stg->type) { + case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT: + return stg->params.iamf_audio_element; + case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION: + return stg->params.iamf_mix_presentation; + default: + break; + } + } + return NULL; +} + +static const AVClass *stream_group_child_iterate(void **opaque) +{ + uintptr_t i = (uintptr_t)*opaque; + const AVClass *ret = NULL; + + switch(i) { + case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT: + ret = &ff_iamf_ae_class; + break; + case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION: + ret = &ff_iamf_mp_class; + break; + default: + break; + } + + if (ret) + *opaque = (void*)(i + 1); + return ret; +} + static const AVClass stream_group_class = { .class_name = "AVStreamGroup", .item_name = av_default_item_name, .version = LIBAVUTIL_VERSION_INT, + .child_next = stream_group_child_next, + .child_class_iterate = stream_group_child_iterate, }; const AVClass *av_stream_group_get_class(void) @@ -358,7 +399,16 @@ AVStreamGroup *avformat_stream_group_create(AVFormatContext *s, stg->av_class = &stream_group_class; stg->type = type; switch (type) { - // Structs in the union are allocated here + case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT: + stg->params.iamf_audio_element = avformat_iamf_audio_element_alloc(); + if (!stg->params.iamf_audio_element) + goto fail; + break; + case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION: + stg->params.iamf_mix_presentation = avformat_iamf_mix_presentation_alloc(); + if (!stg->params.iamf_mix_presentation) + goto fail; + break; default: break; }

[FFmpeg-devel,v3,3/3,WIP,RFC] avformat: Immersive Audio Model and Formats demuxer

Checks

Commit Message

Patch