[FFmpeg-devel,3/2,WIP,RFC] avformat: Immersive Audio Model and Formats demuxer

Message ID	20231023172527.4460-2-jamrial@gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: James Almer <jamrial@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Mon, 23 Oct 2023 14:25:27 -0300 Message-ID: <20231023172527.4460-2-jamrial@gmail.com> In-Reply-To: <20231023172527.4460-1-jamrial@gmail.com> References: <20231023140851.2087-1-jamrial@gmail.com> <20231023172527.4460-1-jamrial@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 3/2] [WIP][RFC]avformat: Immersive Audio Model and Formats demuxer Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,v2] avformat: introduce AVStreamGroup \| expand [FFmpeg-devel,v2] avformat: introduce AVStreamGroup [FFmpeg-devel,2/2] avutil/mem: add av_dynarray2_add_nofree [FFmpeg-devel,3/2,WIP,RFC] avformat: Immersive Audio Model and Formats demuxer

Context	Check	Description
yinshiyou/make_loongarch64	success	Make finished
yinshiyou/make_fate_loongarch64	success	Make fate finished
andriy/make_x86	success	Make finished
andriy/make_fate_x86	success	Make fate finished

diff --git a/libavcodec/avpacket.c b/libavcodec/avpacket.c index e29725c2d2..0f8c9b77ae 100644 --- a/libavcodec/avpacket.c +++ b/libavcodec/avpacket.c @@ -301,6 +301,9 @@ const char *av_packet_side_data_name(enum AVPacketSideDataType type) case AV_PKT_DATA_DOVI_CONF: return "DOVI configuration record"; case AV_PKT_DATA_S12M_TIMECODE: return "SMPTE ST 12-1:2014 timecode"; case AV_PKT_DATA_DYNAMIC_HDR10_PLUS: return "HDR10+ Dynamic Metadata (SMPTE 2094-40)"; + case AV_PKT_DATA_IAMF_MIX_GAIN_PARAM: return "IAMF Mix Gain Parameter Data"; + case AV_PKT_DATA_IAMF_DEMIXING_INFO_PARAM: return "IAMF Demixing Info Parameter Data"; + case AV_PKT_DATA_IAMF_RECON_GAIN_INFO_PARAM: return "IAMF Recon Gain Info Parameter Data"; } return NULL; } diff --git a/libavcodec/packet.h b/libavcodec/packet.h index b19409b719..2c57d262c6 100644 --- a/libavcodec/packet.h +++ b/libavcodec/packet.h @@ -299,6 +299,30 @@ enum AVPacketSideDataType { */ AV_PKT_DATA_DYNAMIC_HDR10_PLUS, + /** + * IAMF Mix Gain Parameter Data associated with the audio frame. This metadata + * is in the form of the AVIAMFParamDefinition struct and contains information + * defined in sections 3.6.1 and 3.8.1 of the Immersive Audio Model and + * Formats standard. + */ + AV_PKT_DATA_IAMF_MIX_GAIN_PARAM, + + /** + * IAMF Demixing Info Parameter Data associated with the audio frame. This + * metadata is in the form of the AVIAMFParamDefinition struct and contains + * information defined in sections 3.6.1 and 3.8.2 of the Immersive Audio Model + * and Formats standard. + */ + AV_PKT_DATA_IAMF_DEMIXING_INFO_PARAM, + + /** + * IAMF Recon Gain Info Parameter Data associated with the audio frame. This + * metadata is in the form of the AVIAMFParamDefinition struct and contains + * information defined in sections 3.6.1 and 3.8.3 of the Immersive Audio Model + * and Formats standard. + */ + AV_PKT_DATA_IAMF_RECON_GAIN_INFO_PARAM, + /** * The number of side data types. * This is not part of the public API/ABI in the sense that it may diff --git a/libavformat/Makefile b/libavformat/Makefile index 329055ccfd..364bc417a3 100644 --- a/libavformat/Makefile +++ b/libavformat/Makefile @@ -3,6 +3,7 @@ DESC = FFmpeg container format library HEADERS = avformat.h \ avio.h \ + iamf.h \ version.h \ version_major.h \ @@ -258,6 +259,7 @@ OBJS-$(CONFIG_EVC_MUXER) += rawenc.o OBJS-$(CONFIG_HLS_DEMUXER) += hls.o hls_sample_encryption.o OBJS-$(CONFIG_HLS_MUXER) += hlsenc.o hlsplaylist.o avc.o OBJS-$(CONFIG_HNM_DEMUXER) += hnm.o +OBJS-$(CONFIG_IAMF_DEMUXER) += iamfdec.o iamf.o OBJS-$(CONFIG_ICO_DEMUXER) += icodec.o OBJS-$(CONFIG_ICO_MUXER) += icoenc.o OBJS-$(CONFIG_IDCIN_DEMUXER) += idcin.o diff --git a/libavformat/allformats.c b/libavformat/allformats.c index d4b505a5a3..63ca44bacd 100644 --- a/libavformat/allformats.c +++ b/libavformat/allformats.c @@ -212,6 +212,7 @@ extern const FFOutputFormat ff_hevc_muxer; extern const AVInputFormat ff_hls_demuxer; extern const FFOutputFormat ff_hls_muxer; extern const AVInputFormat ff_hnm_demuxer; +extern const AVInputFormat ff_iamf_demuxer; extern const AVInputFormat ff_ico_demuxer; extern const FFOutputFormat ff_ico_muxer; extern const AVInputFormat ff_idcin_demuxer; diff --git a/libavformat/avformat.c b/libavformat/avformat.c index 99cda56c2f..caba3308c2 100644 --- a/libavformat/avformat.c +++ b/libavformat/avformat.c @@ -37,6 +37,7 @@ #include "avformat.h" #include "avio.h" #include "demux.h" +#include "iamf.h" #include "mux.h" #include "internal.h" @@ -90,7 +91,14 @@ void ff_free_stream_group(AVStreamGroup **pstg) av_freep(&stg->streams); av_freep(&stg->priv_data); switch (stg->type) { - // Structs in the union are freed here + case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT: { + avformat_iamf_audio_element_free(&stg->params.iamf_audio_element); + break; + } + case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION: { + avformat_iamf_mix_presentation_free(&stg->params.iamf_mix_presentation); + break; + } default: break; } diff --git a/libavformat/avformat.h b/libavformat/avformat.h index f045084c8d..455e07333e 100644 --- a/libavformat/avformat.h +++ b/libavformat/avformat.h @@ -1020,8 +1020,13 @@ typedef struct AVStream { enum AVStreamGroupParamsType { AV_STREAM_GROUP_PARAMS_NONE, + AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT, + AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION, }; +struct AVIAMFAudioElement; +struct AVIAMFMixPresentation; + typedef struct AVStreamGroup { /** * A class for @ref avoptions. Set on group creation. @@ -1055,7 +1060,8 @@ typedef struct AVStreamGroup { * Group-specific type parameters */ union { - uintptr_t dummy; // Placeholder + struct AVIAMFAudioElement *iamf_audio_element; + struct AVIAMFMixPresentation *iamf_mix_presentation; } params; /** diff --git a/libavformat/dump.c b/libavformat/dump.c index c0868a1bb3..f48afdf679 100644 --- a/libavformat/dump.c +++ b/libavformat/dump.c @@ -38,6 +38,7 @@ #include "libavcodec/avcodec.h" #include "avformat.h" +#include "iamf.h" #include "internal.h" #define HEXDUMP_PRINT(...) \ @@ -509,7 +510,7 @@ static void dump_sidedata(void *ctx, const AVStream *st, const char *indent) /* "user interface" functions */ static void dump_stream_format(const AVFormatContext *ic, int i, - int index, int is_output) + int group_index, int index, int is_output) { char buf[256]; int flags = (is_output ? ic->oformat->flags : ic->iformat->flags); @@ -517,6 +518,8 @@ static void dump_stream_format(const AVFormatContext *ic, int i, const FFStream *const sti = cffstream(st); const AVDictionaryEntry *lang = av_dict_get(st->metadata, "language", NULL, 0); const char *separator = ic->dump_separator; + const char *group_indent = group_index >= 0 ? " " : ""; + const char *extra_indent = group_index >= 0 ? " " : " "; AVCodecContext *avctx; int ret; @@ -543,7 +546,10 @@ static void dump_stream_format(const AVFormatContext *ic, int i, avcodec_string(buf, sizeof(buf), avctx, is_output); avcodec_free_context(&avctx); - av_log(NULL, AV_LOG_INFO, " Stream #%d:%d", index, i); + av_log(NULL, AV_LOG_INFO, "%s Stream #%d", group_indent, index); + if (group_index >= 0) + av_log(NULL, AV_LOG_INFO, ":%d", group_index); + av_log(NULL, AV_LOG_INFO, ":%d", i); /* the pid is an important information, so we display it */ /* XXX: add a generic system */ @@ -621,9 +627,61 @@ static void dump_stream_format(const AVFormatContext *ic, int i, av_log(NULL, AV_LOG_INFO, " (non-diegetic)"); av_log(NULL, AV_LOG_INFO, "\n"); - dump_metadata(NULL, st->metadata, " "); + dump_metadata(NULL, st->metadata, extra_indent); - dump_sidedata(NULL, st, " "); + dump_sidedata(NULL, st, extra_indent); +} + +static void dump_stream_group(const AVFormatContext *ic, uint8_t *printed, + int i, int index, int is_output) +{ + const AVStreamGroup *stg = ic->stream_groups[i]; + char buf[512]; + int ret; + + av_log(NULL, AV_LOG_INFO, " Stream group #%d:%d:", index, i); + + switch (stg->type) { + case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT: { + AVIAMFAudioElement *iamf = stg->params.iamf_audio_element; + int substream_count = 0; + av_log(NULL, AV_LOG_INFO, " IAMF Audio Element\n"); + for (int j = 0; j < iamf->num_layers; j++) { + AVIAMFLayer *layer = iamf->layers[j]; + substream_count += layer->substream_count; + av_log(NULL, AV_LOG_INFO, " Layer %d:", j); + ret = av_channel_layout_describe(&layer->ch_layout, buf, sizeof(buf)); + if (ret >= 0) + av_log(NULL, AV_LOG_INFO, " %s", buf); + av_log(NULL, AV_LOG_INFO, "\n"); + for (int k = 0; k < substream_count && k < stg->nb_streams; k++) { + dump_stream_format(ic, stg->streams[k]->index, i, index, is_output); + printed[stg->streams[k]->index] = 1; + } + } + break; + } + case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION: { + AVIAMFMixPresentation *mix_presentation = stg->params.iamf_mix_presentation; + av_log(NULL, AV_LOG_INFO, " IAMF Mix Presentation\n"); + for (int j = 0; j < mix_presentation->num_sub_mixes; j++) { + AVIAMFSubmixPresentation *sub_mix = mix_presentation->sub_mixes[j]; + av_log(NULL, AV_LOG_INFO, " Submix %d:\n", j); + for (int k = 0; k < sub_mix->num_submix_layouts; k++) { + AVIAMFSubmixLayout *submix_layout = sub_mix->submix_layouts[k]; + av_log(NULL, AV_LOG_INFO, " Layout %d", k); + if (submix_layout->layout_type == 2) { + ret = av_channel_layout_describe(&submix_layout->sound_system, buf, sizeof(buf)); + if (ret >= 0) + av_log(NULL, AV_LOG_INFO, " %s", buf); + } else if (submix_layout->layout_type == 3) + av_log(NULL, AV_LOG_INFO, " Binaural"); + av_log(NULL, AV_LOG_INFO, "\n"); + } + } + break; + } + } } void av_dump_format(AVFormatContext *ic, int index, @@ -699,7 +757,7 @@ void av_dump_format(AVFormatContext *ic, int index, dump_metadata(NULL, program->metadata, " "); for (k = 0; k < program->nb_stream_indexes; k++) { dump_stream_format(ic, program->stream_index[k], - index, is_output); + -1, index, is_output); printed[program->stream_index[k]] = 1; } total += program->nb_stream_indexes; @@ -708,9 +766,12 @@ void av_dump_format(AVFormatContext *ic, int index, av_log(NULL, AV_LOG_INFO, " No Program\n"); } + for (i = 0; i < ic->nb_stream_groups; i++) + dump_stream_group(ic, printed, i, index, is_output); + for (i = 0; i < ic->nb_streams; i++) if (!printed[i]) - dump_stream_format(ic, i, index, is_output); + dump_stream_format(ic, i, -1, index, is_output); av_free(printed); } diff --git a/libavformat/iamf.c b/libavformat/iamf.c new file mode 100644 index 0000000000..2d6f84a073 --- /dev/null +++ b/libavformat/iamf.c @@ -0,0 +1,336 @@ +/* + * Immersive Audio Model and Formats helper functions and defines + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <limits.h> +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/error.h" +#include "libavutil/mem.h" + +#include "iamf.h" +#include "iamf_internal.h" + +const AVChannelLayout ff_iamf_scalable_ch_layouts[10] = { + AV_CHANNEL_LAYOUT_MONO, + AV_CHANNEL_LAYOUT_STEREO, + // "Loudspeaker configuration for Sound System B" + AV_CHANNEL_LAYOUT_5POINT1_BACK, + // "Loudspeaker configuration for Sound System C" + AV_CHANNEL_LAYOUT_7POINT1_TOP_BACK, + // "Loudspeaker configuration for Sound System D" + { + .nb_channels = 10, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1_TOP_BACK | AV_CH_TOP_BACK_LEFT | AV_CH_TOP_BACK_RIGHT, + }, + // "Loudspeaker configuration for Sound System I" + AV_CHANNEL_LAYOUT_7POINT1, + // "Loudspeaker configuration for Sound System I" + Ltf + Rtf + { + .nb_channels = 10, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1 | AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT, + }, + // "Loudspeaker configuration for Sound System J" + { + .nb_channels = 12, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1 | AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT | + AV_CH_TOP_BACK_LEFT | AV_CH_TOP_BACK_RIGHT, + }, + // Front subset of "Loudspeaker configuration for Sound System J" + { + .nb_channels = 6, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_3POINT1 | AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT, + }, + // Binaural + AV_CHANNEL_LAYOUT_STEREO, +}; + +const struct IAMFSoundSystemMap ff_iamf_sound_system_map[13] = { + { SOUND_SYSTEM_A_0_2_0, AV_CHANNEL_LAYOUT_STEREO }, + { SOUND_SYSTEM_B_0_5_0, AV_CHANNEL_LAYOUT_5POINT1_BACK }, + { SOUND_SYSTEM_C_2_5_0, AV_CHANNEL_LAYOUT_7POINT1_TOP_BACK }, + { SOUND_SYSTEM_D_4_5_0, + { + .nb_channels = 10, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1_TOP_BACK | AV_CH_TOP_BACK_LEFT | AV_CH_TOP_BACK_RIGHT, + }, + }, + { SOUND_SYSTEM_E_4_5_1, + { + .nb_channels = 11, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1_TOP_BACK | AV_CH_TOP_BACK_LEFT | AV_CH_TOP_BACK_RIGHT | AV_CH_BOTTOM_FRONT_CENTER, + }, + }, + { SOUND_SYSTEM_F_3_7_0, + { + .nb_channels = 12, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1 | AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT | AV_CH_TOP_BACK_CENTER | AV_CH_LOW_FREQUENCY_2, + }, + }, + { SOUND_SYSTEM_G_4_9_0, + { + .nb_channels = 14, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1 | AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT | AV_CH_TOP_BACK_LEFT | AV_CH_TOP_BACK_RIGHT | + AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER, + }, + }, + { SOUND_SYSTEM_H_9_10_3, AV_CHANNEL_LAYOUT_22POINT2 }, + { SOUND_SYSTEM_I_0_7_0, AV_CHANNEL_LAYOUT_7POINT1 }, + { SOUND_SYSTEM_J_4_7_0, + { + .nb_channels = 12, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1 | AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT | + AV_CH_TOP_BACK_LEFT | AV_CH_TOP_BACK_RIGHT, + }, + }, + { SOUND_SYSTEM_10_2_7_0, + { + .nb_channels = 10, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_7POINT1 | AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT, + }, + }, + { SOUND_SYSTEM_11_2_3_0, + { + .nb_channels = 6, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_3POINT1 | AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT, + }, + }, + { SOUND_SYSTEM_12_0_1_0, AV_CHANNEL_LAYOUT_MONO }, +}; + +AVIAMFAudioElement *avformat_iamf_audio_element_alloc() +{ + return av_mallocz(sizeof(AVIAMFAudioElement)); +} + +int avformat_iamf_audio_element_add_layer(AVIAMFAudioElement *audio_element) +{ + AVIAMFLayer **layers; + + if (audio_element->num_layers == UINT_MAX) + return AVERROR(EINVAL); + + layers = av_realloc_array(audio_element->layers, audio_element->num_layers + 1, + sizeof(*audio_element->layers)); + if (!layers) + return AVERROR(ENOMEM); + + audio_element->layers = layers; + + audio_element->layers[audio_element->num_layers] = av_mallocz(sizeof(AVIAMFLayer)); + if (!audio_element->layers[audio_element->num_layers]) + return AVERROR(ENOMEM); + + av_channel_layout_uninit(&audio_element->layers[audio_element->num_layers++]->ch_layout); + + return 0; +} + +void avformat_iamf_audio_element_free(AVIAMFAudioElement **paudio_element) +{ + AVIAMFAudioElement *audio_element = *paudio_element; + + if (!audio_element) + return; + + for (int i; i < audio_element->num_layers; i++) { + AVIAMFLayer *layer = audio_element->layers[i]; + av_channel_layout_uninit(&layer->ch_layout); + av_free(layer->demixing_matrix); + av_free(layer); + } + av_free(audio_element->layers); + + av_free(audio_element->demixing_info); + av_free(audio_element->recon_gain_info); + av_freep(paudio_element); +} + +AVIAMFMixPresentation *avformat_iamf_mix_presentation_alloc() +{ + return av_mallocz(sizeof(AVIAMFMixPresentation)); +} + +int avformat_iamf_mix_presentation_add_submix(AVIAMFMixPresentation *mix_presentation, + unsigned int num_submix_elements, + unsigned int num_submix_layouts) +{ + AVIAMFSubmixPresentation **sub_mixes, *sub_mix; + + if (mix_presentation->num_sub_mixes == UINT_MAX) + return AVERROR(EINVAL); + + sub_mixes = av_realloc_array(mix_presentation->sub_mixes, mix_presentation->num_sub_mixes + 1, + sizeof(*mix_presentation->sub_mixes)); + if (!sub_mixes) + return AVERROR(ENOMEM); + + mix_presentation->sub_mixes = sub_mixes; + + sub_mix = av_mallocz(sizeof(*sub_mix)); + if (!sub_mix) + return AVERROR(ENOMEM); + + sub_mix->submix_elements = av_calloc(num_submix_elements, sizeof(*sub_mix->submix_elements)); + if (!sub_mix->submix_elements) + goto fail; + + sub_mix->submix_layouts = av_calloc(num_submix_layouts, sizeof(*sub_mix->submix_layouts)); + if (!sub_mix->submix_layouts) + goto fail; + + for (int i = 0; i < num_submix_elements; i++) { + AVIAMFSubmixElement *submix_element = av_mallocz(sizeof(*submix_element)); + if (!submix_element) + goto fail; + + sub_mix->submix_elements[sub_mix->num_submix_elements++] = submix_element; + } + + for (int i = 0; i < num_submix_layouts; i++) { + AVIAMFSubmixLayout *submix_layout = av_mallocz(sizeof(*submix_layout)); + if (!submix_layout) + goto fail; + + av_channel_layout_uninit(&submix_layout->sound_system); + sub_mix->submix_layouts[sub_mix->num_submix_layouts++] = submix_layout; + } + + mix_presentation->sub_mixes[mix_presentation->num_sub_mixes++] = sub_mix; + + return 0; +fail: + for (int i = 0; i < sub_mix->num_submix_elements; i++) { + av_free(sub_mix->submix_elements[i]->element_mix_config); + av_free(sub_mix->submix_elements[i]); + } + for (int i = 0; i < sub_mix->num_submix_layouts; i++) + av_free(sub_mix->submix_layouts[i]); + av_free(sub_mix->submix_elements); + av_free(sub_mix->submix_layouts); + av_free(sub_mix); + + return AVERROR(ENOMEM); +} + +void avformat_iamf_mix_presentation_free(AVIAMFMixPresentation **pmix_presentation) +{ + AVIAMFMixPresentation *mix_presentation = *pmix_presentation; + + if (!mix_presentation) + return; + + for (int i; i < mix_presentation->num_sub_mixes; i++) { + AVIAMFSubmixPresentation *sub_mix = mix_presentation->sub_mixes[i]; + for (int j; j < sub_mix->num_submix_elements; j++) { + AVIAMFSubmixElement *submix_element = sub_mix->submix_elements[j]; + for (int k; k < mix_presentation->count_label; k++) + av_free(submix_element->mix_presentation_element_annotations[k]); + av_free(submix_element->mix_presentation_element_annotations); + av_free(submix_element->element_mix_config); + av_free(submix_element); + } + av_free(sub_mix->submix_elements); + for (int j; j < sub_mix->num_submix_layouts; j++) { + AVIAMFSubmixLayout *submix_layout = sub_mix->submix_layouts[j]; + av_channel_layout_uninit(&submix_layout->sound_system); + av_free(submix_layout); + } + av_free(sub_mix->submix_layouts); + av_free(sub_mix->output_mix_config); + av_free(sub_mix); + } + for (int i; i < mix_presentation->count_label; i++) { + av_free(mix_presentation->language_label[i]); + av_free(mix_presentation->mix_presentation_annotations[i]); + } + av_free(mix_presentation->sub_mixes); + av_free(mix_presentation->language_label); + av_free(mix_presentation->mix_presentation_annotations); + + av_freep(pmix_presentation); +} + +AVIAMFParamDefinition *avformat_iamf_param_definition_alloc(enum AVIAMFParamDefinitionType type, + unsigned int num_subblocks, size_t *out_size) +{ + + struct MixGainStruct { + AVIAMFParamDefinition p; + AVIAMFMixGainParameterData m; + }; + struct DemixStruct { + AVIAMFParamDefinition p; + AVIAMFDemixingInfoParameterData d; + }; + struct ReconGainStruct { + AVIAMFParamDefinition p; + AVIAMFReconGainParameterData r; + }; + size_t subblocks_offset, subblock_size; + size_t size; + AVIAMFParamDefinition *par; + + switch (type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: + subblocks_offset = offsetof(struct MixGainStruct, m); + subblock_size = sizeof(AVIAMFMixGainParameterData); + break; + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: + subblocks_offset = offsetof(struct DemixStruct, d); + subblock_size = sizeof(AVIAMFDemixingInfoParameterData); + break; + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: + subblocks_offset = offsetof(struct ReconGainStruct, r); + subblock_size = sizeof(AVIAMFReconGainParameterData); + break; + default: + return NULL; + } + + size = subblocks_offset; + if (num_subblocks > (SIZE_MAX - size) / subblock_size) + return NULL; + size += subblock_size * num_subblocks; + + par = av_mallocz(size); + if (!par) + return NULL; + + par->param_definition_type = type; + par->num_subblocks = num_subblocks; + par->subblock_size = subblock_size; + par->subblocks_offset = subblocks_offset; + if (out_size) + *out_size = size; + + return par; +} diff --git a/libavformat/iamf.h b/libavformat/iamf.h new file mode 100644 index 0000000000..2743fd0c07 --- /dev/null +++ b/libavformat/iamf.h @@ -0,0 +1,228 @@ +/* + * Immersive Audio Model and Formats helper functions and defines + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFORMAT_IAMF_H +#define AVFORMAT_IAMF_H + +#include <stdint.h> +#include <stddef.h> + +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/channel_layout.h" + +struct AVStreamGroup; + +enum AVIAMFAudioElementType { + AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL, + AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE, +}; + +enum AVIAMFParamDefinitionType { + AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, + AV_IAMF_PARAMETER_DEFINITION_DEMIXING, + AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN, +}; + +/** + * Parameters as defined in 3.6.1 + */ +typedef struct AVIAMFParamDefinition { + size_t subblocks_offset; + size_t subblock_size; + unsigned int parameter_id; + enum AVIAMFParamDefinitionType param_definition_type; + unsigned int parameter_rate; + unsigned int param_definition_mode; + unsigned int duration; + unsigned int constant_subblock_duration; + unsigned int num_subblocks; +} AVIAMFParamDefinition; + +AVIAMFParamDefinition *avformat_iamf_param_definition_alloc(enum AVIAMFParamDefinitionType param_definition_type, + unsigned int num_subblocks, size_t *size); + +/** + * Get the subblock at the specified {@code idx}. Must be between 0 and num_subblocks. + * The @ref AVIAMFParamDefinition.param_definition_type "param definition type" defines + * the struct type of the returned pointer. + */ +static av_always_inline void* +avformat_iamf_param_definition_get_subblock(AVIAMFParamDefinition *par, unsigned int idx) +{ + av_assert0(idx < par->num_subblocks); + return (void *)((uint8_t *)par + par->subblocks_offset + idx * par->subblock_size); +} + +enum AVIAMFAnimationType { + AV_IAMF_ANIMATION_TYPE_STEP, + AV_IAMF_ANIMATION_TYPE_LINEAR, + AV_IAMF_ANIMATION_TYPE_BEZIER, +}; + +/** + * Mix Gain Parameter Data as defined in 3.8.1 + * + * Subblocks in AVIAMFParamDefinition use this struct when the value or + * @ref AVIAMFParamDefinition.param_definition_type param_definition_type is + * AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN. + */ +typedef struct AVIAMFMixGainParameterData { + unsigned int subblock_duration; + enum AVIAMFAnimationType animation_type; + int start_point_value; + int end_point_value; + int control_point_value; + unsigned int control_point_relative_time; +} AVIAMFMixGainParameterData; + +/** + * Demixing Info Parameter Data as defined in 3.8.2 + * + * Subblocks in AVIAMFParamDefinition use this struct when the value or + * @ref AVIAMFParamDefinition.param_definition_type param_definition_type is + * AV_IAMF_PARAMETER_DEFINITION_DEMIXING. + */ +typedef struct AVIAMFDemixingInfoParameterData { + unsigned int subblock_duration; + unsigned int dmixp_mode; +} AVIAMFDemixingInfoParameterData; + +/** + * Recon Gain Info Parameter Data as defined in 3.8.3 + * + * Subblocks in AVIAMFParamDefinition use this struct when the value or + * @ref AVIAMFParamDefinition.param_definition_type param_definition_type is + * AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN. + */ +typedef struct AVIAMFReconGainParameterData { + unsigned int subblock_duration; +} AVIAMFReconGainParameterData; + +typedef struct AVIAMFLayer { + AVChannelLayout ch_layout; + unsigned int substream_count; + + unsigned int recon_gain_is_present; + /** + * Output gain flags as defined in 3.6.2 + * + * This field is defined only if audio_element_type is + * AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL, must be 0 otherwise. + */ + unsigned int output_gain_flags; + /** + * Output gain as defined in 3.6.2 + * + * Must be 0 if @ref output_gain_flags is 0. + */ + int output_gain; + /** + * Ambisonics mode as defined in 3.6.3 + * + * This field is defined only if audio_element_type is + * AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE, must be 0 otherwise. + * + * If 0, channel_mapping is defined implicitly (Ambisonic Order) + * or explicitly (Custom Order with ambi channels) in @ref ch_layout. + * If 1, @ref demixing_matrix must be set. + */ + unsigned int ambisonics_mode; + /** + * Demixing matrix as defined in 3.6.3 + * + * Set only if @ref ambisonics_mode == 1, must be NULL otherwise. + */ + int16_t *demixing_matrix; +} AVIAMFLayer; + +typedef struct AVIAMFAudioElement { + AVIAMFLayer **layers; + unsigned int num_layers; + + unsigned int codec_config_id; + + AVIAMFParamDefinition *demixing_info; + AVIAMFParamDefinition *recon_gain_info; + + /** + * Audio element type as defined in 3.6 + */ + enum AVIAMFAudioElementType audio_element_type; + + /** + * Default weight value as defined in 3.6 + */ + unsigned int default_w; +} AVIAMFAudioElement; + +AVIAMFAudioElement *avformat_iamf_audio_element_alloc(void); + +int avformat_iamf_audio_element_add_layer(AVIAMFAudioElement *audio_element); + +void avformat_iamf_audio_element_free(AVIAMFAudioElement **audio_element); + +typedef struct AVIAMFSubmixElement { + const struct AVStreamGroup *audio_element; + + char **mix_presentation_element_annotations; + + unsigned int headphones_rendering_mode; + AVIAMFParamDefinition *element_mix_config; + int default_mix_gain; +} AVIAMFSubmixElement; + +typedef struct AVIAMFSubmixLayout { + unsigned int layout_type; + AVChannelLayout sound_system; + + int integrated_loudness; + int digital_peak; +} AVIAMFSubmixLayout; + +typedef struct AVIAMFSubmixPresentation { + AVIAMFSubmixElement **submix_elements; + unsigned int num_submix_elements; + + AVIAMFSubmixLayout **submix_layouts; + unsigned int num_submix_layouts; + + AVIAMFParamDefinition *output_mix_config; + int default_mix_gain; +} AVIAMFSubmixPresentation; + +typedef struct AVIAMFMixPresentation { + AVIAMFSubmixPresentation **sub_mixes; + unsigned int num_sub_mixes; + + unsigned int count_label; + char **language_label; + char **mix_presentation_annotations; +} AVIAMFMixPresentation; + +AVIAMFMixPresentation *avformat_iamf_mix_presentation_alloc(void); + +int avformat_iamf_mix_presentation_add_submix(AVIAMFMixPresentation *mix_presentation, + unsigned int num_submix_elements, + unsigned int num_submix_layouts); + +void avformat_iamf_mix_presentation_free(AVIAMFMixPresentation **mix_presentation); + +#endif /* AVFORMAT_IAMF_H */ diff --git a/libavformat/iamf_internal.h b/libavformat/iamf_internal.h new file mode 100644 index 0000000000..e4cfbcae33 --- /dev/null +++ b/libavformat/iamf_internal.h @@ -0,0 +1,86 @@ +/* + * Immersive Audio Model and Formats helper functions and defines + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFORMAT_IAMF_INTERNAL_H +#define AVFORMAT_IAMF_INTERNAL_H + +#include <stdint.h> + +#include "libavutil/channel_layout.h" + +#define MAX_IAMF_OBU_HEADER_SIZE (1 + 8 * 3) + +// OBU types (section 3.2). +enum IAMF_OBU_Type { + // 0 reserved. + IAMF_OBU_IA_CODEC_CONFIG = 0, + IAMF_OBU_IA_AUDIO_ELEMENT = 1, + IAMF_OBU_IA_MIX_PRESENTATION = 2, + IAMF_OBU_IA_PARAMETER_BLOCK = 3, + IAMF_OBU_IA_TEMPORAL_DELIMITER = 4, + IAMF_OBU_IA_AUDIO_FRAME = 5, + IAMF_OBU_IA_AUDIO_FRAME_ID0 = 6, + IAMF_OBU_IA_AUDIO_FRAME_ID1 = 7, + IAMF_OBU_IA_AUDIO_FRAME_ID2 = 8, + IAMF_OBU_IA_AUDIO_FRAME_ID3 = 9, + IAMF_OBU_IA_AUDIO_FRAME_ID4 = 10, + IAMF_OBU_IA_AUDIO_FRAME_ID5 = 11, + IAMF_OBU_IA_AUDIO_FRAME_ID6 = 12, + IAMF_OBU_IA_AUDIO_FRAME_ID7 = 13, + IAMF_OBU_IA_AUDIO_FRAME_ID8 = 14, + IAMF_OBU_IA_AUDIO_FRAME_ID9 = 15, + IAMF_OBU_IA_AUDIO_FRAME_ID10 = 16, + IAMF_OBU_IA_AUDIO_FRAME_ID11 = 17, + IAMF_OBU_IA_AUDIO_FRAME_ID12 = 18, + IAMF_OBU_IA_AUDIO_FRAME_ID13 = 19, + IAMF_OBU_IA_AUDIO_FRAME_ID14 = 20, + IAMF_OBU_IA_AUDIO_FRAME_ID15 = 21, + IAMF_OBU_IA_AUDIO_FRAME_ID16 = 22, + IAMF_OBU_IA_AUDIO_FRAME_ID17 = 23, + // 24~30 reserved. + IAMF_OBU_IA_SEQUENCE_HEADER = 31, +}; + +enum IAMF_Sound_System { + SOUND_SYSTEM_A_0_2_0 = 0, // "Loudspeaker configuration for Sound System A" + SOUND_SYSTEM_B_0_5_0 = 1, // "Loudspeaker configuration for Sound System B" + SOUND_SYSTEM_C_2_5_0 = 2, // "Loudspeaker configuration for Sound System C" + SOUND_SYSTEM_D_4_5_0 = 3, // "Loudspeaker configuration for Sound System D" + SOUND_SYSTEM_E_4_5_1 = 4, // "Loudspeaker configuration for Sound System E" + SOUND_SYSTEM_F_3_7_0 = 5, // "Loudspeaker configuration for Sound System F" + SOUND_SYSTEM_G_4_9_0 = 6, // "Loudspeaker configuration for Sound System G" + SOUND_SYSTEM_H_9_10_3 = 7, // "Loudspeaker configuration for Sound System H" + SOUND_SYSTEM_I_0_7_0 = 8, // "Loudspeaker configuration for Sound System I" + SOUND_SYSTEM_J_4_7_0 = 9, // "Loudspeaker configuration for Sound System J" + SOUND_SYSTEM_10_2_7_0 = 10, // "Loudspeaker configuration for Sound System I" + Ltf + Rtf + SOUND_SYSTEM_11_2_3_0 = 11, // Front subset of "Loudspeaker configuration for Sound System J" + SOUND_SYSTEM_12_0_1_0 = 12, // Mono +}; + +extern const AVChannelLayout ff_iamf_scalable_ch_layouts[10]; + +struct IAMFSoundSystemMap { + enum IAMF_Sound_System id; + AVChannelLayout layout; +}; + +extern const struct IAMFSoundSystemMap ff_iamf_sound_system_map[13]; + +#endif /* AVFORMAT_IAMF_INTERNAL_H */ diff --git a/libavformat/iamfdec.c b/libavformat/iamfdec.c new file mode 100644 index 0000000000..799addbc8c --- /dev/null +++ b/libavformat/iamfdec.c @@ -0,0 +1,1646 @@ +/* + * Immersive Audio Model and Formats demuxer + * Copyright (c) 2023 James Almer <jamrial@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config_components.h" + +#include "libavutil/avassert.h" +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/opt.h" +#include "libavcodec/get_bits.h" +#include "libavcodec/flac.h" +#include "libavcodec/mpeg4audio.h" +#include "libavcodec/put_bits.h" +#include "avformat.h" +#include "avio_internal.h" +#include "demux.h" +#include "iamf.h" +#include "iamf_internal.h" +#include "internal.h" +#include "isom.h" + +typedef struct IAMFCodecConfig { + unsigned codec_config_id; + enum AVCodecID codec_id; + unsigned nb_samples; + int seek_preroll; + uint8_t *extradata; + int extradata_size; + int sample_rate; +} IAMFCodecConfig; + +typedef struct IAMFAudioElement { + AVStreamGroup *stream_group; + + AVStream **audio_substreams; + int num_substreams; +} IAMFAudioElement; + +typedef struct IAMFMixPresentation { + AVStreamGroup *stream_group; +} IAMFMixPresentation; + +typedef struct IAMFParamDefinition { + const AVIAMFAudioElement *audio_element; + AVIAMFParamDefinition *param; +} IAMFParamDefinition; + +typedef struct IAMFDemuxContext { + IAMFCodecConfig *codec_configs; + int nb_codec_configs; + IAMFAudioElement *audio_elements; + int nb_audio_elements; + IAMFMixPresentation *mix_presentations; + int nb_mix_presentations; + IAMFParamDefinition *param_definitions; + int nb_param_definitions; + + // Packet side data + AVIAMFParamDefinition *mix; + size_t mix_size; + AVIAMFParamDefinition *demix; + size_t demix_size; + AVIAMFParamDefinition *recon; + size_t recon_size; +} IAMFDemuxContext; + +static inline unsigned get_leb128(GetBitContext *gb) { + int more, i = 0; + unsigned len = 0; + + do { + unsigned bits; + int byte = get_bits(gb, 8); + more = byte & 0x80; + bits = byte & 0x7f; + if (i <= 3 || (i == 4 && bits < (1 << 4))) + len |= bits << (i * 7); + else if (bits) + return AVERROR_INVALIDDATA; + if (++i == 8 && more) + return AVERROR_INVALIDDATA; + } while (more); + + return len; +} + +static int parse_obu_header(const uint8_t *buf, int buf_size, + unsigned *obu_size, int *start_pos, enum IAMF_OBU_Type *type) +{ + GetBitContext gb; + int ret, extension_flag, trimming, start; + unsigned size; + + ret = init_get_bits8(&gb, buf, FFMIN(buf_size, MAX_IAMF_OBU_HEADER_SIZE)); + if (ret < 0) + return ret; + + *type = get_bits(&gb, 5); + av_log(NULL, AV_LOG_DEBUG, "OBU type %d\n", *type); + /*redundant =*/ get_bits1(&gb); + trimming = get_bits1(&gb); + extension_flag = get_bits1(&gb); + + *obu_size = get_leb128(&gb); + if (*obu_size > INT_MAX) + return AVERROR_INVALIDDATA; + + start = get_bits_count(&gb) / 8; + + if (trimming) { + get_leb128(&gb); // num_samples_to_trim_at_end + get_leb128(&gb); // num_samples_to_trim_at_start + } + + if (extension_flag) { + unsigned extension_bytes = get_leb128(&gb); + if (extension_bytes > INT_MAX / 8) + return AVERROR_INVALIDDATA; + skip_bits_long(&gb, extension_bytes * 8); + } + + if (get_bits_left(&gb) < 0) + return AVERROR_INVALIDDATA; + + size = *obu_size + start; + if (size > INT_MAX) + return AVERROR_INVALIDDATA; + + *obu_size -= get_bits_count(&gb) / 8 - start; + *start_pos = size - *obu_size; + + av_log(NULL, AV_LOG_DEBUG, "OBU size %u\n", *obu_size); + return size; +} + +//return < 0 if we need more data +static int get_score(const uint8_t *buf, int buf_size, enum IAMF_OBU_Type type, int *seq) +{ + if (type == IAMF_OBU_IA_SEQUENCE_HEADER) { + if (buf_size < 4 || AV_RB32(buf) != MKBETAG('i','a','m','f')) + return 0; + *seq = 1; + return -1; + } + if (type >= IAMF_OBU_IA_CODEC_CONFIG && type <= IAMF_OBU_IA_TEMPORAL_DELIMITER) + return *seq ? -1 : 0; + if (type >= IAMF_OBU_IA_AUDIO_FRAME && type <= IAMF_OBU_IA_AUDIO_FRAME_ID17) + return *seq ? AVPROBE_SCORE_EXTENSION + 1 : 0; + return 0; +} + +static int iamf_probe(const AVProbeData *p) +{ + unsigned obu_size; + enum IAMF_OBU_Type type; + int seq = 0, cnt = 0, start_pos; + int ret; + + while (1) { + int size = parse_obu_header(p->buf + cnt, p->buf_size - cnt, + &obu_size, &start_pos, &type); + if (size < 0) + return 0; + + ret = get_score(p->buf + cnt + start_pos, + p->buf_size - cnt - start_pos, + type, &seq); + if (ret >= 0) + return ret; + + cnt += FFMIN(size, p->buf_size - cnt); + } + return 0; +} + +static inline int leb(AVIOContext *pb, unsigned *len) { + int more, i = 0; + *len = 0; + + do { + unsigned bits; + int byte = avio_r8(pb); + if (pb->error) + return pb->error; + if (pb->eof_reached) + return AVERROR_INVALIDDATA; + more = byte & 0x80; + bits = byte & 0x7f; + if (i <= 3 || (i == 4 && bits < (1 << 4))) + *len |= bits << (i * 7); + else if (bits) + return AVERROR_INVALIDDATA; + if (++i == 8 && more) + return AVERROR_INVALIDDATA; + } while (more); + + return i; +} + +static int opus_decoder_config(AVFormatContext *s, AVIOContext *pb, int len, + IAMFCodecConfig *codec_config) +{ + int left = len - avio_tell(pb); + + if (left < 11) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left + 8); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + AV_WB32(codec_config->extradata, MKBETAG('O','p','u','s')); + AV_WB32(codec_config->extradata + 4, MKBETAG('H','e','a','d')); + codec_config->extradata_size = avio_read(pb, codec_config->extradata + 8, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + codec_config->extradata_size += 8; + codec_config->sample_rate = 48000; + + return 0; +} + +static int aac_decoder_config(AVFormatContext *s, AVIOContext *pb, int len, + IAMFCodecConfig *codec_config) +{ + MPEG4AudioConfig cfg = { 0 }; + int object_type_id, codec_id, stream_type; + int ret, tag, left; + + tag = avio_r8(pb); + if (tag != MP4DecConfigDescrTag) + return AVERROR_INVALIDDATA; + + object_type_id = avio_r8(pb); + if (object_type_id != 0x40) + return AVERROR_INVALIDDATA; + + stream_type = avio_r8(pb); + if (((stream_type >> 2) != 5) || ((stream_type >> 1) & 1)) + return AVERROR_INVALIDDATA; + + avio_skip(pb, 3); // buffer size db + avio_skip(pb, 4); // rc_max_rate + avio_skip(pb, 4); // avg bitrate + + codec_id = ff_codec_get_id(ff_mp4_obj_type, object_type_id); + if (codec_id && codec_id != codec_config->codec_id) + return AVERROR_INVALIDDATA; + + tag = avio_r8(pb); + if (tag != MP4DecSpecificDescrTag) + return AVERROR_INVALIDDATA; + + left = len - avio_tell(pb); + if (left <= 0) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + codec_config->extradata_size = avio_read(pb, codec_config->extradata, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + ret = avpriv_mpeg4audio_get_config2(&cfg, codec_config->extradata, + codec_config->extradata_size, 1, s); + if (ret < 0) + return ret; + + codec_config->sample_rate = cfg.sample_rate; + + return 0; +} + +static int flac_decoder_config(AVFormatContext *s, AVIOContext *pb, int len, + IAMFCodecConfig *codec_config) +{ + int left; + + avio_skip(pb, 4); // METADATA_BLOCK_HEADER + + left = len - avio_tell(pb); + if (left < FLAC_STREAMINFO_SIZE) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + codec_config->extradata_size = avio_read(pb, codec_config->extradata, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + codec_config->sample_rate = AV_RB24(codec_config->extradata + 10) >> 4; + + return 0; +} + +static int ipcm_decoder_config(AVFormatContext *s, AVIOContext *pb, int len, + IAMFCodecConfig *codec_config) +{ + static const enum AVSampleFormat sample_fmt[2][3] = { + { AV_CODEC_ID_PCM_S16BE, AV_CODEC_ID_PCM_S24BE, AV_CODEC_ID_PCM_S32BE }, + { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S24LE, AV_CODEC_ID_PCM_S32LE }, + }; + int sample_format = avio_r8(pb); // 0 = BE, 1 = LE + int sample_size = (avio_r8(pb) / 8 - 2); // 16, 24, 32 + if (sample_format > 1 || sample_size > 2) + return AVERROR_INVALIDDATA; + + codec_config->codec_id = sample_fmt[sample_format][sample_size]; + codec_config->sample_rate = avio_rb32(pb); + + if (len - avio_tell(pb)) + return AVERROR_INVALIDDATA; + + return 0; +} + +static int codec_config_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + IAMFCodecConfig *codec_config = NULL; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + enum AVCodecID avcodec_id; + unsigned codec_config_id, nb_samples, codec_id; + int16_t seek_preroll; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + ret = leb(pb, &codec_config_id); + if (ret < 0) + goto fail; + + codec_id = avio_rb32(pb); + ret = leb(pb, &nb_samples); + if (ret < 0) + goto fail; + + seek_preroll = avio_rb16(pb); + + switch(codec_id) { + case MKBETAG('O','p','u','s'): + avcodec_id = AV_CODEC_ID_OPUS; + break; + case MKBETAG('m','p','4','a'): + avcodec_id = AV_CODEC_ID_AAC; + break; + case MKBETAG('f','L','a','C'): + avcodec_id = AV_CODEC_ID_FLAC; + break; + default: + avcodec_id = AV_CODEC_ID_NONE; + break; + } + + for (int i = 0; i < c->nb_codec_configs; i++) + if (c->codec_configs[i].codec_config_id == codec_config_id) { + ret = AVERROR_INVALIDDATA; + goto fail; + } + + codec_config = av_dynarray2_add_nofree((void **)&c->codec_configs, &c->nb_codec_configs, + sizeof(*c->codec_configs), NULL); + if (!codec_config) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(codec_config, 0, sizeof(*codec_config)); + + codec_config->codec_config_id = codec_config_id; + codec_config->codec_id = avcodec_id; + codec_config->nb_samples = nb_samples; + codec_config->seek_preroll = seek_preroll; + + switch(codec_id) { + case MKBETAG('O','p','u','s'): + ret = opus_decoder_config(s, pb, len, codec_config); + break; + case MKBETAG('m','p','4','a'): + ret = aac_decoder_config(s, pb, len, codec_config); + break; + case MKBETAG('f','L','a','C'): + ret = flac_decoder_config(s, pb, len, codec_config); + break; + case MKBETAG('i','p','c','m'): + ret = ipcm_decoder_config(s, pb, len, codec_config); + break; + default: + break; + } + if (ret < 0) + goto fail; + + av_log(s, AV_LOG_DEBUG, "%"PRId64" bytes left at the end of codec_config_obu\n", len - avio_tell(pb)); + + ret = 0; +fail: + av_free(buf); + return ret; +} + +static int update_extradata(AVFormatContext *s, AVStream *st) +{ + GetBitContext gb; + PutBitContext pb; + int ret; + + switch(st->codecpar->codec_id) { + case AV_CODEC_ID_OPUS: + AV_WB8(st->codecpar->extradata + 9, st->codecpar->ch_layout.nb_channels); + break; + case AV_CODEC_ID_AAC: { + uint8_t buf[5]; + + init_put_bits(&pb, buf, sizeof(buf)); + ret = init_get_bits8(&gb, st->codecpar->extradata, st->codecpar->extradata_size); + if (ret < 0) + return ret; + + ret = get_bits(&gb, 5); + put_bits(&pb, 5, ret); + if (ret == AOT_ESCAPE) // violates section 3.11.2, but better check for it + put_bits(&pb, 6, get_bits(&gb, 6)); + ret = get_bits(&gb, 4); + put_bits(&pb, 4, ret); + if (ret == 0x0f) + put_bits(&pb, 24, get_bits(&gb, 24)); + + skip_bits(&gb, 4); + put_bits(&pb, 4, st->codecpar->ch_layout.nb_channels); // set channel config + ret = put_bits_left(&pb); + put_bits(&pb, ret, get_bits(&gb, ret)); + flush_put_bits(&pb); + + memcpy(st->codecpar->extradata, buf, sizeof(buf)); + break; + } + case AV_CODEC_ID_FLAC: { + uint8_t buf[13]; + + init_put_bits(&pb, buf, sizeof(buf)); + ret = init_get_bits8(&gb, st->codecpar->extradata, st->codecpar->extradata_size); + if (ret < 0) + return ret; + + put_bits32(&pb, get_bits_long(&gb, 32)); // min/max blocksize + put_bits64(&pb, 48, get_bits64(&gb, 48)); // min/max framesize + put_bits(&pb, 20, get_bits(&gb, 20)); // samplerate + skip_bits(&gb, 3); + put_bits(&pb, 3, st->codecpar->ch_layout.nb_channels - 1); + ret = put_bits_left(&pb); + put_bits(&pb, ret, get_bits(&gb, ret)); + flush_put_bits(&pb); + + memcpy(st->codecpar->extradata, buf, sizeof(buf)); + break; + } + } + + return 0; +} + +static int scalable_channel_layout_config(AVFormatContext *s, AVIOContext *pb, + IAMFAudioElement *audio_element, + const IAMFCodecConfig *codec_config) +{ + AVStreamGroup *stg = audio_element->stream_group; + int num_layers, k = 0; + + num_layers = avio_r8(pb) >> 5; // get_bits(&gb, 3); + // skip_bits(&gb, 5); //reserved + + if (num_layers > 6) + return AVERROR_INVALIDDATA; + + for (int i = 0; i < num_layers; i++) { + AVIAMFLayer *layer; + int loudspeaker_layout, output_gain_is_present_flag; + int coupled_substream_count; + int ret, byte = avio_r8(pb); + + ret = avformat_iamf_audio_element_add_layer(stg->params.iamf_audio_element); + if (ret < 0) + return ret; + + loudspeaker_layout = byte >> 4; // get_bits(&gb, 4); + output_gain_is_present_flag = (byte >> 3) & 1; //get_bits1(&gb); + layer = stg->params.iamf_audio_element->layers[i]; + layer->recon_gain_is_present = (byte >> 2) & 1; + layer->substream_count = avio_r8(pb); + coupled_substream_count = avio_r8(pb); + + if (output_gain_is_present_flag) { + layer->output_gain_flags = avio_r8(pb) >> 2; // get_bits(&gb, 6); + layer->output_gain = sign_extend(avio_rb16(pb), 16); + } + + if (loudspeaker_layout < 10) + av_channel_layout_copy(&layer->ch_layout, &ff_iamf_scalable_ch_layouts[loudspeaker_layout]); + else + layer->ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_UNSPEC, + .nb_channels = layer->substream_count + + coupled_substream_count }; + + for (int j = 0; j < layer->substream_count; j++) { + AVStream *st = audio_element->audio_substreams[k++]; + + ret = avformat_stream_group_add_stream(stg, st); + if (ret < 0) + return ret; + + st->codecpar->ch_layout = coupled_substream_count-- > 0 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO : + (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + ret = update_extradata(s, st); + if (ret < 0) + return ret; + + avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate); + } + + } + + return 0; +} + +static int ambisonics_config(AVFormatContext *s, AVIOContext *pb, + IAMFAudioElement *audio_element, + const IAMFCodecConfig *codec_config) +{ + AVStreamGroup *stg = audio_element->stream_group; + AVIAMFLayer *layer; + unsigned ambisonics_mode; + int output_channel_count, substream_count, order; + int ret; + + ret = leb(pb, &ambisonics_mode); + if (ret < 0) + return ret; + + if (ambisonics_mode > 1) + return 0; + + output_channel_count = avio_r8(pb); // C + substream_count = avio_r8(pb); // N + if (audio_element->num_substreams != substream_count) + return AVERROR_INVALIDDATA; + + order = floor(sqrt(output_channel_count - 1)); + /* incomplete order - some harmonics are missing */ + if ((order + 1) * (order + 1) != output_channel_count) + return AVERROR_INVALIDDATA; + + ret = avformat_iamf_audio_element_add_layer(stg->params.iamf_audio_element); + if (ret < 0) + return ret; + + layer = stg->params.iamf_audio_element->layers[0]; + layer->ambisonics_mode = ambisonics_mode; + layer->substream_count = substream_count; + if (ambisonics_mode == 0) { + for (int i = 0; i < substream_count; i++) { + AVStream *st = audio_element->audio_substreams[i]; + + st->codecpar->ch_layout = (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + ret = avformat_stream_group_add_stream(stg, st); + if (ret < 0) + return ret; + + ret = update_extradata(s, st); + if (ret < 0) + return ret; + + avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate); + } + + layer->ch_layout.order = AV_CHANNEL_ORDER_CUSTOM; + layer->ch_layout.nb_channels = output_channel_count; + layer->ch_layout.u.map = av_calloc(output_channel_count, sizeof(*layer->ch_layout.u.map)); + if (!layer->ch_layout.u.map) + return AVERROR(ENOMEM); + + for (int i = 0; i < output_channel_count; i++) + layer->ch_layout.u.map[i].id = avio_r8(pb) + AV_CHAN_AMBISONIC_BASE; + } else { + int coupled_substream_count = avio_r8(pb); // M + int nb_demixing_matrix = substream_count + coupled_substream_count; + int demixing_matrix_size = nb_demixing_matrix * output_channel_count; + + layer->ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_AMBISONIC, .nb_channels = output_channel_count }; + layer->demixing_matrix = av_malloc_array(demixing_matrix_size, sizeof(*layer->demixing_matrix)); + if (!layer->demixing_matrix) + return AVERROR(ENOMEM); + + for (int i = 0; i < demixing_matrix_size; i++) + layer->demixing_matrix[i] = sign_extend(avio_rb16(pb), 16); + + for (int i = 0; i < substream_count; i++) { + AVStream *st = audio_element->audio_substreams[i]; + + st->codecpar->ch_layout = coupled_substream_count-- > 0 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO : + (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + ret = avformat_stream_group_add_stream(stg, st); + if (ret < 0) + return ret; + + ret = update_extradata(s, st); + if (ret < 0) + return ret; + + avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate); + } + } + + return 0; +} + +static int param_parse(AVFormatContext *s, AVIOContext *pb, + unsigned int param_definition_type, + const AVIAMFAudioElement *audio_element, + AVIAMFParamDefinition **out_param_definition) +{ + IAMFDemuxContext *const c = s->priv_data; + IAMFParamDefinition *param_definition; + const IAMFParamDefinition *old_param = NULL; + unsigned int parameter_id, parameter_rate, param_definition_mode; + unsigned int duration, constant_subblock_duration, num_subblocks = 0; + int nb_param_definitions = c->nb_param_definitions, ret; + + ret = leb(pb, &parameter_id); + if (ret < 0) + return ret; + + for (int i = 0; i < c->nb_param_definitions; i++) + if (c->param_definitions[i].param->parameter_id == parameter_id) { + old_param = param_definition = &c->param_definitions[i]; + break; + } + + if (!old_param) { + param_definition = av_dynarray2_add_nofree((void **)&c->param_definitions, &nb_param_definitions, + sizeof(*c->param_definitions), NULL); + if (!param_definition) + return AVERROR(ENOMEM); + + memset(param_definition, 0, sizeof(*param_definition)); + } + + ret = leb(pb, &parameter_rate); + if (ret < 0) + return ret; + + param_definition_mode = avio_r8(pb) >> 7; + + if (old_param && (param_definition_mode != old_param->param->param_definition_mode || + param_definition_type != old_param->param->param_definition_type)) { + av_log(s, AV_LOG_ERROR, "Inconsistent param_definition_mode or param_definition_type values " + "for parameter_id %d\n", parameter_id); + return AVERROR_INVALIDDATA; + } + + if (param_definition_mode == 0) { + ret = leb(pb, &duration); + if (ret < 0) + return ret; + + ret = leb(pb, &constant_subblock_duration); + if (ret < 0) + return ret; + + if (constant_subblock_duration == 0) { + ret = leb(pb, &num_subblocks); + if (ret < 0) + return ret; + } else + num_subblocks = duration / constant_subblock_duration; + } + + if (old_param) { + if (num_subblocks != old_param->param->num_subblocks) { + av_log(s, AV_LOG_ERROR, "Inconsistent num_subblocks values for parameter_id %d\n", parameter_id); + return AVERROR_INVALIDDATA; + } + } else { + param_definition->param = avformat_iamf_param_definition_alloc(param_definition_type, num_subblocks, NULL); + if (!param_definition->param) + return AVERROR(ENOMEM); + param_definition->audio_element = audio_element; + } + + for (int i = 0; i < num_subblocks; i++) { + void *subblock = avformat_iamf_param_definition_get_subblock(param_definition->param, i); + unsigned int subblock_duration = constant_subblock_duration; + + if (constant_subblock_duration == 0) { + ret = leb(pb, &subblock_duration); + if (ret < 0) { + if (!old_param) + av_freep(&param_definition->param); + return ret; + } + } + + switch (param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: { + AVIAMFMixGainParameterData *mix = subblock; + mix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: { + AVIAMFDemixingInfoParameterData *demix = subblock; + demix->subblock_duration = subblock_duration; + // DemixingInfoParameterData + demix->dmixp_mode = avio_r8(pb) >> 5; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: { + AVIAMFReconGainParameterData *recon = subblock; + recon->subblock_duration = subblock_duration; + break; + } + default: + if (!old_param) + av_freep(&param_definition->param); + return AVERROR_INVALIDDATA; + } + } + + param_definition->param->parameter_id = parameter_id; + param_definition->param->parameter_rate = parameter_rate; + param_definition->param->param_definition_mode = param_definition_mode; + param_definition->param->duration = duration; + param_definition->param->constant_subblock_duration = constant_subblock_duration; + param_definition->param->num_subblocks = num_subblocks; + + av_assert0(out_param_definition); + *out_param_definition = param_definition->param; + + if (!old_param) + c->nb_param_definitions = nb_param_definitions; + + return 0; +} + +static int audio_element_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + const IAMFCodecConfig *codec_config = NULL; + AVIAMFAudioElement *avaudio_element; + IAMFAudioElement *audio_element; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + unsigned audio_element_id, codec_config_id, num_substreams, num_parameters; + int audio_element_type, ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + ret = leb(pb, &audio_element_id); + if (ret < 0) + goto fail; + + for (int i = 0; i < c->nb_audio_elements; i++) + if (c->audio_elements[i].stream_group->id == audio_element_id) { + av_log(s, AV_LOG_ERROR, "Duplicate audio_element_id %d\n", audio_element_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + audio_element_type = avio_r8(pb) >> 5; + + ret = leb(pb, &codec_config_id); + if (ret < 0) + goto fail; + + for (int i = 0; i < c->nb_codec_configs; i++) { + if (c->codec_configs[i].codec_config_id == codec_config_id) { + codec_config = &c->codec_configs[i]; + break; + } + } + + if (!codec_config) { + av_log(s, AV_LOG_ERROR, "Non existant codec config id %d referenced in an audio element\n", codec_config_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + if (codec_config->codec_id == AV_CODEC_ID_NONE) { + av_log(s, AV_LOG_DEBUG, "Unknown codec id referenced in an audio element. Ignoring\n"); + ret = 0; + goto fail; + } + + ret = leb(pb, &num_substreams); + if (ret < 0) + goto fail; + + audio_element = av_dynarray2_add_nofree((void **)&c->audio_elements, &c->nb_audio_elements, + sizeof(*c->audio_elements), NULL); + if (!audio_element) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(audio_element, 0, sizeof(*audio_element)); + + audio_element->audio_substreams = av_calloc(num_substreams, sizeof(*audio_element->audio_substreams)); + if (!audio_element->audio_substreams) { + ret = AVERROR(ENOMEM); + goto fail; + } + + audio_element->stream_group = avformat_stream_group_create(s, AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT); + if (!audio_element->stream_group) + return AVERROR(ENOMEM); + audio_element->stream_group->id = audio_element_id; + avaudio_element = audio_element->stream_group->params.iamf_audio_element; + avaudio_element->codec_config_id = codec_config_id; + avaudio_element->audio_element_type = audio_element_type; + + audio_element->num_substreams = num_substreams; + + for (int i = 0; i < num_substreams; i++) { + AVStream *st = audio_element->audio_substreams[i] = avformat_new_stream(s, NULL); + unsigned audio_substream_id; + + if (!st) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = leb(pb, &audio_substream_id); + if (ret < 0) + goto fail; + + st->id = audio_substream_id; + st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO; + st->codecpar->codec_id = codec_config->codec_id; + st->codecpar->frame_size = codec_config->nb_samples; + st->codecpar->sample_rate = codec_config->sample_rate; + st->codecpar->seek_preroll = codec_config->seek_preroll; + ffstream(st)->need_parsing = AVSTREAM_PARSE_HEADERS; + + switch(st->codecpar->codec_id) { + case AV_CODEC_ID_AAC: + case AV_CODEC_ID_FLAC: + case AV_CODEC_ID_OPUS: + st->codecpar->extradata = av_malloc(codec_config->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!st->codecpar->extradata) { + ret = AVERROR(ENOMEM); + goto fail; + } + memcpy(st->codecpar->extradata, codec_config->extradata, codec_config->extradata_size); + memset(st->codecpar->extradata + codec_config->extradata_size, 0, AV_INPUT_BUFFER_PADDING_SIZE); + st->codecpar->extradata_size = codec_config->extradata_size; + break; + } + } + + ret = leb(pb, &num_parameters); + if (ret < 0) + goto fail; + + for (int i = 0; i < num_parameters; i++) { + unsigned param_definition_type; + + ret = leb(pb, &param_definition_type); + if (ret < 0) + goto fail; + + if (param_definition_type == 0) { + ret = AVERROR_INVALIDDATA; + goto fail; + } else if (param_definition_type == 1) { + ret = param_parse(s, pb, param_definition_type, avaudio_element, &avaudio_element->demixing_info); + if (ret < 0) + goto fail; + + avaudio_element->default_w = avio_r8(pb) >> 4; + } else if (param_definition_type == 2) { + ret = param_parse(s, pb, param_definition_type, avaudio_element, &avaudio_element->recon_gain_info); + if (ret < 0) + goto fail; + } else { + unsigned param_definition_size; + ret = leb(pb, &param_definition_size); + if (ret < 0) + goto fail; + + avio_skip(pb, param_definition_size); + } + } + + if (audio_element_type == AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL) { + ret = scalable_channel_layout_config(s, pb, audio_element, codec_config); + if (ret < 0) + goto fail; + } else if (audio_element_type == AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE) { + ret = ambisonics_config(s, pb, audio_element, codec_config); + if (ret < 0) + goto fail; + } else { + unsigned audio_element_config_size; + ret = leb(pb, &audio_element_config_size); + if (ret < 0) + goto fail; + } + + av_log(s, AV_LOG_DEBUG, "%"PRId64" bytes left at the end of audio_element_obu\n", len - avio_tell(pb)); + + ret = 0; +fail: + av_free(buf); + + return ret; +} + +static int label_string(AVFormatContext *s, AVIOContext *pb, char **label) +{ + uint8_t buf[128]; + + avio_get_str(pb, sizeof(buf), buf, sizeof(buf)); + + if (pb->error) + return pb->error; + if (pb->eof_reached) + return AVERROR_INVALIDDATA; + *label = av_strdup(buf); + if (!*label) + return AVERROR(ENOMEM); + + return 0; +} + +static int mix_presentation_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + AVIAMFMixPresentation *mix_presentation; + IAMFMixPresentation *mixi; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + unsigned mix_presentation_id; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + ret = leb(pb, &mix_presentation_id); + if (ret < 0) + goto fail; + + for (int i = 0; i < c->nb_mix_presentations; i++) + if (c->mix_presentations[i].stream_group->id == mix_presentation_id) { + av_log(s, AV_LOG_ERROR, "Duplicate mix_presentation_id %d\n", mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + mixi = av_dynarray2_add_nofree((void **)&c->mix_presentations, &c->nb_mix_presentations, + sizeof(*c->mix_presentations), NULL); + if (!mixi) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(mixi, 0, sizeof(*mixi)); + mixi->stream_group = avformat_stream_group_create(s, AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION); + if (!mixi->stream_group) { + ret = AVERROR(ENOMEM); + goto fail; + } + + mixi->stream_group->id = mix_presentation_id; + mix_presentation = mixi->stream_group->params.iamf_mix_presentation; + + ret = leb(pb, &mix_presentation->count_label); + if (ret < 0) + goto fail; + + mix_presentation->language_label = av_calloc(mix_presentation->count_label, sizeof(*mix_presentation->language_label)); + if (!mix_presentation->language_label) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < mix_presentation->count_label; i++) { + ret = label_string(s, pb, &mix_presentation->language_label[i]); + if (ret < 0) + goto fail; + } + + mix_presentation->mix_presentation_annotations = av_calloc(mix_presentation->count_label, sizeof(*mix_presentation->mix_presentation_annotations)); + if (!mix_presentation->mix_presentation_annotations) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < mix_presentation->count_label; i++) { + ret = label_string(s, pb, &mix_presentation->mix_presentation_annotations[i]); + if (ret < 0) + goto fail; + } + + ret = leb(pb, &mix_presentation->num_sub_mixes); + if (ret < 0) + goto fail; + + mix_presentation->sub_mixes = av_calloc(mix_presentation->num_sub_mixes, sizeof(*mix_presentation->sub_mixes)); + if (!mix_presentation->sub_mixes) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < mix_presentation->num_sub_mixes; i++) { + AVIAMFSubmixPresentation *sub_mix; + + sub_mix = mix_presentation->sub_mixes[i] = av_mallocz(sizeof(*sub_mix)); + if (!sub_mix) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = leb(pb, &sub_mix->num_submix_elements); + if (ret < 0) + goto fail; + + sub_mix->submix_elements = av_calloc(sub_mix->num_submix_elements, sizeof(*sub_mix->submix_elements)); + if (!sub_mix->submix_elements) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int j = 0; j < sub_mix->num_submix_elements; j++) { + AVIAMFSubmixElement *submix_element; + IAMFAudioElement *audio_element = NULL; + unsigned int audio_element_id, rendering_config_extension_size; + + submix_element = sub_mix->submix_elements[j] = av_mallocz(sizeof(*submix_element)); + if (!submix_element) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = leb(pb, &audio_element_id); + if (ret < 0) + goto fail; + + for (int k = 0; k < c->nb_audio_elements; k++) + if (c->audio_elements[k].stream_group->id == audio_element_id) { + audio_element = &c->audio_elements[k]; + submix_element->audio_element = audio_element->stream_group; + } + + if (!audio_element) { + av_log(s, AV_LOG_ERROR, "Invalid Audio Element with id %u referenced by Mix Parameters %u\n", audio_element_id, mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + for (int k = 0; k < audio_element->num_substreams; k++) { + ret = avformat_stream_group_add_stream(mixi->stream_group, audio_element->audio_substreams[k]); + if (ret < 0 && ret != AVERROR(EEXIST)) + goto fail; + } + + submix_element->mix_presentation_element_annotations = av_calloc(mix_presentation->count_label, sizeof(*submix_element->mix_presentation_element_annotations)); + if (!submix_element->mix_presentation_element_annotations) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int k = 0; k < mix_presentation->count_label; k++) { + ret = label_string(s, pb, &submix_element->mix_presentation_element_annotations[k]); + if (ret < 0) + goto fail; + } + + submix_element->headphones_rendering_mode = avio_r8(pb) >> 6; + + ret = leb(pb, &rendering_config_extension_size); + if (ret < 0) + goto fail; + avio_skip(pb, rendering_config_extension_size); + + ret = param_parse(s, pb, AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, NULL, &submix_element->element_mix_config); + if (ret < 0) + goto fail; + submix_element->default_mix_gain = sign_extend(avio_rb16(pb), 16); + } + ret = param_parse(s, pb, AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, NULL, &sub_mix->output_mix_config); + if (ret < 0) + goto fail; + sub_mix->default_mix_gain = sign_extend(avio_rb16(pb), 16); + + ret = leb(pb, &sub_mix->num_submix_layouts); + if (ret < 0) + goto fail; + + sub_mix->submix_layouts = av_calloc(sub_mix->num_submix_layouts, sizeof(*sub_mix->submix_layouts)); + if (!sub_mix->submix_layouts) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int j = 0; j < sub_mix->num_submix_layouts; j++) { + AVIAMFSubmixLayout *submix_layout; + int info_type; + int byte = avio_r8(pb); + + submix_layout = sub_mix->submix_layouts[j] = av_mallocz(sizeof(*submix_layout)); + if (!submix_layout) { + ret = AVERROR(ENOMEM); + goto fail; + } + + submix_layout->layout_type = byte >> 6; + if (submix_layout->layout_type < 2 && submix_layout->layout_type > 3) { + av_log(s, AV_LOG_ERROR, "Invalid Layout type %u in a submix from Mix Presentation %u\n", submix_layout->layout_type, mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + if (submix_layout->layout_type == 2) { + int sound_system; + sound_system = (byte >> 2) & 0xF; + av_channel_layout_copy(&submix_layout->sound_system, &ff_iamf_sound_system_map[sound_system].layout); + } + + info_type = avio_r8(pb); + submix_layout->integrated_loudness = sign_extend(avio_rb16(pb), 16); + submix_layout->digital_peak = sign_extend(avio_rb16(pb), 16); + + if (info_type & 1) + sign_extend(avio_rb16(pb), 16); // true_peak + + if (info_type & 2) { + unsigned int num_anchored_loudness = avio_r8(pb); + + for (int k = 0; k < num_anchored_loudness; k++) { + avio_r8(pb); // anchor_element + sign_extend(avio_rb16(pb), 16); // anchored_loudness + } + } + + if (info_type & 0xFC) { + unsigned int info_type_size; + ret = leb(pb, &info_type_size); + if (ret < 0) + goto fail; + + avio_skip(pb, info_type_size); + } + } + } + + av_log(s, AV_LOG_DEBUG, "%"PRId64" bytes left at the end of mixing_parameters_obu\n", len - avio_tell(pb)); + + ret = 0; +fail: + av_free(buf); + + return ret; +} + +static int iamf_read_header(AVFormatContext *s) +{ + IAMFDemuxContext *const c = s->priv_data; + uint8_t header[MAX_IAMF_OBU_HEADER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; + int ret; + + av_log(s, AV_LOG_DEBUG, "HEADER\n"); + while (1) { + unsigned obu_size; + enum IAMF_OBU_Type type; + int start_pos, len, size; + + if ((ret = ffio_ensure_seekback(s->pb, MAX_IAMF_OBU_HEADER_SIZE)) < 0) + return ret; + size = avio_read(s->pb, header, MAX_IAMF_OBU_HEADER_SIZE); + if (size < 0) + return size; + + len = parse_obu_header(header, size, &obu_size, &start_pos, &type); + if (len < 0) { + av_log(s, AV_LOG_ERROR, "Failed to read obu\n"); + return len; + } + + if (type >= IAMF_OBU_IA_PARAMETER_BLOCK && type < IAMF_OBU_IA_SEQUENCE_HEADER) { + avio_seek(s->pb, -size, SEEK_CUR); + break; + } + + avio_seek(s->pb, -(size - start_pos), SEEK_CUR); + switch (type) { + case IAMF_OBU_IA_CODEC_CONFIG: + ret = codec_config_obu(s, obu_size); + break; + case IAMF_OBU_IA_AUDIO_ELEMENT: + ret = audio_element_obu(s, obu_size); + break; + case IAMF_OBU_IA_MIX_PRESENTATION: + ret = mix_presentation_obu(s, obu_size); + break; + case IAMF_OBU_IA_TEMPORAL_DELIMITER: + av_freep(&c->mix); + c->mix_size = 0; + av_freep(&c->demix); + c->demix_size = 0; + av_freep(&c->recon); + c->recon_size = 0; + break; + default: { + int64_t offset = avio_skip(s->pb, obu_size); + if (offset < 0) + ret = offset; + break; + } + } + if (ret < 0) + return ret; + } + + return 0; +} + +static AVStream *find_stream_by_id(AVFormatContext *s, int id) +{ + for (int i = 0; i < s->nb_streams; i++) + if (s->streams[i]->id == id) + return s->streams[i]; + + av_log(s, AV_LOG_ERROR, "Invalid stream id %d\n", id); + return NULL; +} + +static int audio_frame_obu(AVFormatContext *s, AVPacket *pkt, int len, + enum IAMF_OBU_Type type, int id_in_bitstream) +{ + const IAMFDemuxContext *const c = s->priv_data; + AVStream *st; + int ret, audio_substream_id; + + if (id_in_bitstream) { + unsigned explicit_audio_substream_id; + ret = leb(s->pb, &explicit_audio_substream_id); + if (ret < 0) + return ret; + len -= ret; + audio_substream_id = explicit_audio_substream_id; + } else + audio_substream_id = type - IAMF_OBU_IA_AUDIO_FRAME_ID0; + + st = find_stream_by_id(s, audio_substream_id); + if (!st) + return AVERROR_INVALIDDATA; + + ret = av_get_packet(s->pb, pkt, len); + if (ret < 0) + return ret; + if (ret != len) + return AVERROR_INVALIDDATA; + + if (c->mix) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_MIX_GAIN_PARAM, c->mix_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->mix, c->mix_size); + } + if (c->demix) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_DEMIXING_INFO_PARAM, c->demix_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->demix, c->demix_size); + } + if (c->recon) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_RECON_GAIN_INFO_PARAM, c->recon_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->recon, c->recon_size); + } + + pkt->stream_index = st->index; + return 0; +} + +static const IAMFParamDefinition *get_param_definition(AVFormatContext *s, unsigned int parameter_id) +{ + const IAMFDemuxContext *const c = s->priv_data; + const IAMFParamDefinition *param_definition = NULL; + + for (int i = 0; i < c->nb_param_definitions; i++) + if (c->param_definitions[i].param->parameter_id == parameter_id) { + param_definition = &c->param_definitions[i]; + break; + } + + return param_definition; +} + +static int parameter_block_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + const IAMFParamDefinition *param_definition; + const AVIAMFParamDefinition *param; + AVIAMFParamDefinition *out_param = NULL; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + unsigned int duration, constant_subblock_duration; + unsigned int num_subblocks; + unsigned int parameter_id; + size_t out_param_size; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + ret = leb(pb, &parameter_id); + if (ret < 0) + goto fail; + + param_definition = get_param_definition(s, parameter_id); + if (!param_definition) { + av_log(s, AV_LOG_VERBOSE, "Non existant parameter_id %d referenced in a parameter block. Ignoring\n", parameter_id); + ret = 0; + goto fail; + } + + param = param_definition->param; + if (param->param_definition_mode) { + ret = leb(pb, &duration); + if (ret < 0) + goto fail; + + ret = leb(pb, &constant_subblock_duration); + if (ret < 0) + goto fail; + + if (constant_subblock_duration == 0) { + ret = leb(pb, &num_subblocks); + if (ret < 0) + goto fail; + } else + num_subblocks = duration / constant_subblock_duration; + } else { + duration = param->duration; + constant_subblock_duration = param->constant_subblock_duration; + num_subblocks = param->num_subblocks; + if (!num_subblocks) + num_subblocks = duration / constant_subblock_duration; + } + + out_param = avformat_iamf_param_definition_alloc(param->param_definition_type, num_subblocks, &out_param_size); + if (!out_param) { + ret = AVERROR(ENOMEM); + goto fail; + } + + out_param->parameter_id = param->parameter_id; + out_param->param_definition_type = param->param_definition_type; + out_param->parameter_rate = param->parameter_rate; + out_param->param_definition_mode = param->param_definition_mode; + out_param->duration = duration; + out_param->constant_subblock_duration = constant_subblock_duration; + out_param->num_subblocks = num_subblocks; + + for (int i = 0; i < num_subblocks; i++) { + void *subblock = avformat_iamf_param_definition_get_subblock(out_param, i); + unsigned int subblock_duration; + + if (param->param_definition_mode && !constant_subblock_duration) { + ret = leb(pb, &subblock_duration); + if (ret < 0) + goto fail; + + } else { + switch (param->param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: + subblock_duration = ((AVIAMFMixGainParameterData *)subblock)->subblock_duration; + break; + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: + subblock_duration = ((AVIAMFDemixingInfoParameterData *)subblock)->subblock_duration; + break; + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: + subblock_duration = ((AVIAMFReconGainParameterData *)subblock)->subblock_duration; + break; + default: + return AVERROR_INVALIDDATA; + } + } + + switch (param->param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: { + AVIAMFMixGainParameterData *mix = subblock; + + ret = leb(pb, &mix->animation_type); + if (ret < 0) + goto fail; + + if (mix->animation_type > AV_IAMF_ANIMATION_TYPE_BEZIER) { + ret = 0; + av_free(out_param); + goto fail; + } + + mix->start_point_value = sign_extend(avio_rb16(pb), 16); + if (mix->animation_type >= AV_IAMF_ANIMATION_TYPE_LINEAR) + mix->end_point_value = sign_extend(avio_rb16(pb), 16); + if (mix->animation_type == AV_IAMF_ANIMATION_TYPE_BEZIER) { + mix->control_point_value = sign_extend(avio_rb16(pb), 16); + mix->control_point_relative_time = avio_r8(pb); + } + mix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: { + AVIAMFDemixingInfoParameterData *demix = subblock; + + demix->dmixp_mode = avio_r8(pb) >> 5; + demix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: { + const AVIAMFAudioElement *audio_element = param_definition->audio_element; + + av_assert0(audio_element); + for (int i = 0; i < audio_element->num_layers; i++) { + const AVIAMFLayer *layer = audio_element->layers[i]; + if (layer->recon_gain_is_present) { + unsigned int recon_gain_flags, bitcount; + ret = leb(pb, &recon_gain_flags); + if (ret < 0) + goto fail; + + bitcount = 7 + 5 * !!(recon_gain_flags & 0x80); + recon_gain_flags = (recon_gain_flags & 0x7F) | ((recon_gain_flags & 0xFF00) >> 1); + for (int j = 0; j < bitcount; j++) { + if (recon_gain_flags & (1 << j)) { + avio_r8(pb); // recon_gain + } + } + } + } + break; + } + default: { + unsigned parameter_data_size; + ret = leb(pb, &parameter_data_size); + if (ret < 0) + goto fail; + + avio_skip(pb, parameter_data_size); + break; + } + } + } + + av_log(s, AV_LOG_DEBUG, "%"PRId64" bytes left at the end of parameter_block_obu\n", len - avio_tell(pb)); + + switch (param->param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: + av_free(c->mix); + c->mix = out_param; + c->mix_size = out_param_size; + break; + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: + av_free(c->demix); + c->demix = out_param; + c->demix_size = out_param_size; + break; + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: // TODO + ret = 0; + av_free(out_param); + goto fail; + default: + return AVERROR_INVALIDDATA; + } + + ret = 0; +fail: + if (ret < 0) + av_free(out_param); + av_free(buf); + + return ret; +} + +static int iamf_read_packet(AVFormatContext *s, AVPacket *pkt) +{ + uint8_t header[MAX_IAMF_OBU_HEADER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; + unsigned obu_size; + int ret; + + av_log(s, AV_LOG_DEBUG, "PACKET\n"); + while (1) { + enum IAMF_OBU_Type type; + int len, size, start_pos; + + if ((ret = ffio_ensure_seekback(s->pb, MAX_IAMF_OBU_HEADER_SIZE)) < 0) + return ret; + size = avio_read(s->pb, header, MAX_IAMF_OBU_HEADER_SIZE); + if (size < 0) + return size; + + len = parse_obu_header(header, size, &obu_size, &start_pos, &type); + if (len < 0) { + av_log(s, AV_LOG_ERROR, "Failed to read obu\n"); + return len; + } + avio_seek(s->pb, -(size - start_pos), SEEK_CUR); + + if (type == IAMF_OBU_IA_AUDIO_FRAME) + return audio_frame_obu(s, pkt, obu_size, type, 1); + else if (type >= IAMF_OBU_IA_AUDIO_FRAME_ID0 && type <= IAMF_OBU_IA_AUDIO_FRAME_ID17) + return audio_frame_obu(s, pkt, obu_size, type, 0); + else if (type == IAMF_OBU_IA_PARAMETER_BLOCK) { + ret = parameter_block_obu(s, obu_size); + if (ret < 0) + return ret; + } else { + int64_t offset = avio_skip(s->pb, obu_size); + if (offset < 0) + ret = offset; + break; + } + } + + return ret; +} + +static int iamf_read_close(AVFormatContext *s) +{ + IAMFDemuxContext *const c = s->priv_data; + + for (int i = 0; i < c->nb_codec_configs; i++) + av_free(c->codec_configs[i].extradata); + av_freep(&c->codec_configs); + c->nb_codec_configs = 0; + + for (int i = 0; i < c->nb_audio_elements; i++) + av_free(c->audio_elements[i].audio_substreams); + av_freep(&c->audio_elements); + c->nb_audio_elements = 0; + + av_freep(&c->mix_presentations); + c->nb_mix_presentations = 0; + + av_freep(&c->param_definitions); + c->nb_param_definitions = 0; + + av_freep(&c->mix); + c->mix_size = 0; + av_freep(&c->demix); + c->demix_size = 0; + av_freep(&c->recon); + c->recon_size = 0; + return 0; +} + +const AVInputFormat ff_iamf_demuxer = { + .name = "iamf", + .long_name = NULL_IF_CONFIG_SMALL("Raw Immersive Audio Model and Formats"), + .priv_data_size = sizeof(IAMFDemuxContext), + .flags_internal = FF_FMT_INIT_CLEANUP, + .read_probe = iamf_probe, + .read_header = iamf_read_header, + .read_packet = iamf_read_packet, + .read_close = iamf_read_close, + .extensions = "iamf", + .flags = AVFMT_GENERIC_INDEX | AVFMT_NO_BYTE_SEEK | AVFMT_NOTIMESTAMPS | AVFMT_SHOW_IDS, +}; diff --git a/libavformat/options.c b/libavformat/options.c index 09eb13e97a..115f95d48a 100644 --- a/libavformat/options.c +++ b/libavformat/options.c @@ -20,6 +20,7 @@ #include "avformat.h" #include "avio_internal.h" #include "demux.h" +#include "iamf.h" #include "internal.h" #include "libavcodec/avcodec.h" @@ -345,7 +346,16 @@ AVStreamGroup *avformat_stream_group_create(AVFormatContext *s, stg->type = type; switch (type) { - // Structs in the union are allocated here + case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT: + stg->params.iamf_audio_element = avformat_iamf_audio_element_alloc(); + if (!stg->params.iamf_audio_element) + goto fail; + break; + case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION: + stg->params.iamf_mix_presentation = avformat_iamf_mix_presentation_alloc(); + if (!stg->params.iamf_mix_presentation) + goto fail; + // fall-through default: break; } @@ -356,6 +366,9 @@ AVStreamGroup *avformat_stream_group_create(AVFormatContext *s, s->stream_groups[s->nb_stream_groups++] = stg; return stg; +fail: + ff_free_stream_group(&stg); + return NULL; } static int stream_group_add_stream(AVStreamGroup *stg, const AVStream *st)

[FFmpeg-devel,3/2,WIP,RFC] avformat: Immersive Audio Model and Formats demuxer

Checks

Commit Message

Patch