[FFmpeg-devel,8/9] avformat: Immersive Audio Model and Formats demuxer

Message ID	20231126012858.40388-9-jamrial@gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: James Almer <jamrial@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Sat, 25 Nov 2023 22:28:57 -0300 Message-ID: <20231126012858.40388-9-jamrial@gmail.com> In-Reply-To: <20231126012858.40388-1-jamrial@gmail.com> References: <20231126012858.40388-1-jamrial@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 8/9] avformat: Immersive Audio Model and Formats demuxer Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	avformat: introduce AVStreamGroup \| expand [FFmpeg-devel,v5,0/9] avformat: introduce AVStreamGroup [FFmpeg-devel,1/9] avutil/mem: add av_dynarray2_add_nofree [FFmpeg-devel,2/9] avcodec/get_bits: add get_leb() [FFmpeg-devel,3/9] avformat/aviobuf: add ffio_read_leb() and ffio_write_leb() [FFmpeg-devel,4/9] avutil: introduce an Immersive Audio Model and Formats API [FFmpeg-devel,5/9] avformat: introduce AVStreamGroup [FFmpeg-devel,6/9] ffmpeg: add support for muxing AVStreamGroups [FFmpeg-devel,7/9] avcodec/packet: add IAMF Parameters side data types [FFmpeg-devel,8/9] avformat: Immersive Audio Model and Formats demuxer [FFmpeg-devel,9/9] avformat: Immersive Audio Model and Formats muxer [FFmpeg-devel,10/13] avcodec: add an Immersive Audio Model and Formats frame split bsf [FFmpeg-devel,11/13] avformat/demux: support inserting bitstream filters in demuxing scenarios [FFmpeg-devel,12/13] avformat/mov: make MOVStreamContext refcounted [FFmpeg-devel,13/13] avformat/mov: add support for Immersive Audio Model and Formats in ISOBMFF

Context	Check	Description
andriy/make_x86	success	Make finished
andriy/make_fate_x86	success	Make fate finished

diff --git a/libavformat/Makefile b/libavformat/Makefile index 329055ccfd..752833f5a8 100644 --- a/libavformat/Makefile +++ b/libavformat/Makefile @@ -258,6 +258,7 @@ OBJS-$(CONFIG_EVC_MUXER) += rawenc.o OBJS-$(CONFIG_HLS_DEMUXER) += hls.o hls_sample_encryption.o OBJS-$(CONFIG_HLS_MUXER) += hlsenc.o hlsplaylist.o avc.o OBJS-$(CONFIG_HNM_DEMUXER) += hnm.o +OBJS-$(CONFIG_IAMF_DEMUXER) += iamfdec.o iamf.o OBJS-$(CONFIG_ICO_DEMUXER) += icodec.o OBJS-$(CONFIG_ICO_MUXER) += icoenc.o OBJS-$(CONFIG_IDCIN_DEMUXER) += idcin.o diff --git a/libavformat/allformats.c b/libavformat/allformats.c index d4b505a5a3..63ca44bacd 100644 --- a/libavformat/allformats.c +++ b/libavformat/allformats.c @@ -212,6 +212,7 @@ extern const FFOutputFormat ff_hevc_muxer; extern const AVInputFormat ff_hls_demuxer; extern const FFOutputFormat ff_hls_muxer; extern const AVInputFormat ff_hnm_demuxer; +extern const AVInputFormat ff_iamf_demuxer; extern const AVInputFormat ff_ico_demuxer; extern const FFOutputFormat ff_ico_muxer; extern const AVInputFormat ff_idcin_demuxer; diff --git a/libavformat/iamf.c b/libavformat/iamf.c new file mode 100644 index 0000000000..be63db663c --- /dev/null +++ b/libavformat/iamf.c @@ -0,0 +1,1149 @@ +/* + * Immersive Audio Model and Formats parsing + * Copyright (c) 2023 James Almer <jamrial@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/common.h" +#include "libavutil/iamf.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/log.h" +#include "libavcodec/get_bits.h" +#include "libavcodec/flac.h" +#include "libavcodec/mpeg4audio.h" +#include "libavcodec/put_bits.h" +#include "avio_internal.h" +#include "iamf.h" +#include "isom.h" + +const AVChannelLayout ff_iamf_scalable_ch_layouts[10] = { + AV_CHANNEL_LAYOUT_MONO, + AV_CHANNEL_LAYOUT_STEREO, + // "Loudspeaker configuration for Sound System B" + AV_CHANNEL_LAYOUT_5POINT1_BACK, + // "Loudspeaker configuration for Sound System C" + AV_CHANNEL_LAYOUT_5POINT1POINT2_BACK, + // "Loudspeaker configuration for Sound System D" + AV_CHANNEL_LAYOUT_5POINT1POINT4_BACK, + // "Loudspeaker configuration for Sound System I" + AV_CHANNEL_LAYOUT_7POINT1, + // "Loudspeaker configuration for Sound System I" + Ltf + Rtf + AV_CHANNEL_LAYOUT_7POINT1POINT2, + // "Loudspeaker configuration for Sound System J" + AV_CHANNEL_LAYOUT_7POINT1POINT4_BACK, + // Front subset of "Loudspeaker configuration for Sound System J" + AV_CHANNEL_LAYOUT_3POINT1POINT2, + // Binaural + AV_CHANNEL_LAYOUT_STEREO, +}; + +const struct IAMFSoundSystemMap ff_iamf_sound_system_map[13] = { + { SOUND_SYSTEM_A_0_2_0, AV_CHANNEL_LAYOUT_STEREO }, + { SOUND_SYSTEM_B_0_5_0, AV_CHANNEL_LAYOUT_5POINT1_BACK }, + { SOUND_SYSTEM_C_2_5_0, AV_CHANNEL_LAYOUT_5POINT1POINT2_BACK }, + { SOUND_SYSTEM_D_4_5_0, AV_CHANNEL_LAYOUT_5POINT1POINT4_BACK }, + { SOUND_SYSTEM_E_4_5_1, + { + .nb_channels = 11, + .order = AV_CHANNEL_ORDER_NATIVE, + .u.mask = AV_CH_LAYOUT_5POINT1POINT4_BACK | AV_CH_BOTTOM_FRONT_CENTER, + }, + }, + { SOUND_SYSTEM_F_3_7_0, AV_CHANNEL_LAYOUT_7POINT2POINT3 }, + { SOUND_SYSTEM_G_4_9_0, AV_CHANNEL_LAYOUT_9POINT1POINT4_BACK }, + { SOUND_SYSTEM_H_9_10_3, AV_CHANNEL_LAYOUT_22POINT2 }, + { SOUND_SYSTEM_I_0_7_0, AV_CHANNEL_LAYOUT_7POINT1 }, + { SOUND_SYSTEM_J_4_7_0, AV_CHANNEL_LAYOUT_7POINT1POINT4_BACK }, + { SOUND_SYSTEM_10_2_7_0, AV_CHANNEL_LAYOUT_7POINT1POINT2 }, + { SOUND_SYSTEM_11_2_3_0, AV_CHANNEL_LAYOUT_3POINT1POINT2 }, + { SOUND_SYSTEM_12_0_1_0, AV_CHANNEL_LAYOUT_MONO }, +}; + +static int opus_decoder_config(IAMFCodecConfig *codec_config, + AVIOContext *pb, int len) +{ + int left = len - avio_tell(pb); + + if (left < 11) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left + 8); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + AV_WB32(codec_config->extradata, MKBETAG('O','p','u','s')); + AV_WB32(codec_config->extradata + 4, MKBETAG('H','e','a','d')); + codec_config->extradata_size = avio_read(pb, codec_config->extradata + 8, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + codec_config->extradata_size += 8; + codec_config->sample_rate = 48000; + + return 0; +} + +static int aac_decoder_config(IAMFCodecConfig *codec_config, + AVIOContext *pb, int len, void *logctx) +{ + MPEG4AudioConfig cfg = { 0 }; + int object_type_id, codec_id, stream_type; + int ret, tag, left; + + tag = avio_r8(pb); + if (tag != MP4DecConfigDescrTag) + return AVERROR_INVALIDDATA; + + object_type_id = avio_r8(pb); + if (object_type_id != 0x40) + return AVERROR_INVALIDDATA; + + stream_type = avio_r8(pb); + if (((stream_type >> 2) != 5) || ((stream_type >> 1) & 1)) + return AVERROR_INVALIDDATA; + + avio_skip(pb, 3); // buffer size db + avio_skip(pb, 4); // rc_max_rate + avio_skip(pb, 4); // avg bitrate + + codec_id = ff_codec_get_id(ff_mp4_obj_type, object_type_id); + if (codec_id && codec_id != codec_config->codec_id) + return AVERROR_INVALIDDATA; + + tag = avio_r8(pb); + if (tag != MP4DecSpecificDescrTag) + return AVERROR_INVALIDDATA; + + left = len - avio_tell(pb); + if (left <= 0) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + codec_config->extradata_size = avio_read(pb, codec_config->extradata, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + ret = avpriv_mpeg4audio_get_config2(&cfg, codec_config->extradata, + codec_config->extradata_size, 1, logctx); + if (ret < 0) + return ret; + + codec_config->sample_rate = cfg.sample_rate; + + return 0; +} + +static int flac_decoder_config(IAMFCodecConfig *codec_config, + AVIOContext *pb, int len) +{ + int left; + + avio_skip(pb, 4); // METADATA_BLOCK_HEADER + + left = len - avio_tell(pb); + if (left < FLAC_STREAMINFO_SIZE) + return AVERROR_INVALIDDATA; + + codec_config->extradata = av_malloc(left); + if (!codec_config->extradata) + return AVERROR(ENOMEM); + + codec_config->extradata_size = avio_read(pb, codec_config->extradata, left); + if (codec_config->extradata_size < left) + return AVERROR_INVALIDDATA; + + codec_config->sample_rate = AV_RB24(codec_config->extradata + 10) >> 4; + + return 0; +} + +static int ipcm_decoder_config(IAMFCodecConfig *codec_config, + AVIOContext *pb, int len) +{ + static const enum AVSampleFormat sample_fmt[2][3] = { + { AV_CODEC_ID_PCM_S16BE, AV_CODEC_ID_PCM_S24BE, AV_CODEC_ID_PCM_S32BE }, + { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S24LE, AV_CODEC_ID_PCM_S32LE }, + }; + int sample_format = avio_r8(pb); // 0 = BE, 1 = LE + int sample_size = (avio_r8(pb) / 8 - 2); // 16, 24, 32 + if (sample_format > 1 || sample_size > 2) + return AVERROR_INVALIDDATA; + + codec_config->codec_id = sample_fmt[sample_format][sample_size]; + codec_config->sample_rate = avio_rb32(pb); + + if (len - avio_tell(pb)) + return AVERROR_INVALIDDATA; + + return 0; +} + +static int codec_config_obu(void *s, IAMFContext *c, AVIOContext *pb, int len) +{ + IAMFCodecConfig *codec_config = NULL; + FFIOContext b; + AVIOContext *pbc; + uint8_t *buf; + enum AVCodecID avcodec_id; + unsigned codec_config_id, nb_samples, codec_id; + int16_t seek_preroll; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pbc = &b.pub; + + codec_config_id = ffio_read_leb(pbc); + codec_id = avio_rb32(pbc); + nb_samples = ffio_read_leb(pbc); + seek_preroll = avio_rb16(pbc); + + switch(codec_id) { + case MKBETAG('O','p','u','s'): + avcodec_id = AV_CODEC_ID_OPUS; + break; + case MKBETAG('m','p','4','a'): + avcodec_id = AV_CODEC_ID_AAC; + break; + case MKBETAG('f','L','a','C'): + avcodec_id = AV_CODEC_ID_FLAC; + break; + default: + avcodec_id = AV_CODEC_ID_NONE; + break; + } + + for (int i = 0; i < c->nb_codec_configs; i++) + if (c->codec_configs[i].codec_config_id == codec_config_id) { + ret = AVERROR_INVALIDDATA; + goto fail; + } + + codec_config = av_dynarray2_add_nofree((void **)&c->codec_configs, &c->nb_codec_configs, + sizeof(*c->codec_configs), NULL); + if (!codec_config) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(codec_config, 0, sizeof(*codec_config)); + + codec_config->codec_config_id = codec_config_id; + codec_config->codec_id = avcodec_id; + codec_config->nb_samples = nb_samples; + codec_config->seek_preroll = seek_preroll; + + switch(codec_id) { + case MKBETAG('O','p','u','s'): + ret = opus_decoder_config(codec_config, pbc, len); + break; + case MKBETAG('m','p','4','a'): + ret = aac_decoder_config(codec_config, pbc, len, s); + break; + case MKBETAG('f','L','a','C'): + ret = flac_decoder_config(codec_config, pbc, len); + break; + case MKBETAG('i','p','c','m'): + ret = ipcm_decoder_config(codec_config, pbc, len); + break; + default: + break; + } + if (ret < 0) + goto fail; + + len -= avio_tell(pbc); + if (len) + av_log(s, AV_LOG_WARNING, "Underread in codec_config_obu. %d bytes left at the end\n", len); + + ret = 0; +fail: + av_free(buf); + return ret; +} + +static int update_extradata(AVCodecParameters *codecpar) +{ + GetBitContext gb; + PutBitContext pb; + int ret; + + switch(codecpar->codec_id) { + case AV_CODEC_ID_OPUS: + AV_WB8(codecpar->extradata + 9, codecpar->ch_layout.nb_channels); + break; + case AV_CODEC_ID_AAC: { + uint8_t buf[5]; + + init_put_bits(&pb, buf, sizeof(buf)); + ret = init_get_bits8(&gb, codecpar->extradata, codecpar->extradata_size); + if (ret < 0) + return ret; + + ret = get_bits(&gb, 5); + put_bits(&pb, 5, ret); + if (ret == AOT_ESCAPE) // violates section 3.11.2, but better check for it + put_bits(&pb, 6, get_bits(&gb, 6)); + ret = get_bits(&gb, 4); + put_bits(&pb, 4, ret); + if (ret == 0x0f) + put_bits(&pb, 24, get_bits(&gb, 24)); + + skip_bits(&gb, 4); + put_bits(&pb, 4, codecpar->ch_layout.nb_channels); // set channel config + ret = put_bits_left(&pb); + put_bits(&pb, ret, get_bits(&gb, ret)); + flush_put_bits(&pb); + + memcpy(codecpar->extradata, buf, sizeof(buf)); + break; + } + case AV_CODEC_ID_FLAC: { + uint8_t buf[13]; + + init_put_bits(&pb, buf, sizeof(buf)); + ret = init_get_bits8(&gb, codecpar->extradata, codecpar->extradata_size); + if (ret < 0) + return ret; + + put_bits32(&pb, get_bits_long(&gb, 32)); // min/max blocksize + put_bits64(&pb, 48, get_bits64(&gb, 48)); // min/max framesize + put_bits(&pb, 20, get_bits(&gb, 20)); // samplerate + skip_bits(&gb, 3); + put_bits(&pb, 3, codecpar->ch_layout.nb_channels - 1); + ret = put_bits_left(&pb); + put_bits(&pb, ret, get_bits(&gb, ret)); + flush_put_bits(&pb); + + memcpy(codecpar->extradata, buf, sizeof(buf)); + break; + } + } + + return 0; +} + +static int scalable_channel_layout_config(void *s, AVIOContext *pb, + IAMFAudioElement *audio_element, + const IAMFCodecConfig *codec_config) +{ + int num_layers, k = 0; + + num_layers = avio_r8(pb) >> 5; // get_bits(&gb, 3); + // skip_bits(&gb, 5); //reserved + + if (num_layers > 6) + return AVERROR_INVALIDDATA; + + for (int i = 0; i < num_layers; i++) { + AVIAMFLayer *layer; + int loudspeaker_layout, output_gain_is_present_flag; + int substream_count, coupled_substream_count; + int ret, byte = avio_r8(pb); + + ret = av_iamf_audio_element_add_layer(audio_element->element, NULL); + if (ret < 0) + return ret; + + loudspeaker_layout = byte >> 4; // get_bits(&gb, 4); + output_gain_is_present_flag = (byte >> 3) & 1; //get_bits1(&gb); + layer = audio_element->element->layers[i]; + layer->recon_gain_is_present = (byte >> 2) & 1; + substream_count = avio_r8(pb); + coupled_substream_count = avio_r8(pb); + + if (output_gain_is_present_flag) { + layer->output_gain_flags = avio_r8(pb) >> 2; // get_bits(&gb, 6); + layer->output_gain = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + } + + if (loudspeaker_layout < 10) + av_channel_layout_copy(&layer->ch_layout, &ff_iamf_scalable_ch_layouts[loudspeaker_layout]); + else + layer->ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_UNSPEC, + .nb_channels = substream_count + + coupled_substream_count }; + + for (int j = 0; j < substream_count; j++) { + IAMFSubStream *substream = &audio_element->substreams[k++]; + + substream->codecpar->ch_layout = coupled_substream_count-- > 0 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO : + (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + ret = update_extradata(substream->codecpar); + if (ret < 0) + return ret; + } + + } + + return 0; +} + +static int ambisonics_config(void *s, AVIOContext *pb, + IAMFAudioElement *audio_element, + const IAMFCodecConfig *codec_config) +{ + AVIAMFLayer *layer; + unsigned ambisonics_mode; + int output_channel_count, substream_count, order; + int ret; + + ambisonics_mode = ffio_read_leb(pb); + if (ambisonics_mode > 1) + return 0; + + output_channel_count = avio_r8(pb); // C + substream_count = avio_r8(pb); // N + if (audio_element->nb_substreams != substream_count) + return AVERROR_INVALIDDATA; + + order = floor(sqrt(output_channel_count - 1)); + /* incomplete order - some harmonics are missing */ + if ((order + 1) * (order + 1) != output_channel_count) + return AVERROR_INVALIDDATA; + + ret = av_iamf_audio_element_add_layer(audio_element->element, NULL); + if (ret < 0) + return ret; + + layer = audio_element->element->layers[0]; + layer->ambisonics_mode = ambisonics_mode; + if (ambisonics_mode == 0) { + for (int i = 0; i < substream_count; i++) { + IAMFSubStream *substream = &audio_element->substreams[i]; + + substream->codecpar->ch_layout = (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + ret = update_extradata(substream->codecpar); + if (ret < 0) + return ret; + } + + layer->ch_layout.order = AV_CHANNEL_ORDER_CUSTOM; + layer->ch_layout.nb_channels = output_channel_count; + layer->ch_layout.u.map = av_calloc(output_channel_count, sizeof(*layer->ch_layout.u.map)); + if (!layer->ch_layout.u.map) + return AVERROR(ENOMEM); + + for (int i = 0; i < output_channel_count; i++) + layer->ch_layout.u.map[i].id = avio_r8(pb) + AV_CHAN_AMBISONIC_BASE; + } else { + int coupled_substream_count = avio_r8(pb); // M + int nb_demixing_matrix = substream_count + coupled_substream_count; + int demixing_matrix_size = nb_demixing_matrix * output_channel_count; + + layer->ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_AMBISONIC, .nb_channels = output_channel_count }; + layer->demixing_matrix = av_malloc_array(demixing_matrix_size, sizeof(*layer->demixing_matrix)); + if (!layer->demixing_matrix) + return AVERROR(ENOMEM); + + for (int i = 0; i < demixing_matrix_size; i++) + layer->demixing_matrix[i] = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + + for (int i = 0; i < substream_count; i++) { + IAMFSubStream *substream = &audio_element->substreams[i]; + + substream->codecpar->ch_layout = coupled_substream_count-- > 0 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO : + (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO; + + + ret = update_extradata(substream->codecpar); + if (ret < 0) + return ret; + } + } + + return 0; +} + +static int param_parse(void *s, IAMFContext *c, AVIOContext *pb, + unsigned int param_definition_type, + const AVIAMFAudioElement *audio_element, + AVIAMFParamDefinition **out_param_definition) +{ + IAMFParamDefinition *param_definition = NULL; + AVIAMFParamDefinition *param; + unsigned int parameter_id, parameter_rate, param_definition_mode; + unsigned int duration = 0, constant_subblock_duration = 0, num_subblocks = 0; + size_t param_size; + + parameter_id = ffio_read_leb(pb); + + for (int i = 0; i < c->nb_param_definitions; i++) + if (c->param_definitions[i].param->parameter_id == parameter_id) { + param_definition = &c->param_definitions[i]; + break; + } + + parameter_rate = ffio_read_leb(pb); + param_definition_mode = avio_r8(pb) >> 7; + + if (param_definition_mode == 0) { + duration = ffio_read_leb(pb); + constant_subblock_duration = ffio_read_leb(pb); + if (constant_subblock_duration == 0) + num_subblocks = ffio_read_leb(pb); + else + num_subblocks = duration / constant_subblock_duration; + } + + param = av_iamf_param_definition_alloc(param_definition_type, NULL, num_subblocks, NULL, &param_size); + if (!param) + return AVERROR(ENOMEM); + + for (int i = 0; i < num_subblocks; i++) { + void *subblock = av_iamf_param_definition_get_subblock(param, i); + unsigned int subblock_duration = constant_subblock_duration; + + if (constant_subblock_duration == 0) + subblock_duration = ffio_read_leb(pb); + + switch (param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: { + AVIAMFMixGainParameterData *mix = subblock; + mix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: { + AVIAMFDemixingInfoParameterData *demix = subblock; + demix->subblock_duration = subblock_duration; + // DemixingInfoParameterData + demix->dmixp_mode = avio_r8(pb) >> 5; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: { + AVIAMFReconGainParameterData *recon = subblock; + recon->subblock_duration = subblock_duration; + break; + } + default: + av_free(param); + return AVERROR_INVALIDDATA; + } + } + + param->parameter_id = parameter_id; + param->parameter_rate = parameter_rate; + param->param_definition_mode = param_definition_mode; + param->duration = duration; + param->constant_subblock_duration = constant_subblock_duration; + param->num_subblocks = num_subblocks; + + if (param_definition) { + if (param_definition->param_size != param_size || memcmp(param_definition->param, param, param_size)) { + av_log(s, AV_LOG_ERROR, "Incosistent parameters for parameter_id %u\n", parameter_id); + av_free(param); + return AVERROR_INVALIDDATA; + } + } else { + param_definition = av_dynarray2_add_nofree((void **)&c->param_definitions, &c->nb_param_definitions, + sizeof(*c->param_definitions), NULL); + if (!param_definition) { + av_free(param); + return AVERROR(ENOMEM); + } + param_definition->param = param; + param_definition->param_size = param_size; + param_definition->audio_element = audio_element; + } + + av_assert0(out_param_definition); + *out_param_definition = param; + + return 0; +} + +static int audio_element_obu(void *s, IAMFContext *c, AVIOContext *pb, int len) +{ + const IAMFCodecConfig *codec_config = NULL; + AVIAMFAudioElement *element; + IAMFAudioElement *audio_element; + FFIOContext b; + AVIOContext *pbc; + uint8_t *buf; + unsigned audio_element_id, codec_config_id, num_substreams, num_parameters; + int audio_element_type, ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pbc = &b.pub; + + audio_element_id = ffio_read_leb(pbc); + + for (int i = 0; i < c->nb_audio_elements; i++) + if (c->audio_elements[i].audio_element_id == audio_element_id) { + av_log(s, AV_LOG_ERROR, "Duplicate audio_element_id %d\n", audio_element_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + audio_element_type = avio_r8(pbc) >> 5; + codec_config_id = ffio_read_leb(pbc); + + for (int i = 0; i < c->nb_codec_configs; i++) { + if (c->codec_configs[i].codec_config_id == codec_config_id) { + codec_config = &c->codec_configs[i]; + break; + } + } + + if (!codec_config) { + av_log(s, AV_LOG_ERROR, "Non existant codec config id %d referenced in an audio element\n", codec_config_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + if (codec_config->codec_id == AV_CODEC_ID_NONE) { + av_log(s, AV_LOG_DEBUG, "Unknown codec id referenced in an audio element. Ignoring\n"); + ret = 0; + goto fail; + } + + num_substreams = ffio_read_leb(pbc); + + audio_element = av_dynarray2_add_nofree((void **)&c->audio_elements, &c->nb_audio_elements, + sizeof(*c->audio_elements), NULL); + if (!audio_element) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(audio_element, 0, sizeof(*audio_element)); + + audio_element->codec_config = codec_config; + audio_element->audio_element_id = audio_element_id; + element = audio_element->element = av_iamf_audio_element_alloc(); + if (!element) { + ret = AVERROR(ENOMEM); + goto fail; + } + + element->codec_config_id = codec_config_id; + element->audio_element_type = audio_element_type; + + for (int i = 0; i < num_substreams; i++) { + IAMFSubStream *substream; + + substream = av_dynarray2_add_nofree((void **)&audio_element->substreams, &audio_element->nb_substreams, + sizeof(*audio_element->substreams), NULL); + if (!substream) { + ret = AVERROR(ENOMEM); + goto fail; + } + + substream->codecpar = avcodec_parameters_alloc(); + if (!substream->codecpar) { + ret = AVERROR(ENOMEM); + goto fail; + } + + substream->audio_substream_id = ffio_read_leb(pbc); + + substream->codecpar->codec_type = AVMEDIA_TYPE_AUDIO; + substream->codecpar->codec_id = codec_config->codec_id; + substream->codecpar->frame_size = codec_config->nb_samples; + substream->codecpar->sample_rate = codec_config->sample_rate; + substream->codecpar->seek_preroll = codec_config->seek_preroll; + + switch(substream->codecpar->codec_id) { + case AV_CODEC_ID_AAC: + case AV_CODEC_ID_FLAC: + case AV_CODEC_ID_OPUS: + substream->codecpar->extradata = av_malloc(codec_config->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!substream->codecpar->extradata) { + ret = AVERROR(ENOMEM); + goto fail; + } + memcpy(substream->codecpar->extradata, codec_config->extradata, codec_config->extradata_size); + memset(substream->codecpar->extradata + codec_config->extradata_size, 0, AV_INPUT_BUFFER_PADDING_SIZE); + substream->codecpar->extradata_size = codec_config->extradata_size; + break; + } + } + + num_parameters = ffio_read_leb(pbc); + if (num_parameters && audio_element_type != 0) { + av_log(s, AV_LOG_ERROR, "Audio Element parameter count %u is invalid" + " for Scene representations\n", num_parameters); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + for (int i = 0; i < num_parameters; i++) { + unsigned param_definition_type; + + param_definition_type = ffio_read_leb(pbc); + if (param_definition_type == AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN) { + ret = AVERROR_INVALIDDATA; + goto fail; + } else if (param_definition_type == AV_IAMF_PARAMETER_DEFINITION_DEMIXING) { + ret = param_parse(s, c, pbc, param_definition_type, element, &element->demixing_info); + if (ret < 0) + goto fail; + + element->default_w = avio_r8(pbc) >> 4; + } else if (param_definition_type == AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN) { + ret = param_parse(s, c, pbc, param_definition_type, element, &element->recon_gain_info); + if (ret < 0) + goto fail; + } else { + unsigned param_definition_size = ffio_read_leb(pbc); + avio_skip(pbc, param_definition_size); + } + } + + if (audio_element_type == AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL) { + ret = scalable_channel_layout_config(s, pbc, audio_element, codec_config); + if (ret < 0) + goto fail; + } else if (audio_element_type == AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE) { + ret = ambisonics_config(s, pbc, audio_element, codec_config); + if (ret < 0) + goto fail; + } else { + unsigned audio_element_config_size = ffio_read_leb(pbc); + avio_skip(pbc, audio_element_config_size); + } + + len -= avio_tell(pbc); + if (len) + av_log(s, AV_LOG_WARNING, "Underread in audio_element_obu. %d bytes left at the end\n", len); + + ret = 0; +fail: + av_free(buf); + + return ret; +} + +static int label_string(AVIOContext *pb, char **label) +{ + uint8_t buf[128]; + + avio_get_str(pb, sizeof(buf), buf, sizeof(buf)); + + if (pb->error) + return pb->error; + if (pb->eof_reached) + return AVERROR_INVALIDDATA; + *label = av_strdup(buf); + if (!*label) + return AVERROR(ENOMEM); + + return 0; +} + +static int mix_presentation_obu(void *s, IAMFContext *c, AVIOContext *pb, int len) +{ + AVIAMFMixPresentation *mix; + IAMFMixPresentation *mix_presentation; + FFIOContext b; + AVIOContext *pbc; + uint8_t *buf; + unsigned mix_presentation_id; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pbc = &b.pub; + + mix_presentation_id = ffio_read_leb(pbc); + + for (int i = 0; i < c->nb_mix_presentations; i++) + if (c->mix_presentations[i].mix_presentation_id == mix_presentation_id) { + av_log(s, AV_LOG_ERROR, "Duplicate mix_presentation_id %d\n", mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + mix_presentation = av_dynarray2_add_nofree((void **)&c->mix_presentations, &c->nb_mix_presentations, + sizeof(*c->mix_presentations), NULL); + if (!mix_presentation) { + ret = AVERROR(ENOMEM); + goto fail; + } + + memset(mix_presentation, 0, sizeof(*mix_presentation)); + + mix_presentation->mix_presentation_id = mix_presentation_id; + mix = mix_presentation->mix = av_iamf_mix_presentation_alloc(); + if (!mix) { + ret = AVERROR(ENOMEM); + goto fail; + } + + mix_presentation->count_label = ffio_read_leb(pbc); + + mix_presentation->language_label = av_calloc(mix_presentation->count_label, + sizeof(*mix_presentation->language_label)); + if (!mix_presentation->language_label) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < mix_presentation->count_label; i++) { + ret = label_string(pbc, &mix_presentation->language_label[i]); + if (ret < 0) + goto fail; + } + + for (int i = 0; i < mix_presentation->count_label; i++) { + char *annotation = NULL; + ret = label_string(pbc, &annotation); + if (ret < 0) + goto fail; + ret = av_dict_set(&mix->annotations, mix_presentation->language_label[i], annotation, + AV_DICT_DONT_STRDUP_VAL | AV_DICT_DONT_OVERWRITE); + if (ret < 0) + goto fail; + } + + mix->num_submixes = ffio_read_leb(pbc); + mix->submixes = av_calloc(mix->num_submixes, sizeof(*mix->submixes)); + if (!mix->submixes) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < mix->num_submixes; i++) { + AVIAMFSubmix *sub_mix; + + sub_mix = mix->submixes[i] = av_mallocz(sizeof(*sub_mix)); + if (!sub_mix) { + ret = AVERROR(ENOMEM); + goto fail; + } + + sub_mix->num_elements = ffio_read_leb(pbc); + sub_mix->elements = av_calloc(sub_mix->num_elements, sizeof(*sub_mix->elements)); + if (!sub_mix->elements) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int j = 0; j < sub_mix->num_elements; j++) { + AVIAMFSubmixElement *submix_element; + IAMFAudioElement *audio_element = NULL; + unsigned int rendering_config_extension_size; + + submix_element = sub_mix->elements[j] = av_mallocz(sizeof(*submix_element)); + if (!submix_element) { + ret = AVERROR(ENOMEM); + goto fail; + } + + submix_element->audio_element_id = ffio_read_leb(pbc); + + for (int k = 0; k < c->nb_audio_elements; k++) + if (c->audio_elements[k].audio_element_id == submix_element->audio_element_id) { + audio_element = &c->audio_elements[k]; + break; + } + + if (!audio_element) { + av_log(s, AV_LOG_ERROR, "Invalid Audio Element with id %u referenced by Mix Parameters %u\n", + submix_element->audio_element_id, mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + + for (int k = 0; k < mix_presentation->count_label; k++) { + char *annotation = NULL; + ret = label_string(pbc, &annotation); + if (ret < 0) + goto fail; + ret = av_dict_set(&submix_element->annotations, mix_presentation->language_label[k], annotation, + AV_DICT_DONT_STRDUP_VAL | AV_DICT_DONT_OVERWRITE); + if (ret < 0) + goto fail; + } + + submix_element->headphones_rendering_mode = avio_r8(pbc) >> 6; + + rendering_config_extension_size = ffio_read_leb(pbc); + avio_skip(pbc, rendering_config_extension_size); + + ret = param_parse(s, c, pbc, AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, + audio_element->element, + &submix_element->element_mix_config); + if (ret < 0) + goto fail; + submix_element->default_mix_gain = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8); + } + + ret = param_parse(s, c, pbc, AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, NULL, &sub_mix->output_mix_config); + if (ret < 0) + goto fail; + sub_mix->default_mix_gain = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8); + + sub_mix->num_layouts = ffio_read_leb(pbc); + sub_mix->layouts = av_calloc(sub_mix->num_layouts, sizeof(*sub_mix->layouts)); + if (!sub_mix->layouts) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (int j = 0; j < sub_mix->num_layouts; j++) { + AVIAMFSubmixLayout *submix_layout; + int info_type; + int byte = avio_r8(pbc); + + submix_layout = sub_mix->layouts[j] = av_mallocz(sizeof(*submix_layout)); + if (!submix_layout) { + ret = AVERROR(ENOMEM); + goto fail; + } + + submix_layout->layout_type = byte >> 6; + if (submix_layout->layout_type < AV_IAMF_SUBMIX_LAYOUT_TYPE_LOUDSPEAKERS && + submix_layout->layout_type > AV_IAMF_SUBMIX_LAYOUT_TYPE_BINAURAL) { + av_log(s, AV_LOG_ERROR, "Invalid Layout type %u in a submix from Mix Presentation %u\n", + submix_layout->layout_type, mix_presentation_id); + ret = AVERROR_INVALIDDATA; + goto fail; + } + if (submix_layout->layout_type == 2) { + int sound_system; + sound_system = (byte >> 2) & 0xF; + av_channel_layout_copy(&submix_layout->sound_system, &ff_iamf_sound_system_map[sound_system].layout); + } + + info_type = avio_r8(pbc); + submix_layout->integrated_loudness = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8); + submix_layout->digital_peak = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8); + + if (info_type & 1) + submix_layout->true_peak = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8); + if (info_type & 2) { + unsigned int num_anchored_loudness = avio_r8(pbc); + + for (int k = 0; k < num_anchored_loudness; k++) { + unsigned int anchor_element = avio_r8(pbc); + AVRational anchored_loudness = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8); + if (anchor_element == IAMF_ANCHOR_ELEMENT_DIALOGUE) + submix_layout->dialogue_anchored_loudness = anchored_loudness; + else if (anchor_element <= IAMF_ANCHOR_ELEMENT_ALBUM) + submix_layout->album_anchored_loudness = anchored_loudness; + else + av_log(s, AV_LOG_DEBUG, "Unknown anchor_element. Ignoring\n"); + } + } + + if (info_type & 0xFC) { + unsigned int info_type_size = ffio_read_leb(pbc); + avio_skip(pbc, info_type_size); + } + } + } + + len -= avio_tell(pbc); + if (len) + av_log(s, AV_LOG_WARNING, "Underread in mix_presentation_obu. %d bytes left at the end\n", len); + + ret = 0; +fail: + av_free(buf); + + return ret; +} + +int ff_iamf_parse_obu_header(const uint8_t *buf, int buf_size, + unsigned *obu_size, int *start_pos, enum IAMF_OBU_Type *type, + unsigned *skip_samples, unsigned *discard_padding) +{ + GetBitContext gb; + int ret, extension_flag, trimming, start; + unsigned skip = 0, discard = 0; + unsigned size; + + ret = init_get_bits8(&gb, buf, FFMIN(buf_size, MAX_IAMF_OBU_HEADER_SIZE)); + if (ret < 0) + return ret; + + *type = get_bits(&gb, 5); + /*redundant =*/ get_bits1(&gb); + trimming = get_bits1(&gb); + extension_flag = get_bits1(&gb); + + *obu_size = get_leb(&gb); + if (*obu_size > INT_MAX) + return AVERROR_INVALIDDATA; + + start = get_bits_count(&gb) / 8; + + if (trimming) { + discard = get_leb(&gb); // num_samples_to_trim_at_end + skip = get_leb(&gb); // num_samples_to_trim_at_start + } + + if (skip_samples) + *skip_samples = skip; + if (discard_padding) + *discard_padding = discard; + + if (extension_flag) { + unsigned int extension_bytes; + extension_bytes = get_leb(&gb); + if (extension_bytes > INT_MAX / 8) + return AVERROR_INVALIDDATA; + skip_bits_long(&gb, extension_bytes * 8); + } + + if (get_bits_left(&gb) < 0) + return AVERROR_INVALIDDATA; + + size = *obu_size + start; + if (size > INT_MAX) + return AVERROR_INVALIDDATA; + + *obu_size -= get_bits_count(&gb) / 8 - start; + *start_pos = size - *obu_size; + + return size; +} + +int ff_iamfdec_read_descriptors(IAMFContext *c, AVIOContext *pb, + int max_size, void *log_ctx) +{ + uint8_t header[MAX_IAMF_OBU_HEADER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; + int ret; + + while (1) { + unsigned obu_size; + enum IAMF_OBU_Type type; + int start_pos, len, size; + + if ((ret = ffio_ensure_seekback(pb, FFMIN(MAX_IAMF_OBU_HEADER_SIZE, max_size))) < 0) + return ret; + size = avio_read(pb, header, FFMIN(MAX_IAMF_OBU_HEADER_SIZE, max_size)); + if (size < 0) + return size; + + len = ff_iamf_parse_obu_header(header, size, &obu_size, &start_pos, &type, NULL, NULL); + if (len < 0 || obu_size > max_size) { + av_log(log_ctx, AV_LOG_ERROR, "Failed to read obu\n"); + avio_seek(pb, -size, SEEK_CUR); + return len; + } + + if (type >= IAMF_OBU_IA_PARAMETER_BLOCK && type < IAMF_OBU_IA_SEQUENCE_HEADER) { + avio_seek(pb, -size, SEEK_CUR); + break; + } + + avio_seek(pb, -(size - start_pos), SEEK_CUR); + switch (type) { + case IAMF_OBU_IA_CODEC_CONFIG: + ret = codec_config_obu(log_ctx, c, pb, obu_size); + break; + case IAMF_OBU_IA_AUDIO_ELEMENT: + ret = audio_element_obu(log_ctx, c, pb, obu_size); + break; + case IAMF_OBU_IA_MIX_PRESENTATION: + ret = mix_presentation_obu(log_ctx, c, pb, obu_size); + break; + case IAMF_OBU_IA_TEMPORAL_DELIMITER: + break; + default: { + int64_t offset = avio_skip(pb, obu_size); + if (offset < 0) + ret = offset; + break; + } + } + if (ret < 0) + return ret; + max_size -= obu_size + start_pos; + if (max_size < 0) + return AVERROR_INVALIDDATA; + if (!max_size) + break; + } + + return 0; +} + +void ff_iamf_uninit_context(IAMFContext *c) +{ + if (!c) + return; + + for (int i = 0; i < c->nb_codec_configs; i++) + av_free(c->codec_configs[i].extradata); + av_freep(&c->codec_configs); + c->nb_codec_configs = 0; + + for (int i = 0; i < c->nb_audio_elements; i++) { + IAMFAudioElement *audio_element = &c->audio_elements[i]; + for (int j = 0; j < audio_element->nb_substreams; j++) + avcodec_parameters_free(&audio_element->substreams[i].codecpar); + av_free(audio_element->substreams); + av_free(audio_element->layers); + av_iamf_audio_element_free(&audio_element->element); + } + av_freep(&c->audio_elements); + c->nb_audio_elements = 0; + + for (int i = 0; i < c->nb_mix_presentations; i++) { + IAMFMixPresentation *mix_presentation = &c->mix_presentations[i]; + for (int j = 0; j < mix_presentation->count_label; j++) + av_free(mix_presentation->language_label[j]); + av_free(mix_presentation->language_label); + av_iamf_mix_presentation_free(&mix_presentation->mix); + } + av_freep(&c->mix_presentations); + c->nb_mix_presentations = 0; + + av_freep(&c->param_definitions); + c->nb_param_definitions = 0; +} diff --git a/libavformat/iamf.h b/libavformat/iamf.h new file mode 100644 index 0000000000..7e3239d500 --- /dev/null +++ b/libavformat/iamf.h @@ -0,0 +1,167 @@ +/* + * Immersive Audio Model and Formats parsing + * Copyright (c) 2023 James Almer <jamrial@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFORMAT_IAMF_H +#define AVFORMAT_IAMF_H + +#include <stdint.h> + +#include "libavutil/iamf.h" +#include "libavcodec/codec_id.h" +#include "libavcodec/codec_par.h" +#include "avformat.h" +#include "avio.h" + +#define MAX_IAMF_OBU_HEADER_SIZE (1 + 8 * 3) + +// OBU types (section 3.2). +enum IAMF_OBU_Type { + IAMF_OBU_IA_CODEC_CONFIG = 0, + IAMF_OBU_IA_AUDIO_ELEMENT = 1, + IAMF_OBU_IA_MIX_PRESENTATION = 2, + IAMF_OBU_IA_PARAMETER_BLOCK = 3, + IAMF_OBU_IA_TEMPORAL_DELIMITER = 4, + IAMF_OBU_IA_AUDIO_FRAME = 5, + IAMF_OBU_IA_AUDIO_FRAME_ID0 = 6, + IAMF_OBU_IA_AUDIO_FRAME_ID1 = 7, + IAMF_OBU_IA_AUDIO_FRAME_ID2 = 8, + IAMF_OBU_IA_AUDIO_FRAME_ID3 = 9, + IAMF_OBU_IA_AUDIO_FRAME_ID4 = 10, + IAMF_OBU_IA_AUDIO_FRAME_ID5 = 11, + IAMF_OBU_IA_AUDIO_FRAME_ID6 = 12, + IAMF_OBU_IA_AUDIO_FRAME_ID7 = 13, + IAMF_OBU_IA_AUDIO_FRAME_ID8 = 14, + IAMF_OBU_IA_AUDIO_FRAME_ID9 = 15, + IAMF_OBU_IA_AUDIO_FRAME_ID10 = 16, + IAMF_OBU_IA_AUDIO_FRAME_ID11 = 17, + IAMF_OBU_IA_AUDIO_FRAME_ID12 = 18, + IAMF_OBU_IA_AUDIO_FRAME_ID13 = 19, + IAMF_OBU_IA_AUDIO_FRAME_ID14 = 20, + IAMF_OBU_IA_AUDIO_FRAME_ID15 = 21, + IAMF_OBU_IA_AUDIO_FRAME_ID16 = 22, + IAMF_OBU_IA_AUDIO_FRAME_ID17 = 23, + // 24~30 reserved. + IAMF_OBU_IA_SEQUENCE_HEADER = 31, +}; + +typedef struct IAMFCodecConfig { + unsigned codec_config_id; + enum AVCodecID codec_id; + uint32_t codec_tag; + unsigned nb_samples; + int seek_preroll; + uint8_t *extradata; + int extradata_size; + int sample_rate; +} IAMFCodecConfig; + +typedef struct IAMFLayer { + unsigned int substream_count; + unsigned int coupled_substream_count; +} IAMFLayer; + +typedef struct IAMFSubStream { + unsigned int audio_substream_id; + + // demux + AVCodecParameters *codecpar; +} IAMFSubStream; + +typedef struct IAMFAudioElement { + AVIAMFAudioElement *element; + unsigned int audio_element_id; + + IAMFSubStream *substreams; + unsigned int nb_substreams; + + const IAMFCodecConfig *codec_config; + + // mux + IAMFLayer *layers; + unsigned int nb_layers; +} IAMFAudioElement; + +typedef struct IAMFMixPresentation { + AVIAMFMixPresentation *mix; + unsigned int mix_presentation_id; + + // demux + unsigned int count_label; + char **language_label; +} IAMFMixPresentation; + +typedef struct IAMFParamDefinition { + const AVIAMFAudioElement *audio_element; + AVIAMFParamDefinition *param; + size_t param_size; +} IAMFParamDefinition; + +typedef struct IAMFContext { + IAMFCodecConfig *codec_configs; + int nb_codec_configs; + IAMFAudioElement *audio_elements; + int nb_audio_elements; + IAMFMixPresentation *mix_presentations; + int nb_mix_presentations; + IAMFParamDefinition *param_definitions; + int nb_param_definitions; +} IAMFContext; + +enum IAMF_Anchor_Element { + IAMF_ANCHOR_ELEMENT_UNKNWONW, + IAMF_ANCHOR_ELEMENT_DIALOGUE, + IAMF_ANCHOR_ELEMENT_ALBUM, +}; + +enum IAMF_Sound_System { + SOUND_SYSTEM_A_0_2_0 = 0, // "Loudspeaker configuration for Sound System A" + SOUND_SYSTEM_B_0_5_0 = 1, // "Loudspeaker configuration for Sound System B" + SOUND_SYSTEM_C_2_5_0 = 2, // "Loudspeaker configuration for Sound System C" + SOUND_SYSTEM_D_4_5_0 = 3, // "Loudspeaker configuration for Sound System D" + SOUND_SYSTEM_E_4_5_1 = 4, // "Loudspeaker configuration for Sound System E" + SOUND_SYSTEM_F_3_7_0 = 5, // "Loudspeaker configuration for Sound System F" + SOUND_SYSTEM_G_4_9_0 = 6, // "Loudspeaker configuration for Sound System G" + SOUND_SYSTEM_H_9_10_3 = 7, // "Loudspeaker configuration for Sound System H" + SOUND_SYSTEM_I_0_7_0 = 8, // "Loudspeaker configuration for Sound System I" + SOUND_SYSTEM_J_4_7_0 = 9, // "Loudspeaker configuration for Sound System J" + SOUND_SYSTEM_10_2_7_0 = 10, // "Loudspeaker configuration for Sound System I" + Ltf + Rtf + SOUND_SYSTEM_11_2_3_0 = 11, // Front subset of "Loudspeaker configuration for Sound System J" + SOUND_SYSTEM_12_0_1_0 = 12, // Mono +}; + +struct IAMFSoundSystemMap { + enum IAMF_Sound_System id; + AVChannelLayout layout; +}; + +extern const AVChannelLayout ff_iamf_scalable_ch_layouts[10]; +extern const struct IAMFSoundSystemMap ff_iamf_sound_system_map[13]; + +int ff_iamf_parse_obu_header(const uint8_t *buf, int buf_size, + unsigned *obu_size, int *start_pos, enum IAMF_OBU_Type *type, + unsigned *skip_samples, unsigned *discard_padding); + +int ff_iamfdec_read_descriptors(IAMFContext *c,AVIOContext *pb, + int size, void *log_ctx); + +void ff_iamf_uninit_context(IAMFContext *c); + +#endif /* AVFORMAT_IAMF_H */ diff --git a/libavformat/iamfdec.c b/libavformat/iamfdec.c new file mode 100644 index 0000000000..2011cba566 --- /dev/null +++ b/libavformat/iamfdec.c @@ -0,0 +1,495 @@ +/* + * Immersive Audio Model and Formats demuxer + * Copyright (c) 2023 James Almer <jamrial@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config_components.h" + +#include "libavutil/avassert.h" +#include "libavutil/iamf.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/log.h" +#include "libavcodec/mathops.h" +#include "avformat.h" +#include "avio_internal.h" +#include "demux.h" +#include "iamf.h" +#include "internal.h" + +typedef struct IAMFDemuxContext { + IAMFContext iamf; + + // Packet side data + AVIAMFParamDefinition *mix; + size_t mix_size; + AVIAMFParamDefinition *demix; + size_t demix_size; + AVIAMFParamDefinition *recon; + size_t recon_size; +} IAMFDemuxContext; + +static AVStream *find_stream_by_id(AVFormatContext *s, int id) +{ + for (int i = 0; i < s->nb_streams; i++) + if (s->streams[i]->id == id) + return s->streams[i]; + + av_log(s, AV_LOG_ERROR, "Invalid stream id %d\n", id); + return NULL; +} + +static int audio_frame_obu(AVFormatContext *s, AVPacket *pkt, int len, + enum IAMF_OBU_Type type, + unsigned skip_samples, unsigned discard_padding, + int id_in_bitstream) +{ + const IAMFDemuxContext *const c = s->priv_data; + AVStream *st; + int ret, audio_substream_id; + + if (id_in_bitstream) { + unsigned explicit_audio_substream_id; + int64_t pos = avio_tell(s->pb); + explicit_audio_substream_id = ffio_read_leb(s->pb); + len -= avio_tell(s->pb) - pos; + audio_substream_id = explicit_audio_substream_id; + } else + audio_substream_id = type - IAMF_OBU_IA_AUDIO_FRAME_ID0; + + st = find_stream_by_id(s, audio_substream_id); + if (!st) + return AVERROR_INVALIDDATA; + + ret = av_get_packet(s->pb, pkt, len); + if (ret < 0) + return ret; + if (ret != len) + return AVERROR_INVALIDDATA; + + if (skip_samples || discard_padding) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_SKIP_SAMPLES, 10); + if (!side_data) + return AVERROR(ENOMEM); + AV_WL32(side_data, skip_samples); + AV_WL32(side_data + 4, discard_padding); + } + if (c->mix) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_MIX_GAIN_PARAM, c->mix_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->mix, c->mix_size); + } + if (c->demix) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_DEMIXING_INFO_PARAM, c->demix_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->demix, c->demix_size); + } + if (c->recon) { + uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_IAMF_RECON_GAIN_INFO_PARAM, c->recon_size); + if (!side_data) + return AVERROR(ENOMEM); + memcpy(side_data, c->recon, c->recon_size); + } + + pkt->stream_index = st->index; + return 0; +} + +static const IAMFParamDefinition *get_param_definition(AVFormatContext *s, unsigned int parameter_id) +{ + const IAMFDemuxContext *const c = s->priv_data; + const IAMFContext *const iamf = &c->iamf; + const IAMFParamDefinition *param_definition = NULL; + + for (int i = 0; i < iamf->nb_param_definitions; i++) + if (iamf->param_definitions[i].param->parameter_id == parameter_id) { + param_definition = &iamf->param_definitions[i]; + break; + } + + return param_definition; +} + +static int parameter_block_obu(AVFormatContext *s, int len) +{ + IAMFDemuxContext *const c = s->priv_data; + const IAMFParamDefinition *param_definition; + const AVIAMFParamDefinition *param; + AVIAMFParamDefinition *out_param = NULL; + FFIOContext b; + AVIOContext *pb; + uint8_t *buf; + unsigned int duration, constant_subblock_duration; + unsigned int num_subblocks; + unsigned int parameter_id; + size_t out_param_size; + int ret; + + buf = av_malloc(len); + if (!buf) + return AVERROR(ENOMEM); + + ret = avio_read(s->pb, buf, len); + if (ret != len) { + if (ret >= 0) + ret = AVERROR_INVALIDDATA; + goto fail; + } + + ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL); + pb = &b.pub; + + parameter_id = ffio_read_leb(pb); + param_definition = get_param_definition(s, parameter_id); + if (!param_definition) { + av_log(s, AV_LOG_VERBOSE, "Non existant parameter_id %d referenced in a parameter block. Ignoring\n", + parameter_id); + ret = 0; + goto fail; + } + + param = param_definition->param; + if (param->param_definition_mode) { + duration = ffio_read_leb(pb); + constant_subblock_duration = ffio_read_leb(pb); + if (constant_subblock_duration == 0) + num_subblocks = ffio_read_leb(pb); + else + num_subblocks = duration / constant_subblock_duration; + } else { + duration = param->duration; + constant_subblock_duration = param->constant_subblock_duration; + num_subblocks = param->num_subblocks; + if (!num_subblocks) + num_subblocks = duration / constant_subblock_duration; + } + + out_param = av_iamf_param_definition_alloc(param->param_definition_type, NULL, num_subblocks, + NULL, &out_param_size); + if (!out_param) { + ret = AVERROR(ENOMEM); + goto fail; + } + + out_param->parameter_id = param->parameter_id; + out_param->param_definition_type = param->param_definition_type; + out_param->parameter_rate = param->parameter_rate; + out_param->param_definition_mode = param->param_definition_mode; + out_param->duration = duration; + out_param->constant_subblock_duration = constant_subblock_duration; + out_param->num_subblocks = num_subblocks; + + for (int i = 0; i < num_subblocks; i++) { + void *subblock = av_iamf_param_definition_get_subblock(out_param, i); + unsigned int subblock_duration = constant_subblock_duration; + + if (param->param_definition_mode && !constant_subblock_duration) + subblock_duration = ffio_read_leb(pb); + + switch (param->param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: { + AVIAMFMixGainParameterData *mix = subblock; + + mix->animation_type = ffio_read_leb(pb); + if (mix->animation_type > AV_IAMF_ANIMATION_TYPE_BEZIER) { + ret = 0; + av_free(out_param); + goto fail; + } + + mix->start_point_value = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + if (mix->animation_type >= AV_IAMF_ANIMATION_TYPE_LINEAR) + mix->end_point_value = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + if (mix->animation_type == AV_IAMF_ANIMATION_TYPE_BEZIER) { + mix->control_point_value = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8); + mix->control_point_relative_time = avio_r8(pb); + } + mix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: { + AVIAMFDemixingInfoParameterData *demix = subblock; + + demix->dmixp_mode = avio_r8(pb) >> 5; + demix->subblock_duration = subblock_duration; + break; + } + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: { + AVIAMFReconGainParameterData *recon = subblock; + const AVIAMFAudioElement *audio_element = param_definition->audio_element; + + av_assert0(audio_element); + for (int i = 0; i < audio_element->num_layers; i++) { + const AVIAMFLayer *layer = audio_element->layers[i]; + if (layer->recon_gain_is_present) { + unsigned int recon_gain_flags = ffio_read_leb(pb); + unsigned int bitcount = 7 + 5 * !!(recon_gain_flags & 0x80); + recon_gain_flags = (recon_gain_flags & 0x7F) | ((recon_gain_flags & 0xFF00) >> 1); + for (int j = 0; j < bitcount; j++) { + if (recon_gain_flags & (1 << j)) + recon->recon_gain[i][j] = avio_r8(pb); + } + } + } + recon->subblock_duration = subblock_duration; + break; + } + default: + av_assert0(0); + } + } + + len -= avio_tell(pb); + if (len) { + int level = (s->error_recognition & AV_EF_EXPLODE) ? AV_LOG_ERROR : AV_LOG_WARNING; + av_log(s, level, "Underread in parameter_block_obu. %d bytes left at the end\n", len); + } + + switch (param->param_definition_type) { + case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: + av_free(c->mix); + c->mix = out_param; + c->mix_size = out_param_size; + break; + case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: + av_free(c->demix); + c->demix = out_param; + c->demix_size = out_param_size; + break; + case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: + av_free(c->recon); + c->recon = out_param; + c->recon_size = out_param_size; + break; + default: + av_assert0(0); + } + + ret = 0; +fail: + if (ret < 0) + av_free(out_param); + av_free(buf); + + return ret; +} + +static int iamf_read_packet(AVFormatContext *s, AVPacket *pkt) +{ + IAMFDemuxContext *const c = s->priv_data; + uint8_t header[MAX_IAMF_OBU_HEADER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; + unsigned obu_size; + int ret; + + while (1) { + enum IAMF_OBU_Type type; + unsigned skip_samples, discard_padding; + int len, size, start_pos; + + if ((ret = ffio_ensure_seekback(s->pb, MAX_IAMF_OBU_HEADER_SIZE)) < 0) + return ret; + size = avio_read(s->pb, header, MAX_IAMF_OBU_HEADER_SIZE); + if (size < 0) + return size; + + len = ff_iamf_parse_obu_header(header, size, &obu_size, &start_pos, &type, + &skip_samples, &discard_padding); + if (len < 0) { + av_log(s, AV_LOG_ERROR, "Failed to read obu\n"); + return len; + } + avio_seek(s->pb, -(size - start_pos), SEEK_CUR); + + if (type >= IAMF_OBU_IA_AUDIO_FRAME && type <= IAMF_OBU_IA_AUDIO_FRAME_ID17) + return audio_frame_obu(s, pkt, obu_size, type, + skip_samples, discard_padding, + type == IAMF_OBU_IA_AUDIO_FRAME); + else if (type == IAMF_OBU_IA_PARAMETER_BLOCK) { + ret = parameter_block_obu(s, obu_size); + if (ret < 0) + return ret; + } else if (type == IAMF_OBU_IA_TEMPORAL_DELIMITER) { + av_freep(&c->mix); + c->mix_size = 0; + av_freep(&c->demix); + c->demix_size = 0; + av_freep(&c->recon); + c->recon_size = 0; + } else { + int64_t offset = avio_skip(s->pb, obu_size); + if (offset < 0) { + ret = offset; + break; + } + } + } + + return ret; +} + +//return < 0 if we need more data +static int get_score(const uint8_t *buf, int buf_size, enum IAMF_OBU_Type type, int *seq) +{ + if (type == IAMF_OBU_IA_SEQUENCE_HEADER) { + if (buf_size < 4 || AV_RB32(buf) != MKBETAG('i','a','m','f')) + return 0; + *seq = 1; + return -1; + } + if (type >= IAMF_OBU_IA_CODEC_CONFIG && type <= IAMF_OBU_IA_TEMPORAL_DELIMITER) + return *seq ? -1 : 0; + if (type >= IAMF_OBU_IA_AUDIO_FRAME && type <= IAMF_OBU_IA_AUDIO_FRAME_ID17) + return *seq ? AVPROBE_SCORE_EXTENSION + 1 : 0; + return 0; +} + +static int iamf_probe(const AVProbeData *p) +{ + unsigned obu_size; + enum IAMF_OBU_Type type; + int seq = 0, cnt = 0, start_pos; + int ret; + + while (1) { + int size = ff_iamf_parse_obu_header(p->buf + cnt, p->buf_size - cnt, + &obu_size, &start_pos, &type, + NULL, NULL); + if (size < 0) + return 0; + + ret = get_score(p->buf + cnt + start_pos, + p->buf_size - cnt - start_pos, + type, &seq); + if (ret >= 0) + return ret; + + cnt += FFMIN(size, p->buf_size - cnt); + } + return 0; +} + +static int iamf_read_header(AVFormatContext *s) +{ + IAMFDemuxContext *const c = s->priv_data; + IAMFContext *const iamf = &c->iamf; + int ret; + + ret = ff_iamfdec_read_descriptors(iamf, s->pb, INT_MAX, s); + if (ret < 0) + return ret; + + for (int i = 0; i < iamf->nb_audio_elements; i++) { + IAMFAudioElement *audio_element = &iamf->audio_elements[i]; + AVStreamGroup *stg = avformat_stream_group_create(s, AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT, NULL); + + if (!stg) + return AVERROR(ENOMEM); + + stg->id = audio_element->audio_element_id; + stg->params.iamf_audio_element = audio_element->element; + audio_element->element = NULL; + + for (int j = 0; j < audio_element->nb_substreams; j++) { + IAMFSubStream *substream = &audio_element->substreams[j]; + AVStream *st = avformat_new_stream(s, NULL); + + if (!st) + return AVERROR(ENOMEM); + + ret = avformat_stream_group_add_stream(stg, st); + if (ret < 0) + return ret; + + ret = avcodec_parameters_copy(st->codecpar, substream->codecpar); + if (ret < 0) + return ret; + + st->id = substream->audio_substream_id; + avpriv_set_pts_info(st, 64, 1, st->codecpar->sample_rate); + } + } + + for (int i = 0; i < iamf->nb_mix_presentations; i++) { + IAMFMixPresentation *mix_presentation = &iamf->mix_presentations[i]; + AVStreamGroup *stg = avformat_stream_group_create(s, AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION, NULL); + const AVIAMFMixPresentation *mix = mix_presentation->mix; + + if (!stg) + return AVERROR(ENOMEM); + + stg->id = mix_presentation->mix_presentation_id; + stg->params.iamf_mix_presentation = mix_presentation->mix; + mix_presentation->mix = NULL; + + for (int j = 0; j < mix->num_submixes; j++) { + AVIAMFSubmix *sub_mix = mix->submixes[j]; + + for (int k = 0; k < sub_mix->num_elements; k++) { + AVIAMFSubmixElement *submix_element = sub_mix->elements[k]; + AVStreamGroup *audio_element = NULL; + + for (int l = 0; l < s->nb_stream_groups; l++) + if (s->stream_groups[l]->type == AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT && + s->stream_groups[l]->id == submix_element->audio_element_id) { + audio_element = s->stream_groups[l]; + break; + } + av_assert0(audio_element); + + for (int l = 0; l < audio_element->nb_streams; l++) { + ret = avformat_stream_group_add_stream(stg, audio_element->streams[l]); + if (ret < 0 && ret != AVERROR(EEXIST)) + return ret; + } + } + } + } + + return 0; +} + +static int iamf_read_close(AVFormatContext *s) +{ + IAMFDemuxContext *const c = s->priv_data; + + ff_iamf_uninit_context(&c->iamf); + + av_freep(&c->mix); + c->mix_size = 0; + av_freep(&c->demix); + c->demix_size = 0; + av_freep(&c->recon); + c->recon_size = 0; + + return 0; +} + +const AVInputFormat ff_iamf_demuxer = { + .name = "iamf", + .long_name = NULL_IF_CONFIG_SMALL("Raw Immersive Audio Model and Formats"), + .priv_data_size = sizeof(IAMFDemuxContext), + .flags_internal = FF_FMT_INIT_CLEANUP, + .read_probe = iamf_probe, + .read_header = iamf_read_header, + .read_packet = iamf_read_packet, + .read_close = iamf_read_close, + .extensions = "iamf", + .flags = AVFMT_GENERIC_INDEX | AVFMT_NO_BYTE_SEEK | AVFMT_NOTIMESTAMPS | AVFMT_SHOW_IDS, +};

[FFmpeg-devel,8/9] avformat: Immersive Audio Model and Formats demuxer

Checks

Commit Message

Patch