diff mbox series

[FFmpeg-devel,6/6,v2] avformat/movenc: add support for Immersive Audio Model and Formats in ISOBMFF

Message ID 20240131172654.15869-6-jamrial@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/6,v3] avcodec: add an Immersive Audio Model and Formats frame split bsf | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

James Almer Jan. 31, 2024, 5:26 p.m. UTC
Signed-off-by: James Almer <jamrial@gmail.com>
---
 configure            |   2 +-
 libavformat/movenc.c | 323 ++++++++++++++++++++++++++++++++++---------
 libavformat/movenc.h |   7 +
 3 files changed, 269 insertions(+), 63 deletions(-)

Comments

Andreas Rheinhardt Feb. 3, 2024, 2:50 p.m. UTC | #1
James Almer:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  configure            |   2 +-
>  libavformat/movenc.c | 323 ++++++++++++++++++++++++++++++++++---------
>  libavformat/movenc.h |   7 +
>  3 files changed, 269 insertions(+), 63 deletions(-)
> 
> diff --git a/configure b/configure
> index 42ba5ec502..6cdd101487 100755
> --- a/configure
> +++ b/configure
> @@ -3547,7 +3547,7 @@ mlp_demuxer_select="mlp_parser"
>  mmf_muxer_select="riffenc"
>  mov_demuxer_select="iso_media riffdec iamf_frame_split_bsf"
>  mov_demuxer_suggest="zlib"
> -mov_muxer_select="iso_media riffenc rtpenc_chain vp9_superframe_bsf aac_adtstoasc_bsf ac3_parser"
> +mov_muxer_select="iso_media riffenc rtpenc_chain vp9_superframe_bsf aac_adtstoasc_bsf iamf_frame_merge_bsf ac3_parser"
>  mp3_demuxer_select="mpegaudio_parser"
>  mp3_muxer_select="mpegaudioheader"
>  mp4_muxer_select="mov_muxer"
> diff --git a/libavformat/movenc.c b/libavformat/movenc.c
> index b724bd5ebc..dfa8b6b04e 100644
> --- a/libavformat/movenc.c
> +++ b/libavformat/movenc.c
> @@ -32,6 +32,7 @@
>  #include "dovi_isom.h"
>  #include "riff.h"
>  #include "avio.h"
> +#include "iamf_writer.h"
>  #include "isom.h"
>  #include "av1.h"
>  #include "avc.h"
> @@ -47,6 +48,7 @@
>  #include "libavcodec/raw.h"
>  #include "internal.h"
>  #include "libavutil/avstring.h"
> +#include "libavutil/bprint.h"
>  #include "libavutil/channel_layout.h"
>  #include "libavutil/csp.h"
>  #include "libavutil/intfloat.h"
> @@ -316,6 +318,32 @@ static int mov_write_sdtp_tag(AVIOContext *pb, MOVTrack *track)
>      return update_size(pb, pos);
>  }
>  
> +static int mov_write_iacb_tag(AVFormatContext *s, AVIOContext *pb, MOVTrack *track)
> +{
> +    AVIOContext *dyn_bc;
> +    int64_t pos = avio_tell(pb);
> +    uint8_t *dyn_buf = NULL;
> +    int dyn_size;
> +    int ret = avio_open_dyn_buf(&dyn_bc);
> +    if (ret < 0)
> +        return ret;
> +
> +    avio_wb32(pb, 0);
> +    ffio_wfourcc(pb, "iacb");
> +    avio_w8(pb, 1); // configurationVersion
> +
> +    ret = ff_iamf_write_descriptors(track->iamf, dyn_bc, s);
> +    if (ret < 0)
> +        return ret;
> +
> +    dyn_size = avio_close_dyn_buf(dyn_bc, &dyn_buf);
> +    ffio_write_leb(pb, dyn_size);
> +    avio_write(pb, dyn_buf, dyn_size);
> +    av_free(dyn_buf);
> +
> +    return update_size(pb, pos);
> +}
> +
>  static int mov_write_amr_tag(AVIOContext *pb, MOVTrack *track)
>  {
>      avio_wb32(pb, 0x11); /* size */
> @@ -1358,6 +1386,8 @@ static int mov_write_audio_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContex
>          ret = mov_write_wave_tag(s, pb, track);
>      else if (track->tag == MKTAG('m','p','4','a'))
>          ret = mov_write_esds_tag(pb, track);
> +    else if (track->tag == MKTAG('i','a','m','f'))
> +        ret = mov_write_iacb_tag(mov->fc, pb, track);
>      else if (track->par->codec_id == AV_CODEC_ID_AMR_NB)
>          ret = mov_write_amr_tag(pb, track);
>      else if (track->par->codec_id == AV_CODEC_ID_AC3)
> @@ -2501,7 +2531,7 @@ static int mov_write_video_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContex
>  
>      if (track->mode == MODE_AVIF) {
>          mov_write_ccst_tag(pb);
> -        if (s->nb_streams > 0 && track == &mov->tracks[1])
> +        if (mov->nb_streams > 0 && track == &mov->tracks[1])
>              mov_write_aux_tag(pb, "auxi");
>      }
>  
> @@ -3096,9 +3126,9 @@ static int mov_write_iloc_tag(AVIOContext *pb, MOVMuxContext *mov, AVFormatConte
>      avio_wb32(pb, 0); /* Version & flags */
>      avio_w8(pb, (4 << 4) + 4); /* offset_size(4) and length_size(4) */
>      avio_w8(pb, 0); /* base_offset_size(4) and reserved(4) */
> -    avio_wb16(pb, s->nb_streams); /* item_count */
> +    avio_wb16(pb, mov->nb_streams); /* item_count */
>  
> -    for (int i = 0; i < s->nb_streams; i++) {
> +    for (int i = 0; i < mov->nb_streams; i++) {
>          avio_wb16(pb, i + 1); /* item_id */
>          avio_wb16(pb, 0); /* data_reference_index */
>          avio_wb16(pb, 1); /* extent_count */
> @@ -3117,9 +3147,9 @@ static int mov_write_iinf_tag(AVIOContext *pb, MOVMuxContext *mov, AVFormatConte
>      avio_wb32(pb, 0); /* size */
>      ffio_wfourcc(pb, "iinf");
>      avio_wb32(pb, 0); /* Version & flags */
> -    avio_wb16(pb, s->nb_streams); /* entry_count */
> +    avio_wb16(pb, mov->nb_streams); /* entry_count */
>  
> -    for (int i = 0; i < s->nb_streams; i++) {
> +    for (int i = 0; i < mov->nb_streams; i++) {
>          int64_t infe_pos = avio_tell(pb);
>          avio_wb32(pb, 0); /* size */
>          ffio_wfourcc(pb, "infe");
> @@ -3188,7 +3218,7 @@ static int mov_write_ipco_tag(AVIOContext *pb, MOVMuxContext *mov, AVFormatConte
>      int64_t pos = avio_tell(pb);
>      avio_wb32(pb, 0); /* size */
>      ffio_wfourcc(pb, "ipco");
> -    for (int i = 0; i < s->nb_streams; i++) {
> +    for (int i = 0; i < mov->nb_streams; i++) {
>          mov_write_ispe_tag(pb, mov, s, i);
>          mov_write_pixi_tag(pb, mov, s, i);
>          mov_write_av1c_tag(pb, &mov->tracks[i]);
> @@ -3206,9 +3236,9 @@ static int mov_write_ipma_tag(AVIOContext *pb, MOVMuxContext *mov, AVFormatConte
>      avio_wb32(pb, 0); /* size */
>      ffio_wfourcc(pb, "ipma");
>      avio_wb32(pb, 0); /* Version & flags */
> -    avio_wb32(pb, s->nb_streams); /* entry_count */
> +    avio_wb32(pb, mov->nb_streams); /* entry_count */
>  
> -    for (int i = 0, index = 1; i < s->nb_streams; i++) {
> +    for (int i = 0, index = 1; i < mov->nb_streams; i++) {
>          avio_wb16(pb, i + 1); /* item_ID */
>          avio_w8(pb, 4); /* association_count */
>  
> @@ -4185,7 +4215,7 @@ static int mov_write_covr(AVIOContext *pb, AVFormatContext *s)
>      int64_t pos = 0;
>      int i;
>  
> -    for (i = 0; i < s->nb_streams; i++) {
> +    for (i = 0; i < mov->nb_streams; i++) {
>          MOVTrack *trk = &mov->tracks[i];
>  
>          if (!is_cover_image(trk->st) || trk->cover_image->size <= 0)
> @@ -4332,7 +4362,7 @@ static int mov_write_meta_tag(AVIOContext *pb, MOVMuxContext *mov,
>          mov_write_pitm_tag(pb, 1);
>          mov_write_iloc_tag(pb, mov, s);
>          mov_write_iinf_tag(pb, mov, s);
> -        if (s->nb_streams > 1)
> +        if (mov->nb_streams > 1)
>              mov_write_iref_tag(pb, mov, s);
>          mov_write_iprp_tag(pb, mov, s);
>      } else {
> @@ -4583,16 +4613,17 @@ static int mov_setup_track_ids(MOVMuxContext *mov, AVFormatContext *s)
>  
>      if (mov->use_stream_ids_as_track_ids) {
>          int next_generated_track_id = 0;
> -        for (i = 0; i < s->nb_streams; i++) {
> -            if (s->streams[i]->id > next_generated_track_id)
> -                next_generated_track_id = s->streams[i]->id;
> +        for (i = 0; i < mov->nb_streams; i++) {
> +            AVStream *st = mov->tracks[i].st;
> +            if (st->id > next_generated_track_id)
> +                next_generated_track_id = st->id;
>          }
>  
>          for (i = 0; i < mov->nb_tracks; i++) {
>              if (mov->tracks[i].entry <= 0 && !(mov->flags & FF_MOV_FLAG_FRAGMENT))
>                  continue;
>  
> -            mov->tracks[i].track_id = i >= s->nb_streams ? ++next_generated_track_id : s->streams[i]->id;
> +            mov->tracks[i].track_id = i >= mov->nb_streams ? ++next_generated_track_id : mov->tracks[i].st->id;
>          }
>      } else {
>          for (i = 0; i < mov->nb_tracks; i++) {
> @@ -4629,7 +4660,7 @@ static int mov_write_moov_tag(AVIOContext *pb, MOVMuxContext *mov,
>      }
>  
>      if (mov->chapter_track)
> -        for (i = 0; i < s->nb_streams; i++) {
> +        for (i = 0; i < mov->nb_streams; i++) {
>              mov->tracks[i].tref_tag = MKTAG('c','h','a','p');
>              mov->tracks[i].tref_id  = mov->tracks[mov->chapter_track].track_id;
>          }
> @@ -4669,7 +4700,7 @@ static int mov_write_moov_tag(AVIOContext *pb, MOVMuxContext *mov,
>      for (i = 0; i < mov->nb_tracks; i++) {
>          if (mov->tracks[i].entry > 0 || mov->flags & FF_MOV_FLAG_FRAGMENT ||
>              mov->mode == MODE_AVIF) {
> -            int ret = mov_write_trak_tag(s, pb, mov, &(mov->tracks[i]), i < s->nb_streams ? s->streams[i] : NULL);
> +            int ret = mov_write_trak_tag(s, pb, mov, &(mov->tracks[i]), i < mov->nb_streams ? mov->tracks[i].st : NULL);
>              if (ret < 0)
>                  return ret;
>          }
> @@ -5463,8 +5494,8 @@ static int mov_write_ftyp_tag(AVIOContext *pb, AVFormatContext *s)
>      int has_h264 = 0, has_av1 = 0, has_video = 0, has_dolby = 0;
>      int i;
>  
> -    for (i = 0; i < s->nb_streams; i++) {
> -        AVStream *st = s->streams[i];
> +    for (i = 0; i < mov->nb_streams; i++) {
> +        AVStream *st = mov->tracks[i].st;
>          if (is_cover_image(st))
>              continue;
>          if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
> @@ -5639,8 +5670,8 @@ static int mov_write_identification(AVIOContext *pb, AVFormatContext *s)
>      mov_write_ftyp_tag(pb,s);
>      if (mov->mode == MODE_PSP) {
>          int video_streams_nb = 0, audio_streams_nb = 0, other_streams_nb = 0;
> -        for (i = 0; i < s->nb_streams; i++) {
> -            AVStream *st = s->streams[i];
> +        for (i = 0; i < mov->nb_streams; i++) {
> +            AVStream *st = mov->tracks[i].st;
>              if (is_cover_image(st))
>                  continue;
>              if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
> @@ -5827,7 +5858,7 @@ static int mov_write_squashed_packets(AVFormatContext *s)
>  {
>      MOVMuxContext *mov = s->priv_data;
>  
> -    for (int i = 0; i < s->nb_streams; i++) {
> +    for (int i = 0; i < mov->nb_streams; i++) {
>          MOVTrack *track = &mov->tracks[i];
>          int ret = AVERROR_BUG;
>  
> @@ -5868,7 +5899,7 @@ static int mov_flush_fragment(AVFormatContext *s, int force)
>      // of fragments was triggered automatically by an AVPacket, we
>      // already have reliable info for the end of that track, but other
>      // tracks may need to be filled in.
> -    for (i = 0; i < s->nb_streams; i++) {
> +    for (i = 0; i < mov->nb_streams; i++) {
>          MOVTrack *track = &mov->tracks[i];
>          if (!track->end_reliable) {
>              const AVPacket *pkt = ff_interleaved_peek(s, i);
> @@ -6069,10 +6100,8 @@ static int mov_auto_flush_fragment(AVFormatContext *s, int force)
>      return ret;
>  }
>  
> -static int check_pkt(AVFormatContext *s, AVPacket *pkt)
> +static int check_pkt(AVFormatContext *s, MOVTrack *trk, AVPacket *pkt)
>  {
> -    MOVMuxContext *mov = s->priv_data;
> -    MOVTrack *trk = &mov->tracks[pkt->stream_index];
>      int64_t ref;
>      uint64_t duration;
>  
> @@ -6110,15 +6139,21 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>  {
>      MOVMuxContext *mov = s->priv_data;
>      AVIOContext *pb = s->pb;
> -    MOVTrack *trk = &mov->tracks[pkt->stream_index];
> -    AVCodecParameters *par = trk->par;
> +    MOVTrack *trk;
> +    AVCodecParameters *par;
>      AVProducerReferenceTime *prft;
>      unsigned int samples_in_chunk = 0;
>      int size = pkt->size, ret = 0, offset = 0;
>      size_t prft_size;
>      uint8_t *reformatted_data = NULL;
>  
> -    ret = check_pkt(s, pkt);
> +    if (pkt->stream_index < s->nb_streams)
> +        trk = s->streams[pkt->stream_index]->priv_data;
> +    else // Timecode or chapter
> +        trk = &mov->tracks[pkt->stream_index];
> +    par = trk->par;
> +
> +    ret = check_pkt(s, trk, pkt);
>      if (ret < 0)
>          return ret;
>  
> @@ -6208,7 +6243,7 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>  
>      if (par->codec_id == AV_CODEC_ID_AAC && pkt->size > 2 &&
>          (AV_RB16(pkt->data) & 0xfff0) == 0xfff0) {
> -        if (!s->streams[pkt->stream_index]->nb_frames) {
> +        if (!trk->st->nb_frames) {
>              av_log(s, AV_LOG_ERROR, "Malformed AAC bitstream detected: "
>                     "use the audio bitstream filter 'aac_adtstoasc' to fix it "
>                     "('-bsf:a aac_adtstoasc' option with ffmpeg)\n");
> @@ -6470,18 +6505,18 @@ err:
>  static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
>  {
>      MOVMuxContext *mov = s->priv_data;
> -    MOVTrack *trk = &mov->tracks[pkt->stream_index];
> +    MOVTrack *trk = s->streams[pkt->stream_index]->priv_data;
>      AVCodecParameters *par = trk->par;
>      int64_t frag_duration = 0;
>      int size = pkt->size;
>  
> -    int ret = check_pkt(s, pkt);
> +    int ret = check_pkt(s, trk, pkt);
>      if (ret < 0)
>          return ret;
>  
>      if (mov->flags & FF_MOV_FLAG_FRAG_DISCONT) {
>          int i;
> -        for (i = 0; i < s->nb_streams; i++)
> +        for (i = 0; i < mov->nb_streams; i++)
>              mov->tracks[i].frag_discont = 1;
>          mov->flags &= ~FF_MOV_FLAG_FRAG_DISCONT;
>      }
> @@ -6523,7 +6558,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
>          return 0;             /* Discard 0 sized packets */
>      }
>  
> -    if (trk->entry && pkt->stream_index < s->nb_streams)
> +    if (trk->entry && pkt->stream_index < mov->nb_streams)
>          frag_duration = av_rescale_q(pkt->dts - trk->cluster[0].dts,
>                  s->streams[pkt->stream_index]->time_base,
>                  AV_TIME_BASE_Q);
> @@ -6578,17 +6613,45 @@ static int mov_write_subtitle_end_packet(AVFormatContext *s,
>      return ret;
>  }
>  
> +static int mov_filter_packet(AVFormatContext *s, MOVTrack *track, AVPacket *pkt)
> +{
> +    int ret;
> +
> +    if (!track->bsf)
> +        return 0;
> +
> +    ret = av_bsf_send_packet(track->bsf, pkt);
> +    if (ret < 0) {
> +        av_log(s, AV_LOG_ERROR,
> +                "Failed to send packet to filter %s for stream %d: %s\n",
> +                track->bsf->filter->name, pkt->stream_index, av_err2str(ret));
> +        return ret;
> +    }
> +
> +    return av_bsf_receive_packet(track->bsf, pkt);
> +}
> +
>  static int mov_write_packet(AVFormatContext *s, AVPacket *pkt)
>  {
>      MOVMuxContext *mov = s->priv_data;
>      MOVTrack *trk;
> +    int ret;
>  
>      if (!pkt) {
>          mov_flush_fragment(s, 1);
>          return 1;
>      }
>  
> -    trk = &mov->tracks[pkt->stream_index];
> +    trk = s->streams[pkt->stream_index]->priv_data;
> +
> +    ret = mov_filter_packet(s, trk, pkt);
> +    if (ret < 0) {
> +        if (ret == AVERROR(EAGAIN))
> +            return 0;
> +        av_log(s, AV_LOG_ERROR, "Error applying bitstream filters to an output "
> +                                "packet for stream #%d: %s\n", trk->st->index, av_err2str(ret));
> +        return ret;
> +    }
>  
>      if (is_cover_image(trk->st)) {
>          int ret;
> @@ -6789,12 +6852,12 @@ static int mov_create_chapter_track(AVFormatContext *s, int tracknum)
>  }
>  
>  
> -static int mov_check_timecode_track(AVFormatContext *s, AVTimecode *tc, int src_index, const char *tcstr)
> +static int mov_check_timecode_track(AVFormatContext *s, AVTimecode *tc, AVStream *src_st, const char *tcstr)
>  {
>      int ret;
>  
>      /* compute the frame number */
> -    ret = av_timecode_init_from_string(tc, s->streams[src_index]->avg_frame_rate, tcstr, s);
> +    ret = av_timecode_init_from_string(tc, src_st->avg_frame_rate, tcstr, s);
>      return ret;
>  }
>  
> @@ -6802,7 +6865,7 @@ static int mov_create_timecode_track(AVFormatContext *s, int index, int src_inde
>  {
>      MOVMuxContext *mov  = s->priv_data;
>      MOVTrack *track     = &mov->tracks[index];
> -    AVStream *src_st    = s->streams[src_index];
> +    AVStream *src_st    = mov->tracks[src_index].st;
>      uint8_t data[4];
>      AVPacket *pkt = mov->pkt;
>      AVRational rate = src_st->avg_frame_rate;
> @@ -6862,8 +6925,8 @@ static void enable_tracks(AVFormatContext *s)
>          first[i] = -1;
>      }
>  
> -    for (i = 0; i < s->nb_streams; i++) {
> -        AVStream *st = s->streams[i];
> +    for (i = 0; i < mov->nb_streams; i++) {
> +        AVStream *st = mov->tracks[i].st;
>  
>          if (st->codecpar->codec_type <= AVMEDIA_TYPE_UNKNOWN ||
>              st->codecpar->codec_type >= AVMEDIA_TYPE_NB ||
> @@ -6897,6 +6960,9 @@ static void mov_free(AVFormatContext *s)
>      MOVMuxContext *mov = s->priv_data;
>      int i;
>  
> +    for (i = 0; i < s->nb_streams; i++)
> +        s->streams[i]->priv_data = NULL;
> +
>      if (!mov->tracks)
>          return;
>  
> @@ -6927,6 +6993,7 @@ static void mov_free(AVFormatContext *s)
>          ffio_free_dyn_buf(&track->mdat_buf);
>  
>          avpriv_packet_list_free(&track->squashed_packet_queue);
> +        av_bsf_free(&track->bsf);
>      }
>  
>      av_freep(&mov->tracks);
> @@ -6999,6 +7066,92 @@ static int mov_create_dvd_sub_decoder_specific_info(MOVTrack *track,
>      return 0;
>  }
>  
> +static int mov_init_iamf_track(AVFormatContext *s)
> +{
> +    MOVMuxContext *mov = s->priv_data;
> +    MOVTrack *track = &mov->tracks[0]; // IAMF if present is always the first track
> +    const AVBitStreamFilter *filter;
> +    AVBPrint bprint;
> +    AVStream *first_st = NULL;
> +    char *args;
> +    int nb_audio_elements = 0, nb_mix_presentations = 0;
> +    int ret;
> +
> +    for (int i = 0; i < s->nb_stream_groups; i++) {
> +        const AVStreamGroup *stg = s->stream_groups[i];
> +
> +        if (stg->type == AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT)
> +            nb_audio_elements++;
> +        if (stg->type == AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION)
> +            nb_mix_presentations++;
> +    }
> +
> +    if (!nb_audio_elements && !nb_mix_presentations)
> +        return 0;
> +
> +    if ((nb_audio_elements < 1 && nb_audio_elements > 2) || nb_mix_presentations < 1) {
> +        av_log(s, AV_LOG_ERROR, "There must be >= 1 and <= 2 IAMF_AUDIO_ELEMENT and at least "
> +                                "one IAMF_MIX_PRESENTATION stream groups to write a IMAF track\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    track->iamf = av_mallocz(sizeof(*track->iamf));
> +    if (!track->iamf)
> +        return AVERROR(ENOMEM);
> +
> +    av_bprint_init(&bprint, 0, AV_BPRINT_SIZE_UNLIMITED);
> +
> +    for (int i = 0; i < s->nb_stream_groups; i++) {
> +        const AVStreamGroup *stg = s->stream_groups[i];
> +
> +        switch(stg->type) {
> +        case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT:
> +            if (!first_st)
> +                first_st = stg->streams[0];
> +
> +            for (int j = 0; j < stg->nb_streams; j++) {
> +                av_bprintf(&bprint, "%d=%d%s", s->streams[j]->index, s->streams[j]->id,
> +                                               j < (stg->nb_streams - 1) ? ":" : "");
> +                s->streams[j]->priv_data = track;
> +            }
> +
> +            ret = ff_iamf_add_audio_element(track->iamf, stg, s);
> +            break;
> +        case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION:
> +            ret = ff_iamf_add_mix_presentation(track->iamf, stg, s);
> +            break;
> +        default:
> +            av_assert0(0);
> +        }
> +        if (ret < 0)
> +            return ret;
> +    }
> +
> +    av_bprint_finalize(&bprint, &args);
> +
> +    filter = av_bsf_get_by_name("iamf_frame_merge");
> +    if (!filter) {
> +        av_log(s, AV_LOG_ERROR, "iamf_frame_merge bitstream filter "
> +               "not found. This is a bug, please report it.\n");
> +        return AVERROR_BUG;
> +    }
> +
> +    ret = av_bsf_alloc(filter, &track->bsf);
> +    if (ret < 0)
> +        return ret;
> +
> +    ret = avcodec_parameters_copy(track->bsf->par_in, first_st->codecpar);
> +    if (ret < 0)
> +        return ret;
> +
> +    av_opt_set(track->bsf->priv_data, "index_mapping", args, 0);
> +    av_opt_set_int(track->bsf->priv_data, "out_index", first_st->index, 0);
> +
> +    track->tag = MKTAG('i','a','m','f');
> +
> +    return av_bsf_init(track->bsf);
> +}
> +
>  static int mov_init(AVFormatContext *s)
>  {
>      MOVMuxContext *mov = s->priv_data;
> @@ -7136,7 +7289,37 @@ static int mov_init(AVFormatContext *s)
>          s->streams[0]->disposition |= AV_DISPOSITION_DEFAULT;
>      }
>  
> -    mov->nb_tracks = s->nb_streams;
> +    for (i = 0; i < s->nb_stream_groups; i++) {
> +        AVStreamGroup *stg = s->stream_groups[i];
> +
> +        if (stg->type != AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT)
> +            continue;
> +
> +        for (int j = 0; j < stg->nb_streams; j++) {
> +            AVStream *st = stg->streams[j];
> +
> +            if (st->priv_data) {
> +                av_log(s, AV_LOG_ERROR, "Stream %d is present in more than one Stream Group of type "
> +                                        "IAMF Audio Element\n", j);
> +                return AVERROR(EINVAL);
> +            }
> +            st->priv_data = st;
> +        }
> +
> +        if (!mov->nb_tracks) // We support one track for the entire IAMF structure
> +            mov->nb_tracks++;
> +    }
> +
> +    for (i = 0; i < s->nb_streams; i++) {
> +        AVStream *st = s->streams[i];
> +        if (st->priv_data)
> +            continue;
> +        st->priv_data = st;
> +        mov->nb_tracks++;
> +    }
> +
> +    mov->nb_streams = mov->nb_tracks;
> +
>      if (mov->mode & (MODE_MP4|MODE_MOV|MODE_IPOD) && s->nb_chapters)
>          mov->chapter_track = mov->nb_tracks++;
>  
> @@ -7162,7 +7345,7 @@ static int mov_init(AVFormatContext *s)
>              if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO &&
>                  (t || (t=av_dict_get(st->metadata, "timecode", NULL, 0)))) {
>                  AVTimecode tc;
> -                ret = mov_check_timecode_track(s, &tc, i, t->value);
> +                ret = mov_check_timecode_track(s, &tc, st, t->value);
>                  if (ret >= 0)
>                      mov->nb_meta_tmcd++;
>              }
> @@ -7211,18 +7394,33 @@ static int mov_init(AVFormatContext *s)
>          }
>      }
>  
> +    ret = mov_init_iamf_track(s);
> +    if (ret < 0)
> +        return ret;
> +
> +    for (int j = 0, i = 0; j < s->nb_streams; j++) {
> +        AVStream *st = s->streams[j];
> +
> +        if (st != st->priv_data)
> +            continue;
> +        st->priv_data = &mov->tracks[i++];
> +    }
> +
>      for (i = 0; i < s->nb_streams; i++) {
>          AVStream *st= s->streams[i];
> -        MOVTrack *track= &mov->tracks[i];
> +        MOVTrack *track = st->priv_data;
>          AVDictionaryEntry *lang = av_dict_get(st->metadata, "language", NULL,0);
>  
> -        track->st  = st;
> -        track->par = st->codecpar;
> +        if (!track->st) {
> +            track->st  = st;
> +            track->par = st->codecpar;
> +        }
>          track->language = ff_mov_iso639_to_lang(lang?lang->value:"und", mov->mode!=MODE_MOV);
>          if (track->language < 0)
>              track->language = 32767;  // Unspecified Macintosh language code
>          track->mode = mov->mode;
> -        track->tag  = mov_find_codec_tag(s, track);
> +        if (!track->tag)
> +            track->tag  = mov_find_codec_tag(s, track);
>          if (!track->tag) {
>              av_log(s, AV_LOG_ERROR, "Could not find tag for codec %s in stream #%d, "
>                     "codec not currently supported in container\n",
> @@ -7414,25 +7612,26 @@ static int mov_write_header(AVFormatContext *s)
>  {
>      AVIOContext *pb = s->pb;
>      MOVMuxContext *mov = s->priv_data;
> -    int i, ret, hint_track = 0, tmcd_track = 0, nb_tracks = s->nb_streams;
> +    int i, ret, hint_track = 0, tmcd_track = 0, nb_tracks = mov->nb_streams;
>  
>      if (mov->mode & (MODE_MP4|MODE_MOV|MODE_IPOD) && s->nb_chapters)
>          nb_tracks++;
>  
>      if (mov->flags & FF_MOV_FLAG_RTP_HINT) {
>          hint_track = nb_tracks;
> -        for (i = 0; i < s->nb_streams; i++)
> -            if (rtp_hinting_needed(s->streams[i]))
> +        for (i = 0; i < mov->nb_streams; i++) {
> +            if (rtp_hinting_needed(mov->tracks[i].st))
>                  nb_tracks++;
> +        }
>      }
>  
>      if (mov->nb_meta_tmcd)
>          tmcd_track = nb_tracks;
>  
> -    for (i = 0; i < s->nb_streams; i++) {
> +    for (i = 0; i < mov->nb_streams; i++) {
>          int j;
> -        AVStream *st= s->streams[i];
> -        MOVTrack *track= &mov->tracks[i];
> +        MOVTrack *track = &mov->tracks[i];
> +        AVStream *st = track->st;
>  
>          /* copy extradata if it exists */
>          if (st->codecpar->extradata_size) {
> @@ -7454,8 +7653,8 @@ static int mov_write_header(AVFormatContext *s)
>                                        &(AVChannelLayout)AV_CHANNEL_LAYOUT_MONO))
>              continue;
>  
> -        for (j = 0; j < s->nb_streams; j++) {
> -            AVStream *stj= s->streams[j];
> +        for (j = 0; j < mov->nb_streams; j++) {
> +            AVStream *stj= mov->tracks[j].st;
>              MOVTrack *trackj= &mov->tracks[j];
>              if (j == i)
>                  continue;
> @@ -7518,8 +7717,8 @@ static int mov_write_header(AVFormatContext *s)
>              return ret;
>  
>      if (mov->flags & FF_MOV_FLAG_RTP_HINT) {
> -        for (i = 0; i < s->nb_streams; i++) {
> -            if (rtp_hinting_needed(s->streams[i])) {
> +        for (i = 0; i < mov->nb_streams; i++) {
> +            if (rtp_hinting_needed(mov->tracks[i].st)) {
>                  if ((ret = ff_mov_init_hinting(s, hint_track, i)) < 0)
>                      return ret;
>                  hint_track++;
> @@ -7531,8 +7730,8 @@ static int mov_write_header(AVFormatContext *s)
>          const AVDictionaryEntry *t, *global_tcr = av_dict_get(s->metadata,
>                                                                "timecode", NULL, 0);
>          /* Initialize the tmcd tracks */
> -        for (i = 0; i < s->nb_streams; i++) {
> -            AVStream *st = s->streams[i];
> +        for (i = 0; i < mov->nb_streams; i++) {
> +            AVStream *st = mov->tracks[i].st;
>              t = global_tcr;
>  
>              if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
> @@ -7541,7 +7740,7 @@ static int mov_write_header(AVFormatContext *s)
>                      t = av_dict_get(st->metadata, "timecode", NULL, 0);
>                  if (!t)
>                      continue;
> -                if (mov_check_timecode_track(s, &tc, i, t->value) < 0)
> +                if (mov_check_timecode_track(s, &tc, st, t->value) < 0)
>                      continue;
>                  if ((ret = mov_create_timecode_track(s, tmcd_track, i, tc)) < 0)
>                      return ret;
> @@ -7662,7 +7861,7 @@ static int mov_write_trailer(AVFormatContext *s)
>      int64_t moov_pos;
>  
>      if (mov->need_rewrite_extradata) {
> -        for (i = 0; i < s->nb_streams; i++) {
> +        for (i = 0; i < mov->nb_streams; i++) {
>              MOVTrack *track = &mov->tracks[i];
>              AVCodecParameters *par = track->par;
>  
> @@ -7802,7 +8001,7 @@ static int avif_write_trailer(AVFormatContext *s)
>      if (mov->moov_written) return 0;
>  
>      mov->is_animated_avif = s->streams[0]->nb_frames > 1;
> -    if (mov->is_animated_avif && s->nb_streams > 1) {
> +    if (mov->is_animated_avif && mov->nb_streams > 1) {
>          // For animated avif with alpha channel, we need to write a tref tag
>          // with type "auxl".
>          mov->tracks[1].tref_tag = MKTAG('a', 'u', 'x', 'l');
> @@ -7812,7 +8011,7 @@ static int avif_write_trailer(AVFormatContext *s)
>      mov_write_meta_tag(pb, mov, s);
>  
>      moov_size = get_moov_size(s);
> -    for (i = 0; i < s->nb_streams; i++)
> +    for (i = 0; i < mov->nb_tracks; i++)
>          mov->tracks[i].data_offset = avio_tell(pb) + moov_size + 8;
>  
>      if (mov->is_animated_avif) {
> @@ -7834,7 +8033,7 @@ static int avif_write_trailer(AVFormatContext *s)
>  
>      // write extent offsets.
>      pos_backup = avio_tell(pb);
> -    for (i = 0; i < s->nb_streams; i++) {
> +    for (i = 0; i < mov->nb_streams; i++) {
>          if (extent_offsets[i] != (uint32_t)extent_offsets[i]) {
>              av_log(s, AV_LOG_ERROR, "extent offset does not fit in 32 bits\n");
>              return AVERROR_INVALIDDATA;
> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
> index 60363198c9..fee3e759e0 100644
> --- a/libavformat/movenc.h
> +++ b/libavformat/movenc.h
> @@ -25,7 +25,9 @@
>  #define AVFORMAT_MOVENC_H
>  
>  #include "avformat.h"
> +#include "iamf.h"
>  #include "movenccenc.h"
> +#include "libavcodec/bsf.h"

There is no need to include these here, as you don't need complete
types. This has the added benefit of forcing you to actually include the
files where you are using them (namely in movenc.c, where you forgot to
include bsf.h).

>  #include "libavcodec/packet_internal.h"
>  
>  #define MOV_FRAG_INFO_ALLOC_INCREMENT 64
> @@ -170,6 +172,10 @@ typedef struct MOVTrack {
>      unsigned int squash_fragment_samples_to_one; //< flag to note formats where all samples for a fragment are to be squashed
>  
>      PacketList squashed_packet_queue;
> +
> +    AVBSFContext *bsf;
> +
> +    IAMFContext *iamf;
>  } MOVTrack;
>  
>  typedef enum {
> @@ -188,6 +194,7 @@ typedef struct MOVMuxContext {
>      const AVClass *av_class;
>      int     mode;
>      int64_t time;
> +    int     nb_streams;
>      int     nb_tracks;
>      int     nb_meta_tmcd;  ///< number of new created tmcd track based on metadata (aka not data copy)
>      int     chapter_track; ///< qt chapter track number
James Almer Feb. 5, 2024, 3:08 p.m. UTC | #2
On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>> index 60363198c9..fee3e759e0 100644
>> --- a/libavformat/movenc.h
>> +++ b/libavformat/movenc.h
>> @@ -25,7 +25,9 @@
>>   #define AVFORMAT_MOVENC_H
>>   
>>   #include "avformat.h"
>> +#include "iamf.h"
>>   #include "movenccenc.h"
>> +#include "libavcodec/bsf.h"
> 
> There is no need to include these here, as you don't need complete
> types. This has the added benefit of forcing you to actually include the
> files where you are using them (namely in movenc.c, where you forgot to
> include bsf.h).

Ok, fixed locally.

Will push the set soon.
James Almer Feb. 5, 2024, 3:12 p.m. UTC | #3
On 2/5/2024 12:12 PM, Andreas Rheinhardt wrote:
> James Almer:
>> On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>>>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>>>> index 60363198c9..fee3e759e0 100644
>>>> --- a/libavformat/movenc.h
>>>> +++ b/libavformat/movenc.h
>>>> @@ -25,7 +25,9 @@
>>>>    #define AVFORMAT_MOVENC_H
>>>>      #include "avformat.h"
>>>> +#include "iamf.h"
>>>>    #include "movenccenc.h"
>>>> +#include "libavcodec/bsf.h"
>>>
>>> There is no need to include these here, as you don't need complete
>>> types. This has the added benefit of forcing you to actually include the
>>> files where you are using them (namely in movenc.c, where you forgot to
>>> include bsf.h).
>>
>> Ok, fixed locally.
>>
>> Will push the set soon.
> 
> It seems you have not noticed my objection to the first version of your set.
> 
> - Andreas

Can you link to it?
Andreas Rheinhardt Feb. 5, 2024, 3:12 p.m. UTC | #4
James Almer:
> On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>>> index 60363198c9..fee3e759e0 100644
>>> --- a/libavformat/movenc.h
>>> +++ b/libavformat/movenc.h
>>> @@ -25,7 +25,9 @@
>>>   #define AVFORMAT_MOVENC_H
>>>     #include "avformat.h"
>>> +#include "iamf.h"
>>>   #include "movenccenc.h"
>>> +#include "libavcodec/bsf.h"
>>
>> There is no need to include these here, as you don't need complete
>> types. This has the added benefit of forcing you to actually include the
>> files where you are using them (namely in movenc.c, where you forgot to
>> include bsf.h).
> 
> Ok, fixed locally.
> 
> Will push the set soon.

It seems you have not noticed my objection to the first version of your set.

- Andreas
Andreas Rheinhardt Feb. 5, 2024, 3:28 p.m. UTC | #5
James Almer:
> On 2/5/2024 12:12 PM, Andreas Rheinhardt wrote:
>> James Almer:
>>> On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>>>>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>>>>> index 60363198c9..fee3e759e0 100644
>>>>> --- a/libavformat/movenc.h
>>>>> +++ b/libavformat/movenc.h
>>>>> @@ -25,7 +25,9 @@
>>>>>    #define AVFORMAT_MOVENC_H
>>>>>      #include "avformat.h"
>>>>> +#include "iamf.h"
>>>>>    #include "movenccenc.h"
>>>>> +#include "libavcodec/bsf.h"
>>>>
>>>> There is no need to include these here, as you don't need complete
>>>> types. This has the added benefit of forcing you to actually include
>>>> the
>>>> files where you are using them (namely in movenc.c, where you forgot to
>>>> include bsf.h).
>>>
>>> Ok, fixed locally.
>>>
>>> Will push the set soon.
>>
>> It seems you have not noticed my objection to the first version of
>> your set.
>>
>> - Andreas
> 
> Can you link to it?

Sorry, it was v2:
https://ffmpeg.org/pipermail/ffmpeg-devel/2024-February/320722.html

- Andreas
James Almer Feb. 5, 2024, 3:28 p.m. UTC | #6
On 2/5/2024 12:28 PM, Andreas Rheinhardt wrote:
> James Almer:
>> On 2/5/2024 12:12 PM, Andreas Rheinhardt wrote:
>>> James Almer:
>>>> On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>>>>>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>>>>>> index 60363198c9..fee3e759e0 100644
>>>>>> --- a/libavformat/movenc.h
>>>>>> +++ b/libavformat/movenc.h
>>>>>> @@ -25,7 +25,9 @@
>>>>>>     #define AVFORMAT_MOVENC_H
>>>>>>       #include "avformat.h"
>>>>>> +#include "iamf.h"
>>>>>>     #include "movenccenc.h"
>>>>>> +#include "libavcodec/bsf.h"
>>>>>
>>>>> There is no need to include these here, as you don't need complete
>>>>> types. This has the added benefit of forcing you to actually include
>>>>> the
>>>>> files where you are using them (namely in movenc.c, where you forgot to
>>>>> include bsf.h).
>>>>
>>>> Ok, fixed locally.
>>>>
>>>> Will push the set soon.
>>>
>>> It seems you have not noticed my objection to the first version of
>>> your set.
>>>
>>> - Andreas
>>
>> Can you link to it?
> 
> Sorry, it was v2:
> https://ffmpeg.org/pipermail/ffmpeg-devel/2024-February/320722.html
> 
> - Andreas

I removed the codec list from the split bsf like you asked, and 
explained what the bsfs do in the documentation.
Andreas Rheinhardt Feb. 5, 2024, 4:18 p.m. UTC | #7
James Almer:
> On 2/5/2024 12:28 PM, Andreas Rheinhardt wrote:
>> James Almer:
>>> On 2/5/2024 12:12 PM, Andreas Rheinhardt wrote:
>>>> James Almer:
>>>>> On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>>>>>>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>>>>>>> index 60363198c9..fee3e759e0 100644
>>>>>>> --- a/libavformat/movenc.h
>>>>>>> +++ b/libavformat/movenc.h
>>>>>>> @@ -25,7 +25,9 @@
>>>>>>>     #define AVFORMAT_MOVENC_H
>>>>>>>       #include "avformat.h"
>>>>>>> +#include "iamf.h"
>>>>>>>     #include "movenccenc.h"
>>>>>>> +#include "libavcodec/bsf.h"
>>>>>>
>>>>>> There is no need to include these here, as you don't need complete
>>>>>> types. This has the added benefit of forcing you to actually include
>>>>>> the
>>>>>> files where you are using them (namely in movenc.c, where you
>>>>>> forgot to
>>>>>> include bsf.h).
>>>>>
>>>>> Ok, fixed locally.
>>>>>
>>>>> Will push the set soon.
>>>>
>>>> It seems you have not noticed my objection to the first version of
>>>> your set.
>>>>
>>>> - Andreas
>>>
>>> Can you link to it?
>>
>> Sorry, it was v2:
>> https://ffmpeg.org/pipermail/ffmpeg-devel/2024-February/320722.html
>>
>> - Andreas
> 
> I removed the codec list from the split bsf like you asked, and
> explained what the bsfs do in the documentation.

For those few codecs where different framings are common and supported
by us, the muxers convert the given framing to the needs of the output
format; decoders also support the various framings. This of course only
works if they can decide which packetization the input uses; it is
possible for the cases we support.

If you allow that packets can contain OBU encapsulated data for
arbitrary codec ids (even if only intended for a few of them), then this
packetization would become officially allowed and we would have to adapt
our muxers and decoders accordingly. Which is just not possible, because
there is just no information available to base this decision on.

There is a second complication with iamf_frame_split_bsf: Up until now,
BSFs only passed the stream index value through. But with this BSF the
output may have multiple ids even when the input only had one. I am
pretty sure that this will surprise and break many users. I don't know
whether ffmpeg is among them (if a user inserts the BSF manually).

In fact, for this BSF the stream_index that the output packet gets is
determined by the offset as well as the packet data alone. The only way
for the demuxer to know these numbers is if it has already parsed the
packet data before and added streams according to this. Looking at 3/6
it seems that this is indeed what's happening (which is wasteful). But
only partially: The iamf_descriptors data is checked and streams are
created according to it, but the data read via av_append_packet() is not
checked at all. What guarantees that it can't contain
IAMF_OBU_IA_AUDIO_ELEMENT elements (which trigger adding new ids and
therefore lead to an increment in the potential output stream_index)?
Also notice that your 3/6 uses the pkt's stream_index coming out of the
BSF without any check.

- Andreas
James Almer Feb. 5, 2024, 4:51 p.m. UTC | #8
On 2/5/2024 1:18 PM, Andreas Rheinhardt wrote:
> James Almer:
>> On 2/5/2024 12:28 PM, Andreas Rheinhardt wrote:
>>> James Almer:
>>>> On 2/5/2024 12:12 PM, Andreas Rheinhardt wrote:
>>>>> James Almer:
>>>>>> On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>>>>>>>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>>>>>>>> index 60363198c9..fee3e759e0 100644
>>>>>>>> --- a/libavformat/movenc.h
>>>>>>>> +++ b/libavformat/movenc.h
>>>>>>>> @@ -25,7 +25,9 @@
>>>>>>>>      #define AVFORMAT_MOVENC_H
>>>>>>>>        #include "avformat.h"
>>>>>>>> +#include "iamf.h"
>>>>>>>>      #include "movenccenc.h"
>>>>>>>> +#include "libavcodec/bsf.h"
>>>>>>>
>>>>>>> There is no need to include these here, as you don't need complete
>>>>>>> types. This has the added benefit of forcing you to actually include
>>>>>>> the
>>>>>>> files where you are using them (namely in movenc.c, where you
>>>>>>> forgot to
>>>>>>> include bsf.h).
>>>>>>
>>>>>> Ok, fixed locally.
>>>>>>
>>>>>> Will push the set soon.
>>>>>
>>>>> It seems you have not noticed my objection to the first version of
>>>>> your set.
>>>>>
>>>>> - Andreas
>>>>
>>>> Can you link to it?
>>>
>>> Sorry, it was v2:
>>> https://ffmpeg.org/pipermail/ffmpeg-devel/2024-February/320722.html
>>>
>>> - Andreas
>>
>> I removed the codec list from the split bsf like you asked, and
>> explained what the bsfs do in the documentation.
> 
> For those few codecs where different framings are common and supported
> by us, the muxers convert the given framing to the needs of the output
> format; decoders also support the various framings. This of course only
> works if they can decide which packetization the input uses; it is
> possible for the cases we support.
> 
> If you allow that packets can contain OBU encapsulated data for
> arbitrary codec ids (even if only intended for a few of them), then this
> packetization would become officially allowed and we would have to adapt
> our muxers and decoders accordingly. Which is just not possible, because
> there is just no information available to base this decision on.

So you want me to state that these bsfs should not be used at all by 
library users, and that they are meant exclusively to be inserted by the 
mov muxer and demuxer?

> 
> There is a second complication with iamf_frame_split_bsf: Up until now,
> BSFs only passed the stream index value through. But with this BSF the
> output may have multiple ids even when the input only had one. I am
> pretty sure that this will surprise and break many users. I don't know
> whether ffmpeg is among them (if a user inserts the BSF manually).

This would be addressed by forbidding (or declaring unsupported) the 
usage of the filters by external callers.

> 
> In fact, for this BSF the stream_index that the output packet gets is
> determined by the offset as well as the packet data alone. The only way
> for the demuxer to know these numbers is if it has already parsed the
> packet data before and added streams according to this. Looking at 3/6

I think you mean 4/6.

> it seems that this is indeed what's happening (which is wasteful). But

Packets are not being parsed, only the descriptors in the relevant mp4 
sample entry.

> only partially: The iamf_descriptors data is checked and streams are
> created according to it, but the data read via av_append_packet() is not
> checked at all. What guarantees that it can't contain
> IAMF_OBU_IA_AUDIO_ELEMENT elements (which trigger adding new ids and
> therefore lead to an increment in the potential output stream_index)?
> Also notice that your 3/6 uses the pkt's stream_index coming out of the
> BSF without any check.

Again 4/6. And i can add a check for stream_index.

I could make the split filter only parse descriptors once, and passing 
them to it immediately after av_bfs_init(), so if packets have 
descriptors, an error will be returned (The spec disallows descriptors 
in samples).
There's also the bsf's input codecpar extradata. I could use that instead.
Andreas Rheinhardt Feb. 5, 2024, 5:06 p.m. UTC | #9
James Almer:
> On 2/5/2024 1:18 PM, Andreas Rheinhardt wrote:
>> James Almer:
>>> On 2/5/2024 12:28 PM, Andreas Rheinhardt wrote:
>>>> James Almer:
>>>>> On 2/5/2024 12:12 PM, Andreas Rheinhardt wrote:
>>>>>> James Almer:
>>>>>>> On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>>>>>>>>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>>>>>>>>> index 60363198c9..fee3e759e0 100644
>>>>>>>>> --- a/libavformat/movenc.h
>>>>>>>>> +++ b/libavformat/movenc.h
>>>>>>>>> @@ -25,7 +25,9 @@
>>>>>>>>>      #define AVFORMAT_MOVENC_H
>>>>>>>>>        #include "avformat.h"
>>>>>>>>> +#include "iamf.h"
>>>>>>>>>      #include "movenccenc.h"
>>>>>>>>> +#include "libavcodec/bsf.h"
>>>>>>>>
>>>>>>>> There is no need to include these here, as you don't need complete
>>>>>>>> types. This has the added benefit of forcing you to actually
>>>>>>>> include
>>>>>>>> the
>>>>>>>> files where you are using them (namely in movenc.c, where you
>>>>>>>> forgot to
>>>>>>>> include bsf.h).
>>>>>>>
>>>>>>> Ok, fixed locally.
>>>>>>>
>>>>>>> Will push the set soon.
>>>>>>
>>>>>> It seems you have not noticed my objection to the first version of
>>>>>> your set.
>>>>>>
>>>>>> - Andreas
>>>>>
>>>>> Can you link to it?
>>>>
>>>> Sorry, it was v2:
>>>> https://ffmpeg.org/pipermail/ffmpeg-devel/2024-February/320722.html
>>>>
>>>> - Andreas
>>>
>>> I removed the codec list from the split bsf like you asked, and
>>> explained what the bsfs do in the documentation.
>>
>> For those few codecs where different framings are common and supported
>> by us, the muxers convert the given framing to the needs of the output
>> format; decoders also support the various framings. This of course only
>> works if they can decide which packetization the input uses; it is
>> possible for the cases we support.
>>
>> If you allow that packets can contain OBU encapsulated data for
>> arbitrary codec ids (even if only intended for a few of them), then this
>> packetization would become officially allowed and we would have to adapt
>> our muxers and decoders accordingly. Which is just not possible, because
>> there is just no information available to base this decision on.
> 
> So you want me to state that these bsfs should not be used at all by
> library users, and that they are meant exclusively to be inserted by the
> mov muxer and demuxer?
> 

Actually, I don't think that this should be done in a BSF at all. For
the reasons outlined above.

>>
>> There is a second complication with iamf_frame_split_bsf: Up until now,
>> BSFs only passed the stream index value through. But with this BSF the
>> output may have multiple ids even when the input only had one. I am
>> pretty sure that this will surprise and break many users. I don't know
>> whether ffmpeg is among them (if a user inserts the BSF manually).
> 
> This would be addressed by forbidding (or declaring unsupported) the
> usage of the filters by external callers.
> 

So a BSF for only one caller (lavf)?

>>
>> In fact, for this BSF the stream_index that the output packet gets is
>> determined by the offset as well as the packet data alone. The only way
>> for the demuxer to know these numbers is if it has already parsed the
>> packet data before and added streams according to this. Looking at 3/6
> 
> I think you mean 4/6.
> 
>> it seems that this is indeed what's happening (which is wasteful). But
> 
> Packets are not being parsed, only the descriptors in the relevant mp4
> sample entry.
> 

And it happens twice, which is wasteful.

>> only partially: The iamf_descriptors data is checked and streams are
>> created according to it, but the data read via av_append_packet() is not
>> checked at all. What guarantees that it can't contain
>> IAMF_OBU_IA_AUDIO_ELEMENT elements (which trigger adding new ids and
>> therefore lead to an increment in the potential output stream_index)?
>> Also notice that your 3/6 uses the pkt's stream_index coming out of the
>> BSF without any check.
> 
> Again 4/6. And i can add a check for stream_index.
> 

I think we should never come in a scenario where this can happen.

> I could make the split filter only parse descriptors once, and passing
> them to it immediately after av_bfs_init(), so if packets have
> descriptors, an error will be returned (The spec disallows descriptors
> in samples).
> There's also the bsf's input codecpar extradata. I could use that instead.

If one were to use a BSF for this, then sending the descriptor via
extradata would be the way to go.

- Andreas
James Almer Feb. 5, 2024, 5:25 p.m. UTC | #10
On 2/5/2024 2:06 PM, Andreas Rheinhardt wrote:
> James Almer:
>> On 2/5/2024 1:18 PM, Andreas Rheinhardt wrote:
>>> James Almer:
>>>> On 2/5/2024 12:28 PM, Andreas Rheinhardt wrote:
>>>>> James Almer:
>>>>>> On 2/5/2024 12:12 PM, Andreas Rheinhardt wrote:
>>>>>>> James Almer:
>>>>>>>> On 2/3/2024 11:50 AM, Andreas Rheinhardt wrote:
>>>>>>>>>> diff --git a/libavformat/movenc.h b/libavformat/movenc.h
>>>>>>>>>> index 60363198c9..fee3e759e0 100644
>>>>>>>>>> --- a/libavformat/movenc.h
>>>>>>>>>> +++ b/libavformat/movenc.h
>>>>>>>>>> @@ -25,7 +25,9 @@
>>>>>>>>>>       #define AVFORMAT_MOVENC_H
>>>>>>>>>>         #include "avformat.h"
>>>>>>>>>> +#include "iamf.h"
>>>>>>>>>>       #include "movenccenc.h"
>>>>>>>>>> +#include "libavcodec/bsf.h"
>>>>>>>>>
>>>>>>>>> There is no need to include these here, as you don't need complete
>>>>>>>>> types. This has the added benefit of forcing you to actually
>>>>>>>>> include
>>>>>>>>> the
>>>>>>>>> files where you are using them (namely in movenc.c, where you
>>>>>>>>> forgot to
>>>>>>>>> include bsf.h).
>>>>>>>>
>>>>>>>> Ok, fixed locally.
>>>>>>>>
>>>>>>>> Will push the set soon.
>>>>>>>
>>>>>>> It seems you have not noticed my objection to the first version of
>>>>>>> your set.
>>>>>>>
>>>>>>> - Andreas
>>>>>>
>>>>>> Can you link to it?
>>>>>
>>>>> Sorry, it was v2:
>>>>> https://ffmpeg.org/pipermail/ffmpeg-devel/2024-February/320722.html
>>>>>
>>>>> - Andreas
>>>>
>>>> I removed the codec list from the split bsf like you asked, and
>>>> explained what the bsfs do in the documentation.
>>>
>>> For those few codecs where different framings are common and supported
>>> by us, the muxers convert the given framing to the needs of the output
>>> format; decoders also support the various framings. This of course only
>>> works if they can decide which packetization the input uses; it is
>>> possible for the cases we support.
>>>
>>> If you allow that packets can contain OBU encapsulated data for
>>> arbitrary codec ids (even if only intended for a few of them), then this
>>> packetization would become officially allowed and we would have to adapt
>>> our muxers and decoders accordingly. Which is just not possible, because
>>> there is just no information available to base this decision on.
>>
>> So you want me to state that these bsfs should not be used at all by
>> library users, and that they are meant exclusively to be inserted by the
>> mov muxer and demuxer?
>>
> 
> Actually, I don't think that this should be done in a BSF at all. For
> the reasons outlined above.

I have to disagree. I'm using the packet filtering API to filter packets.
Saying that it shouldn't be a bsf because it's mainly useful for 
internal libav* modules is not a good argument. External users can use 
it too if they want to, seeing there are lavc users other than lavf, and 
other mp4 demuxers could benefit from it. And we have bsfs, like 
vp9_superframe_split, that have virtually no use whatsoever for callers 
other than our native decoders (even though other decoders could benefit 
from it too).

These are packetizer and depacketizer filters. We're not standardizing 
any kind of encapsulation for specific codecs other modules will have 
too look for.

> 
>>>
>>> There is a second complication with iamf_frame_split_bsf: Up until now,
>>> BSFs only passed the stream index value through. But with this BSF the
>>> output may have multiple ids even when the input only had one. I am
>>> pretty sure that this will surprise and break many users. I don't know
>>> whether ffmpeg is among them (if a user inserts the BSF manually).
>>
>> This would be addressed by forbidding (or declaring unsupported) the
>> usage of the filters by external callers.
>>
> 
> So a BSF for only one caller (lavf)?
> 
>>>
>>> In fact, for this BSF the stream_index that the output packet gets is
>>> determined by the offset as well as the packet data alone. The only way
>>> for the demuxer to know these numbers is if it has already parsed the
>>> packet data before and added streams according to this. Looking at 3/6
>>
>> I think you mean 4/6.
>>
>>> it seems that this is indeed what's happening (which is wasteful). But
>>
>> Packets are not being parsed, only the descriptors in the relevant mp4
>> sample entry.
>>
> 
> And it happens twice, which is wasteful.

Muxer and bsf are separate independent modules. It's expected to be the 
case.

> 
>>> only partially: The iamf_descriptors data is checked and streams are
>>> created according to it, but the data read via av_append_packet() is not
>>> checked at all. What guarantees that it can't contain
>>> IAMF_OBU_IA_AUDIO_ELEMENT elements (which trigger adding new ids and
>>> therefore lead to an increment in the potential output stream_index)?
>>> Also notice that your 3/6 uses the pkt's stream_index coming out of the
>>> BSF without any check.
>>
>> Again 4/6. And i can add a check for stream_index.
>>
> 
> I think we should never come in a scenario where this can happen.
> 
>> I could make the split filter only parse descriptors once, and passing
>> them to it immediately after av_bfs_init(), so if packets have
>> descriptors, an error will be returned (The spec disallows descriptors
>> in samples).
>> There's also the bsf's input codecpar extradata. I could use that instead.
> 
> If one were to use a BSF for this, then sending the descriptor via
> extradata would be the way to go.
> 
> - Andreas
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox series

Patch

diff --git a/configure b/configure
index 42ba5ec502..6cdd101487 100755
--- a/configure
+++ b/configure
@@ -3547,7 +3547,7 @@  mlp_demuxer_select="mlp_parser"
 mmf_muxer_select="riffenc"
 mov_demuxer_select="iso_media riffdec iamf_frame_split_bsf"
 mov_demuxer_suggest="zlib"
-mov_muxer_select="iso_media riffenc rtpenc_chain vp9_superframe_bsf aac_adtstoasc_bsf ac3_parser"
+mov_muxer_select="iso_media riffenc rtpenc_chain vp9_superframe_bsf aac_adtstoasc_bsf iamf_frame_merge_bsf ac3_parser"
 mp3_demuxer_select="mpegaudio_parser"
 mp3_muxer_select="mpegaudioheader"
 mp4_muxer_select="mov_muxer"
diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index b724bd5ebc..dfa8b6b04e 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -32,6 +32,7 @@ 
 #include "dovi_isom.h"
 #include "riff.h"
 #include "avio.h"
+#include "iamf_writer.h"
 #include "isom.h"
 #include "av1.h"
 #include "avc.h"
@@ -47,6 +48,7 @@ 
 #include "libavcodec/raw.h"
 #include "internal.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/csp.h"
 #include "libavutil/intfloat.h"
@@ -316,6 +318,32 @@  static int mov_write_sdtp_tag(AVIOContext *pb, MOVTrack *track)
     return update_size(pb, pos);
 }
 
+static int mov_write_iacb_tag(AVFormatContext *s, AVIOContext *pb, MOVTrack *track)
+{
+    AVIOContext *dyn_bc;
+    int64_t pos = avio_tell(pb);
+    uint8_t *dyn_buf = NULL;
+    int dyn_size;
+    int ret = avio_open_dyn_buf(&dyn_bc);
+    if (ret < 0)
+        return ret;
+
+    avio_wb32(pb, 0);
+    ffio_wfourcc(pb, "iacb");
+    avio_w8(pb, 1); // configurationVersion
+
+    ret = ff_iamf_write_descriptors(track->iamf, dyn_bc, s);
+    if (ret < 0)
+        return ret;
+
+    dyn_size = avio_close_dyn_buf(dyn_bc, &dyn_buf);
+    ffio_write_leb(pb, dyn_size);
+    avio_write(pb, dyn_buf, dyn_size);
+    av_free(dyn_buf);
+
+    return update_size(pb, pos);
+}
+
 static int mov_write_amr_tag(AVIOContext *pb, MOVTrack *track)
 {
     avio_wb32(pb, 0x11); /* size */
@@ -1358,6 +1386,8 @@  static int mov_write_audio_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContex
         ret = mov_write_wave_tag(s, pb, track);
     else if (track->tag == MKTAG('m','p','4','a'))
         ret = mov_write_esds_tag(pb, track);
+    else if (track->tag == MKTAG('i','a','m','f'))
+        ret = mov_write_iacb_tag(mov->fc, pb, track);
     else if (track->par->codec_id == AV_CODEC_ID_AMR_NB)
         ret = mov_write_amr_tag(pb, track);
     else if (track->par->codec_id == AV_CODEC_ID_AC3)
@@ -2501,7 +2531,7 @@  static int mov_write_video_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContex
 
     if (track->mode == MODE_AVIF) {
         mov_write_ccst_tag(pb);
-        if (s->nb_streams > 0 && track == &mov->tracks[1])
+        if (mov->nb_streams > 0 && track == &mov->tracks[1])
             mov_write_aux_tag(pb, "auxi");
     }
 
@@ -3096,9 +3126,9 @@  static int mov_write_iloc_tag(AVIOContext *pb, MOVMuxContext *mov, AVFormatConte
     avio_wb32(pb, 0); /* Version & flags */
     avio_w8(pb, (4 << 4) + 4); /* offset_size(4) and length_size(4) */
     avio_w8(pb, 0); /* base_offset_size(4) and reserved(4) */
-    avio_wb16(pb, s->nb_streams); /* item_count */
+    avio_wb16(pb, mov->nb_streams); /* item_count */
 
-    for (int i = 0; i < s->nb_streams; i++) {
+    for (int i = 0; i < mov->nb_streams; i++) {
         avio_wb16(pb, i + 1); /* item_id */
         avio_wb16(pb, 0); /* data_reference_index */
         avio_wb16(pb, 1); /* extent_count */
@@ -3117,9 +3147,9 @@  static int mov_write_iinf_tag(AVIOContext *pb, MOVMuxContext *mov, AVFormatConte
     avio_wb32(pb, 0); /* size */
     ffio_wfourcc(pb, "iinf");
     avio_wb32(pb, 0); /* Version & flags */
-    avio_wb16(pb, s->nb_streams); /* entry_count */
+    avio_wb16(pb, mov->nb_streams); /* entry_count */
 
-    for (int i = 0; i < s->nb_streams; i++) {
+    for (int i = 0; i < mov->nb_streams; i++) {
         int64_t infe_pos = avio_tell(pb);
         avio_wb32(pb, 0); /* size */
         ffio_wfourcc(pb, "infe");
@@ -3188,7 +3218,7 @@  static int mov_write_ipco_tag(AVIOContext *pb, MOVMuxContext *mov, AVFormatConte
     int64_t pos = avio_tell(pb);
     avio_wb32(pb, 0); /* size */
     ffio_wfourcc(pb, "ipco");
-    for (int i = 0; i < s->nb_streams; i++) {
+    for (int i = 0; i < mov->nb_streams; i++) {
         mov_write_ispe_tag(pb, mov, s, i);
         mov_write_pixi_tag(pb, mov, s, i);
         mov_write_av1c_tag(pb, &mov->tracks[i]);
@@ -3206,9 +3236,9 @@  static int mov_write_ipma_tag(AVIOContext *pb, MOVMuxContext *mov, AVFormatConte
     avio_wb32(pb, 0); /* size */
     ffio_wfourcc(pb, "ipma");
     avio_wb32(pb, 0); /* Version & flags */
-    avio_wb32(pb, s->nb_streams); /* entry_count */
+    avio_wb32(pb, mov->nb_streams); /* entry_count */
 
-    for (int i = 0, index = 1; i < s->nb_streams; i++) {
+    for (int i = 0, index = 1; i < mov->nb_streams; i++) {
         avio_wb16(pb, i + 1); /* item_ID */
         avio_w8(pb, 4); /* association_count */
 
@@ -4185,7 +4215,7 @@  static int mov_write_covr(AVIOContext *pb, AVFormatContext *s)
     int64_t pos = 0;
     int i;
 
-    for (i = 0; i < s->nb_streams; i++) {
+    for (i = 0; i < mov->nb_streams; i++) {
         MOVTrack *trk = &mov->tracks[i];
 
         if (!is_cover_image(trk->st) || trk->cover_image->size <= 0)
@@ -4332,7 +4362,7 @@  static int mov_write_meta_tag(AVIOContext *pb, MOVMuxContext *mov,
         mov_write_pitm_tag(pb, 1);
         mov_write_iloc_tag(pb, mov, s);
         mov_write_iinf_tag(pb, mov, s);
-        if (s->nb_streams > 1)
+        if (mov->nb_streams > 1)
             mov_write_iref_tag(pb, mov, s);
         mov_write_iprp_tag(pb, mov, s);
     } else {
@@ -4583,16 +4613,17 @@  static int mov_setup_track_ids(MOVMuxContext *mov, AVFormatContext *s)
 
     if (mov->use_stream_ids_as_track_ids) {
         int next_generated_track_id = 0;
-        for (i = 0; i < s->nb_streams; i++) {
-            if (s->streams[i]->id > next_generated_track_id)
-                next_generated_track_id = s->streams[i]->id;
+        for (i = 0; i < mov->nb_streams; i++) {
+            AVStream *st = mov->tracks[i].st;
+            if (st->id > next_generated_track_id)
+                next_generated_track_id = st->id;
         }
 
         for (i = 0; i < mov->nb_tracks; i++) {
             if (mov->tracks[i].entry <= 0 && !(mov->flags & FF_MOV_FLAG_FRAGMENT))
                 continue;
 
-            mov->tracks[i].track_id = i >= s->nb_streams ? ++next_generated_track_id : s->streams[i]->id;
+            mov->tracks[i].track_id = i >= mov->nb_streams ? ++next_generated_track_id : mov->tracks[i].st->id;
         }
     } else {
         for (i = 0; i < mov->nb_tracks; i++) {
@@ -4629,7 +4660,7 @@  static int mov_write_moov_tag(AVIOContext *pb, MOVMuxContext *mov,
     }
 
     if (mov->chapter_track)
-        for (i = 0; i < s->nb_streams; i++) {
+        for (i = 0; i < mov->nb_streams; i++) {
             mov->tracks[i].tref_tag = MKTAG('c','h','a','p');
             mov->tracks[i].tref_id  = mov->tracks[mov->chapter_track].track_id;
         }
@@ -4669,7 +4700,7 @@  static int mov_write_moov_tag(AVIOContext *pb, MOVMuxContext *mov,
     for (i = 0; i < mov->nb_tracks; i++) {
         if (mov->tracks[i].entry > 0 || mov->flags & FF_MOV_FLAG_FRAGMENT ||
             mov->mode == MODE_AVIF) {
-            int ret = mov_write_trak_tag(s, pb, mov, &(mov->tracks[i]), i < s->nb_streams ? s->streams[i] : NULL);
+            int ret = mov_write_trak_tag(s, pb, mov, &(mov->tracks[i]), i < mov->nb_streams ? mov->tracks[i].st : NULL);
             if (ret < 0)
                 return ret;
         }
@@ -5463,8 +5494,8 @@  static int mov_write_ftyp_tag(AVIOContext *pb, AVFormatContext *s)
     int has_h264 = 0, has_av1 = 0, has_video = 0, has_dolby = 0;
     int i;
 
-    for (i = 0; i < s->nb_streams; i++) {
-        AVStream *st = s->streams[i];
+    for (i = 0; i < mov->nb_streams; i++) {
+        AVStream *st = mov->tracks[i].st;
         if (is_cover_image(st))
             continue;
         if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
@@ -5639,8 +5670,8 @@  static int mov_write_identification(AVIOContext *pb, AVFormatContext *s)
     mov_write_ftyp_tag(pb,s);
     if (mov->mode == MODE_PSP) {
         int video_streams_nb = 0, audio_streams_nb = 0, other_streams_nb = 0;
-        for (i = 0; i < s->nb_streams; i++) {
-            AVStream *st = s->streams[i];
+        for (i = 0; i < mov->nb_streams; i++) {
+            AVStream *st = mov->tracks[i].st;
             if (is_cover_image(st))
                 continue;
             if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
@@ -5827,7 +5858,7 @@  static int mov_write_squashed_packets(AVFormatContext *s)
 {
     MOVMuxContext *mov = s->priv_data;
 
-    for (int i = 0; i < s->nb_streams; i++) {
+    for (int i = 0; i < mov->nb_streams; i++) {
         MOVTrack *track = &mov->tracks[i];
         int ret = AVERROR_BUG;
 
@@ -5868,7 +5899,7 @@  static int mov_flush_fragment(AVFormatContext *s, int force)
     // of fragments was triggered automatically by an AVPacket, we
     // already have reliable info for the end of that track, but other
     // tracks may need to be filled in.
-    for (i = 0; i < s->nb_streams; i++) {
+    for (i = 0; i < mov->nb_streams; i++) {
         MOVTrack *track = &mov->tracks[i];
         if (!track->end_reliable) {
             const AVPacket *pkt = ff_interleaved_peek(s, i);
@@ -6069,10 +6100,8 @@  static int mov_auto_flush_fragment(AVFormatContext *s, int force)
     return ret;
 }
 
-static int check_pkt(AVFormatContext *s, AVPacket *pkt)
+static int check_pkt(AVFormatContext *s, MOVTrack *trk, AVPacket *pkt)
 {
-    MOVMuxContext *mov = s->priv_data;
-    MOVTrack *trk = &mov->tracks[pkt->stream_index];
     int64_t ref;
     uint64_t duration;
 
@@ -6110,15 +6139,21 @@  int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
 {
     MOVMuxContext *mov = s->priv_data;
     AVIOContext *pb = s->pb;
-    MOVTrack *trk = &mov->tracks[pkt->stream_index];
-    AVCodecParameters *par = trk->par;
+    MOVTrack *trk;
+    AVCodecParameters *par;
     AVProducerReferenceTime *prft;
     unsigned int samples_in_chunk = 0;
     int size = pkt->size, ret = 0, offset = 0;
     size_t prft_size;
     uint8_t *reformatted_data = NULL;
 
-    ret = check_pkt(s, pkt);
+    if (pkt->stream_index < s->nb_streams)
+        trk = s->streams[pkt->stream_index]->priv_data;
+    else // Timecode or chapter
+        trk = &mov->tracks[pkt->stream_index];
+    par = trk->par;
+
+    ret = check_pkt(s, trk, pkt);
     if (ret < 0)
         return ret;
 
@@ -6208,7 +6243,7 @@  int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
 
     if (par->codec_id == AV_CODEC_ID_AAC && pkt->size > 2 &&
         (AV_RB16(pkt->data) & 0xfff0) == 0xfff0) {
-        if (!s->streams[pkt->stream_index]->nb_frames) {
+        if (!trk->st->nb_frames) {
             av_log(s, AV_LOG_ERROR, "Malformed AAC bitstream detected: "
                    "use the audio bitstream filter 'aac_adtstoasc' to fix it "
                    "('-bsf:a aac_adtstoasc' option with ffmpeg)\n");
@@ -6470,18 +6505,18 @@  err:
 static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
 {
     MOVMuxContext *mov = s->priv_data;
-    MOVTrack *trk = &mov->tracks[pkt->stream_index];
+    MOVTrack *trk = s->streams[pkt->stream_index]->priv_data;
     AVCodecParameters *par = trk->par;
     int64_t frag_duration = 0;
     int size = pkt->size;
 
-    int ret = check_pkt(s, pkt);
+    int ret = check_pkt(s, trk, pkt);
     if (ret < 0)
         return ret;
 
     if (mov->flags & FF_MOV_FLAG_FRAG_DISCONT) {
         int i;
-        for (i = 0; i < s->nb_streams; i++)
+        for (i = 0; i < mov->nb_streams; i++)
             mov->tracks[i].frag_discont = 1;
         mov->flags &= ~FF_MOV_FLAG_FRAG_DISCONT;
     }
@@ -6523,7 +6558,7 @@  static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
         return 0;             /* Discard 0 sized packets */
     }
 
-    if (trk->entry && pkt->stream_index < s->nb_streams)
+    if (trk->entry && pkt->stream_index < mov->nb_streams)
         frag_duration = av_rescale_q(pkt->dts - trk->cluster[0].dts,
                 s->streams[pkt->stream_index]->time_base,
                 AV_TIME_BASE_Q);
@@ -6578,17 +6613,45 @@  static int mov_write_subtitle_end_packet(AVFormatContext *s,
     return ret;
 }
 
+static int mov_filter_packet(AVFormatContext *s, MOVTrack *track, AVPacket *pkt)
+{
+    int ret;
+
+    if (!track->bsf)
+        return 0;
+
+    ret = av_bsf_send_packet(track->bsf, pkt);
+    if (ret < 0) {
+        av_log(s, AV_LOG_ERROR,
+                "Failed to send packet to filter %s for stream %d: %s\n",
+                track->bsf->filter->name, pkt->stream_index, av_err2str(ret));
+        return ret;
+    }
+
+    return av_bsf_receive_packet(track->bsf, pkt);
+}
+
 static int mov_write_packet(AVFormatContext *s, AVPacket *pkt)
 {
     MOVMuxContext *mov = s->priv_data;
     MOVTrack *trk;
+    int ret;
 
     if (!pkt) {
         mov_flush_fragment(s, 1);
         return 1;
     }
 
-    trk = &mov->tracks[pkt->stream_index];
+    trk = s->streams[pkt->stream_index]->priv_data;
+
+    ret = mov_filter_packet(s, trk, pkt);
+    if (ret < 0) {
+        if (ret == AVERROR(EAGAIN))
+            return 0;
+        av_log(s, AV_LOG_ERROR, "Error applying bitstream filters to an output "
+                                "packet for stream #%d: %s\n", trk->st->index, av_err2str(ret));
+        return ret;
+    }
 
     if (is_cover_image(trk->st)) {
         int ret;
@@ -6789,12 +6852,12 @@  static int mov_create_chapter_track(AVFormatContext *s, int tracknum)
 }
 
 
-static int mov_check_timecode_track(AVFormatContext *s, AVTimecode *tc, int src_index, const char *tcstr)
+static int mov_check_timecode_track(AVFormatContext *s, AVTimecode *tc, AVStream *src_st, const char *tcstr)
 {
     int ret;
 
     /* compute the frame number */
-    ret = av_timecode_init_from_string(tc, s->streams[src_index]->avg_frame_rate, tcstr, s);
+    ret = av_timecode_init_from_string(tc, src_st->avg_frame_rate, tcstr, s);
     return ret;
 }
 
@@ -6802,7 +6865,7 @@  static int mov_create_timecode_track(AVFormatContext *s, int index, int src_inde
 {
     MOVMuxContext *mov  = s->priv_data;
     MOVTrack *track     = &mov->tracks[index];
-    AVStream *src_st    = s->streams[src_index];
+    AVStream *src_st    = mov->tracks[src_index].st;
     uint8_t data[4];
     AVPacket *pkt = mov->pkt;
     AVRational rate = src_st->avg_frame_rate;
@@ -6862,8 +6925,8 @@  static void enable_tracks(AVFormatContext *s)
         first[i] = -1;
     }
 
-    for (i = 0; i < s->nb_streams; i++) {
-        AVStream *st = s->streams[i];
+    for (i = 0; i < mov->nb_streams; i++) {
+        AVStream *st = mov->tracks[i].st;
 
         if (st->codecpar->codec_type <= AVMEDIA_TYPE_UNKNOWN ||
             st->codecpar->codec_type >= AVMEDIA_TYPE_NB ||
@@ -6897,6 +6960,9 @@  static void mov_free(AVFormatContext *s)
     MOVMuxContext *mov = s->priv_data;
     int i;
 
+    for (i = 0; i < s->nb_streams; i++)
+        s->streams[i]->priv_data = NULL;
+
     if (!mov->tracks)
         return;
 
@@ -6927,6 +6993,7 @@  static void mov_free(AVFormatContext *s)
         ffio_free_dyn_buf(&track->mdat_buf);
 
         avpriv_packet_list_free(&track->squashed_packet_queue);
+        av_bsf_free(&track->bsf);
     }
 
     av_freep(&mov->tracks);
@@ -6999,6 +7066,92 @@  static int mov_create_dvd_sub_decoder_specific_info(MOVTrack *track,
     return 0;
 }
 
+static int mov_init_iamf_track(AVFormatContext *s)
+{
+    MOVMuxContext *mov = s->priv_data;
+    MOVTrack *track = &mov->tracks[0]; // IAMF if present is always the first track
+    const AVBitStreamFilter *filter;
+    AVBPrint bprint;
+    AVStream *first_st = NULL;
+    char *args;
+    int nb_audio_elements = 0, nb_mix_presentations = 0;
+    int ret;
+
+    for (int i = 0; i < s->nb_stream_groups; i++) {
+        const AVStreamGroup *stg = s->stream_groups[i];
+
+        if (stg->type == AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT)
+            nb_audio_elements++;
+        if (stg->type == AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION)
+            nb_mix_presentations++;
+    }
+
+    if (!nb_audio_elements && !nb_mix_presentations)
+        return 0;
+
+    if ((nb_audio_elements < 1 && nb_audio_elements > 2) || nb_mix_presentations < 1) {
+        av_log(s, AV_LOG_ERROR, "There must be >= 1 and <= 2 IAMF_AUDIO_ELEMENT and at least "
+                                "one IAMF_MIX_PRESENTATION stream groups to write a IMAF track\n");
+        return AVERROR(EINVAL);
+    }
+
+    track->iamf = av_mallocz(sizeof(*track->iamf));
+    if (!track->iamf)
+        return AVERROR(ENOMEM);
+
+    av_bprint_init(&bprint, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (int i = 0; i < s->nb_stream_groups; i++) {
+        const AVStreamGroup *stg = s->stream_groups[i];
+
+        switch(stg->type) {
+        case AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT:
+            if (!first_st)
+                first_st = stg->streams[0];
+
+            for (int j = 0; j < stg->nb_streams; j++) {
+                av_bprintf(&bprint, "%d=%d%s", s->streams[j]->index, s->streams[j]->id,
+                                               j < (stg->nb_streams - 1) ? ":" : "");
+                s->streams[j]->priv_data = track;
+            }
+
+            ret = ff_iamf_add_audio_element(track->iamf, stg, s);
+            break;
+        case AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION:
+            ret = ff_iamf_add_mix_presentation(track->iamf, stg, s);
+            break;
+        default:
+            av_assert0(0);
+        }
+        if (ret < 0)
+            return ret;
+    }
+
+    av_bprint_finalize(&bprint, &args);
+
+    filter = av_bsf_get_by_name("iamf_frame_merge");
+    if (!filter) {
+        av_log(s, AV_LOG_ERROR, "iamf_frame_merge bitstream filter "
+               "not found. This is a bug, please report it.\n");
+        return AVERROR_BUG;
+    }
+
+    ret = av_bsf_alloc(filter, &track->bsf);
+    if (ret < 0)
+        return ret;
+
+    ret = avcodec_parameters_copy(track->bsf->par_in, first_st->codecpar);
+    if (ret < 0)
+        return ret;
+
+    av_opt_set(track->bsf->priv_data, "index_mapping", args, 0);
+    av_opt_set_int(track->bsf->priv_data, "out_index", first_st->index, 0);
+
+    track->tag = MKTAG('i','a','m','f');
+
+    return av_bsf_init(track->bsf);
+}
+
 static int mov_init(AVFormatContext *s)
 {
     MOVMuxContext *mov = s->priv_data;
@@ -7136,7 +7289,37 @@  static int mov_init(AVFormatContext *s)
         s->streams[0]->disposition |= AV_DISPOSITION_DEFAULT;
     }
 
-    mov->nb_tracks = s->nb_streams;
+    for (i = 0; i < s->nb_stream_groups; i++) {
+        AVStreamGroup *stg = s->stream_groups[i];
+
+        if (stg->type != AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT)
+            continue;
+
+        for (int j = 0; j < stg->nb_streams; j++) {
+            AVStream *st = stg->streams[j];
+
+            if (st->priv_data) {
+                av_log(s, AV_LOG_ERROR, "Stream %d is present in more than one Stream Group of type "
+                                        "IAMF Audio Element\n", j);
+                return AVERROR(EINVAL);
+            }
+            st->priv_data = st;
+        }
+
+        if (!mov->nb_tracks) // We support one track for the entire IAMF structure
+            mov->nb_tracks++;
+    }
+
+    for (i = 0; i < s->nb_streams; i++) {
+        AVStream *st = s->streams[i];
+        if (st->priv_data)
+            continue;
+        st->priv_data = st;
+        mov->nb_tracks++;
+    }
+
+    mov->nb_streams = mov->nb_tracks;
+
     if (mov->mode & (MODE_MP4|MODE_MOV|MODE_IPOD) && s->nb_chapters)
         mov->chapter_track = mov->nb_tracks++;
 
@@ -7162,7 +7345,7 @@  static int mov_init(AVFormatContext *s)
             if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO &&
                 (t || (t=av_dict_get(st->metadata, "timecode", NULL, 0)))) {
                 AVTimecode tc;
-                ret = mov_check_timecode_track(s, &tc, i, t->value);
+                ret = mov_check_timecode_track(s, &tc, st, t->value);
                 if (ret >= 0)
                     mov->nb_meta_tmcd++;
             }
@@ -7211,18 +7394,33 @@  static int mov_init(AVFormatContext *s)
         }
     }
 
+    ret = mov_init_iamf_track(s);
+    if (ret < 0)
+        return ret;
+
+    for (int j = 0, i = 0; j < s->nb_streams; j++) {
+        AVStream *st = s->streams[j];
+
+        if (st != st->priv_data)
+            continue;
+        st->priv_data = &mov->tracks[i++];
+    }
+
     for (i = 0; i < s->nb_streams; i++) {
         AVStream *st= s->streams[i];
-        MOVTrack *track= &mov->tracks[i];
+        MOVTrack *track = st->priv_data;
         AVDictionaryEntry *lang = av_dict_get(st->metadata, "language", NULL,0);
 
-        track->st  = st;
-        track->par = st->codecpar;
+        if (!track->st) {
+            track->st  = st;
+            track->par = st->codecpar;
+        }
         track->language = ff_mov_iso639_to_lang(lang?lang->value:"und", mov->mode!=MODE_MOV);
         if (track->language < 0)
             track->language = 32767;  // Unspecified Macintosh language code
         track->mode = mov->mode;
-        track->tag  = mov_find_codec_tag(s, track);
+        if (!track->tag)
+            track->tag  = mov_find_codec_tag(s, track);
         if (!track->tag) {
             av_log(s, AV_LOG_ERROR, "Could not find tag for codec %s in stream #%d, "
                    "codec not currently supported in container\n",
@@ -7414,25 +7612,26 @@  static int mov_write_header(AVFormatContext *s)
 {
     AVIOContext *pb = s->pb;
     MOVMuxContext *mov = s->priv_data;
-    int i, ret, hint_track = 0, tmcd_track = 0, nb_tracks = s->nb_streams;
+    int i, ret, hint_track = 0, tmcd_track = 0, nb_tracks = mov->nb_streams;
 
     if (mov->mode & (MODE_MP4|MODE_MOV|MODE_IPOD) && s->nb_chapters)
         nb_tracks++;
 
     if (mov->flags & FF_MOV_FLAG_RTP_HINT) {
         hint_track = nb_tracks;
-        for (i = 0; i < s->nb_streams; i++)
-            if (rtp_hinting_needed(s->streams[i]))
+        for (i = 0; i < mov->nb_streams; i++) {
+            if (rtp_hinting_needed(mov->tracks[i].st))
                 nb_tracks++;
+        }
     }
 
     if (mov->nb_meta_tmcd)
         tmcd_track = nb_tracks;
 
-    for (i = 0; i < s->nb_streams; i++) {
+    for (i = 0; i < mov->nb_streams; i++) {
         int j;
-        AVStream *st= s->streams[i];
-        MOVTrack *track= &mov->tracks[i];
+        MOVTrack *track = &mov->tracks[i];
+        AVStream *st = track->st;
 
         /* copy extradata if it exists */
         if (st->codecpar->extradata_size) {
@@ -7454,8 +7653,8 @@  static int mov_write_header(AVFormatContext *s)
                                       &(AVChannelLayout)AV_CHANNEL_LAYOUT_MONO))
             continue;
 
-        for (j = 0; j < s->nb_streams; j++) {
-            AVStream *stj= s->streams[j];
+        for (j = 0; j < mov->nb_streams; j++) {
+            AVStream *stj= mov->tracks[j].st;
             MOVTrack *trackj= &mov->tracks[j];
             if (j == i)
                 continue;
@@ -7518,8 +7717,8 @@  static int mov_write_header(AVFormatContext *s)
             return ret;
 
     if (mov->flags & FF_MOV_FLAG_RTP_HINT) {
-        for (i = 0; i < s->nb_streams; i++) {
-            if (rtp_hinting_needed(s->streams[i])) {
+        for (i = 0; i < mov->nb_streams; i++) {
+            if (rtp_hinting_needed(mov->tracks[i].st)) {
                 if ((ret = ff_mov_init_hinting(s, hint_track, i)) < 0)
                     return ret;
                 hint_track++;
@@ -7531,8 +7730,8 @@  static int mov_write_header(AVFormatContext *s)
         const AVDictionaryEntry *t, *global_tcr = av_dict_get(s->metadata,
                                                               "timecode", NULL, 0);
         /* Initialize the tmcd tracks */
-        for (i = 0; i < s->nb_streams; i++) {
-            AVStream *st = s->streams[i];
+        for (i = 0; i < mov->nb_streams; i++) {
+            AVStream *st = mov->tracks[i].st;
             t = global_tcr;
 
             if (st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
@@ -7541,7 +7740,7 @@  static int mov_write_header(AVFormatContext *s)
                     t = av_dict_get(st->metadata, "timecode", NULL, 0);
                 if (!t)
                     continue;
-                if (mov_check_timecode_track(s, &tc, i, t->value) < 0)
+                if (mov_check_timecode_track(s, &tc, st, t->value) < 0)
                     continue;
                 if ((ret = mov_create_timecode_track(s, tmcd_track, i, tc)) < 0)
                     return ret;
@@ -7662,7 +7861,7 @@  static int mov_write_trailer(AVFormatContext *s)
     int64_t moov_pos;
 
     if (mov->need_rewrite_extradata) {
-        for (i = 0; i < s->nb_streams; i++) {
+        for (i = 0; i < mov->nb_streams; i++) {
             MOVTrack *track = &mov->tracks[i];
             AVCodecParameters *par = track->par;
 
@@ -7802,7 +8001,7 @@  static int avif_write_trailer(AVFormatContext *s)
     if (mov->moov_written) return 0;
 
     mov->is_animated_avif = s->streams[0]->nb_frames > 1;
-    if (mov->is_animated_avif && s->nb_streams > 1) {
+    if (mov->is_animated_avif && mov->nb_streams > 1) {
         // For animated avif with alpha channel, we need to write a tref tag
         // with type "auxl".
         mov->tracks[1].tref_tag = MKTAG('a', 'u', 'x', 'l');
@@ -7812,7 +8011,7 @@  static int avif_write_trailer(AVFormatContext *s)
     mov_write_meta_tag(pb, mov, s);
 
     moov_size = get_moov_size(s);
-    for (i = 0; i < s->nb_streams; i++)
+    for (i = 0; i < mov->nb_tracks; i++)
         mov->tracks[i].data_offset = avio_tell(pb) + moov_size + 8;
 
     if (mov->is_animated_avif) {
@@ -7834,7 +8033,7 @@  static int avif_write_trailer(AVFormatContext *s)
 
     // write extent offsets.
     pos_backup = avio_tell(pb);
-    for (i = 0; i < s->nb_streams; i++) {
+    for (i = 0; i < mov->nb_streams; i++) {
         if (extent_offsets[i] != (uint32_t)extent_offsets[i]) {
             av_log(s, AV_LOG_ERROR, "extent offset does not fit in 32 bits\n");
             return AVERROR_INVALIDDATA;
diff --git a/libavformat/movenc.h b/libavformat/movenc.h
index 60363198c9..fee3e759e0 100644
--- a/libavformat/movenc.h
+++ b/libavformat/movenc.h
@@ -25,7 +25,9 @@ 
 #define AVFORMAT_MOVENC_H
 
 #include "avformat.h"
+#include "iamf.h"
 #include "movenccenc.h"
+#include "libavcodec/bsf.h"
 #include "libavcodec/packet_internal.h"
 
 #define MOV_FRAG_INFO_ALLOC_INCREMENT 64
@@ -170,6 +172,10 @@  typedef struct MOVTrack {
     unsigned int squash_fragment_samples_to_one; //< flag to note formats where all samples for a fragment are to be squashed
 
     PacketList squashed_packet_queue;
+
+    AVBSFContext *bsf;
+
+    IAMFContext *iamf;
 } MOVTrack;
 
 typedef enum {
@@ -188,6 +194,7 @@  typedef struct MOVMuxContext {
     const AVClass *av_class;
     int     mode;
     int64_t time;
+    int     nb_streams;
     int     nb_tracks;
     int     nb_meta_tmcd;  ///< number of new created tmcd track based on metadata (aka not data copy)
     int     chapter_track; ///< qt chapter track number