diff mbox series

[FFmpeg-devel,4/5] avformat/mov: Add support for exporting Video Extension Usage info

Message ID 20240610184408.68171-5-derek.buitenhuis@gmail.com
State New
Headers show
Series Apple Spatial Metadata | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 fail Make fate failed

Commit Message

Derek Buitenhuis June 10, 2024, 6:44 p.m. UTC
This box is provided by files created by the Apple Vision Pro, as well
as the iPhone 15+ when capture for Vision Pro is enabled.

The boxes are a mix of things documented by Apple in some PDFs, their
API docs, and reverse engineering. Ideally we will have a real spec
one day.

Links:
  * https://developer.apple.com/av-foundation/Stereo-Video-ISOBMFF-Extensions.pdf
  * https://developer.apple.com/documentation/videotoolbox/kvtcompressionpropertykey_horizontaldisparityadjustment
  * https://developer.apple.com/documentation/videotoolbox/kvtcompressionpropertykey_stereocamerabaseline
  * https://developer.apple.com/documentation/videotoolbox/kvtcompressionpropertykey_heroeye

Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
---
 libavformat/mov.c | 279 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)

Comments

James Almer June 10, 2024, 7:38 p.m. UTC | #1
On 6/10/2024 3:44 PM, Derek Buitenhuis wrote:
> +static int mov_read_vexu(MOVContext *c, AVIOContext *pb, MOVAtom atom)
> +{
> +    int size;
> +    int64_t remaining;
> +    uint32_t tag;
> +
> +    if (c->fc->nb_streams < 1)
> +        return 0;
> +
> +    if (atom.size < 8) {
> +        av_log(c->fc, AV_LOG_ERROR, "Empty video extension usage box\n");
> +        return AVERROR_INVALIDDATA;
> +    }
> +
> +    remaining = atom.size;
> +    while (remaining > 0) {

Maybe this loop should call mov_read_default, with proj and eyes added 
to mov_default_parse_table[]. Although i don't know if eyes may show up 
as child for other parent boxes or not.
At least with proj, i see it can be a child for sv3d, where only prhd is 
expected as a child box in turn. But it shouldn't a problem to add a 
mov_read_proj that handles both prhd and prji for this purpose.

> +        size = avio_rb32(pb);
> +        if (size < 8 || size > remaining ) {
> +            av_log(c->fc, AV_LOG_ERROR, "Invalid child size in vexu box\n");
> +            return AVERROR_INVALIDDATA;
> +        }
> +
> +        tag = avio_rl32(pb);
> +        switch (tag) {
> +        case MKTAG('p','r','o','j'): {
> +            MOVAtom proj = { tag, size - 8 };
> +            int ret = mov_read_vexu_proj(c, pb, proj);
> +            if (ret < 0)
> +                return ret;
> +            break;
> +        }
> +        case MKTAG('e','y','e','s'): {
> +            MOVAtom eyes = { tag, size - 8 };
> +            int ret = mov_read_eyes(c, pb, eyes);
> +            if (ret < 0)
> +                return ret;
> +            break;
> +        }
> +        default:
> +            av_log(c->fc, AV_LOG_WARNING, "Unknown tag in vexu: 0x%08X\n", tag);
> +            avio_skip(pb, size - 8);
> +            break;
> +        }
> +        remaining -= size;
> +    }
> +
> +    if (remaining != 0) {
> +        av_log(c->fc, AV_LOG_ERROR, "Broken vexu box\n");
> +        return AVERROR_INVALIDDATA;
> +    }
> +
> +    return 0;
> +}
Derek Buitenhuis June 10, 2024, 7:45 p.m. UTC | #2
On 6/10/2024 8:38 PM, James Almer wrote:
>> +    remaining = atom.size;
>> +    while (remaining > 0) {
> 
> Maybe this loop should call mov_read_default, with proj and eyes added 
> to mov_default_parse_table[]. Although i don't know if eyes may show up 
> as child for other parent boxes or not.
> At least with proj, i see it can be a child for sv3d, where only prhd is 
> expected as a child box in turn. But it shouldn't a problem to add a 
> mov_read_proj that handles both prhd and prji for this purpose.

Well, the proj box that that's in sv3d and this proj box are unrelated -
they both exist because Apple and Google don't play nice with eachother,
and both defined their own incompatible projection boxes, which exist in
entirely different parts of the file, so it seemed kind of wrong to handle
it that way...

The other reason I left it this way is because it left it easier for us to
implement the 'must' box if we wanted to, which needs to keep track of what
boxes have been seen inside the 'vexu' box, and its inner boxes.

To my knowledge eyes (and indeed all the other boxes inside the 'vexu' box besides
the 'proj' box) only appear in the 'vexu' box.

- Derek
diff mbox series

Patch

diff --git a/libavformat/mov.c b/libavformat/mov.c
index 160e9626d7..b164bb0adb 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -6477,6 +6477,284 @@  static int mov_read_sv3d(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     return 0;
 }
 
+static int mov_read_vexu_proj(MOVContext *c, AVIOContext *pb, MOVAtom atom)
+{
+    AVStream *st;
+    MOVStreamContext *sc;
+    int size;
+    uint32_t tag;
+    enum AVSphericalProjection projection;
+
+    if (c->fc->nb_streams < 1)
+        return 0;
+
+    st = c->fc->streams[c->fc->nb_streams - 1];
+    sc = st->priv_data;
+
+    if (atom.size != 16) {
+        av_log(c->fc, AV_LOG_ERROR, "Invalid size for proj box: %"PRIu64"\n", atom.size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    size = avio_rb32(pb);
+    if (size != 16) {
+        av_log(c->fc, AV_LOG_ERROR, "Invalid size for prji box: %d\n", size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    tag = avio_rl32(pb);
+    if (tag != MKTAG('p','r','j','i')) {
+        av_log(c->fc, AV_LOG_ERROR, "Invalid child box of proj box: 0x%08X\n", tag);
+        return AVERROR_INVALIDDATA;
+    }
+
+    avio_skip(pb, 1); // version
+    avio_skip(pb, 3); // flags
+
+    tag = avio_rl32(pb);
+    switch (tag) {
+    case MKTAG('r','e','c','t'):
+        projection = AV_SPHERICAL_RECTANGULAR;
+        break;
+    case MKTAG('e','q','u','i'):
+        projection = AV_SPHERICAL_EQUIRECTANGULAR;
+        break;
+    case MKTAG('h','e','q','u'):
+        projection = AV_SPHERICAL_HALF_EQUIRECTANGULAR;
+        break;
+    case MKTAG('f','i','s','h'):
+        projection = AV_SPHERICAL_FISHEYE;
+        break;
+    default:
+        av_log(c->fc, AV_LOG_ERROR, "Invalid projection type in prji box: 0x%08X\n", tag);
+        return AVERROR_INVALIDDATA;
+    }
+
+    sc->spherical = av_spherical_alloc(&sc->spherical_size);
+    if (!sc->spherical)
+        return AVERROR(ENOMEM);
+
+    sc->spherical->projection = projection;
+
+    return 0;
+}
+
+static int mov_read_eyes(MOVContext *c, AVIOContext *pb, MOVAtom atom)
+{
+    AVStream *st;
+    MOVStreamContext *sc;
+    int size, flags = 0;
+    int64_t remaining;
+    uint32_t tag, baseline = 0;
+    enum AVStereo3DView view = AV_STEREO3D_VIEW_PACKED;
+    enum AVStereo3DPrimaryEye primary_eye = AV_PRIMARY_EYE_NONE;
+    int32_t horizontal_disparity_adjustment = 0;
+
+    if (c->fc->nb_streams < 1)
+        return 0;
+
+    st = c->fc->streams[c->fc->nb_streams - 1];
+    sc = st->priv_data;
+
+    remaining = atom.size;
+    while (remaining > 0) {
+        size = avio_rb32(pb);
+        if (size < 8 || size > remaining ) {
+            av_log(c->fc, AV_LOG_ERROR, "Invalid child size in eyes box\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        tag = avio_rl32(pb);
+        switch (tag) {
+        case MKTAG('s','t','r','i'): {
+            int has_right, has_left;
+            uint8_t tmp;
+            if (size != 13) {
+                av_log(c->fc, AV_LOG_ERROR, "Invalid size of stri box: %d\n", size);
+                return AVERROR_INVALIDDATA;
+            }
+            avio_skip(pb, 1); // version
+            avio_skip(pb, 3); // flags
+
+            tmp = avio_r8(pb);
+
+            // eye_views_reversed
+            if (tmp & 8) {
+                flags |= AV_STEREO3D_FLAG_INVERT;
+            }
+            // has_additional_views
+            if (tmp & 4) {
+                // skip...
+            }
+
+            has_right = tmp & 2; // has_right_eye_view
+            has_left = tmp & 1; // has_left_eye_view
+
+            if (has_left && has_right)
+                view = AV_STEREO3D_VIEW_PACKED;
+            else if (has_left)
+                view = AV_STEREO3D_VIEW_LEFT;
+            else if (has_right)
+                view = AV_STEREO3D_VIEW_RIGHT;
+            break;
+        }
+        case MKTAG('h','e','r','o'): {
+            int tmp;
+            if (size != 13) {
+                av_log(c->fc, AV_LOG_ERROR, "Invalid size of hero box: %d\n", size);
+                return AVERROR_INVALIDDATA;
+            }
+            avio_skip(pb, 1); // version
+            avio_skip(pb, 3); // flags
+
+            tmp = avio_r8(pb);
+            if (tmp == 0)
+                primary_eye = AV_PRIMARY_EYE_NONE;
+            else if (tmp == 1)
+                primary_eye = AV_PRIMARY_EYE_LEFT;
+            else if (tmp == 2)
+                primary_eye = AV_PRIMARY_EYE_RIGHT;
+            else
+                av_log(c->fc, AV_LOG_WARNING, "Unknown hero eye type: %d\n", tmp);
+
+            break;
+        }
+        case MKTAG('c','a','m','s'): {
+            uint32_t subtag;
+            int subsize;
+            if (size != 24) {
+                av_log(c->fc, AV_LOG_ERROR, "Invalid size of cams box: %d\n", size);
+                return AVERROR_INVALIDDATA;
+            }
+
+            subsize = avio_rb32(pb);
+            if (subsize != 16) {
+                av_log(c->fc, AV_LOG_ERROR, "Invalid size of blin box: %d\n", size);
+                return AVERROR_INVALIDDATA;
+            }
+
+            subtag = avio_rl32(pb);
+            if (subtag != MKTAG('b','l','i','n')) {
+                av_log(c->fc, AV_LOG_ERROR, "Expected blin box, got 0x%08X\n", subtag);
+                return AVERROR_INVALIDDATA;
+            }
+
+            avio_skip(pb, 1); // version
+            avio_skip(pb, 3); // flags
+
+            baseline = avio_rb32(pb);
+
+            break;
+        }
+        case MKTAG('c','m','f','y'): {
+            uint32_t subtag;
+            int subsize;
+            if (size != 24) {
+                av_log(c->fc, AV_LOG_ERROR, "Invalid size of cmfy box: %d\n", size);
+                return AVERROR_INVALIDDATA;
+            }
+
+            subsize = avio_rb32(pb);
+            if (subsize != 16) {
+                av_log(c->fc, AV_LOG_ERROR, "Invalid size of dadj box: %d\n", size);
+                return AVERROR_INVALIDDATA;
+            }
+
+            subtag = avio_rl32(pb);
+            if (subtag != MKTAG('d','a','d','j')) {
+                av_log(c->fc, AV_LOG_ERROR, "Expected dadj box, got 0x%08X\n", subtag);
+                return AVERROR_INVALIDDATA;
+            }
+
+            avio_skip(pb, 1); // version
+            avio_skip(pb, 3); // flags
+
+            horizontal_disparity_adjustment = (int32_t) avio_rb32(pb);
+
+            break;
+        }
+        default:
+            av_log(c->fc, AV_LOG_WARNING, "Unknown tag in eyes: 0x%08X\n", tag);
+            avio_skip(pb, size - 8);
+            break;
+        }
+        remaining -= size;
+    }
+
+    if (remaining != 0) {
+        av_log(c->fc, AV_LOG_ERROR, "Broken eyes box\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (!sc->stereo3d) {
+        sc->stereo3d = av_stereo3d_alloc();
+        if (!sc->stereo3d)
+            return AVERROR(ENOMEM);
+    }
+
+    sc->stereo3d->flags                           = flags;
+    sc->stereo3d->view                            = view;
+    sc->stereo3d->primary_eye                     = primary_eye;
+    sc->stereo3d->baseline                        = baseline;
+    sc->stereo3d->horizontal_disparity_adjustment = horizontal_disparity_adjustment;
+
+    return 0;
+}
+
+static int mov_read_vexu(MOVContext *c, AVIOContext *pb, MOVAtom atom)
+{
+    int size;
+    int64_t remaining;
+    uint32_t tag;
+
+    if (c->fc->nb_streams < 1)
+        return 0;
+
+    if (atom.size < 8) {
+        av_log(c->fc, AV_LOG_ERROR, "Empty video extension usage box\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    remaining = atom.size;
+    while (remaining > 0) {
+        size = avio_rb32(pb);
+        if (size < 8 || size > remaining ) {
+            av_log(c->fc, AV_LOG_ERROR, "Invalid child size in vexu box\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        tag = avio_rl32(pb);
+        switch (tag) {
+        case MKTAG('p','r','o','j'): {
+            MOVAtom proj = { tag, size - 8 };
+            int ret = mov_read_vexu_proj(c, pb, proj);
+            if (ret < 0)
+                return ret;
+            break;
+        }
+        case MKTAG('e','y','e','s'): {
+            MOVAtom eyes = { tag, size - 8 };
+            int ret = mov_read_eyes(c, pb, eyes);
+            if (ret < 0)
+                return ret;
+            break;
+        }
+        default:
+            av_log(c->fc, AV_LOG_WARNING, "Unknown tag in vexu: 0x%08X\n", tag);
+            avio_skip(pb, size - 8);
+            break;
+        }
+        remaining -= size;
+    }
+
+    if (remaining != 0) {
+        av_log(c->fc, AV_LOG_ERROR, "Broken vexu box\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
 static int mov_parse_uuid_spherical(MOVStreamContext *sc, AVIOContext *pb, size_t len)
 {
     int ret = 0;
@@ -8595,6 +8873,7 @@  static const MOVParseTableEntry mov_default_parse_table[] = {
 { MKTAG('d','f','L','a'), mov_read_dfla },
 { MKTAG('s','t','3','d'), mov_read_st3d }, /* stereoscopic 3D video box */
 { MKTAG('s','v','3','d'), mov_read_sv3d }, /* spherical video box */
+{ MKTAG('v','e','x','u'), mov_read_vexu }, /* video extension usage */
 { MKTAG('d','O','p','s'), mov_read_dops },
 { MKTAG('d','m','l','p'), mov_read_dmlp },
 { MKTAG('S','m','D','m'), mov_read_smdm },