diff mbox series

[FFmpeg-devel] avcodec/libjxldec: emit proper PTS to decoded AVFrame

Message ID 20231208173106.165084-1-leo.izen@gmail.com
State New
Headers show
Series [FFmpeg-devel] avcodec/libjxldec: emit proper PTS to decoded AVFrame | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Leo Izen Dec. 8, 2023, 5:31 p.m. UTC
If a sequence of JXL images is encapsulated in a container that has PTS
information, we should use the PTS information from the container. At
this time there is no container that does this, but if JPEG XL support
is ever added to NUT, AVTransport, or some other container, this commit
should allow the PTS information those containers provide to work as
expected.

Signed-off-by: Leo Izen <leo.izen@gmail.com>
---
 libavcodec/libjxldec.c | 77 +++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 20 deletions(-)

Comments

Anton Khirnov Dec. 14, 2023, 8:28 a.m. UTC | #1
Quoting Leo Izen (2023-12-08 18:31:06)
> If a sequence of JXL images is encapsulated in a container that has PTS
> information, we should use the PTS information from the container. At
> this time there is no container that does this, but if JPEG XL support
> is ever added to NUT, AVTransport, or some other container, this commit
> should allow the PTS information those containers provide to work as
> expected.
> 
> Signed-off-by: Leo Izen <leo.izen@gmail.com>
> ---
>  libavcodec/libjxldec.c | 77 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 57 insertions(+), 20 deletions(-)
> 
> diff --git a/libavcodec/libjxldec.c b/libavcodec/libjxldec.c
> index 002740d9c1..494060ac8c 100644
> --- a/libavcodec/libjxldec.c
> +++ b/libavcodec/libjxldec.c
> @@ -370,6 +370,7 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>  
>      while (1) {
>          size_t remaining;
> +        JxlFrameHeader header;
>  
>          if (!pkt->size) {
>              av_packet_unref(pkt);
> @@ -428,13 +429,16 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>              }
>              if ((ret = ff_set_dimensions(avctx, ctx->basic_info.xsize, ctx->basic_info.ysize)) < 0)
>                  return ret;
> +            /*
> +             * If animation is present, we use the timebase provided by
> +             *    the animated image itself.
> +             * If the image is not animated, we use ctx->pts
> +             *    to refer to the frame number, not an actual
> +             *    PTS value, thus we may leave ctx->timebase unset.
> +             */
>              if (ctx->basic_info.have_animation)
>                  ctx->timebase = av_make_q(ctx->basic_info.animation.tps_denominator,
>                                            ctx->basic_info.animation.tps_numerator);
> -            else if (avctx->pkt_timebase.num)
> -                ctx->timebase = avctx->pkt_timebase;
> -            else
> -                ctx->timebase = AV_TIME_BASE_Q;
>              continue;
>          case JXL_DEC_COLOR_ENCODING:
>              av_log(avctx, AV_LOG_DEBUG, "COLOR_ENCODING event emitted\n");
> @@ -462,23 +466,24 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>  #endif
>              continue;
>          case JXL_DEC_FRAME:
> +            /* Frame here refers to the Frame bundle, not a decoded picture */
>              av_log(avctx, AV_LOG_DEBUG, "FRAME event emitted\n");
> -            if (!ctx->basic_info.have_animation || ctx->prev_is_last) {
> +            if (ctx->prev_is_last) {
> +                /*
> +                 * The last frame sent was tagged as "is_last" which
> +                 * means this is a new image file altogether.
> +                 */
>                  ctx->frame->pict_type = AV_PICTURE_TYPE_I;
>                  ctx->frame->flags |= AV_FRAME_FLAG_KEY;
>              }
> -            if (ctx->basic_info.have_animation) {
> -                JxlFrameHeader header;
> -                if (JxlDecoderGetFrameHeader(ctx->decoder, &header) != JXL_DEC_SUCCESS) {
> -                    av_log(avctx, AV_LOG_ERROR, "Bad libjxl dec frame event\n");
> -                    return AVERROR_EXTERNAL;
> -                }
> -                ctx->prev_is_last = header.is_last;
> -                ctx->frame_duration = header.duration;
> -            } else {
> -                ctx->prev_is_last = 1;
> -                ctx->frame_duration = 1;
> +            if (JxlDecoderGetFrameHeader(ctx->decoder, &header) != JXL_DEC_SUCCESS) {
> +                av_log(avctx, AV_LOG_ERROR, "Bad libjxl dec frame event\n");
> +                return AVERROR_EXTERNAL;
>              }
> +            ctx->prev_is_last = header.is_last;
> +            /* zero duration for animation means the frame is not presented */
> +            if (ctx->basic_info.have_animation && header.duration)
> +                ctx->frame_duration = header.duration;
>              continue;
>          case JXL_DEC_FULL_IMAGE:
>              /* full image is one frame, even if animated */
> @@ -490,12 +495,44 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>                  /* ownership is transfered, and it is not ref-ed */
>                  ctx->iccp = NULL;
>              }
> -            if (avctx->pkt_timebase.num) {
> -                ctx->frame->pts = av_rescale_q(ctx->pts, ctx->timebase, avctx->pkt_timebase);
> -                ctx->frame->duration = av_rescale_q(ctx->frame_duration, ctx->timebase, avctx->pkt_timebase);
> +            if (ctx->basic_info.have_animation) {
> +                if (avctx->pkt_timebase.num) {
> +                    /*
> +                     * ideally, the demuxer set avctx->pkt_timebase to equal the animation's timebase
> +                     * or something strictly finer. This is true about the jpegxl_anim demuxer.
> +                     */
> +                    ctx->frame->pts = av_rescale_q(ctx->pts, ctx->timebase, avctx->pkt_timebase);
> +                    ctx->frame->duration = av_rescale_q(ctx->frame_duration, ctx->timebase, avctx->pkt_timebase);
> +                } else {
> +                    /*
> +                     * If we don't know the container timebase, we have to set the frame->timebase,
> +                     * even if it is currently ignored by most users. We don't have permission
> +                     * to set avctx->pkt_timebase.
> +                     */
> +                    ctx->frame->time_base = ctx->timebase;
> +                    ctx->frame->pts = ctx->pts;
> +                    ctx->frame->duration = ctx->frame_duration;
> +                }
> +            } else if (avctx->pkt_timebase.num) {
> +                if (pkt->pts != AV_NOPTS_VALUE) {
> +                    /* The container has provided the PTS for us, so we don't need to count frames. */
> +                    ctx->frame->pts = pkt->pts;
> +                } else {
> +                    /*
> +                     * The demuxer has provided us with a timebase, but not with PTS information.
> +                     * We use 1/1 as a dummy timebase, for 1fps as a dummy framerate, and set the
> +                     * PTS based on frame count.
> +                     */
> +                    const AVRational dummy = {.num = 1, .den = 1};
> +                    ctx->frame->pts = av_rescale_q(ctx->pts, dummy, avctx->pkt_timebase);
> +                }
> +                ctx->frame->duration = pkt->duration;
>              } else {
> +                /*
> +                 * There is no timing information. Set the frame PTS to frame counter.
> +                 */
>                  ctx->frame->pts = ctx->pts;
> -                ctx->frame->duration = ctx->frame_duration;
> +                ctx->frame->duration = 0;

This logic seems shady to me. The decoder should mess with pts as little
as possible and whenever it can just copy the packet value to the frame.
Any codec-level timestamps should not be trusted.

Now this does not work when a single packet decodes into multiple
frames, then you have to add increments of frame duration to the
original packet pts. But you should still preserve the original value as
the base - it might not start at 0.

Also, decoders are not allowed to set AVFrame.time_base. And you should
probably set AVCodecContext.framerate.
Leo Izen Dec. 14, 2023, 11:33 p.m. UTC | #2
On 12/14/23 03:28, Anton Khirnov wrote:
> Quoting Leo Izen (2023-12-08 18:31:06)
>> If a sequence of JXL images is encapsulated in a container that has PTS
>> information, we should use the PTS information from the container. At
>> this time there is no container that does this, but if JPEG XL support
>> is ever added to NUT, AVTransport, or some other container, this commit
>> should allow the PTS information those containers provide to work as
>> expected.
>>
>> Signed-off-by: Leo Izen <leo.izen@gmail.com>
>> ---
>>   libavcodec/libjxldec.c | 77 +++++++++++++++++++++++++++++++-----------
>>   1 file changed, 57 insertions(+), 20 deletions(-)
>>
>> diff --git a/libavcodec/libjxldec.c b/libavcodec/libjxldec.c
>> index 002740d9c1..494060ac8c 100644
>> --- a/libavcodec/libjxldec.c
>> +++ b/libavcodec/libjxldec.c
>> @@ -370,6 +370,7 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>>   
>>       while (1) {
>>           size_t remaining;
>> +        JxlFrameHeader header;
>>   
>>           if (!pkt->size) {
>>               av_packet_unref(pkt);
>> @@ -428,13 +429,16 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>>               }
>>               if ((ret = ff_set_dimensions(avctx, ctx->basic_info.xsize, ctx->basic_info.ysize)) < 0)
>>                   return ret;
>> +            /*
>> +             * If animation is present, we use the timebase provided by
>> +             *    the animated image itself.
>> +             * If the image is not animated, we use ctx->pts
>> +             *    to refer to the frame number, not an actual
>> +             *    PTS value, thus we may leave ctx->timebase unset.
>> +             */
>>               if (ctx->basic_info.have_animation)
>>                   ctx->timebase = av_make_q(ctx->basic_info.animation.tps_denominator,
>>                                             ctx->basic_info.animation.tps_numerator);
>> -            else if (avctx->pkt_timebase.num)
>> -                ctx->timebase = avctx->pkt_timebase;
>> -            else
>> -                ctx->timebase = AV_TIME_BASE_Q;
>>               continue;
>>           case JXL_DEC_COLOR_ENCODING:
>>               av_log(avctx, AV_LOG_DEBUG, "COLOR_ENCODING event emitted\n");
>> @@ -462,23 +466,24 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>>   #endif
>>               continue;
>>           case JXL_DEC_FRAME:
>> +            /* Frame here refers to the Frame bundle, not a decoded picture */
>>               av_log(avctx, AV_LOG_DEBUG, "FRAME event emitted\n");
>> -            if (!ctx->basic_info.have_animation || ctx->prev_is_last) {
>> +            if (ctx->prev_is_last) {
>> +                /*
>> +                 * The last frame sent was tagged as "is_last" which
>> +                 * means this is a new image file altogether.
>> +                 */
>>                   ctx->frame->pict_type = AV_PICTURE_TYPE_I;
>>                   ctx->frame->flags |= AV_FRAME_FLAG_KEY;
>>               }
>> -            if (ctx->basic_info.have_animation) {
>> -                JxlFrameHeader header;
>> -                if (JxlDecoderGetFrameHeader(ctx->decoder, &header) != JXL_DEC_SUCCESS) {
>> -                    av_log(avctx, AV_LOG_ERROR, "Bad libjxl dec frame event\n");
>> -                    return AVERROR_EXTERNAL;
>> -                }
>> -                ctx->prev_is_last = header.is_last;
>> -                ctx->frame_duration = header.duration;
>> -            } else {
>> -                ctx->prev_is_last = 1;
>> -                ctx->frame_duration = 1;
>> +            if (JxlDecoderGetFrameHeader(ctx->decoder, &header) != JXL_DEC_SUCCESS) {
>> +                av_log(avctx, AV_LOG_ERROR, "Bad libjxl dec frame event\n");
>> +                return AVERROR_EXTERNAL;
>>               }
>> +            ctx->prev_is_last = header.is_last;
>> +            /* zero duration for animation means the frame is not presented */
>> +            if (ctx->basic_info.have_animation && header.duration)
>> +                ctx->frame_duration = header.duration;
>>               continue;
>>           case JXL_DEC_FULL_IMAGE:
>>               /* full image is one frame, even if animated */
>> @@ -490,12 +495,44 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>>                   /* ownership is transfered, and it is not ref-ed */
>>                   ctx->iccp = NULL;
>>               }
>> -            if (avctx->pkt_timebase.num) {
>> -                ctx->frame->pts = av_rescale_q(ctx->pts, ctx->timebase, avctx->pkt_timebase);
>> -                ctx->frame->duration = av_rescale_q(ctx->frame_duration, ctx->timebase, avctx->pkt_timebase);
>> +            if (ctx->basic_info.have_animation) {
>> +                if (avctx->pkt_timebase.num) {
>> +                    /*
>> +                     * ideally, the demuxer set avctx->pkt_timebase to equal the animation's timebase
>> +                     * or something strictly finer. This is true about the jpegxl_anim demuxer.
>> +                     */
>> +                    ctx->frame->pts = av_rescale_q(ctx->pts, ctx->timebase, avctx->pkt_timebase);
>> +                    ctx->frame->duration = av_rescale_q(ctx->frame_duration, ctx->timebase, avctx->pkt_timebase);
>> +                } else {
>> +                    /*
>> +                     * If we don't know the container timebase, we have to set the frame->timebase,
>> +                     * even if it is currently ignored by most users. We don't have permission
>> +                     * to set avctx->pkt_timebase.
>> +                     */
>> +                    ctx->frame->time_base = ctx->timebase;
>> +                    ctx->frame->pts = ctx->pts;
>> +                    ctx->frame->duration = ctx->frame_duration;
>> +                }
>> +            } else if (avctx->pkt_timebase.num) {
>> +                if (pkt->pts != AV_NOPTS_VALUE) {
>> +                    /* The container has provided the PTS for us, so we don't need to count frames. */
>> +                    ctx->frame->pts = pkt->pts;
>> +                } else {
>> +                    /*
>> +                     * The demuxer has provided us with a timebase, but not with PTS information.
>> +                     * We use 1/1 as a dummy timebase, for 1fps as a dummy framerate, and set the
>> +                     * PTS based on frame count.
>> +                     */
>> +                    const AVRational dummy = {.num = 1, .den = 1};
>> +                    ctx->frame->pts = av_rescale_q(ctx->pts, dummy, avctx->pkt_timebase);
>> +                }
>> +                ctx->frame->duration = pkt->duration;
>>               } else {
>> +                /*
>> +                 * There is no timing information. Set the frame PTS to frame counter.
>> +                 */
>>                   ctx->frame->pts = ctx->pts;
>> -                ctx->frame->duration = ctx->frame_duration;
>> +                ctx->frame->duration = 0;
> 
> This logic seems shady to me.

Which part, specifically? The animated logic, or the non-animated logic?

> The decoder should mess with pts as little
> as possible and whenever it can just copy the packet value to the frame.
> Any codec-level timestamps should not be trusted.

In the case of animated JXL, codec-level timestamps are all that's 
available because the only demuxer is jpegxl_anim, which doesn't 
packetize the individual frames.

> 
> Now this does not work when a single packet decodes into multiple
> frames, then you have to add increments of frame duration to the
> original packet pts. But you should still preserve the original value as
> the base - it might not start at 0.

I see what you're saying, but in the case where one packet decodes into 
multiple frames in the non-animated stream, we don't have any way to 
properly differentiate the PTS of those frames. In the animated case, 
this makes sense to me though.

> 
> Also, decoders are not allowed to set AVFrame.time_base. And you should
> probably set AVCodecContext.framerate.

Animated JXL uses frame-delay and has no inherent framerate. I'm not 
sure what we'd set it to. I can remove the setting of AVFrame->time_base 
though.

- Leo Izen (Traneptora)
Anton Khirnov Dec. 18, 2023, 5:05 p.m. UTC | #3
Quoting Leo Izen (2023-12-15 00:33:33)
> On 12/14/23 03:28, Anton Khirnov wrote:
> > This logic seems shady to me.
> 
> Which part, specifically? The animated logic, or the non-animated logic?

Aspects of both looked questionable to me (which doesn't necessarily
means it's wrong)

> > The decoder should mess with pts as little
> > as possible and whenever it can just copy the packet value to the frame.
> > Any codec-level timestamps should not be trusted.
> 
> In the case of animated JXL, codec-level timestamps are all that's 
> available because the only demuxer is jpegxl_anim, which doesn't 
> packetize the individual frames.

That may change in the future. And you shouldn't assume the caller is
necessarily using lavf for demuxing, if you can help it.

> > 
> > Now this does not work when a single packet decodes into multiple
> > frames, then you have to add increments of frame duration to the
> > original packet pts. But you should still preserve the original value as
> > the base - it might not start at 0.
> 
> I see what you're saying, but in the case where one packet decodes into 
> multiple frames in the non-animated stream, we don't have any way to 
> properly differentiate the PTS of those frames.

When does that happen?

And sure, I accept that when there's no other option you might have to
take some liberties. I'm just saying it should be done as little as
possible.
diff mbox series

Patch

diff --git a/libavcodec/libjxldec.c b/libavcodec/libjxldec.c
index 002740d9c1..494060ac8c 100644
--- a/libavcodec/libjxldec.c
+++ b/libavcodec/libjxldec.c
@@ -370,6 +370,7 @@  static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 
     while (1) {
         size_t remaining;
+        JxlFrameHeader header;
 
         if (!pkt->size) {
             av_packet_unref(pkt);
@@ -428,13 +429,16 @@  static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
             }
             if ((ret = ff_set_dimensions(avctx, ctx->basic_info.xsize, ctx->basic_info.ysize)) < 0)
                 return ret;
+            /*
+             * If animation is present, we use the timebase provided by
+             *    the animated image itself.
+             * If the image is not animated, we use ctx->pts
+             *    to refer to the frame number, not an actual
+             *    PTS value, thus we may leave ctx->timebase unset.
+             */
             if (ctx->basic_info.have_animation)
                 ctx->timebase = av_make_q(ctx->basic_info.animation.tps_denominator,
                                           ctx->basic_info.animation.tps_numerator);
-            else if (avctx->pkt_timebase.num)
-                ctx->timebase = avctx->pkt_timebase;
-            else
-                ctx->timebase = AV_TIME_BASE_Q;
             continue;
         case JXL_DEC_COLOR_ENCODING:
             av_log(avctx, AV_LOG_DEBUG, "COLOR_ENCODING event emitted\n");
@@ -462,23 +466,24 @@  static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 #endif
             continue;
         case JXL_DEC_FRAME:
+            /* Frame here refers to the Frame bundle, not a decoded picture */
             av_log(avctx, AV_LOG_DEBUG, "FRAME event emitted\n");
-            if (!ctx->basic_info.have_animation || ctx->prev_is_last) {
+            if (ctx->prev_is_last) {
+                /*
+                 * The last frame sent was tagged as "is_last" which
+                 * means this is a new image file altogether.
+                 */
                 ctx->frame->pict_type = AV_PICTURE_TYPE_I;
                 ctx->frame->flags |= AV_FRAME_FLAG_KEY;
             }
-            if (ctx->basic_info.have_animation) {
-                JxlFrameHeader header;
-                if (JxlDecoderGetFrameHeader(ctx->decoder, &header) != JXL_DEC_SUCCESS) {
-                    av_log(avctx, AV_LOG_ERROR, "Bad libjxl dec frame event\n");
-                    return AVERROR_EXTERNAL;
-                }
-                ctx->prev_is_last = header.is_last;
-                ctx->frame_duration = header.duration;
-            } else {
-                ctx->prev_is_last = 1;
-                ctx->frame_duration = 1;
+            if (JxlDecoderGetFrameHeader(ctx->decoder, &header) != JXL_DEC_SUCCESS) {
+                av_log(avctx, AV_LOG_ERROR, "Bad libjxl dec frame event\n");
+                return AVERROR_EXTERNAL;
             }
+            ctx->prev_is_last = header.is_last;
+            /* zero duration for animation means the frame is not presented */
+            if (ctx->basic_info.have_animation && header.duration)
+                ctx->frame_duration = header.duration;
             continue;
         case JXL_DEC_FULL_IMAGE:
             /* full image is one frame, even if animated */
@@ -490,12 +495,44 @@  static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
                 /* ownership is transfered, and it is not ref-ed */
                 ctx->iccp = NULL;
             }
-            if (avctx->pkt_timebase.num) {
-                ctx->frame->pts = av_rescale_q(ctx->pts, ctx->timebase, avctx->pkt_timebase);
-                ctx->frame->duration = av_rescale_q(ctx->frame_duration, ctx->timebase, avctx->pkt_timebase);
+            if (ctx->basic_info.have_animation) {
+                if (avctx->pkt_timebase.num) {
+                    /*
+                     * ideally, the demuxer set avctx->pkt_timebase to equal the animation's timebase
+                     * or something strictly finer. This is true about the jpegxl_anim demuxer.
+                     */
+                    ctx->frame->pts = av_rescale_q(ctx->pts, ctx->timebase, avctx->pkt_timebase);
+                    ctx->frame->duration = av_rescale_q(ctx->frame_duration, ctx->timebase, avctx->pkt_timebase);
+                } else {
+                    /*
+                     * If we don't know the container timebase, we have to set the frame->timebase,
+                     * even if it is currently ignored by most users. We don't have permission
+                     * to set avctx->pkt_timebase.
+                     */
+                    ctx->frame->time_base = ctx->timebase;
+                    ctx->frame->pts = ctx->pts;
+                    ctx->frame->duration = ctx->frame_duration;
+                }
+            } else if (avctx->pkt_timebase.num) {
+                if (pkt->pts != AV_NOPTS_VALUE) {
+                    /* The container has provided the PTS for us, so we don't need to count frames. */
+                    ctx->frame->pts = pkt->pts;
+                } else {
+                    /*
+                     * The demuxer has provided us with a timebase, but not with PTS information.
+                     * We use 1/1 as a dummy timebase, for 1fps as a dummy framerate, and set the
+                     * PTS based on frame count.
+                     */
+                    const AVRational dummy = {.num = 1, .den = 1};
+                    ctx->frame->pts = av_rescale_q(ctx->pts, dummy, avctx->pkt_timebase);
+                }
+                ctx->frame->duration = pkt->duration;
             } else {
+                /*
+                 * There is no timing information. Set the frame PTS to frame counter.
+                 */
                 ctx->frame->pts = ctx->pts;
-                ctx->frame->duration = ctx->frame_duration;
+                ctx->frame->duration = 0;
             }
             ctx->pts += ctx->frame_duration;
             av_frame_move_ref(frame, ctx->frame);