diff mbox series

[FFmpeg-devel,v7,3/7] avcodec/webp_parser: parse each frame into one packet

Message ID 20231206013505.57654-4-thilo.borgmann@mail.de
State New
Headers show
Series webp: add support for animated WebP decoding | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Thilo Borgmann Dec. 6, 2023, 1:35 a.m. UTC
---
 libavcodec/webp_parser.c | 130 +++++++++++++++++++++++++++------------
 1 file changed, 89 insertions(+), 41 deletions(-)

Comments

Andreas Rheinhardt Dec. 7, 2023, 5:42 p.m. UTC | #1
Thilo Borgmann via ffmpeg-devel:
> ---
>  libavcodec/webp_parser.c | 130 +++++++++++++++++++++++++++------------
>  1 file changed, 89 insertions(+), 41 deletions(-)
> 

According to
https://developers.google.com/speed/webp/docs/riff_container#extended_file_format
metadata chunks are stored after the image data; if you split the data
into packets, then the metadata while only become available for the very
last frame, although it pertains to all of them. This makes your
approach unworkable.

Additionally, the WebP muxer expects animations to be contained in a
single packet, hence the warnings from Michael.

- Andreas
Cosmin Stejerean Dec. 8, 2023, 2:02 a.m. UTC | #2
> On Dec 7, 2023, at 9:42 AM, Andreas Rheinhardt <andreas.rheinhardt@outlook.com> wrote:
> 
> According to
> https://developers.google.com/speed/webp/docs/riff_container#extended_file_format
> metadata chunks are stored after the image data; if you split the data
> into packets, then the metadata while only become available for the very
> last frame, although it pertains to all of them. This makes your
> approach unworkable.
> 
> Additionally, the WebP muxer expects animations to be contained in a
> single packet, hence the warnings from Michael.

What would be a better approach here, keeping all the animations in a single packet and decoding multiple frames from it, by essentially moving this logic to split them from the parser to the decoder?

- Cosmin
Thilo Borgmann Dec. 12, 2023, 12:14 p.m. UTC | #3
Am 08.12.23 um 03:02 schrieb Cosmin Stejerean via ffmpeg-devel:
> 
> 
>> On Dec 7, 2023, at 9:42 AM, Andreas Rheinhardt <andreas.rheinhardt@outlook.com> wrote:
>>
>> According to
>> https://developers.google.com/speed/webp/docs/riff_container#extended_file_format
>> metadata chunks are stored after the image data; if you split the data
>> into packets, then the metadata while only become available for the very
>> last frame, although it pertains to all of them. This makes your
>> approach unworkable.
>>
>> Additionally, the WebP muxer expects animations to be contained in a
>> single packet, hence the warnings from Michael.
> 
> What would be a better approach here, keeping all the animations in a single packet and decoding multiple frames from it, by essentially moving this logic to split them from the parser to the decoder?

Nah, nothing to do here. The whole parser patch can be dropped, I think Josef wanted this to get more out of frame threading (more pkts -> more decoder calls -> more parallel). The old untouched parser code appears fine as is.

The demuxer should be overly complicated for this reason as well (and that adds the warning Michael found). Going to strip the demuxer from that wrong though as well.

Sending v8 shortly.

Thanks,
Thilo
diff mbox series

Patch

diff --git a/libavcodec/webp_parser.c b/libavcodec/webp_parser.c
index bd5f94dac5..da853bb1f5 100644
--- a/libavcodec/webp_parser.c
+++ b/libavcodec/webp_parser.c
@@ -25,13 +25,17 @@ 
 
 #include "libavutil/bswap.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 
 #include "parser.h"
 
 typedef struct WebPParseContext {
     ParseContext pc;
+    int frame;
+    int first_frame;
     uint32_t fsize;
-    uint32_t remaining_size;
+    uint32_t remaining_file_size;
+    uint32_t remaining_tag_size;
 } WebPParseContext;
 
 static int webp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
@@ -41,62 +45,106 @@  static int webp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     WebPParseContext *ctx = s->priv_data;
     uint64_t state = ctx->pc.state64;
     int next = END_NOT_FOUND;
-    int i = 0;
+    int i, len;
 
-    *poutbuf      = NULL;
-    *poutbuf_size = 0;
-
-restart:
-    if (ctx->pc.frame_start_found <= 8) {
-        for (; i < buf_size; i++) {
+    for (i = 0; i < buf_size;) {
+        if (ctx->remaining_tag_size) {
+            /* consuming tag */
+            len = FFMIN(ctx->remaining_tag_size, buf_size - i);
+            i += len;
+            ctx->remaining_tag_size -= len;
+            ctx->remaining_file_size -= len;
+        } else {
+            /* scan for the next tag or file */
             state = (state << 8) | buf[i];
-            if (ctx->pc.frame_start_found == 0) {
-                if ((state >> 32) == MKBETAG('R', 'I', 'F', 'F')) {
-                    ctx->fsize = av_bswap32(state);
-                    if (ctx->fsize > 15 && ctx->fsize <= UINT32_MAX - 10) {
-                        ctx->pc.frame_start_found = 1;
-                        ctx->fsize += 8;
+            i++;
+
+            if (!ctx->remaining_file_size) {
+                /* scan for the next file */
+                if (ctx->pc.frame_start_found == 4) {
+                    ctx->pc.frame_start_found = 0;
+                    if ((uint32_t) state == MKBETAG('W', 'E', 'B', 'P')) {
+                        if (ctx->frame || i != 12) {
+                            ctx->frame = 0;
+                            next = i - 12;
+                            state = 0;
+                            ctx->pc.frame_start_found = 0;
+                            break;
+                        }
+                        ctx->remaining_file_size = ctx->fsize - 4;
+                        ctx->first_frame = 1;
+                        continue;
                     }
                 }
-            } else if (ctx->pc.frame_start_found == 8) {
-                if ((state >> 32) != MKBETAG('W', 'E', 'B', 'P')) {
+                if (ctx->pc.frame_start_found == 0) {
+                    if ((state >> 32) == MKBETAG('R', 'I', 'F', 'F')) {
+                        ctx->fsize = av_bswap32(state);
+                        if (ctx->fsize > 15 && ctx->fsize <= UINT32_MAX - 10) {
+                            ctx->fsize += (ctx->fsize & 1);
+                            ctx->pc.frame_start_found = 1;
+                        }
+                    }
+                } else
+                    ctx->pc.frame_start_found++;
+            } else {
+                /* read the next tag */
+                ctx->remaining_file_size--;
+                if (ctx->remaining_file_size == 0) {
                     ctx->pc.frame_start_found = 0;
                     continue;
                 }
                 ctx->pc.frame_start_found++;
-                ctx->remaining_size = ctx->fsize + i - 15;
-                if (ctx->pc.index + i > 15) {
-                    next = i - 15;
-                    state = 0;
+                if (ctx->pc.frame_start_found < 8)
+                    continue;
+
+                switch (state >> 32) {
+                case MKBETAG('A', 'N', 'M', 'F'):
+                case MKBETAG('V', 'P', '8', ' '):
+                case MKBETAG('V', 'P', '8', 'L'):
+                    if (ctx->frame) {
+                        ctx->frame = 0;
+                        next = i - 8;
+                        state = 0;
+                        ctx->pc.frame_start_found = 0;
+                        goto flush;
+                    }
+                    ctx->frame = 1;
+                    break;
+                default:
                     break;
-                } else {
-                    ctx->pc.state64 = 0;
-                    goto restart;
                 }
-            } else if (ctx->pc.frame_start_found)
-                ctx->pc.frame_start_found++;
-        }
-        ctx->pc.state64 = state;
-    } else {
-        if (ctx->remaining_size) {
-            i = FFMIN(ctx->remaining_size, buf_size);
-            ctx->remaining_size -= i;
-            if (ctx->remaining_size)
-                goto flush;
 
-            ctx->pc.frame_start_found = 0;
-            goto restart;
+                ctx->remaining_tag_size = av_bswap32(state);
+                ctx->remaining_tag_size += ctx->remaining_tag_size & 1;
+                if (ctx->remaining_tag_size > ctx->remaining_file_size) {
+                    /* this might be truncated remains before end of file */
+                    ctx->remaining_tag_size = ctx->remaining_file_size;
+                }
+                ctx->pc.frame_start_found = 0;
+                state = 0;
+            }
         }
     }
-
 flush:
-    if (ff_combine_frame(&ctx->pc, next, &buf, &buf_size) < 0)
+    ctx->pc.state64 = state;
+
+    if (ff_combine_frame(&ctx->pc, next, &buf, &buf_size) < 0) {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
         return buf_size;
+    }
 
-    if (next != END_NOT_FOUND && next < 0)
-        ctx->pc.frame_start_found = FFMAX(ctx->pc.frame_start_found - i - 1, 0);
-    else
-        ctx->pc.frame_start_found = 0;
+    // Extremely simplified key frame detection:
+    // - the first frame (containing headers) is marked as a key frame
+    // - other frames are marked as non-key frames
+    if (ctx->first_frame) {
+        ctx->first_frame = 0;
+        s->pict_type = AV_PICTURE_TYPE_I;
+        s->key_frame = 1;
+    } else {
+        s->pict_type = AV_PICTURE_TYPE_P;
+        s->key_frame = 0;
+    }
 
     *poutbuf      = buf;
     *poutbuf_size = buf_size;