diff mbox series

[FFmpeg-devel,v2,1/2] libavcodec: add support for animated WebP decoding

Message ID 20200729120550.16828-1-josef@pex.com
State New
Headers show
Series [FFmpeg-devel,v2,1/2] libavcodec: add support for animated WebP decoding
Related show

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate fail Make fate failed

Commit Message

Zlomek, Josef July 29, 2020, 12:05 p.m. UTC
Fixes: 4907

Adds support for decoding of animated WebP.

The WebP parser now splits the input stream into packets containing one frame.

The WebP decoder adds the animation related features according to the specs:
https://developers.google.com/speed/webp/docs/riff_container#animation
The frames of the animation may be smaller than the image canvas.
Therefore, the frame is decoded to a temporary frame,
then it is blended into the canvas, the canvas is copied to the output frame,
and finally the frame is disposed from the canvas.

The output to AV_PIX_FMT_YUVA420P/AV_PIX_FMT_YUV420P is still supported.
The background color is specified only as BGRA in the WebP file
so it is converted to YUVA if YUV formats are output.

Signed-off-by: Josef Zlomek <josef@pex.com>
---
 Changelog                |   1 +
 libavcodec/codec_desc.c  |   3 +-
 libavcodec/webp.c        | 790 ++++++++++++++++++++++++++++++++++++---
 libavcodec/webp.h        |  44 +++
 libavcodec/webp_parser.c | 128 ++++---
 5 files changed, 863 insertions(+), 103 deletions(-)
 create mode 100644 libavcodec/webp.h

Comments

Lynne July 29, 2020, 3:43 p.m. UTC | #1
Jul 29, 2020, 14:05 by josef@pex.com:

> Fixes: 4907
>
> Adds support for decoding of animated WebP.
> +
> +    ff_thread_release_buffer(s->avctx, &s->canvas_frame);
> +    ret = ff_thread_get_buffer(s->avctx, &s->canvas_frame, AV_GET_BUFFER_FLAG_REF);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (canvas->format == AV_PIX_FMT_ARGB) {
> +        height = canvas->height;
> +        memset(canvas->data[0], 0, height * canvas->linesize[0]);
>

At least some of those image-wide memsets can be replaced with
av_image_fill_black.



> +// divide by 255 and round to nearest
> +// apply a fast variant: (X+127)/255 = ((X+127)*257+257)>>16 = ((X+128)*257)>>16
> +#define FAST_DIV255(x) ((((x) + 128) * 257) >> 16)
>

If you don't need nearest rounding a div would likely be faster, especially at this range.



> +        if (canvas->format == AV_PIX_FMT_ARGB) {
> +            width  = s->width;
> +            height = s->height;
> +            pos_x  = s->pos_x;
> +            pos_y  = s->pos_y;
> +
> +            for (y = 0; y < height; y++) {
> +                const uint8_t *src = frame->data[0] + y * frame->linesize[0];
> +                uint8_t *dst = canvas->data[0] + (y + pos_y) * canvas->linesize[0] + pos_x * sizeof(uint32_t);
> +                for (x = 0; x < width; x++) {
> +                    int src_alpha = src[0];
> +                    int dst_alpha = dst[0];
> +
> +                    if (src_alpha == 255) {
> +                        memcpy(dst, src, 4);
> +                    } else if (dst_alpha == 255) {
> +                        dst[0] = 255;
> +                        dst[1] = FAST_DIV255(src[1] * src_alpha + dst[1] * (255 - src_alpha));
> +                        dst[2] = FAST_DIV255(src[2] * src_alpha + dst[2] * (255 - src_alpha));
> +                        dst[3] = FAST_DIV255(src[3] * src_alpha + dst[3] * (255 - src_alpha));
> +                    } else if (src_alpha + dst_alpha == 0) {
> +                        memset(dst, 0, 4);
> +                    } else {
> +                        int dst_alpha2 = dst_alpha - FAST_DIV255(src_alpha * dst_alpha);
> +                        int blend_alpha = src_alpha + dst_alpha2;
> +                        av_assert0(blend_alpha);
>

Why is there an assert here or in fact anywhere in the pixel path?
We really don't want to abort() in a library if someone sends some wrong data.



> +                    // calculate the average alpha of the tile
> +                    int src_alpha = 0;
> +                    int dst_alpha = 0;
> +                    for (yy = 0; yy < tile_h; yy++) {
> +                        for (xx = 0; xx < tile_w; xx++) {
> +                            src_alpha += frame->data[plane_a][(y * tile_h + yy) * frame->linesize[plane_a] + (x * tile_w + xx)];
> +                            dst_alpha += canvas->data[plane_a][((y + pos_y) * tile_h + yy) * canvas->linesize[plane_a] + ((x + pos_x) * tile_w + xx)];
> +                        }
> +                    }
> +                    src_alpha = RSHIFT(src_alpha, desc->log2_chroma_w + desc->log2_chroma_h);
> +                    dst_alpha = RSHIFT(dst_alpha, desc->log2_chroma_w + desc->log2_chroma_h);
>

This is some pretty horrible and iffy code to see in a decoder, sadly no choice
but to live with it. And blame google. A lot. For everything.



> +                    if (src_alpha == 255) {
> +                        *dst_u = *src_u;
> +                        *dst_v = *src_v;
> +                    } else if (dst_alpha == 255) {
> +                        *dst_u = FAST_DIV255(*src_u * src_alpha + *dst_u * (255 - src_alpha));
> +                        *dst_v = FAST_DIV255(*src_v * src_alpha + *dst_v * (255 - src_alpha));
> +                    } else if (src_alpha + dst_alpha == 0) {
> +                        *dst_u = s->transparent_yuva[1];
> +                        *dst_v = s->transparent_yuva[2];
> +                    } else {
> +                        int dst_alpha2 = dst_alpha - FAST_DIV255(src_alpha * dst_alpha);
> +                        int blend_alpha = src_alpha + dst_alpha2;
> +                        av_assert0(blend_alpha);
> +
> +                        *dst_u = ROUNDED_DIV(*src_u * src_alpha + *dst_u * dst_alpha2, blend_alpha);
> +                        *dst_v = ROUNDED_DIV(*src_v * src_alpha + *dst_v * dst_alpha2, blend_alpha);
> +                    }
>

Are you sure a branch is faster here? A mispredicted branch is something like 30 cycles,
while the FAST_DIV255 macro seems like it has less instructions than that.



> +        for (y = 0; y < height; y++) {
> +            const uint8_t *src = canvas->data[0] + y * canvas->linesize[0];
> +            uint8_t *dst = frame->data[0] + y * frame->linesize[0];
> +            for (x = 0; x < width; x++) {
> +                int src_alpha = src[0];
> +
> +                if (src_alpha == 255) {
> +                    memcpy(dst, src, 4);
> +                } else if (background_alpha == 255) {
> +                    dst[0] = 255;
> +                    dst[1] = FAST_DIV255(src[1] * src_alpha + s->background_argb[1] * (255 - src_alpha));
> +                    dst[2] = FAST_DIV255(src[2] * src_alpha + s->background_argb[2] * (255 - src_alpha));
> +                    dst[3] = FAST_DIV255(src[3] * src_alpha + s->background_argb[3] * (255 - src_alpha));
> +                } else if (src_alpha + background_alpha == 0) {
> +                    memset(dst, 0, 4);
> +                } else {
> +                    int dst_alpha2 = background_alpha - FAST_DIV255(src_alpha * background_alpha);
> +                    int blend_alpha = src_alpha + dst_alpha2;
> +                    av_assert0(blend_alpha);
> +
> +                    dst[0] = blend_alpha;
> +                    dst[1] = ROUNDED_DIV(src[1] * src_alpha + s->background_argb[1] * dst_alpha2, blend_alpha);
> +                    dst[2] = ROUNDED_DIV(src[2] * src_alpha + s->background_argb[2] * dst_alpha2, blend_alpha);
> +                    dst[3] = ROUNDED_DIV(src[3] * src_alpha + s->background_argb[3] * dst_alpha2, blend_alpha);
> +                }
> +                src += 4;
> +                dst += 4;
> +            }
> +        }
>

Could you abstract some blending code in the function? I feel like its repeated a lot of times.



>  AVCodec ff_webp_decoder = {
>  .name           = "webp",
>  .long_name      = NULL_IF_CONFIG_SMALL("WebP image"),
>  .type           = AVMEDIA_TYPE_VIDEO,
>  .id             = AV_CODEC_ID_WEBP,
>  .priv_data_size = sizeof(WebPContext),
> +    .update_thread_context = ONLY_IF_THREADS_ENABLED(webp_update_thread_context),
> +    .init           = webp_decode_init,
>  .decode         = webp_decode_frame,
>  .close          = webp_decode_close,
>  .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
> +    .caps_internal  = FF_CODEC_CAP_ALLOCATE_PROGRESS,
>  };
>

Don't you need to add a flush function to enable proper seeking?
It gets called when you seek. You won't get any data from in between the seek start and end.

The blending code really needs some abstraction. Its 80% of the whole patch.
From the few glimpses of non-blending code I saw, that part looked okay.
Zlomek, Josef July 29, 2020, 4:19 p.m. UTC | #2
On Wed, Jul 29, 2020 at 5:43 PM Lynne <dev@lynne.ee> wrote:

> Jul 29, 2020, 14:05 by josef@pex.com:
>
> > +    if (canvas->format == AV_PIX_FMT_ARGB) {
> > +        height = canvas->height;
> > +        memset(canvas->data[0], 0, height * canvas->linesize[0]);
> >
>
> At least some of those image-wide memsets can be replaced with
> av_image_fill_black.
>

For the canvas, the transparent black is needed (alpha = 0),
av_image_fill_black sets alpha = 255.
The background color has to be blended in the end when generating the
output frame,
otherwise one example file would not be decoded correctly.

> +// divide by 255 and round to nearest
> > +// apply a fast variant: (X+127)/255 = ((X+127)*257+257)>>16 =
> ((X+128)*257)>>16
> > +#define FAST_DIV255(x) ((((x) + 128) * 257) >> 16)
> >
>
> If you don't need nearest rounding a div would likely be faster,
> especially at this range.
>

I did not expect that div by 255 was fast. I'll do some benchmarks.
I took this macro from libavcodec/pngdec.c

Thank you also for  the rest of suggestions, I'll implement them.

Josef
diff mbox series

Patch

diff --git a/Changelog b/Changelog
index c37ffa82e1..51268221a9 100644
--- a/Changelog
+++ b/Changelog
@@ -9,6 +9,7 @@  version <next>:
 - VDPAU accelerated HEVC 10/12bit decoding
 - ADPCM IMA Ubisoft APM encoder
 - Rayman 2 APM muxer
+- animated WebP parser/decoder
 
 
 version 4.3:
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index ced00bd34c..947682f7c4 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -1251,8 +1251,7 @@  static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "webp",
         .long_name = NULL_IF_CONFIG_SMALL("WebP"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
+        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
         .mime_types= MT("image/webp"),
     },
     {
diff --git a/libavcodec/webp.c b/libavcodec/webp.c
index c6d0206846..43c00990ec 100644
--- a/libavcodec/webp.c
+++ b/libavcodec/webp.c
@@ -35,12 +35,15 @@ 
  * Exif metadata
  * ICC profile
  *
+ * @author Josef Zlomek, Pexeso Inc. <josef@pex.com>
+ * Animation
+ *
  * Unimplemented:
- *   - Animation
  *   - XMP metadata
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/colorspace.h"
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
@@ -50,12 +53,7 @@ 
 #include "internal.h"
 #include "thread.h"
 #include "vp8.h"
-
-#define VP8X_FLAG_ANIMATION             0x02
-#define VP8X_FLAG_XMP_METADATA          0x04
-#define VP8X_FLAG_EXIF_METADATA         0x08
-#define VP8X_FLAG_ALPHA                 0x10
-#define VP8X_FLAG_ICC                   0x20
+#include "webp.h"
 
 #define MAX_PALETTE_SIZE                256
 #define MAX_CACHE_BITS                  11
@@ -188,6 +186,8 @@  typedef struct ImageContext {
 typedef struct WebPContext {
     VP8Context v;                       /* VP8 Context used for lossy decoding */
     GetBitContext gb;                   /* bitstream reader for main image chunk */
+    ThreadFrame canvas_frame;           /* ThreadFrame for canvas */
+    AVFrame *frame;                     /* AVFrame for decoded frame */
     AVFrame *alpha_frame;               /* AVFrame for alpha data decompressed from VP8L */
     AVCodecContext *avctx;              /* parent AVCodecContext */
     int initialized;                    /* set once the VP8 context is initialized */
@@ -198,9 +198,23 @@  typedef struct WebPContext {
     int alpha_data_size;                /* alpha chunk data size */
     int has_exif;                       /* set after an EXIF chunk has been processed */
     int has_iccp;                       /* set after an ICCP chunk has been processed */
-    int width;                          /* image width */
-    int height;                         /* image height */
-    int lossless;                       /* indicates lossless or lossy */
+    int vp8x_flags;                     /* global flags from VP8X chunk */
+    int canvas_width;                   /* canvas width */
+    int canvas_height;                  /* canvas height */
+    int anmf_flags;                     /* frame flags from ANMF chunk */
+    int width;                          /* frame width */
+    int height;                         /* frame height */
+    int pos_x;                          /* frame position X */
+    int pos_y;                          /* frame position Y */
+    int prev_anmf_flags;                /* previous frame flags from ANMF chunk */
+    int prev_width;                     /* previous frame width */
+    int prev_height;                    /* previous frame height */
+    int prev_pos_x;                     /* previous frame position X */
+    int prev_pos_y;                     /* previous frame position Y */
+    int await_progress;                 /* value of progress to wait for */
+    uint8_t background_argb[4];         /* background color in ARGB format */
+    uint8_t background_yuva[4];         /* background color in YUVA format */
+    uint8_t transparent_yuva[4];        /* transparent black in YUVA format */
 
     int nb_transforms;                  /* number of transforms */
     enum TransformType transforms[4];   /* transformations used in the image, in order */
@@ -612,6 +626,9 @@  static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
     if (ret < 0)
         return ret;
 
+    if (role == IMAGE_ROLE_ARGB && !img->is_alpha_primary)
+        ff_thread_finish_setup(s->avctx);
+
     if (get_bits1(&s->gb)) {
         img->color_cache_bits = get_bits(&s->gb, 4);
         if (img->color_cache_bits < 1 || img->color_cache_bits > 11) {
@@ -1100,7 +1117,7 @@  static int apply_color_indexing_transform(WebPContext *s)
     return 0;
 }
 
-static void update_canvas_size(AVCodecContext *avctx, int w, int h)
+static void update_frame_size(AVCodecContext *avctx, int w, int h)
 {
     WebPContext *s = avctx->priv_data;
     if (s->width && s->width != w) {
@@ -1123,7 +1140,6 @@  static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
     int w, h, ret, i, used;
 
     if (!is_alpha_chunk) {
-        s->lossless = 1;
         avctx->pix_fmt = AV_PIX_FMT_ARGB;
     }
 
@@ -1140,7 +1156,7 @@  static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
         w = get_bits(&s->gb, 14) + 1;
         h = get_bits(&s->gb, 14) + 1;
 
-        update_canvas_size(avctx, w, h);
+        update_frame_size(avctx, w, h);
 
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
@@ -1338,7 +1354,6 @@  static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
         s->v.actually_webp = 1;
     }
     avctx->pix_fmt = s->has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
-    s->lossless = 0;
 
     if (data_size > INT_MAX) {
         av_log(avctx, AV_LOG_ERROR, "unsupported chunk size\n");
@@ -1356,7 +1371,7 @@  static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
     if (!*got_frame)
         return AVERROR_INVALIDDATA;
 
-    update_canvas_size(avctx, avctx->width, avctx->height);
+    update_frame_size(avctx, avctx->width, avctx->height);
 
     if (s->has_alpha) {
         ret = vp8_lossy_decode_alpha(avctx, p, s->alpha_data,
@@ -1367,42 +1382,533 @@  static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
     return ret;
 }
 
-static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-                             AVPacket *avpkt)
+static int init_canvas_frame(WebPContext *s, int format, int key_frame)
 {
-    AVFrame * const p = data;
-    WebPContext *s = avctx->priv_data;
-    GetByteContext gb;
+    AVFrame *canvas = s->canvas_frame.f;
+    int height;
     int ret;
-    uint32_t chunk_type, chunk_size;
-    int vp8x_flags = 0;
 
-    s->avctx     = avctx;
-    s->width     = 0;
-    s->height    = 0;
-    *got_frame   = 0;
-    s->has_alpha = 0;
-    s->has_exif  = 0;
-    s->has_iccp  = 0;
-    bytestream2_init(&gb, avpkt->data, avpkt->size);
+    // canvas is needed only for animation
+    if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION))
+        return 0;
 
-    if (bytestream2_get_bytes_left(&gb) < 12)
-        return AVERROR_INVALIDDATA;
+    // avoid init for non-key frames whose format and size did not change
+    if (!key_frame &&
+        canvas->data[0] &&
+        canvas->format == format &&
+        canvas->width  == s->canvas_width &&
+        canvas->height == s->canvas_height)
+        return 0;
 
-    if (bytestream2_get_le32(&gb) != MKTAG('R', 'I', 'F', 'F')) {
-        av_log(avctx, AV_LOG_ERROR, "missing RIFF tag\n");
-        return AVERROR_INVALIDDATA;
+    s->avctx->pix_fmt = format;
+    canvas->format    = format;
+    canvas->width     = s->canvas_width;
+    canvas->height    = s->canvas_height;
+
+    // VP8 decoder changed the width and height in AVCodecContext.
+    // Change it back to the canvas size.
+    ret = ff_set_dimensions(s->avctx, s->canvas_width, s->canvas_height);
+    if (ret < 0)
+        return ret;
+
+    ff_thread_release_buffer(s->avctx, &s->canvas_frame);
+    ret = ff_thread_get_buffer(s->avctx, &s->canvas_frame, AV_GET_BUFFER_FLAG_REF);
+    if (ret < 0)
+        return ret;
+
+    if (canvas->format == AV_PIX_FMT_ARGB) {
+        height = canvas->height;
+        memset(canvas->data[0], 0, height * canvas->linesize[0]);
+    } else /* if (canvas->format == AV_PIX_FMT_YUVA420P) */ {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(canvas->format);
+        int component;
+        int plane;
+
+        for (component = 0; component < desc->nb_components; component++) {
+            plane  = desc->comp[component].plane;
+            height = canvas->height;
+
+            if (component == 1 || component == 2)
+                height = AV_CEIL_RSHIFT(height, desc->log2_chroma_h);
+
+            memset(canvas->data[plane], s->transparent_yuva[component],
+                   height * canvas->linesize[plane]);
+        }
     }
 
-    chunk_size = bytestream2_get_le32(&gb);
-    if (bytestream2_get_bytes_left(&gb) < chunk_size)
-        return AVERROR_INVALIDDATA;
+    return 0;
+}
 
-    if (bytestream2_get_le32(&gb) != MKTAG('W', 'E', 'B', 'P')) {
-        av_log(avctx, AV_LOG_ERROR, "missing WEBP tag\n");
-        return AVERROR_INVALIDDATA;
+// divide by 255 and round to nearest
+// apply a fast variant: (X+127)/255 = ((X+127)*257+257)>>16 = ((X+128)*257)>>16
+#define FAST_DIV255(x) ((((x) + 128) * 257) >> 16)
+
+static int blend_frame_into_canvas(WebPContext *s)
+{
+    AVFrame *canvas = s->canvas_frame.f;
+    AVFrame *frame = s->frame;
+    int ret;
+    int x, y;
+    int width, height;
+    int pos_x, pos_y;
+
+    ret = av_frame_copy_props(canvas, frame);
+    if (ret < 0)
+        return ret;
+
+    if ((s->anmf_flags & ANMF_BLENDING_METHOD) == ANMF_BLENDING_METHOD_OVERWRITE
+        || frame->format == AV_PIX_FMT_YUV420P) {
+        // do not blend, overwrite
+
+        if (canvas->format == AV_PIX_FMT_ARGB) {
+            width  = s->width;
+            height = s->height;
+            pos_x  = s->pos_x;
+            pos_y  = s->pos_y;
+
+            for (y = 0; y < height; y++) {
+                const uint32_t *src = (uint32_t *) (frame->data[0] + y * frame->linesize[0]);
+                uint32_t *dst = (uint32_t *) (canvas->data[0] + (y + pos_y) * canvas->linesize[0]) + pos_x;
+                memcpy(dst, src, width * sizeof(uint32_t));
+            }
+        } else /* if (canvas->format == AV_PIX_FMT_YUVA420P) */ {
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+            int component;
+            int plane;
+
+            for (component = 0; component < desc->nb_components; component++) {
+                plane  = desc->comp[component].plane;
+                width  = s->width;
+                height = s->height;
+                pos_x  = s->pos_x;
+                pos_y  = s->pos_y;
+                if (component == 1 || component == 2) {
+                    width  = AV_CEIL_RSHIFT(width,  desc->log2_chroma_w);
+                    height = AV_CEIL_RSHIFT(height, desc->log2_chroma_h);
+                    pos_x  = AV_CEIL_RSHIFT(pos_x,  desc->log2_chroma_w);
+                    pos_y  = AV_CEIL_RSHIFT(pos_y,  desc->log2_chroma_h);
+                }
+
+                for (y = 0; y < height; y++) {
+                    const uint8_t *src = frame->data[plane] + y * frame->linesize[plane];
+                    uint8_t *dst = canvas->data[plane] + (y + pos_y) * canvas->linesize[plane] + pos_x;
+                    memcpy(dst, src, width);
+                }
+            }
+
+            if (desc->nb_components < 4) {
+                // frame does not have alpha, set alpha to 255
+                desc = av_pix_fmt_desc_get(canvas->format);
+                plane  = desc->comp[3].plane;
+                width  = s->width;
+                height = s->height;
+                pos_x  = s->pos_x;
+                pos_y  = s->pos_y;
+
+                for (y = 0; y < height; y++) {
+                    uint8_t *dst = canvas->data[plane] + (y + pos_y) * canvas->linesize[plane] + pos_x;
+                    memset(dst, 255, width);
+                }
+            }
+        }
+    } else {
+        // alpha blending
+
+        if (canvas->format == AV_PIX_FMT_ARGB) {
+            width  = s->width;
+            height = s->height;
+            pos_x  = s->pos_x;
+            pos_y  = s->pos_y;
+
+            for (y = 0; y < height; y++) {
+                const uint8_t *src = frame->data[0] + y * frame->linesize[0];
+                uint8_t *dst = canvas->data[0] + (y + pos_y) * canvas->linesize[0] + pos_x * sizeof(uint32_t);
+                for (x = 0; x < width; x++) {
+                    int src_alpha = src[0];
+                    int dst_alpha = dst[0];
+
+                    if (src_alpha == 255) {
+                        memcpy(dst, src, 4);
+                    } else if (dst_alpha == 255) {
+                        dst[0] = 255;
+                        dst[1] = FAST_DIV255(src[1] * src_alpha + dst[1] * (255 - src_alpha));
+                        dst[2] = FAST_DIV255(src[2] * src_alpha + dst[2] * (255 - src_alpha));
+                        dst[3] = FAST_DIV255(src[3] * src_alpha + dst[3] * (255 - src_alpha));
+                    } else if (src_alpha + dst_alpha == 0) {
+                        memset(dst, 0, 4);
+                    } else {
+                        int dst_alpha2 = dst_alpha - FAST_DIV255(src_alpha * dst_alpha);
+                        int blend_alpha = src_alpha + dst_alpha2;
+                        av_assert0(blend_alpha);
+
+                        dst[0] = blend_alpha;
+                        dst[1] = ROUNDED_DIV(src[1] * src_alpha + dst[1] * dst_alpha2, blend_alpha);
+                        dst[2] = ROUNDED_DIV(src[2] * src_alpha + dst[2] * dst_alpha2, blend_alpha);
+                        dst[3] = ROUNDED_DIV(src[3] * src_alpha + dst[3] * dst_alpha2, blend_alpha);
+                    }
+                    src += 4;
+                    dst += 4;
+                }
+            }
+        } else /* if (canvas->format == AV_PIX_FMT_YUVA420P) */ {
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+            int plane_y, plane_u, plane_v, plane_a;
+            int tile_w;
+            int tile_h;
+
+            av_assert0(desc->nb_components >= 4);
+            plane_y = desc->comp[0].plane;
+            plane_u = desc->comp[1].plane;
+            plane_v = desc->comp[2].plane;
+            plane_a = desc->comp[3].plane;
+
+            // first, blend U & V planes, because the later step modifies alpha plane
+            width  = AV_CEIL_RSHIFT(s->width,  desc->log2_chroma_w);
+            height = AV_CEIL_RSHIFT(s->height, desc->log2_chroma_h);
+            pos_x  = AV_CEIL_RSHIFT(s->pos_x,  desc->log2_chroma_w);
+            pos_y  = AV_CEIL_RSHIFT(s->pos_y,  desc->log2_chroma_h);
+            tile_w = 1 << desc->log2_chroma_w;
+            tile_h = 1 << desc->log2_chroma_h;
+
+            for (y = 0; y < height; y++) {
+                const uint8_t *src_u = frame->data[plane_u] + y * frame->linesize[plane_u];
+                const uint8_t *src_v = frame->data[plane_v] + y * frame->linesize[plane_v];
+                uint8_t *dst_u = canvas->data[plane_u] + (y + pos_y) * canvas->linesize[plane_u] + pos_x;
+                uint8_t *dst_v = canvas->data[plane_v] + (y + pos_y) * canvas->linesize[plane_v] + pos_x;
+                for (x = 0; x < width; x++) {
+                    int xx, yy;
+
+                    // calculate the average alpha of the tile
+                    int src_alpha = 0;
+                    int dst_alpha = 0;
+                    for (yy = 0; yy < tile_h; yy++) {
+                        for (xx = 0; xx < tile_w; xx++) {
+                            src_alpha += frame->data[plane_a][(y * tile_h + yy) * frame->linesize[plane_a] + (x * tile_w + xx)];
+                            dst_alpha += canvas->data[plane_a][((y + pos_y) * tile_h + yy) * canvas->linesize[plane_a] + ((x + pos_x) * tile_w + xx)];
+                        }
+                    }
+                    src_alpha = RSHIFT(src_alpha, desc->log2_chroma_w + desc->log2_chroma_h);
+                    dst_alpha = RSHIFT(dst_alpha, desc->log2_chroma_w + desc->log2_chroma_h);
+
+                    if (src_alpha == 255) {
+                        *dst_u = *src_u;
+                        *dst_v = *src_v;
+                    } else if (dst_alpha == 255) {
+                        *dst_u = FAST_DIV255(*src_u * src_alpha + *dst_u * (255 - src_alpha));
+                        *dst_v = FAST_DIV255(*src_v * src_alpha + *dst_v * (255 - src_alpha));
+                    } else if (src_alpha + dst_alpha == 0) {
+                        *dst_u = s->transparent_yuva[1];
+                        *dst_v = s->transparent_yuva[2];
+                    } else {
+                        int dst_alpha2 = dst_alpha - FAST_DIV255(src_alpha * dst_alpha);
+                        int blend_alpha = src_alpha + dst_alpha2;
+                        av_assert0(blend_alpha);
+
+                        *dst_u = ROUNDED_DIV(*src_u * src_alpha + *dst_u * dst_alpha2, blend_alpha);
+                        *dst_v = ROUNDED_DIV(*src_v * src_alpha + *dst_v * dst_alpha2, blend_alpha);
+                    }
+                    src_u++;
+                    src_v++;
+                    dst_u++;
+                    dst_v++;
+                }
+            }
+
+            // then blend Y & A planes
+            width  = s->width;
+            height = s->height;
+            pos_x  = s->pos_x;
+            pos_y  = s->pos_y;
+
+            for (y = 0; y < height; y++) {
+                const uint8_t *src_y = frame->data[plane_y] + y * frame->linesize[plane_y];
+                const uint8_t *src_a = frame->data[plane_a] + y * frame->linesize[plane_a];
+                uint8_t *dst_y = canvas->data[plane_y] + (y + pos_y) * canvas->linesize[plane_y] + pos_x;
+                uint8_t *dst_a = canvas->data[plane_a] + (y + pos_y) * canvas->linesize[plane_a] + pos_x;
+                for (x = 0; x < width; x++) {
+                    int src_alpha = *src_a;
+                    int dst_alpha = *dst_a;
+
+                    if (src_alpha == 255) {
+                        *dst_y = *src_y;
+                        *dst_a = 255;
+                    } else if (dst_alpha == 255) {
+                        *dst_y = FAST_DIV255(*src_y * src_alpha + *dst_y * (255 - src_alpha));
+                        *dst_a = 255;
+                    } else if (src_alpha + dst_alpha == 0) {
+                        *dst_y = s->transparent_yuva[0];
+                        *dst_a = 0;
+                    } else {
+                        int dst_alpha2 = dst_alpha - FAST_DIV255(src_alpha * dst_alpha);
+                        int blend_alpha = src_alpha + dst_alpha2;
+                        av_assert0(blend_alpha);
+
+                        *dst_y = ROUNDED_DIV(*src_y * src_alpha + *dst_y * dst_alpha2, blend_alpha);
+                        *dst_a = blend_alpha;
+                    }
+                    src_y++;
+                    src_a++;
+                    dst_y++;
+                    dst_a++;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int copy_canvas_to_frame(WebPContext *s, AVFrame *frame, int key_frame)
+{
+    AVFrame *canvas = s->canvas_frame.f;
+    int ret;
+    int x, y;
+    int width, height;
+
+    // VP8 decoder changed the width and height in AVCodecContext.
+    // Change it back to the canvas size.
+    ret = ff_set_dimensions(s->avctx, canvas->width, canvas->height);
+    if (ret < 0)
+        return ret;
+
+    s->avctx->pix_fmt = canvas->format;
+    frame->format     = canvas->format;
+    frame->width      = canvas->width;
+    frame->height     = canvas->height;
+
+    ret = av_frame_get_buffer(frame, 0);
+    if (ret < 0)
+        return ret;
+
+    ret = av_frame_copy_props(frame, canvas);
+    if (ret < 0)
+        return ret;
+
+    // blend the canvas with the background color into the output frame
+    if (canvas->format == AV_PIX_FMT_ARGB) {
+        int background_alpha = s->background_argb[0];
+
+        width  = canvas->width;
+        height = canvas->height;
+
+        for (y = 0; y < height; y++) {
+            const uint8_t *src = canvas->data[0] + y * canvas->linesize[0];
+            uint8_t *dst = frame->data[0] + y * frame->linesize[0];
+            for (x = 0; x < width; x++) {
+                int src_alpha = src[0];
+
+                if (src_alpha == 255) {
+                    memcpy(dst, src, 4);
+                } else if (background_alpha == 255) {
+                    dst[0] = 255;
+                    dst[1] = FAST_DIV255(src[1] * src_alpha + s->background_argb[1] * (255 - src_alpha));
+                    dst[2] = FAST_DIV255(src[2] * src_alpha + s->background_argb[2] * (255 - src_alpha));
+                    dst[3] = FAST_DIV255(src[3] * src_alpha + s->background_argb[3] * (255 - src_alpha));
+                } else if (src_alpha + background_alpha == 0) {
+                    memset(dst, 0, 4);
+                } else {
+                    int dst_alpha2 = background_alpha - FAST_DIV255(src_alpha * background_alpha);
+                    int blend_alpha = src_alpha + dst_alpha2;
+                    av_assert0(blend_alpha);
+
+                    dst[0] = blend_alpha;
+                    dst[1] = ROUNDED_DIV(src[1] * src_alpha + s->background_argb[1] * dst_alpha2, blend_alpha);
+                    dst[2] = ROUNDED_DIV(src[2] * src_alpha + s->background_argb[2] * dst_alpha2, blend_alpha);
+                    dst[3] = ROUNDED_DIV(src[3] * src_alpha + s->background_argb[3] * dst_alpha2, blend_alpha);
+                }
+                src += 4;
+                dst += 4;
+            }
+        }
+    } else /* if (canvas->format == AV_PIX_FMT_YUVA420P) */ {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+        int plane_y, plane_u, plane_v, plane_a;
+        int tile_w;
+        int tile_h;
+        int background_alpha = s->background_yuva[3];
+
+        av_assert0(desc->nb_components >= 4);
+        plane_y = desc->comp[0].plane;
+        plane_u = desc->comp[1].plane;
+        plane_v = desc->comp[2].plane;
+        plane_a = desc->comp[3].plane;
+
+        // blend U & V planes
+        width  = AV_CEIL_RSHIFT(canvas->width,  desc->log2_chroma_w);
+        height = AV_CEIL_RSHIFT(canvas->height, desc->log2_chroma_h);
+        tile_w = 1 << desc->log2_chroma_w;
+        tile_h = 1 << desc->log2_chroma_h;
+
+        for (y = 0; y < height; y++) {
+            const uint8_t *src_u = canvas->data[plane_u] + y * canvas->linesize[plane_u];
+            const uint8_t *src_v = canvas->data[plane_v] + y * canvas->linesize[plane_v];
+            uint8_t *dst_u = frame->data[plane_u] + y * frame->linesize[plane_u];
+            uint8_t *dst_v = frame->data[plane_v] + y * frame->linesize[plane_v];
+            for (x = 0; x < width; x++) {
+                int xx, yy;
+
+                // calculate the average alpha of the tile
+                int src_alpha = 0;
+                for (yy = 0; yy < tile_h; yy++) {
+                    for (xx = 0; xx < tile_w; xx++) {
+                        src_alpha += canvas->data[plane_a][(y * tile_h + yy) * canvas->linesize[plane_a] + (x * tile_w + xx)];
+                    }
+                }
+                src_alpha = RSHIFT(src_alpha, desc->log2_chroma_w + desc->log2_chroma_h);
+
+                if (src_alpha == 255) {
+                    *dst_u = *src_u;
+                    *dst_v = *src_v;
+                } else if (background_alpha == 255) {
+                    *dst_u = FAST_DIV255(*src_u * src_alpha + s->background_yuva[1] * (255 - src_alpha));
+                    *dst_v = FAST_DIV255(*src_v * src_alpha + s->background_yuva[2] * (255 - src_alpha));
+                } else if (src_alpha + background_alpha == 0) {
+                    *dst_u = s->transparent_yuva[1];
+                    *dst_v = s->transparent_yuva[2];
+                } else {
+                    int dst_alpha2 = background_alpha - FAST_DIV255(src_alpha * background_alpha);
+                    int blend_alpha = src_alpha + dst_alpha2;
+                    av_assert0(blend_alpha);
+
+                    *dst_u = ROUNDED_DIV(*src_u * src_alpha + s->background_yuva[1] * dst_alpha2, blend_alpha);
+                    *dst_v = ROUNDED_DIV(*src_v * src_alpha + s->background_yuva[2] * dst_alpha2, blend_alpha);
+                }
+                src_u++;
+                src_v++;
+                dst_u++;
+                dst_v++;
+            }
+        }
+
+        // blend Y & A planes
+        width  = canvas->width;
+        height = canvas->height;
+
+        for (y = 0; y < height; y++) {
+            const uint8_t *src_y = canvas->data[plane_y] + y * canvas->linesize[plane_y];
+            const uint8_t *src_a = canvas->data[plane_a] + y * canvas->linesize[plane_a];
+            uint8_t *dst_y = frame->data[plane_y] + y * frame->linesize[plane_y];
+            uint8_t *dst_a = frame->data[plane_a] + y * frame->linesize[plane_a];
+            for (x = 0; x < width; x++) {
+                int src_alpha = *src_a;
+
+                if (src_alpha == 255) {
+                    *dst_y = *src_y;
+                    *dst_a = 255;
+                } else if (background_alpha == 255) {
+                    *dst_y = FAST_DIV255(*src_y * src_alpha + s->background_yuva[0] * (255 - src_alpha));
+                    *dst_a = 255;
+                } else if (src_alpha + background_alpha == 0) {
+                    *dst_y = s->transparent_yuva[0];
+                    *dst_a = 0;
+                } else {
+                    int dst_alpha2 = background_alpha - FAST_DIV255(src_alpha * background_alpha);
+                    int blend_alpha = src_alpha + dst_alpha2;
+                    av_assert0(blend_alpha);
+
+                    *dst_y = ROUNDED_DIV(*src_y * src_alpha + s->background_yuva[0] * dst_alpha2, blend_alpha);
+                    *dst_a = blend_alpha;
+                }
+                src_y++;
+                src_a++;
+                dst_y++;
+                dst_a++;
+            }
+        }
+    }
+
+    if (key_frame) {
+        frame->pict_type = AV_PICTURE_TYPE_I;
+        frame->key_frame = 1;
+    } else {
+        frame->pict_type = AV_PICTURE_TYPE_P;
+        frame->key_frame = 0;
+    }
+
+    return 0;
+}
+
+static int dispose_prev_frame_in_canvas(WebPContext *s)
+{
+    AVFrame *canvas = s->canvas_frame.f;
+    int y;
+    int width, height;
+    int pos_x, pos_y;
+
+    if ((s->prev_anmf_flags & ANMF_DISPOSAL_METHOD) == ANMF_DISPOSAL_METHOD_BACKGROUND) {
+        // dispose to background
+
+        if (canvas->format == AV_PIX_FMT_ARGB) {
+            width  = s->prev_width;
+            height = s->prev_height;
+            pos_x  = s->prev_pos_x;
+            pos_y  = s->prev_pos_y;
+
+            for (y = 0; y < height; y++) {
+                uint32_t *dst = (uint32_t *) (canvas->data[0] + (y + pos_y) * canvas->linesize[0]) + pos_x;
+                memset(dst, 0, width * sizeof(uint32_t));
+            }
+        } else /* if (canvas->format == AV_PIX_FMT_YUVA420P) */ {
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(canvas->format);
+            int component;
+            int plane;
+
+            for (component = 0; component < desc->nb_components; component++) {
+                plane  = desc->comp[component].plane;
+                width  = s->prev_width;
+                height = s->prev_height;
+                pos_x  = s->prev_pos_x;
+                pos_y  = s->prev_pos_y;
+                if (component == 1 || component == 2) {
+                    width  = AV_CEIL_RSHIFT(width,  desc->log2_chroma_w);
+                    height = AV_CEIL_RSHIFT(height, desc->log2_chroma_h);
+                    pos_x  = AV_CEIL_RSHIFT(pos_x,  desc->log2_chroma_w);
+                    pos_y  = AV_CEIL_RSHIFT(pos_y,  desc->log2_chroma_h);
+                }
+
+                for (y = 0; y < height; y++) {
+                    uint8_t *dst = canvas->data[plane] + (y + pos_y) * canvas->linesize[plane] + pos_x;
+                    memset(dst, s->transparent_yuva[component], width);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static av_cold int webp_decode_init(AVCodecContext *avctx)
+{
+    WebPContext *s = avctx->priv_data;
+
+    s->avctx = avctx;
+    s->canvas_frame.f = av_frame_alloc();
+    s->frame = av_frame_alloc();
+    if (!s->canvas_frame.f || !s->frame) {
+        av_frame_free(&s->canvas_frame.f);
+        av_frame_free(&s->frame);
+        return AVERROR(ENOMEM);
     }
 
+    // convert transparent black from RGBA to YUVA
+    s->transparent_yuva[0] = RGB_TO_Y_CCIR(0, 0, 0);
+    s->transparent_yuva[1] = RGB_TO_U_CCIR(0, 0, 0, 0);
+    s->transparent_yuva[2] = RGB_TO_V_CCIR(0, 0, 0, 0);
+    s->transparent_yuva[3] = 0;
+
+    return 0;
+}
+
+static int decode_frame_common(AVCodecContext *avctx, uint8_t *data, int size,
+                               int *got_frame, int key_frame)
+{
+    WebPContext *s = avctx->priv_data;
+    GetByteContext gb;
+    int ret;
+    uint32_t chunk_type, chunk_size;
+
+    bytestream2_init(&gb, data, size);
+
     while (bytestream2_get_bytes_left(&gb) > 8) {
         char chunk_str[5] = { 0 };
 
@@ -1412,6 +1918,10 @@  static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
         chunk_size += chunk_size & 1;
 
+        // we need to dive into RIFF chunk
+        if (chunk_type == MKTAG('R', 'I', 'F', 'F'))
+            chunk_size = 4;
+
         if (bytestream2_get_bytes_left(&gb) < chunk_size) {
            /* we seem to be running out of data, but it could also be that the
               bitstream has trailing junk leading to bogus chunk_size. */
@@ -1419,10 +1929,26 @@  static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
 
         switch (chunk_type) {
+        case MKTAG('R', 'I', 'F', 'F'):
+            if (bytestream2_get_le32(&gb) != MKTAG('W', 'E', 'B', 'P')) {
+                av_log(avctx, AV_LOG_ERROR, "missing WEBP tag\n");
+                return AVERROR_INVALIDDATA;
+            }
+            s->vp8x_flags    = 0;
+            s->canvas_width  = 0;
+            s->canvas_height = 0;
+            s->has_exif      = 0;
+            s->has_iccp      = 0;
+            ff_thread_release_buffer(avctx, &s->canvas_frame);
+            break;
         case MKTAG('V', 'P', '8', ' '):
             if (!*got_frame) {
-                ret = vp8_lossy_decode_frame(avctx, p, got_frame,
-                                             avpkt->data + bytestream2_tell(&gb),
+                ret = init_canvas_frame(s, AV_PIX_FMT_YUVA420P, key_frame);
+                if (ret < 0)
+                    return ret;
+
+                ret = vp8_lossy_decode_frame(avctx, s->frame, got_frame,
+                                             data + bytestream2_tell(&gb),
                                              chunk_size);
                 if (ret < 0)
                     return ret;
@@ -1431,8 +1957,12 @@  static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             break;
         case MKTAG('V', 'P', '8', 'L'):
             if (!*got_frame) {
-                ret = vp8_lossless_decode_frame(avctx, p, got_frame,
-                                                avpkt->data + bytestream2_tell(&gb),
+                ret = init_canvas_frame(s, AV_PIX_FMT_ARGB, key_frame);
+                if (ret < 0)
+                    return ret;
+
+                ret = vp8_lossless_decode_frame(avctx, s->frame, got_frame,
+                                                data + bytestream2_tell(&gb),
                                                 chunk_size, 0);
                 if (ret < 0)
                     return ret;
@@ -1441,14 +1971,16 @@  static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             bytestream2_skip(&gb, chunk_size);
             break;
         case MKTAG('V', 'P', '8', 'X'):
-            if (s->width || s->height || *got_frame) {
+            if (s->canvas_width || s->canvas_height || *got_frame) {
                 av_log(avctx, AV_LOG_ERROR, "Canvas dimensions are already set\n");
                 return AVERROR_INVALIDDATA;
             }
-            vp8x_flags = bytestream2_get_byte(&gb);
+            s->vp8x_flags = bytestream2_get_byte(&gb);
             bytestream2_skip(&gb, 3);
             s->width  = bytestream2_get_le24(&gb) + 1;
             s->height = bytestream2_get_le24(&gb) + 1;
+            s->canvas_width  = s->width;
+            s->canvas_height = s->height;
             ret = av_image_check_size(s->width, s->height, 0, avctx);
             if (ret < 0)
                 return ret;
@@ -1456,7 +1988,7 @@  static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         case MKTAG('A', 'L', 'P', 'H'): {
             int alpha_header, filter_m, compression;
 
-            if (!(vp8x_flags & VP8X_FLAG_ALPHA)) {
+            if (!(s->vp8x_flags & VP8X_FLAG_ALPHA)) {
                 av_log(avctx, AV_LOG_WARNING,
                        "ALPHA chunk present, but alpha bit not set in the "
                        "VP8X header\n");
@@ -1465,8 +1997,9 @@  static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 av_log(avctx, AV_LOG_ERROR, "invalid ALPHA chunk size\n");
                 return AVERROR_INVALIDDATA;
             }
+
             alpha_header       = bytestream2_get_byte(&gb);
-            s->alpha_data      = avpkt->data + bytestream2_tell(&gb);
+            s->alpha_data      = data + bytestream2_tell(&gb);
             s->alpha_data_size = chunk_size - 1;
             bytestream2_skip(&gb, s->alpha_data_size);
 
@@ -1493,14 +2026,13 @@  static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 av_log(avctx, AV_LOG_VERBOSE, "Ignoring extra EXIF chunk\n");
                 goto exif_end;
             }
-            if (!(vp8x_flags & VP8X_FLAG_EXIF_METADATA))
+            if (!(s->vp8x_flags & VP8X_FLAG_EXIF_METADATA))
                 av_log(avctx, AV_LOG_WARNING,
                        "EXIF chunk present, but Exif bit not set in the "
                        "VP8X header\n");
 
             s->has_exif = 1;
-            bytestream2_init(&exif_gb, avpkt->data + exif_offset,
-                             avpkt->size - exif_offset);
+            bytestream2_init(&exif_gb, data + exif_offset, size - exif_offset);
             if (ff_tdecode_header(&exif_gb, &le, &ifd_offset) < 0) {
                 av_log(avctx, AV_LOG_ERROR, "invalid TIFF header "
                        "in Exif data\n");
@@ -1528,21 +2060,62 @@  exif_end:
                 bytestream2_skip(&gb, chunk_size);
                 break;
             }
-            if (!(vp8x_flags & VP8X_FLAG_ICC))
+            if (!(s->vp8x_flags & VP8X_FLAG_ICC))
                 av_log(avctx, AV_LOG_WARNING,
                        "ICCP chunk present, but ICC Profile bit not set in the "
                        "VP8X header\n");
 
             s->has_iccp = 1;
-            sd = av_frame_new_side_data(p, AV_FRAME_DATA_ICC_PROFILE, chunk_size);
+            sd = av_frame_new_side_data(s->frame, AV_FRAME_DATA_ICC_PROFILE, chunk_size);
             if (!sd)
                 return AVERROR(ENOMEM);
 
             bytestream2_get_buffer(&gb, sd->data, chunk_size);
             break;
         }
-        case MKTAG('A', 'N', 'I', 'M'):
+        case MKTAG('A', 'N', 'I', 'M'): {
+            int a, r, g, b;
+            if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION)) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "ANIM chunk present, but animation bit not set in the "
+                       "VP8X header\n");
+            }
+            // background is stored as BGRA, we need ARGB
+            s->background_argb[3] = b = bytestream2_get_byte(&gb);
+            s->background_argb[2] = g = bytestream2_get_byte(&gb);
+            s->background_argb[1] = r = bytestream2_get_byte(&gb);
+            s->background_argb[0] = a = bytestream2_get_byte(&gb);
+
+            // convert the background color to YUVA
+            s->background_yuva[0] = RGB_TO_Y_CCIR(r, g, b);
+            s->background_yuva[1] = RGB_TO_U_CCIR(r, g, b, 0);
+            s->background_yuva[2] = RGB_TO_V_CCIR(r, g, b, 0);
+            s->background_yuva[3] = a;
+
+            bytestream2_skip(&gb, 2); // loop count is ignored
+            break;
+        }
         case MKTAG('A', 'N', 'M', 'F'):
+            if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION)) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "ANMF chunk present, but animation bit not set in the "
+                       "VP8X header\n");
+            }
+            s->pos_x      = bytestream2_get_le24(&gb) * 2;
+            s->pos_y      = bytestream2_get_le24(&gb) * 2;
+            s->width      = bytestream2_get_le24(&gb) + 1;
+            s->height     = bytestream2_get_le24(&gb) + 1;
+            bytestream2_skip(&gb, 3);   // duration
+            s->anmf_flags = bytestream2_get_byte(&gb);
+
+            if (s->width  + s->pos_x > s->canvas_width ||
+                s->height + s->pos_y > s->canvas_height) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "frame does not fit into canvas\n");
+                return AVERROR_INVALIDDATA;
+            }
+            s->vp8x_flags |= VP8X_FLAG_ANIMATION;
+            break;
         case MKTAG('X', 'M', 'P', ' '):
             AV_WL32(chunk_str, chunk_type);
             av_log(avctx, AV_LOG_WARNING, "skipping unsupported chunk: %s\n",
@@ -1558,31 +2131,130 @@  exif_end:
         }
     }
 
-    if (!*got_frame) {
-        av_log(avctx, AV_LOG_ERROR, "image data not found\n");
-        return AVERROR_INVALIDDATA;
+    return size;
+}
+
+static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                             AVPacket *avpkt)
+{
+    AVFrame * const p = data;
+    WebPContext *s = avctx->priv_data;
+    int ret;
+    int key_frame = avpkt->flags & AV_PKT_FLAG_KEY;
+
+    *got_frame   = 0;
+
+    if (key_frame) {
+        // The canvas is passed from one thread to another in a sequence
+        // starting with a key frame followed by non-key frames.
+        // The key frame reports progress 1,
+        // the N-th non-key frame awaits progress N = s->await_progress
+        // and reports progress N + 1.
+        s->await_progress = 0;
+    }
+
+    // reset the frame params
+    s->anmf_flags = 0;
+    s->width      = 0;
+    s->height     = 0;
+    s->pos_x      = 0;
+    s->pos_y      = 0;
+    s->has_alpha  = 0;
+
+    ret = decode_frame_common(avctx, avpkt->data, avpkt->size, got_frame, key_frame);
+    if (ret < 0)
+        goto end;
+
+    if (*got_frame) {
+        if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION)) {
+            // no animation, output the decoded frame
+            av_frame_move_ref(p, s->frame);
+        } else {
+            if (!key_frame) {
+                ff_thread_await_progress(&s->canvas_frame, s->await_progress, 0);
+
+                ret = dispose_prev_frame_in_canvas(s);
+                if (ret < 0)
+                    goto end;
+            }
+
+            ret = blend_frame_into_canvas(s);
+            if (ret < 0)
+                goto end;
+
+            ret = copy_canvas_to_frame(s, p, key_frame);
+            if (ret < 0)
+                goto end;
+
+            ff_thread_report_progress(&s->canvas_frame, s->await_progress + 1, 0);
+        }
+
+        p->pts = avpkt->pts;
     }
 
-    return avpkt->size;
+    ret = avpkt->size;
+
+end:
+    av_frame_unref(s->frame);
+    return ret;
 }
 
 static av_cold int webp_decode_close(AVCodecContext *avctx)
 {
     WebPContext *s = avctx->priv_data;
 
+    ff_thread_release_buffer(avctx, &s->canvas_frame);
+    av_frame_free(&s->canvas_frame.f);
+    av_frame_free(&s->frame);
+
     if (s->initialized)
         return ff_vp8_decode_free(avctx);
 
     return 0;
 }
 
+#if HAVE_THREADS
+static int webp_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    WebPContext *wsrc = src->priv_data;
+    WebPContext *wdst = dst->priv_data;
+    int ret;
+
+    if (dst == src)
+        return 0;
+
+    ff_thread_release_buffer(dst, &wdst->canvas_frame);
+    if (wsrc->canvas_frame.f->data[0] &&
+        (ret = ff_thread_ref_frame(&wdst->canvas_frame, &wsrc->canvas_frame)) < 0)
+        return ret;
+
+    wdst->vp8x_flags      = wsrc->vp8x_flags;
+    wdst->canvas_width    = wsrc->canvas_width;
+    wdst->canvas_height   = wsrc->canvas_height;
+    wdst->prev_anmf_flags = wsrc->anmf_flags;
+    wdst->prev_width      = wsrc->width;
+    wdst->prev_height     = wsrc->height;
+    wdst->prev_pos_x      = wsrc->pos_x;
+    wdst->prev_pos_y      = wsrc->pos_y;
+    wdst->await_progress  = wsrc->await_progress + 1;
+
+    memcpy(wdst->background_argb,  wsrc->background_argb,  sizeof(wsrc->background_argb));
+    memcpy(wdst->background_yuva,  wsrc->background_yuva,  sizeof(wsrc->background_yuva));
+
+    return 0;
+}
+#endif
+
 AVCodec ff_webp_decoder = {
     .name           = "webp",
     .long_name      = NULL_IF_CONFIG_SMALL("WebP image"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_WEBP,
     .priv_data_size = sizeof(WebPContext),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(webp_update_thread_context),
+    .init           = webp_decode_init,
     .decode         = webp_decode_frame,
     .close          = webp_decode_close,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .caps_internal  = FF_CODEC_CAP_ALLOCATE_PROGRESS,
 };
diff --git a/libavcodec/webp.h b/libavcodec/webp.h
new file mode 100644
index 0000000000..ad9f1e23b2
--- /dev/null
+++ b/libavcodec/webp.h
@@ -0,0 +1,44 @@ 
+/*
+ * WebP image format definitions
+ * Copyright (c) 2020 Pexeso Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP image format definitions.
+ */
+
+#ifndef AVCODEC_WEBP_H
+#define AVCODEC_WEBP_H
+
+#define VP8X_FLAG_ANIMATION             0x02
+#define VP8X_FLAG_XMP_METADATA          0x04
+#define VP8X_FLAG_EXIF_METADATA         0x08
+#define VP8X_FLAG_ALPHA                 0x10
+#define VP8X_FLAG_ICC                   0x20
+
+#define ANMF_DISPOSAL_METHOD            0x01
+#define ANMF_DISPOSAL_METHOD_UNCHANGED  0x00
+#define ANMF_DISPOSAL_METHOD_BACKGROUND 0x01
+
+#define ANMF_BLENDING_METHOD            0x02
+#define ANMF_BLENDING_METHOD_ALPHA      0x00
+#define ANMF_BLENDING_METHOD_OVERWRITE  0x02
+
+#endif /* AVCODEC_WEBP_H */
diff --git a/libavcodec/webp_parser.c b/libavcodec/webp_parser.c
index fdb7c38350..7fb14b9249 100644
--- a/libavcodec/webp_parser.c
+++ b/libavcodec/webp_parser.c
@@ -25,13 +25,17 @@ 
 
 #include "libavutil/bswap.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 
 #include "parser.h"
 
 typedef struct WebPParseContext {
     ParseContext pc;
+    int frame;
+    int first_frame;
     uint32_t fsize;
-    uint32_t remaining_size;
+    uint32_t remaining_file_size;
+    uint32_t remaining_tag_size;
 } WebPParseContext;
 
 static int webp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
@@ -41,62 +45,102 @@  static int webp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     WebPParseContext *ctx = s->priv_data;
     uint64_t state = ctx->pc.state64;
     int next = END_NOT_FOUND;
-    int i = 0;
+    int i, len;
 
-    *poutbuf      = NULL;
-    *poutbuf_size = 0;
-
-restart:
-    if (ctx->pc.frame_start_found <= 8) {
-        for (; i < buf_size; i++) {
+    for (i = 0; i < buf_size;) {
+        if (ctx->remaining_tag_size) {
+            /* consuming tag */
+            len = FFMIN(ctx->remaining_tag_size, buf_size - i);
+            i += len;
+            ctx->remaining_tag_size -= len;
+            ctx->remaining_file_size -= len;
+        } else if (ctx->frame) {
+            /* consumed tag containing frame, flush it */
+            next = i;
+            ctx->frame = 0;
+            break;
+        } else {
+            /* scan for the next tag or file */
             state = (state << 8) | buf[i];
-            if (ctx->pc.frame_start_found == 0) {
-                if ((state >> 32) == MKBETAG('R', 'I', 'F', 'F')) {
-                    ctx->fsize = av_bswap32(state);
-                    if (ctx->fsize > 15 && ctx->fsize <= UINT32_MAX - 10) {
-                        ctx->pc.frame_start_found = 1;
-                        ctx->fsize += 8;
+            i++;
+
+            if (!ctx->remaining_file_size) {
+                /* scan for the next file */
+                if (ctx->pc.frame_start_found == 4) {
+                    ctx->pc.frame_start_found = 0;
+                    if ((uint32_t) state == MKBETAG('W', 'E', 'B', 'P')) {
+                        if (i != 12) {
+                            next = i - 12;
+                            state = 0;
+                            break;
+                        }
+                        ctx->remaining_file_size = ctx->fsize - 4;
+                        ctx->first_frame = 1;
+                        continue;
                     }
                 }
-            } else if (ctx->pc.frame_start_found == 8) {
-                if ((state >> 32) != MKBETAG('W', 'E', 'B', 'P')) {
+                if (ctx->pc.frame_start_found == 0) {
+                    if ((state >> 32) == MKBETAG('R', 'I', 'F', 'F')) {
+                        ctx->fsize = av_bswap32(state);
+                        if (ctx->fsize > 15 && ctx->fsize <= UINT32_MAX - 10) {
+                            ctx->fsize += (ctx->fsize & 1);
+                            ctx->pc.frame_start_found = 1;
+                        }
+                    }
+                } else
+                    ctx->pc.frame_start_found++;
+            } else {
+                /* read the next tag */
+                ctx->remaining_file_size--;
+                if (ctx->remaining_file_size == 0) {
                     ctx->pc.frame_start_found = 0;
                     continue;
                 }
                 ctx->pc.frame_start_found++;
-                ctx->remaining_size = ctx->fsize + i - 15;
-                if (ctx->pc.index + i > 15) {
-                    next = i - 15;
-                    state = 0;
-                    break;
-                } else {
-                    ctx->pc.state64 = 0;
-                    goto restart;
+                if (ctx->pc.frame_start_found < 8)
+                    continue;
+
+                switch (state >> 32) {
+                    case MKBETAG('A', 'N', 'M', 'F'):
+                    case MKBETAG('V', 'P', '8', ' '):
+                    case MKBETAG('V', 'P', '8', 'L'):
+                        ctx->frame = 1;
+                        break;
+                    default:
+                        ctx->frame = 0;
+                        break;
                 }
-            } else if (ctx->pc.frame_start_found)
-                ctx->pc.frame_start_found++;
-        }
-        ctx->pc.state64 = state;
-    } else {
-        if (ctx->remaining_size) {
-            i = FFMIN(ctx->remaining_size, buf_size);
-            ctx->remaining_size -= i;
-            if (ctx->remaining_size)
-                goto flush;
 
-            ctx->pc.frame_start_found = 0;
-            goto restart;
+                ctx->remaining_tag_size = av_bswap32(state);
+                ctx->remaining_tag_size += ctx->remaining_tag_size & 1;
+                if (ctx->remaining_tag_size > ctx->remaining_file_size) {
+                    /* this is probably trash at the end of file */
+                    ctx->remaining_tag_size = ctx->remaining_file_size;
+                }
+                ctx->pc.frame_start_found = 0;
+                state = 0;
+            }
         }
     }
+    ctx->pc.state64 = state;
 
-flush:
-    if (ff_combine_frame(&ctx->pc, next, &buf, &buf_size) < 0)
+    if (ff_combine_frame(&ctx->pc, next, &buf, &buf_size) < 0) {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
         return buf_size;
+    }
 
-    if (next != END_NOT_FOUND && next < 0)
-        ctx->pc.frame_start_found = FFMAX(ctx->pc.frame_start_found - i - 1, 0);
-    else
-        ctx->pc.frame_start_found = 0;
+    // Extremely simplified key frame detection:
+    // - the first frame (containing headers) is marked as a key frame
+    // - other frames are marked as non-key frames
+    if (ctx->first_frame) {
+        ctx->first_frame = 0;
+        s->pict_type = AV_PICTURE_TYPE_I;
+        s->key_frame = 1;
+    } else {
+        s->pict_type = AV_PICTURE_TYPE_P;
+        s->key_frame = 0;
+    }
 
     *poutbuf      = buf;
     *poutbuf_size = buf_size;