diff mbox series

[FFmpeg-devel,v2] lavfi/qsvvpp: support async depth

Message ID 20210315051036.4767-1-fei.w.wang@intel.com
State New
Headers show
Series [FFmpeg-devel,v2] lavfi/qsvvpp: support async depth
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Wang, Fei W March 15, 2021, 5:10 a.m. UTC
Async depth will allow qsv filter cache few frames, and avoid force
switch and end filter task frame by frame. This change will improve
performance for some multi-task case, for example 1:N transcode(
decode + vpp + encode) with all QSV plugins.

Signed-off-by: Fei Wang <fei.w.wang@intel.com>
---
Change: combine used and queued into queued in QSVFrame.

 libavfilter/qsvvpp.c             | 153 ++++++++++++++++++-------------
 libavfilter/qsvvpp.h             |  41 ++++++++-
 libavfilter/vf_deinterlace_qsv.c |  14 +--
 libavfilter/vf_vpp_qsv.c         |  75 ++++++++++++---
 4 files changed, 193 insertions(+), 90 deletions(-)

Comments

Linjie Fu March 21, 2021, 10:10 a.m. UTC | #1
Hi Fei,

On Mon, Mar 15, 2021 at 1:13 PM Fei Wang <fei.w.wang@intel.com> wrote:
>
> Async depth will allow qsv filter cache few frames, and avoid force
> switch and end filter task frame by frame. This change will improve
> performance for some multi-task case, for example 1:N transcode(
> decode + vpp + encode) with all QSV plugins.

Async depth support for qsv vpp is valuable for the performance of
whole qsv pipeline, since both decoding/encoding have already
supported the async_depth.

Hence, would you please help to elaborate more about the details about
the performance improvement for the whole pipeline?
(For examples,  before/after this patch, cmdline, platform and the fps ...)

> Signed-off-by: Fei Wang <fei.w.wang@intel.com>
> ---
> Change: combine used and queued into queued in QSVFrame.
>
>  libavfilter/qsvvpp.c             | 153 ++++++++++++++++++-------------
>  libavfilter/qsvvpp.h             |  41 ++++++++-
>  libavfilter/vf_deinterlace_qsv.c |  14 +--
>  libavfilter/vf_vpp_qsv.c         |  75 ++++++++++++---
>  4 files changed, 193 insertions(+), 90 deletions(-)
>
> diff --git a/libavfilter/qsvvpp.c b/libavfilter/qsvvpp.c
> index f216b3f248..e7c7a12cfa 100644
> --- a/libavfilter/qsvvpp.c
> +++ b/libavfilter/qsvvpp.c
> @@ -27,6 +27,7 @@
>  #include "libavutil/hwcontext_qsv.h"
>  #include "libavutil/time.h"
>  #include "libavutil/pixdesc.h"
> +#include "libavutil/fifo.h"

This seems to be redundant, since you're adding fifo.h in qsvvpp.h as well.

>  #include "internal.h"
>  #include "qsvvpp.h"
> @@ -37,37 +38,6 @@
>  #define IS_OPAQUE_MEMORY(mode) (mode & MFX_MEMTYPE_OPAQUE_FRAME)
>  #define IS_SYSTEM_MEMORY(mode) (mode & MFX_MEMTYPE_SYSTEM_MEMORY)
>
> -typedef struct QSVFrame {
> -    AVFrame          *frame;
> -    mfxFrameSurface1 *surface;
> -    mfxFrameSurface1  surface_internal;  /* for system memory */
> -    struct QSVFrame  *next;
> -} QSVFrame;
> -
> -/* abstract struct for all QSV filters */
> -struct QSVVPPContext {
> -    mfxSession          session;
> -    int (*filter_frame) (AVFilterLink *outlink, AVFrame *frame);/* callback */
> -    enum AVPixelFormat  out_sw_format;   /* Real output format */
> -    mfxVideoParam       vpp_param;
> -    mfxFrameInfo       *frame_infos;     /* frame info for each input */
> -
> -    /* members related to the input/output surface */
> -    int                 in_mem_mode;
> -    int                 out_mem_mode;
> -    QSVFrame           *in_frame_list;
> -    QSVFrame           *out_frame_list;
> -    int                 nb_surface_ptrs_in;
> -    int                 nb_surface_ptrs_out;
> -    mfxFrameSurface1  **surface_ptrs_in;
> -    mfxFrameSurface1  **surface_ptrs_out;
> -
> -    /* MFXVPP extern parameters */
> -    mfxExtOpaqueSurfaceAlloc opaque_alloc;
> -    mfxExtBuffer      **ext_buffers;
> -    int                 nb_ext_buffers;
> -};
> -
>  static const mfxHandleType handle_types[] = {
>      MFX_HANDLE_VA_DISPLAY,
>      MFX_HANDLE_D3D9_DEVICE_MANAGER,
> @@ -336,9 +306,11 @@ static int fill_frameinfo_by_link(mfxFrameInfo *frameinfo, AVFilterLink *link)
>  static void clear_unused_frames(QSVFrame *list)
>  {
>      while (list) {
> -        if (list->surface && !list->surface->Data.Locked) {
> -            list->surface = NULL;
> +        /* list->queued==1 means the frame is not cached in VPP
> +         * process any more, it can be released to pool. */
> +        if ((list->queued == 1) && !list->surface.Data.Locked) {
>              av_frame_free(&list->frame);
> +            list->queued = 0;
>          }
>          list = list->next;
>      }
> @@ -361,8 +333,10 @@ static QSVFrame *get_free_frame(QSVFrame **list)
>      QSVFrame *out = *list;
>
>      for (; out; out = out->next) {
> -        if (!out->surface)
> +        if (!out->queued) {
> +            out->queued = 1;
>              break;
> +        }
>      }
>
>      if (!out) {
> @@ -371,8 +345,9 @@ static QSVFrame *get_free_frame(QSVFrame **list)
>              av_log(NULL, AV_LOG_ERROR, "Can't alloc new output frame.\n");
>              return NULL;
>          }
> -        out->next  = *list;
> -        *list      = out;
> +        out->queued = 1;
> +        out->next   = *list;
> +        *list       = out;
>      }
>
>      return out;
> @@ -402,7 +377,7 @@ static QSVFrame *submit_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *p
>              return NULL;
>          }
>          qsv_frame->frame   = av_frame_clone(picref);
> -        qsv_frame->surface = (mfxFrameSurface1 *)qsv_frame->frame->data[3];
> +        qsv_frame->surface = *(mfxFrameSurface1 *)qsv_frame->frame->data[3];

The type of surface in struct QSVFrame  would be changed fron
*mfxFrameSurface1 to mfxFrameSurface1, and surface_internal would be
removed.
IMO separating the related changes for the structures into a single
commit would make it more explicit, since it's not closely related
with the implemetation of async fifo.

- linjie
Wang, Fei W March 24, 2021, 7:58 a.m. UTC | #2
On Sun, 2021-03-21 at 18:10 +0800, Linjie Fu wrote:
> Hi Fei,
> 
> On Mon, Mar 15, 2021 at 1:13 PM Fei Wang <fei.w.wang@intel.com>
> wrote:
> > 
> > Async depth will allow qsv filter cache few frames, and avoid force
> > switch and end filter task frame by frame. This change will improve
> > performance for some multi-task case, for example 1:N transcode(
> > decode + vpp + encode) with all QSV plugins.
> 
> Async depth support for qsv vpp is valuable for the performance of
> whole qsv pipeline, since both decoding/encoding have already
> supported the async_depth.
> 
> Hence, would you please help to elaborate more about the details
> about
> the performance improvement for the whole pipeline?
> (For examples,  before/after this patch, cmdline, platform and the
> fps ...)

Will add some data in my next version.
> 
> > Signed-off-by: Fei Wang <fei.w.wang@intel.com>
> > ---
> > Change: combine used and queued into queued in QSVFrame.
> > 
> >  libavfilter/qsvvpp.c             | 153 ++++++++++++++++++---------
> > ----
> >  libavfilter/qsvvpp.h             |  41 ++++++++-
> >  libavfilter/vf_deinterlace_qsv.c |  14 +--
> >  libavfilter/vf_vpp_qsv.c         |  75 ++++++++++++---
> >  4 files changed, 193 insertions(+), 90 deletions(-)
> > 
> > diff --git a/libavfilter/qsvvpp.c b/libavfilter/qsvvpp.c
> > index f216b3f248..e7c7a12cfa 100644
> > --- a/libavfilter/qsvvpp.c
> > +++ b/libavfilter/qsvvpp.c
> > @@ -27,6 +27,7 @@
> >  #include "libavutil/hwcontext_qsv.h"
> >  #include "libavutil/time.h"
> >  #include "libavutil/pixdesc.h"
> > +#include "libavutil/fifo.h"
> 
> This seems to be redundant, since you're adding fifo.h in qsvvpp.h as
> well.

Thanks, will remove this line.
> 
> >  #include "internal.h"
> >  #include "qsvvpp.h"
> > @@ -37,37 +38,6 @@
> >  #define IS_OPAQUE_MEMORY(mode) (mode & MFX_MEMTYPE_OPAQUE_FRAME)
> >  #define IS_SYSTEM_MEMORY(mode) (mode & MFX_MEMTYPE_SYSTEM_MEMORY)
> > 
> > -typedef struct QSVFrame {
> > -    AVFrame          *frame;
> > -    mfxFrameSurface1 *surface;
> > -    mfxFrameSurface1  surface_internal;  /* for system memory */
> > -    struct QSVFrame  *next;
> > -} QSVFrame;
> > -
> > -/* abstract struct for all QSV filters */
> > -struct QSVVPPContext {
> > -    mfxSession          session;
> > -    int (*filter_frame) (AVFilterLink *outlink, AVFrame *frame);/*
> > callback */
> > -    enum AVPixelFormat  out_sw_format;   /* Real output format */
> > -    mfxVideoParam       vpp_param;
> > -    mfxFrameInfo       *frame_infos;     /* frame info for each
> > input */
> > -
> > -    /* members related to the input/output surface */
> > -    int                 in_mem_mode;
> > -    int                 out_mem_mode;
> > -    QSVFrame           *in_frame_list;
> > -    QSVFrame           *out_frame_list;
> > -    int                 nb_surface_ptrs_in;
> > -    int                 nb_surface_ptrs_out;
> > -    mfxFrameSurface1  **surface_ptrs_in;
> > -    mfxFrameSurface1  **surface_ptrs_out;
> > -
> > -    /* MFXVPP extern parameters */
> > -    mfxExtOpaqueSurfaceAlloc opaque_alloc;
> > -    mfxExtBuffer      **ext_buffers;
> > -    int                 nb_ext_buffers;
> > -};
> > -
> >  static const mfxHandleType handle_types[] = {
> >      MFX_HANDLE_VA_DISPLAY,
> >      MFX_HANDLE_D3D9_DEVICE_MANAGER,
> > @@ -336,9 +306,11 @@ static int fill_frameinfo_by_link(mfxFrameInfo
> > *frameinfo, AVFilterLink *link)
> >  static void clear_unused_frames(QSVFrame *list)
> >  {
> >      while (list) {
> > -        if (list->surface && !list->surface->Data.Locked) {
> > -            list->surface = NULL;
> > +        /* list->queued==1 means the frame is not cached in VPP
> > +         * process any more, it can be released to pool. */
> > +        if ((list->queued == 1) && !list->surface.Data.Locked) {
> >              av_frame_free(&list->frame);
> > +            list->queued = 0;
> >          }
> >          list = list->next;
> >      }
> > @@ -361,8 +333,10 @@ static QSVFrame *get_free_frame(QSVFrame
> > **list)
> >      QSVFrame *out = *list;
> > 
> >      for (; out; out = out->next) {
> > -        if (!out->surface)
> > +        if (!out->queued) {
> > +            out->queued = 1;
> >              break;
> > +        }
> >      }
> > 
> >      if (!out) {
> > @@ -371,8 +345,9 @@ static QSVFrame *get_free_frame(QSVFrame
> > **list)
> >              av_log(NULL, AV_LOG_ERROR, "Can't alloc new output
> > frame.\n");
> >              return NULL;
> >          }
> > -        out->next  = *list;
> > -        *list      = out;
> > +        out->queued = 1;
> > +        out->next   = *list;
> > +        *list       = out;
> >      }
> > 
> >      return out;
> > @@ -402,7 +377,7 @@ static QSVFrame *submit_frame(QSVVPPContext *s,
> > AVFilterLink *inlink, AVFrame *p
> >              return NULL;
> >          }
> >          qsv_frame->frame   = av_frame_clone(picref);
> > -        qsv_frame->surface = (mfxFrameSurface1 *)qsv_frame->frame-
> > >data[3];
> > +        qsv_frame->surface = *(mfxFrameSurface1 *)qsv_frame-
> > >frame->data[3];
> 
> The type of surface in struct QSVFrame  would be changed fron
> *mfxFrameSurface1 to mfxFrameSurface1, and surface_internal would be
> removed.
> IMO separating the related changes for the structures into a single
> commit would make it more explicit, since it's not closely related
> with the implemetation of async fifo.

Not exactly. If keep using previous *mfxFrameSurface1 surface in
QSVFrame here, the input surface will point to same surface address
that created and pushed to VPP by QSV decoder, which means any change
to surface in VPP will reflect back to decoder. After add async depth
part in VPP, the surface will be set to MSDK and then
surface.Data.Locked will be set to 1(or reference number plus 1). This
flag will set back to 0(or reference number minus 1) after calling
MFXVideoCORE_SyncOperation after N(async_depth) frames latency. If you
checked in decoder, only cur->surface.Data.Locked==0, frame can be
released. So the surface will not be released in decoder in time and
lead to decoder couldn't find available frame. So the best way is to
define surface with mfxFrameSurface1 which is same with what QSV enc
does.


> 
> - linjie
diff mbox series

Patch

diff --git a/libavfilter/qsvvpp.c b/libavfilter/qsvvpp.c
index f216b3f248..e7c7a12cfa 100644
--- a/libavfilter/qsvvpp.c
+++ b/libavfilter/qsvvpp.c
@@ -27,6 +27,7 @@ 
 #include "libavutil/hwcontext_qsv.h"
 #include "libavutil/time.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/fifo.h"
 
 #include "internal.h"
 #include "qsvvpp.h"
@@ -37,37 +38,6 @@ 
 #define IS_OPAQUE_MEMORY(mode) (mode & MFX_MEMTYPE_OPAQUE_FRAME)
 #define IS_SYSTEM_MEMORY(mode) (mode & MFX_MEMTYPE_SYSTEM_MEMORY)
 
-typedef struct QSVFrame {
-    AVFrame          *frame;
-    mfxFrameSurface1 *surface;
-    mfxFrameSurface1  surface_internal;  /* for system memory */
-    struct QSVFrame  *next;
-} QSVFrame;
-
-/* abstract struct for all QSV filters */
-struct QSVVPPContext {
-    mfxSession          session;
-    int (*filter_frame) (AVFilterLink *outlink, AVFrame *frame);/* callback */
-    enum AVPixelFormat  out_sw_format;   /* Real output format */
-    mfxVideoParam       vpp_param;
-    mfxFrameInfo       *frame_infos;     /* frame info for each input */
-
-    /* members related to the input/output surface */
-    int                 in_mem_mode;
-    int                 out_mem_mode;
-    QSVFrame           *in_frame_list;
-    QSVFrame           *out_frame_list;
-    int                 nb_surface_ptrs_in;
-    int                 nb_surface_ptrs_out;
-    mfxFrameSurface1  **surface_ptrs_in;
-    mfxFrameSurface1  **surface_ptrs_out;
-
-    /* MFXVPP extern parameters */
-    mfxExtOpaqueSurfaceAlloc opaque_alloc;
-    mfxExtBuffer      **ext_buffers;
-    int                 nb_ext_buffers;
-};
-
 static const mfxHandleType handle_types[] = {
     MFX_HANDLE_VA_DISPLAY,
     MFX_HANDLE_D3D9_DEVICE_MANAGER,
@@ -336,9 +306,11 @@  static int fill_frameinfo_by_link(mfxFrameInfo *frameinfo, AVFilterLink *link)
 static void clear_unused_frames(QSVFrame *list)
 {
     while (list) {
-        if (list->surface && !list->surface->Data.Locked) {
-            list->surface = NULL;
+        /* list->queued==1 means the frame is not cached in VPP
+         * process any more, it can be released to pool. */
+        if ((list->queued == 1) && !list->surface.Data.Locked) {
             av_frame_free(&list->frame);
+            list->queued = 0;
         }
         list = list->next;
     }
@@ -361,8 +333,10 @@  static QSVFrame *get_free_frame(QSVFrame **list)
     QSVFrame *out = *list;
 
     for (; out; out = out->next) {
-        if (!out->surface)
+        if (!out->queued) {
+            out->queued = 1;
             break;
+        }
     }
 
     if (!out) {
@@ -371,8 +345,9 @@  static QSVFrame *get_free_frame(QSVFrame **list)
             av_log(NULL, AV_LOG_ERROR, "Can't alloc new output frame.\n");
             return NULL;
         }
-        out->next  = *list;
-        *list      = out;
+        out->queued = 1;
+        out->next   = *list;
+        *list       = out;
     }
 
     return out;
@@ -402,7 +377,7 @@  static QSVFrame *submit_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *p
             return NULL;
         }
         qsv_frame->frame   = av_frame_clone(picref);
-        qsv_frame->surface = (mfxFrameSurface1 *)qsv_frame->frame->data[3];
+        qsv_frame->surface = *(mfxFrameSurface1 *)qsv_frame->frame->data[3];
     } else {
         /* make a copy if the input is not padded as libmfx requires */
         if (picref->height & 31 || picref->linesize[0] & 31) {
@@ -425,27 +400,26 @@  static QSVFrame *submit_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *p
             qsv_frame->frame = av_frame_clone(picref);
 
         if (map_frame_to_surface(qsv_frame->frame,
-                                &qsv_frame->surface_internal) < 0) {
+                                 &qsv_frame->surface) < 0) {
             av_log(ctx, AV_LOG_ERROR, "Unsupported frame.\n");
             return NULL;
         }
-        qsv_frame->surface = &qsv_frame->surface_internal;
     }
 
-    qsv_frame->surface->Info           = s->frame_infos[FF_INLINK_IDX(inlink)];
-    qsv_frame->surface->Data.TimeStamp = av_rescale_q(qsv_frame->frame->pts,
+    qsv_frame->surface.Info           = s->frame_infos[FF_INLINK_IDX(inlink)];
+    qsv_frame->surface.Data.TimeStamp = av_rescale_q(qsv_frame->frame->pts,
                                                       inlink->time_base, default_tb);
 
-    qsv_frame->surface->Info.PicStruct =
+    qsv_frame->surface.Info.PicStruct =
             !qsv_frame->frame->interlaced_frame ? MFX_PICSTRUCT_PROGRESSIVE :
             (qsv_frame->frame->top_field_first ? MFX_PICSTRUCT_FIELD_TFF :
                                                  MFX_PICSTRUCT_FIELD_BFF);
     if (qsv_frame->frame->repeat_pict == 1)
-        qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FIELD_REPEATED;
+        qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FIELD_REPEATED;
     else if (qsv_frame->frame->repeat_pict == 2)
-        qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FRAME_DOUBLING;
+        qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FRAME_DOUBLING;
     else if (qsv_frame->frame->repeat_pict == 4)
-        qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FRAME_TRIPLING;
+        qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FRAME_TRIPLING;
 
     return qsv_frame;
 }
@@ -476,7 +450,7 @@  static QSVFrame *query_frame(QSVVPPContext *s, AVFilterLink *outlink)
             return NULL;
         }
 
-        out_frame->surface = (mfxFrameSurface1 *)out_frame->frame->data[3];
+        out_frame->surface = *(mfxFrameSurface1 *)out_frame->frame->data[3];
     } else {
         /* Get a frame with aligned dimensions.
          * Libmfx need system memory being 128x64 aligned */
@@ -490,14 +464,12 @@  static QSVFrame *query_frame(QSVVPPContext *s, AVFilterLink *outlink)
         out_frame->frame->height = outlink->h;
 
         ret = map_frame_to_surface(out_frame->frame,
-                                  &out_frame->surface_internal);
+                                   &out_frame->surface);
         if (ret < 0)
             return NULL;
-
-        out_frame->surface = &out_frame->surface_internal;
     }
 
-    out_frame->surface->Info = s->vpp_param.vpp.Out;
+    out_frame->surface.Info = s->vpp_param.vpp.Out;
 
     return out_frame;
 }
@@ -666,6 +638,16 @@  static int init_vpp_session(AVFilterContext *avctx, QSVVPPContext *s)
     return 0;
 }
 
+static unsigned int qsv_fifo_item_size(void)
+{
+    return sizeof(mfxSyncPoint*) + sizeof(QSVFrame*);
+}
+
+static unsigned int qsv_fifo_size(const AVFifoBuffer* fifo)
+{
+    return  av_fifo_size(fifo)/qsv_fifo_item_size();
+}
+
 int ff_qsvvpp_create(AVFilterContext *avctx, QSVVPPContext **vpp, QSVVPPParam *param)
 {
     int i;
@@ -738,7 +720,17 @@  int ff_qsvvpp_create(AVFilterContext *avctx, QSVVPPContext **vpp, QSVVPPParam *p
         s->vpp_param.ExtParam    = param->ext_buf;
     }
 
-    s->vpp_param.AsyncDepth = 1;
+    s->got_frame = 0;
+
+    /** keep fifo size at least 1. Even when async_depth is 0, fifo is used. */
+    s->async_fifo  = av_fifo_alloc((param->async_depth + 1) * qsv_fifo_item_size());
+    s->async_depth = param->async_depth;
+    if (!s->async_fifo) {
+        ret = AVERROR(ENOMEM);
+        goto failed;
+    }
+
+    s->vpp_param.AsyncDepth = param->async_depth;
 
     if (IS_SYSTEM_MEMORY(s->in_mem_mode))
         s->vpp_param.IOPattern |= MFX_IOPATTERN_IN_SYSTEM_MEMORY;
@@ -793,6 +785,7 @@  int ff_qsvvpp_free(QSVVPPContext **vpp)
     av_freep(&s->surface_ptrs_out);
     av_freep(&s->ext_buffers);
     av_freep(&s->frame_infos);
+    av_fifo_free(s->async_fifo);
     av_freep(vpp);
 
     return 0;
@@ -803,9 +796,29 @@  int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
     AVFilterContext  *ctx     = inlink->dst;
     AVFilterLink     *outlink = ctx->outputs[0];
     mfxSyncPoint      sync;
-    QSVFrame         *in_frame, *out_frame;
+    QSVFrame         *in_frame, *out_frame, *tmp;
     int               ret, filter_ret;
 
+    while (s->eof && qsv_fifo_size(s->async_fifo)) {
+        av_fifo_generic_read(s->async_fifo, &tmp, sizeof(tmp), NULL);
+        av_fifo_generic_read(s->async_fifo, &sync, sizeof(sync), NULL);
+        if (MFXVideoCORE_SyncOperation(s->session, sync, 1000) < 0)
+            av_log(ctx, AV_LOG_WARNING, "Sync failed.\n");
+
+        filter_ret = s->filter_frame(outlink, tmp->frame);
+        if (filter_ret < 0) {
+            av_frame_free(&tmp->frame);
+            ret = filter_ret;
+            break;
+        }
+        tmp->queued--;
+        s->got_frame = 1;
+        tmp->frame = NULL;
+    };
+
+    if (!picref)
+        return 0;
+
     in_frame = submit_frame(s, inlink, picref);
     if (!in_frame) {
         av_log(ctx, AV_LOG_ERROR, "Failed to submit frame on input[%d]\n",
@@ -821,8 +834,8 @@  int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
         }
 
         do {
-            ret = MFXVideoVPP_RunFrameVPPAsync(s->session, in_frame->surface,
-                                               out_frame->surface, NULL, &sync);
+            ret = MFXVideoVPP_RunFrameVPPAsync(s->session, &in_frame->surface,
+                                               &out_frame->surface, NULL, &sync);
             if (ret == MFX_WRN_DEVICE_BUSY)
                 av_usleep(500);
         } while (ret == MFX_WRN_DEVICE_BUSY);
@@ -833,20 +846,32 @@  int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
                 ret = AVERROR(EAGAIN);
             break;
         }
+        out_frame->frame->pts = av_rescale_q(out_frame->surface.Data.TimeStamp,
+                                             default_tb, outlink->time_base);
 
-        if (MFXVideoCORE_SyncOperation(s->session, sync, 1000) < 0)
-            av_log(ctx, AV_LOG_WARNING, "Sync failed.\n");
+        out_frame->queued++;
+        av_fifo_generic_write(s->async_fifo, &out_frame, sizeof(out_frame), NULL);
+        av_fifo_generic_write(s->async_fifo, &sync, sizeof(sync), NULL);
 
-        out_frame->frame->pts = av_rescale_q(out_frame->surface->Data.TimeStamp,
-                                             default_tb, outlink->time_base);
 
-        filter_ret = s->filter_frame(outlink, out_frame->frame);
-        if (filter_ret < 0) {
-            av_frame_free(&out_frame->frame);
-            ret = filter_ret;
-            break;
+        if (qsv_fifo_size(s->async_fifo) > s->async_depth) {
+            av_fifo_generic_read(s->async_fifo, &tmp, sizeof(tmp), NULL);
+            av_fifo_generic_read(s->async_fifo, &sync, sizeof(sync), NULL);
+
+            if (MFXVideoCORE_SyncOperation(s->session, sync, 1000) < 0)
+                av_log(ctx, AV_LOG_WARNING, "Sync failed.\n");
+
+            filter_ret = s->filter_frame(outlink, tmp->frame);
+            if (filter_ret < 0) {
+                av_frame_free(&tmp->frame);
+                ret = filter_ret;
+                break;
+            }
+
+            tmp->queued--;
+            s->got_frame = 1;
+            tmp->frame = NULL;
         }
-        out_frame->frame = NULL;
     } while(ret == MFX_ERR_MORE_SURFACE);
 
     return ret;
diff --git a/libavfilter/qsvvpp.h b/libavfilter/qsvvpp.h
index b4baeedf9e..26be0d8ea8 100644
--- a/libavfilter/qsvvpp.h
+++ b/libavfilter/qsvvpp.h
@@ -27,6 +27,7 @@ 
 #include <mfx/mfxvideo.h>
 
 #include "avfilter.h"
+#include "libavutil/fifo.h"
 
 #define FF_INLINK_IDX(link)  ((int)((link)->dstpad - (link)->dst->input_pads))
 #define FF_OUTLINK_IDX(link) ((int)((link)->srcpad - (link)->src->output_pads))
@@ -39,7 +40,43 @@ 
     ((MFX_VERSION.Major > (MAJOR)) ||                           \
     (MFX_VERSION.Major == (MAJOR) && MFX_VERSION.Minor >= (MINOR)))
 
-typedef struct QSVVPPContext QSVVPPContext;
+#define VPP_ASYNC_DEPTH_DEFAULT 1
+
+typedef struct QSVFrame {
+    AVFrame          *frame;
+    mfxFrameSurface1 surface;
+    struct QSVFrame  *next;
+    int queued;
+} QSVFrame;
+
+typedef struct QSVVPPContext {
+    mfxSession          session;
+    int (*filter_frame) (AVFilterLink *outlink, AVFrame *frame); /**< callback */
+    enum AVPixelFormat  out_sw_format;   /**< Real output format */
+    mfxVideoParam       vpp_param;
+    mfxFrameInfo       *frame_infos;     /**< frame info for each input */
+
+    /** members related to the input/output surface */
+    int                 in_mem_mode;
+    int                 out_mem_mode;
+    QSVFrame           *in_frame_list;
+    QSVFrame           *out_frame_list;
+    int                 nb_surface_ptrs_in;
+    int                 nb_surface_ptrs_out;
+    mfxFrameSurface1  **surface_ptrs_in;
+    mfxFrameSurface1  **surface_ptrs_out;
+
+    /** MFXVPP extern parameters */
+    mfxExtOpaqueSurfaceAlloc opaque_alloc;
+    mfxExtBuffer      **ext_buffers;
+    int                 nb_ext_buffers;
+
+    int got_frame;
+    int async_depth;
+    int eof;
+    /** order with frame_out, sync */
+    AVFifoBuffer *async_fifo;
+} QSVVPPContext;
 
 typedef struct QSVVPPCrop {
     int in_idx;        ///< Input index
@@ -60,6 +97,8 @@  typedef struct QSVVPPParam {
     /* Crop information for each input, if needed */
     int num_crop;
     QSVVPPCrop *crop;
+
+   int async_depth;
 } QSVVPPParam;
 
 /* create and initialize the QSV session */
diff --git a/libavfilter/vf_deinterlace_qsv.c b/libavfilter/vf_deinterlace_qsv.c
index 89a282f99e..34feb616ab 100644
--- a/libavfilter/vf_deinterlace_qsv.c
+++ b/libavfilter/vf_deinterlace_qsv.c
@@ -47,14 +47,6 @@  enum {
     QSVDEINT_MORE_INPUT,
 };
 
-typedef struct QSVFrame {
-    AVFrame *frame;
-    mfxFrameSurface1 surface;
-    int used;
-
-    struct QSVFrame *next;
-} QSVFrame;
-
 typedef struct QSVDeintContext {
     const AVClass *class;
 
@@ -376,7 +368,7 @@  static void clear_unused_frames(QSVDeintContext *s)
     while (cur) {
         if (!cur->surface.Data.Locked) {
             av_frame_free(&cur->frame);
-            cur->used = 0;
+            cur->queued = 0;
         }
         cur = cur->next;
     }
@@ -391,7 +383,7 @@  static int get_free_frame(QSVDeintContext *s, QSVFrame **f)
     frame = s->work_frames;
     last  = &s->work_frames;
     while (frame) {
-        if (!frame->used) {
+        if (!frame->queued) {
             *f = frame;
             return 0;
         }
@@ -453,7 +445,7 @@  static int submit_frame(AVFilterContext *ctx, AVFrame *frame,
                                               (AVRational){1, 90000});
 
     *surface = &qf->surface;
-    qf->used = 1;
+    qf->queued = 1;
 
     return 0;
 }
diff --git a/libavfilter/vf_vpp_qsv.c b/libavfilter/vf_vpp_qsv.c
index 5d57707455..83bdf1276c 100644
--- a/libavfilter/vf_vpp_qsv.c
+++ b/libavfilter/vf_vpp_qsv.c
@@ -32,6 +32,7 @@ 
 #include "formats.h"
 #include "internal.h"
 #include "avfilter.h"
+#include "filters.h"
 #include "libavcodec/avcodec.h"
 #include "libavformat/avformat.h"
 
@@ -93,6 +94,9 @@  typedef struct VPPContext{
     char *cx, *cy, *cw, *ch;
     char *ow, *oh;
     char *output_format_str;
+
+    int async_depth;
+    int eof;
 } VPPContext;
 
 static const AVOption options[] = {
@@ -128,6 +132,7 @@  static const AVOption options[] = {
     { "h",      "Output video height", OFFSET(oh), AV_OPT_TYPE_STRING, { .str="w*ch/cw" }, 0, 255, .flags = FLAGS },
     { "height", "Output video height", OFFSET(oh), AV_OPT_TYPE_STRING, { .str="w*ch/cw" }, 0, 255, .flags = FLAGS },
     { "format", "Output pixel format", OFFSET(output_format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
+    { "async_depth", "Internal parallelization depth, the higher the value the higher the latency.", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = VPP_ASYNC_DEPTH_DEFAULT }, 0, INT_MAX, .flags = FLAGS },
 
     { NULL }
 };
@@ -303,6 +308,7 @@  static int config_output(AVFilterLink *outlink)
     param.filter_frame  = NULL;
     param.num_ext_buf   = 0;
     param.ext_buf       = ext_buf;
+    param.async_depth   = vpp->async_depth;
 
     if (inlink->format == AV_PIX_FMT_QSV) {
          if (!inlink->hw_frames_ctx || !inlink->hw_frames_ctx->data)
@@ -467,23 +473,64 @@  static int config_output(AVFilterLink *outlink)
     return 0;
 }
 
-static int filter_frame(AVFilterLink *inlink, AVFrame *picref)
+static int activate(AVFilterContext *ctx)
 {
-    int              ret = 0;
-    AVFilterContext  *ctx = inlink->dst;
-    VPPContext       *vpp = inlink->dst->priv;
-    AVFilterLink     *outlink = ctx->outputs[0];
-
-    if (vpp->qsv) {
-        ret = ff_qsvvpp_filter_frame(vpp->qsv, inlink, picref);
-        av_frame_free(&picref);
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    VPPContext *s =ctx->priv;
+    QSVVPPContext *qsv = s->qsv;
+    AVFrame *in = NULL;
+    int ret, status;
+    int64_t pts;
+
+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+    if (!s->eof) {
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+
+        if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+            if (status == AVERROR_EOF) {
+                s->eof = 1;
+            }
+        }
+    }
+
+    if (qsv) {
+        if (in || s->eof) {
+            qsv->eof = s->eof;
+            ret = ff_qsvvpp_filter_frame(qsv, inlink, in);
+            av_frame_free(&in);
+
+            if (s->eof) {
+                ff_outlink_set_status(outlink, status, pts);
+                return 0;
+            }
+
+            if (qsv->got_frame) {
+                qsv->got_frame = 0;
+                return ret;
+            }
+        }
     } else {
-        if (picref->pts != AV_NOPTS_VALUE)
-            picref->pts = av_rescale_q(picref->pts, inlink->time_base, outlink->time_base);
-        ret = ff_filter_frame(outlink, picref);
+        if (in) {
+            if (in->pts != AV_NOPTS_VALUE)
+                in->pts = av_rescale_q(in->pts, inlink->time_base, outlink->time_base);
+
+            ret = ff_filter_frame(outlink, in);
+            return ret;
+        }
     }
 
-    return ret;
+    if (s->eof) {
+        ff_outlink_set_status(outlink, status, pts);
+        return 0;
+    } else {
+        FF_FILTER_FORWARD_WANTED(outlink, inlink);
+    }
+
+    return FFERROR_NOT_READY;
 }
 
 static int query_formats(AVFilterContext *ctx)
@@ -531,7 +578,6 @@  static const AVFilterPad vpp_inputs[] = {
         .name          = "default",
         .type          = AVMEDIA_TYPE_VIDEO,
         .config_props  = config_input,
-        .filter_frame  = filter_frame,
     },
     { NULL }
 };
@@ -554,6 +600,7 @@  AVFilter ff_vf_vpp_qsv = {
     .uninit        = vpp_uninit,
     .inputs        = vpp_inputs,
     .outputs       = vpp_outputs,
+    .activate      = activate,
     .priv_class    = &vpp_class,
     .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
 };