diff mbox series

[FFmpeg-devel,3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance

Message ID 20211027085705.4114165-3-wenbin.chen@intel.com
State New
Headers show
Series [FFmpeg-devel,1/3] libavcodec/vaapi_encode: Change the way to call async to increase performance | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wenbin Chen Oct. 27, 2021, 8:57 a.m. UTC
Add async_depth to increase encoder's performance. Reuse encode_fifo as
async buffer. Encoder puts all reordered frame to HW and then check
fifo size. If fifo < async_depth and the top frame is not ready, it will
return AVERROR(EAGAIN) to require more frames.

1080p transcoding (no B frames) with -async_depth=4 can increase 20%
performance on my environment.
The async increases performance but also introduces frame delay.

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 libavcodec/vaapi_encode.c | 20 +++++++++++++++-----
 libavcodec/vaapi_encode.h | 12 ++++++++++--
 2 files changed, 25 insertions(+), 7 deletions(-)

Comments

Wenbin Chen Nov. 1, 2021, 2:14 a.m. UTC | #1
> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> async buffer. Encoder puts all reordered frame to HW and then check
> fifo size. If fifo < async_depth and the top frame is not ready, it will
> return AVERROR(EAGAIN) to require more frames.
> 
> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> performance on my environment.
> The async increases performance but also introduces frame delay.
> 
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
>  libavcodec/vaapi_encode.c | 20 +++++++++++++++-----
>  libavcodec/vaapi_encode.h | 12 ++++++++++--
>  2 files changed, 25 insertions(+), 7 deletions(-)
> 
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index db0ae136a1..616fb7c089 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -1158,7 +1158,8 @@ static int
> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
>          if (ctx->input_order == ctx->decode_delay)
>              ctx->dts_pts_diff = pic->pts - ctx->first_pts;
>          if (ctx->output_delay > 0)
> -            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
> +            ctx->ts_ring[ctx->input_order %
> +                        (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
> 
>          pic->display_order = ctx->input_order;
>          ++ctx->input_order;
> @@ -1212,7 +1213,8 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>              return AVERROR(EAGAIN);
>      }
> 
> -    while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES *
> sizeof(VAAPIEncodePicture *)) {
> +    while (av_fifo_size(ctx->encode_fifo) <
> +            MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) {
>          pic = NULL;
>          err = vaapi_encode_pick_next(avctx, &pic);
>          if (err < 0)
> @@ -1234,6 +1236,14 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>      if (!av_fifo_size(ctx->encode_fifo))
>          return err;
> 
> +    if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth *
> sizeof(VAAPIEncodePicture *) &&
> +        !ctx->end_of_stream) {
> +        av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), NULL);
> +        err = vaapi_encode_wait(avctx, pic, 0);
> +        if (err < 0)
> +            return err;
> +    }
> +
>      av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
>      ctx->encode_order = pic->encode_order + 1;
> 
> @@ -1252,7 +1262,7 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>              pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
>      } else {
>          pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
> -                                (3 * ctx->output_delay)];
> +                                (3 * ctx->output_delay + ctx->async_depth)];
>      }
>      av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64"
> dts %"PRId64".\n",
>             pkt->pts, pkt->dts);
> @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext
> *avctx)
>          }
>      }
> 
> -    ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
> -                                      sizeof(VAAPIEncodePicture *));
> +    ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH *
> +                                     sizeof(VAAPIEncodePicture *));
>      if (!ctx->encode_fifo)
>          return AVERROR(ENOMEM);
> 
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index 89fe8de466..1bf5d7c337 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -48,6 +48,7 @@ enum {
>      MAX_TILE_ROWS          = 22,
>      // A.4.1: table A.6 allows at most 20 tile columns for any level.
>      MAX_TILE_COLS          = 20,
> +    MAX_ASYNC_DEPTH        = 64,
>  };
> 
>  extern const AVCodecHWConfigInternal *const
> ff_vaapi_encode_hw_configs[];
> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
>      // Timestamp handling.
>      int64_t         first_pts;
>      int64_t         dts_pts_diff;
> -    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
> +    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
> +                            MAX_ASYNC_DEPTH];
> 
>      // Slice structure.
>      int slice_block_rows;
> @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext {
>      AVFrame         *frame;
> 
>      AVFifoBuffer *encode_fifo;
> +
> +    int async_depth;
>  } VAAPIEncodeContext;
> 
>  enum {
> @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
>      { "b_depth", \
>        "Maximum B-frame reference depth", \
>        OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
> -      { .i64 = 1 }, 1, INT_MAX, FLAGS }
> +      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
> +    { "async_depth", "Maximum processing parallelism. " \
> +      "Increase this to improve single channel performance", \
> +      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
> +      { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
> 
>  #define VAAPI_ENCODE_RC_MODE(name, desc) \
>      { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
> --
> 2.25.1

ping
Ed Martin Dec. 24, 2021, 11:22 p.m. UTC | #2
On 10/31/21 22:14, Chen, Wenbin wrote:
>> Add async_depth to increase encoder's performance. Reuse encode_fifo as
>> async buffer. Encoder puts all reordered frame to HW and then check
>> fifo size. If fifo < async_depth and the top frame is not ready, it will
>> return AVERROR(EAGAIN) to require more frames.
>>
>> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
>> performance on my environment.
>> The async increases performance but also introduces frame delay.
>>
>> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
>> ---
>>   libavcodec/vaapi_encode.c | 20 +++++++++++++++-----
>>   libavcodec/vaapi_encode.h | 12 ++++++++++--
>>   2 files changed, 25 insertions(+), 7 deletions(-)
>>
>> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
>> index db0ae136a1..616fb7c089 100644
>> --- a/libavcodec/vaapi_encode.c
>> +++ b/libavcodec/vaapi_encode.c
>> @@ -1158,7 +1158,8 @@ static int
>> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
>>           if (ctx->input_order == ctx->decode_delay)
>>               ctx->dts_pts_diff = pic->pts - ctx->first_pts;
>>           if (ctx->output_delay > 0)
>> -            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
>> +            ctx->ts_ring[ctx->input_order %
>> +                        (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
>>
>>           pic->display_order = ctx->input_order;
>>           ++ctx->input_order;
>> @@ -1212,7 +1213,8 @@ int
>> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>>               return AVERROR(EAGAIN);
>>       }
>>
>> -    while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES *
>> sizeof(VAAPIEncodePicture *)) {
>> +    while (av_fifo_size(ctx->encode_fifo) <
>> +            MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) {
>>           pic = NULL;
>>           err = vaapi_encode_pick_next(avctx, &pic);
>>           if (err < 0)
>> @@ -1234,6 +1236,14 @@ int
>> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>>       if (!av_fifo_size(ctx->encode_fifo))
>>           return err;
>>
>> +    if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth *
>> sizeof(VAAPIEncodePicture *) &&
>> +        !ctx->end_of_stream) {
>> +        av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), NULL);
>> +        err = vaapi_encode_wait(avctx, pic, 0);
>> +        if (err < 0)
>> +            return err;
>> +    }
>> +
>>       av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
>>       ctx->encode_order = pic->encode_order + 1;
>>
>> @@ -1252,7 +1262,7 @@ int
>> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>>               pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
>>       } else {
>>           pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
>> -                                (3 * ctx->output_delay)];
>> +                                (3 * ctx->output_delay + ctx->async_depth)];
>>       }
>>       av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64"
>> dts %"PRId64".\n",
>>              pkt->pts, pkt->dts);
>> @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext
>> *avctx)
>>           }
>>       }
>>
>> -    ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
>> -                                      sizeof(VAAPIEncodePicture *));
>> +    ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH *
>> +                                     sizeof(VAAPIEncodePicture *));
>>       if (!ctx->encode_fifo)
>>           return AVERROR(ENOMEM);
>>
>> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
>> index 89fe8de466..1bf5d7c337 100644
>> --- a/libavcodec/vaapi_encode.h
>> +++ b/libavcodec/vaapi_encode.h
>> @@ -48,6 +48,7 @@ enum {
>>       MAX_TILE_ROWS          = 22,
>>       // A.4.1: table A.6 allows at most 20 tile columns for any level.
>>       MAX_TILE_COLS          = 20,
>> +    MAX_ASYNC_DEPTH        = 64,
>>   };
>>
>>   extern const AVCodecHWConfigInternal *const
>> ff_vaapi_encode_hw_configs[];
>> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
>>       // Timestamp handling.
>>       int64_t         first_pts;
>>       int64_t         dts_pts_diff;
>> -    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
>> +    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
>> +                            MAX_ASYNC_DEPTH];
>>
>>       // Slice structure.
>>       int slice_block_rows;
>> @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext {
>>       AVFrame         *frame;
>>
>>       AVFifoBuffer *encode_fifo;
>> +
>> +    int async_depth;
>>   } VAAPIEncodeContext;
>>
>>   enum {
>> @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
>>       { "b_depth", \
>>         "Maximum B-frame reference depth", \
>>         OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
>> -      { .i64 = 1 }, 1, INT_MAX, FLAGS }
>> +      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
>> +    { "async_depth", "Maximum processing parallelism. " \
>> +      "Increase this to improve single channel performance", \
>> +      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
>> +      { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
>>
>>   #define VAAPI_ENCODE_RC_MODE(name, desc) \
>>       { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
>> --
>> 2.25.1
> ping

I tested this patchset and  I can confirm that it solves my bug that I 
thought was a mesa bug 
(https://gitlab.freedesktop.org/mesa/mesa/-/issues/1235)


I would love if this feature is incorporated into ffmpeg
Dennis Mungai Dec. 25, 2021, 5:49 a.m. UTC | #3
On Sat, 25 Dec 2021, 02:23 Ed Martin, <lists@edman007.com> wrote:

> On 10/31/21 22:14, Chen, Wenbin wrote:
> >> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> >> async buffer. Encoder puts all reordered frame to HW and then check
> >> fifo size. If fifo < async_depth and the top frame is not ready, it will
> >> return AVERROR(EAGAIN) to require more frames.
> >>
> >> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> >> performance on my environment.
> >> The async increases performance but also introduces frame delay.
> >>
> >> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> >> ---
> >>   libavcodec/vaapi_encode.c | 20 +++++++++++++++-----
> >>   libavcodec/vaapi_encode.h | 12 ++++++++++--
> >>   2 files changed, 25 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> >> index db0ae136a1..616fb7c089 100644
> >> --- a/libavcodec/vaapi_encode.c
> >> +++ b/libavcodec/vaapi_encode.c
> >> @@ -1158,7 +1158,8 @@ static int
> >> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
> >>           if (ctx->input_order == ctx->decode_delay)
> >>               ctx->dts_pts_diff = pic->pts - ctx->first_pts;
> >>           if (ctx->output_delay > 0)
> >> -            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] =
> pic->pts;
> >> +            ctx->ts_ring[ctx->input_order %
> >> +                        (3 * ctx->output_delay + ctx->async_depth)] =
> pic->pts;
> >>
> >>           pic->display_order = ctx->input_order;
> >>           ++ctx->input_order;
> >> @@ -1212,7 +1213,8 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >>               return AVERROR(EAGAIN);
> >>       }
> >>
> >> -    while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES *
> >> sizeof(VAAPIEncodePicture *)) {
> >> +    while (av_fifo_size(ctx->encode_fifo) <
> >> +            MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) {
> >>           pic = NULL;
> >>           err = vaapi_encode_pick_next(avctx, &pic);
> >>           if (err < 0)
> >> @@ -1234,6 +1236,14 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >>       if (!av_fifo_size(ctx->encode_fifo))
> >>           return err;
> >>
> >> +    if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth *
> >> sizeof(VAAPIEncodePicture *) &&
> >> +        !ctx->end_of_stream) {
> >> +        av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic),
> NULL);
> >> +        err = vaapi_encode_wait(avctx, pic, 0);
> >> +        if (err < 0)
> >> +            return err;
> >> +    }
> >> +
> >>       av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
> >>       ctx->encode_order = pic->encode_order + 1;
> >>
> >> @@ -1252,7 +1262,7 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >>               pkt->dts = ctx->ts_ring[pic->encode_order] -
> ctx->dts_pts_diff;
> >>       } else {
> >>           pkt->dts = ctx->ts_ring[(pic->encode_order -
> ctx->decode_delay) %
> >> -                                (3 * ctx->output_delay)];
> >> +                                (3 * ctx->output_delay +
> ctx->async_depth)];
> >>       }
> >>       av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64"
> >> dts %"PRId64".\n",
> >>              pkt->pts, pkt->dts);
> >> @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext
> >> *avctx)
> >>           }
> >>       }
> >>
> >> -    ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
> >> -                                      sizeof(VAAPIEncodePicture *));
> >> +    ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH *
> >> +                                     sizeof(VAAPIEncodePicture *));
> >>       if (!ctx->encode_fifo)
> >>           return AVERROR(ENOMEM);
> >>
> >> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> >> index 89fe8de466..1bf5d7c337 100644
> >> --- a/libavcodec/vaapi_encode.h
> >> +++ b/libavcodec/vaapi_encode.h
> >> @@ -48,6 +48,7 @@ enum {
> >>       MAX_TILE_ROWS          = 22,
> >>       // A.4.1: table A.6 allows at most 20 tile columns for any level.
> >>       MAX_TILE_COLS          = 20,
> >> +    MAX_ASYNC_DEPTH        = 64,
> >>   };
> >>
> >>   extern const AVCodecHWConfigInternal *const
> >> ff_vaapi_encode_hw_configs[];
> >> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
> >>       // Timestamp handling.
> >>       int64_t         first_pts;
> >>       int64_t         dts_pts_diff;
> >> -    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
> >> +    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
> >> +                            MAX_ASYNC_DEPTH];
> >>
> >>       // Slice structure.
> >>       int slice_block_rows;
> >> @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext {
> >>       AVFrame         *frame;
> >>
> >>       AVFifoBuffer *encode_fifo;
> >> +
> >> +    int async_depth;
> >>   } VAAPIEncodeContext;
> >>
> >>   enum {
> >> @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
> >>       { "b_depth", \
> >>         "Maximum B-frame reference depth", \
> >>         OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
> >> -      { .i64 = 1 }, 1, INT_MAX, FLAGS }
> >> +      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
> >> +    { "async_depth", "Maximum processing parallelism. " \
> >> +      "Increase this to improve single channel performance", \
> >> +      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
> >> +      { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
> >>
> >>   #define VAAPI_ENCODE_RC_MODE(name, desc) \
> >>       { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name },
> \
> >> --
> >> 2.25.1
> > ping
>
> I tested this patchset and  I can confirm that it solves my bug that I
> thought was a mesa bug
> (https://gitlab.freedesktop.org/mesa/mesa/-/issues/1235)
>
>
> I would love if this feature is incorporated into ffmpeg


>
>
> Indeed, this is the only patch that makes AMD GPUs usable with VAAPI.
diff mbox series

Patch

diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index db0ae136a1..616fb7c089 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -1158,7 +1158,8 @@  static int vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
         if (ctx->input_order == ctx->decode_delay)
             ctx->dts_pts_diff = pic->pts - ctx->first_pts;
         if (ctx->output_delay > 0)
-            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
+            ctx->ts_ring[ctx->input_order %
+                        (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
 
         pic->display_order = ctx->input_order;
         ++ctx->input_order;
@@ -1212,7 +1213,8 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
             return AVERROR(EAGAIN);
     }
 
-    while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * sizeof(VAAPIEncodePicture *)) {
+    while (av_fifo_size(ctx->encode_fifo) <
+            MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) {
         pic = NULL;
         err = vaapi_encode_pick_next(avctx, &pic);
         if (err < 0)
@@ -1234,6 +1236,14 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
     if (!av_fifo_size(ctx->encode_fifo))
         return err;
 
+    if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth * sizeof(VAAPIEncodePicture *) &&
+        !ctx->end_of_stream) {
+        av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), NULL);
+        err = vaapi_encode_wait(avctx, pic, 0);
+        if (err < 0)
+            return err;
+    }
+
     av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
     ctx->encode_order = pic->encode_order + 1;
 
@@ -1252,7 +1262,7 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
             pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
     } else {
         pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
-                                (3 * ctx->output_delay)];
+                                (3 * ctx->output_delay + ctx->async_depth)];
     }
     av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n",
            pkt->pts, pkt->dts);
@@ -2566,8 +2576,8 @@  av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
         }
     }
 
-    ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
-                                      sizeof(VAAPIEncodePicture *));
+    ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH *
+                                     sizeof(VAAPIEncodePicture *));
     if (!ctx->encode_fifo)
         return AVERROR(ENOMEM);
 
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index 89fe8de466..1bf5d7c337 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -48,6 +48,7 @@  enum {
     MAX_TILE_ROWS          = 22,
     // A.4.1: table A.6 allows at most 20 tile columns for any level.
     MAX_TILE_COLS          = 20,
+    MAX_ASYNC_DEPTH        = 64,
 };
 
 extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[];
@@ -298,7 +299,8 @@  typedef struct VAAPIEncodeContext {
     // Timestamp handling.
     int64_t         first_pts;
     int64_t         dts_pts_diff;
-    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
+    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
+                            MAX_ASYNC_DEPTH];
 
     // Slice structure.
     int slice_block_rows;
@@ -348,6 +350,8 @@  typedef struct VAAPIEncodeContext {
     AVFrame         *frame;
 
     AVFifoBuffer *encode_fifo;
+
+    int async_depth;
 } VAAPIEncodeContext;
 
 enum {
@@ -458,7 +462,11 @@  int ff_vaapi_encode_close(AVCodecContext *avctx);
     { "b_depth", \
       "Maximum B-frame reference depth", \
       OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
-      { .i64 = 1 }, 1, INT_MAX, FLAGS }
+      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
+    { "async_depth", "Maximum processing parallelism. " \
+      "Increase this to improve single channel performance", \
+      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
+      { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
 
 #define VAAPI_ENCODE_RC_MODE(name, desc) \
     { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \