diff mbox series

[FFmpeg-devel,V3,3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance

Message ID 20220208030549.340748-3-wenbin.chen@intel.com
State New
Headers show
Series [FFmpeg-devel,V3,1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished
andriy/make_aarch64_jetson success Make finished
andriy/make_fate_aarch64_jetson success Make fate finished

Commit Message

Chen, Wenbin Feb. 8, 2022, 3:05 a.m. UTC
Add async_depth to increase encoder's performance. Reuse encode_fifo as
async buffer. Encoder puts all reordered frame to HW and then check
fifo size. If fifo < async_depth and the top frame is not ready, it will
return AVERROR(EAGAIN) to require more frames.

1080p transcoding (no B frames) with -async_depth=4 can increase 20%
performance on my environment.
The async increases performance but also introduces frame delay.

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 libavcodec/vaapi_encode.c | 16 ++++++++++++----
 libavcodec/vaapi_encode.h | 12 ++++++++++--
 2 files changed, 22 insertions(+), 6 deletions(-)

Comments

Chen, Wenbin Feb. 9, 2022, 6:22 a.m. UTC | #1
> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> async buffer. Encoder puts all reordered frame to HW and then check
> fifo size. If fifo < async_depth and the top frame is not ready, it will
> return AVERROR(EAGAIN) to require more frames.
> 
> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> performance on my environment.
> The async increases performance but also introduces frame delay.
> 
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
>  libavcodec/vaapi_encode.c | 16 ++++++++++++----
>  libavcodec/vaapi_encode.h | 12 ++++++++++--
>  2 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index 15ddbbaa4a..432abf31f7 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -1158,7 +1158,8 @@ static int
> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
>          if (ctx->input_order == ctx->decode_delay)
>              ctx->dts_pts_diff = pic->pts - ctx->first_pts;
>          if (ctx->output_delay > 0)
> -            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
> +            ctx->ts_ring[ctx->input_order %
> +                        (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
> 
>          pic->display_order = ctx->input_order;
>          ++ctx->input_order;
> @@ -1214,7 +1215,7 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> 
>  #if VA_CHECK_VERSION(1, 9, 0)
>      if (ctx->has_sync_buffer_func) {
> -        while (av_fifo_can_read(ctx->encode_fifo) <=
> MAX_PICTURE_REFERENCES) {
> +        while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) {

Here is a mistake I should use "<" instead of "<=" and I can use av_fifo_can_write()
instead. I will update it.

>              pic = NULL;
>              err = vaapi_encode_pick_next(avctx, &pic);
>              if (err < 0)
> @@ -1232,6 +1233,13 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>          }
>          if (!av_fifo_can_read(ctx->encode_fifo))
>              return err;
> +        if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth &&
> +            !ctx->end_of_stream) {
> +            av_fifo_peek(ctx->encode_fifo, &pic, 1, 0);
> +            err = vaapi_encode_wait(avctx, pic, 0);
> +            if (err < 0)
> +                return err;
> +        }
>          av_fifo_read(ctx->encode_fifo, &pic, 1);
>          ctx->encode_order = pic->encode_order + 1;
>      } else
> @@ -1267,7 +1275,7 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>              pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
>      } else {
>          pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
> -                                (3 * ctx->output_delay)];
> +                                (3 * ctx->output_delay + ctx->async_depth)];
>      }
>      av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64"
> dts %"PRId64".\n",
>             pkt->pts, pkt->dts);
> @@ -2588,7 +2596,7 @@ av_cold int ff_vaapi_encode_init(AVCodecContext
> *avctx)
>      vas = vaSyncBuffer(ctx->hwctx->display, 0, 0);
>      if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
>          ctx->has_sync_buffer_func = 1;
> -        ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1,
> +        ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH,
>                                            sizeof(VAAPIEncodePicture *),
>                                            0);
>          if (!ctx->encode_fifo)
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index d33a486cb8..691521387d 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -48,6 +48,7 @@ enum {
>      MAX_TILE_ROWS          = 22,
>      // A.4.1: table A.6 allows at most 20 tile columns for any level.
>      MAX_TILE_COLS          = 20,
> +    MAX_ASYNC_DEPTH        = 64,
>  };
> 
>  extern const AVCodecHWConfigInternal *const
> ff_vaapi_encode_hw_configs[];
> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
>      // Timestamp handling.
>      int64_t         first_pts;
>      int64_t         dts_pts_diff;
> -    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
> +    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
> +                            MAX_ASYNC_DEPTH];
> 
>      // Slice structure.
>      int slice_block_rows;
> @@ -350,6 +352,8 @@ typedef struct VAAPIEncodeContext {
>      AVFifo *encode_fifo;
>      //Whether the driver support vaSyncBuffer
>      int has_sync_buffer_func;
> +    //Max number of frame buffered in encoder.
> +    int async_depth;
>  } VAAPIEncodeContext;
> 
>  enum {
> @@ -460,7 +464,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
>      { "b_depth", \
>        "Maximum B-frame reference depth", \
>        OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
> -      { .i64 = 1 }, 1, INT_MAX, FLAGS }
> +      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
> +    { "async_depth", "Maximum processing parallelism. " \
> +      "Increase this to improve single channel performance", \
> +      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
> +      { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
> 
>  #define VAAPI_ENCODE_RC_MODE(name, desc) \
>      { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
> --
> 2.32.0
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Xiang, Haihao Feb. 11, 2022, 4:43 a.m. UTC | #2
> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> async buffer. Encoder puts all reordered frame to HW and then check
> fifo size. If fifo < async_depth and the top frame is not ready, it will
> return AVERROR(EAGAIN) to require more frames.
> 
> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> performance on my environment.
> The async increases performance but also introduces frame delay.
> 
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
>  libavcodec/vaapi_encode.c | 16 ++++++++++++----
>  libavcodec/vaapi_encode.h | 12 ++++++++++--
>  2 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index 15ddbbaa4a..432abf31f7 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -1158,7 +1158,8 @@ static int vaapi_encode_send_frame(AVCodecContext
> *avctx, AVFrame *frame)
>          if (ctx->input_order == ctx->decode_delay)
>              ctx->dts_pts_diff = pic->pts - ctx->first_pts;
>          if (ctx->output_delay > 0)
> -            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic-
> >pts;
> +            ctx->ts_ring[ctx->input_order %
> +                        (3 * ctx->output_delay + ctx->async_depth)] = pic-
> >pts;
>  
>          pic->display_order = ctx->input_order;
>          ++ctx->input_order;
> @@ -1214,7 +1215,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext
> *avctx, AVPacket *pkt)
>  
>  #if VA_CHECK_VERSION(1, 9, 0)
>      if (ctx->has_sync_buffer_func) {
> -        while (av_fifo_can_read(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES)
> {
> +        while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) {
>              pic = NULL;
>              err = vaapi_encode_pick_next(avctx, &pic);
>              if (err < 0)
> @@ -1232,6 +1233,13 @@ int ff_vaapi_encode_receive_packet(AVCodecContext
> *avctx, AVPacket *pkt)
>          }
>          if (!av_fifo_can_read(ctx->encode_fifo))
>              return err;
> +        if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth &&
> +            !ctx->end_of_stream) {
> +            av_fifo_peek(ctx->encode_fifo, &pic, 1, 0);
> +            err = vaapi_encode_wait(avctx, pic, 0);
> +            if (err < 0)
> +                return err;
> +        }
>          av_fifo_read(ctx->encode_fifo, &pic, 1);
>          ctx->encode_order = pic->encode_order + 1;
>      } else
> @@ -1267,7 +1275,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext
> *avctx, AVPacket *pkt)
>              pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
>      } else {
>          pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
> -                                (3 * ctx->output_delay)];
> +                                (3 * ctx->output_delay + ctx->async_depth)];
>      }
>      av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts
> %"PRId64".\n",
>             pkt->pts, pkt->dts);
> @@ -2588,7 +2596,7 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
>      vas = vaSyncBuffer(ctx->hwctx->display, 0, 0);
>      if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
>          ctx->has_sync_buffer_func = 1;
> -        ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1,
> +        ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH,
>                                            sizeof(VAAPIEncodePicture *),
>                                            0);


>          if (!ctx->encode_fifo)
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index d33a486cb8..691521387d 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -48,6 +48,7 @@ enum {
>      MAX_TILE_ROWS          = 22,
>      // A.4.1: table A.6 allows at most 20 tile columns for any level.
>      MAX_TILE_COLS          = 20,
> +    MAX_ASYNC_DEPTH        = 64,
>  };
>  
>  extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[];
> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
>      // Timestamp handling.
>      int64_t         first_pts;
>      int64_t         dts_pts_diff;
> -    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
> +    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
> +                            MAX_ASYNC_DEPTH];
>  
>      // Slice structure.
>      int slice_block_rows;
> @@ -350,6 +352,8 @@ typedef struct VAAPIEncodeContext {
>      AVFifo *encode_fifo;
>      //Whether the driver support vaSyncBuffer
>      int has_sync_buffer_func;
> +    //Max number of frame buffered in encoder.
> +    int async_depth;
>  } VAAPIEncodeContext;
>  
>  enum {
> @@ -460,7 +464,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
>      { "b_depth", \
>        "Maximum B-frame reference depth", \
>        OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
> -      { .i64 = 1 }, 1, INT_MAX, FLAGS }
> +      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
> +    { "async_depth", "Maximum processing parallelism. " \
> +      "Increase this to improve single channel performance", \

async_depth is not available if vaSyncBuffer is not implemented, it would be
better to add some comments in the help string. 

Thanks
Haihao


> +      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
> +      { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
>  
>  #define VAAPI_ENCODE_RC_MODE(name, desc) \
>      { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
diff mbox series

Patch

diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index 15ddbbaa4a..432abf31f7 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -1158,7 +1158,8 @@  static int vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
         if (ctx->input_order == ctx->decode_delay)
             ctx->dts_pts_diff = pic->pts - ctx->first_pts;
         if (ctx->output_delay > 0)
-            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
+            ctx->ts_ring[ctx->input_order %
+                        (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
 
         pic->display_order = ctx->input_order;
         ++ctx->input_order;
@@ -1214,7 +1215,7 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
 
 #if VA_CHECK_VERSION(1, 9, 0)
     if (ctx->has_sync_buffer_func) {
-        while (av_fifo_can_read(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES) {
+        while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) {
             pic = NULL;
             err = vaapi_encode_pick_next(avctx, &pic);
             if (err < 0)
@@ -1232,6 +1233,13 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
         }
         if (!av_fifo_can_read(ctx->encode_fifo))
             return err;
+        if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth &&
+            !ctx->end_of_stream) {
+            av_fifo_peek(ctx->encode_fifo, &pic, 1, 0);
+            err = vaapi_encode_wait(avctx, pic, 0);
+            if (err < 0)
+                return err;
+        }
         av_fifo_read(ctx->encode_fifo, &pic, 1);
         ctx->encode_order = pic->encode_order + 1;
     } else
@@ -1267,7 +1275,7 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
             pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
     } else {
         pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
-                                (3 * ctx->output_delay)];
+                                (3 * ctx->output_delay + ctx->async_depth)];
     }
     av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n",
            pkt->pts, pkt->dts);
@@ -2588,7 +2596,7 @@  av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
     vas = vaSyncBuffer(ctx->hwctx->display, 0, 0);
     if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
         ctx->has_sync_buffer_func = 1;
-        ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1,
+        ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH,
                                           sizeof(VAAPIEncodePicture *),
                                           0);
         if (!ctx->encode_fifo)
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index d33a486cb8..691521387d 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -48,6 +48,7 @@  enum {
     MAX_TILE_ROWS          = 22,
     // A.4.1: table A.6 allows at most 20 tile columns for any level.
     MAX_TILE_COLS          = 20,
+    MAX_ASYNC_DEPTH        = 64,
 };
 
 extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[];
@@ -298,7 +299,8 @@  typedef struct VAAPIEncodeContext {
     // Timestamp handling.
     int64_t         first_pts;
     int64_t         dts_pts_diff;
-    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
+    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
+                            MAX_ASYNC_DEPTH];
 
     // Slice structure.
     int slice_block_rows;
@@ -350,6 +352,8 @@  typedef struct VAAPIEncodeContext {
     AVFifo *encode_fifo;
     //Whether the driver support vaSyncBuffer
     int has_sync_buffer_func;
+    //Max number of frame buffered in encoder.
+    int async_depth;
 } VAAPIEncodeContext;
 
 enum {
@@ -460,7 +464,11 @@  int ff_vaapi_encode_close(AVCodecContext *avctx);
     { "b_depth", \
       "Maximum B-frame reference depth", \
       OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
-      { .i64 = 1 }, 1, INT_MAX, FLAGS }
+      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
+    { "async_depth", "Maximum processing parallelism. " \
+      "Increase this to improve single channel performance", \
+      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
+      { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
 
 #define VAAPI_ENCODE_RC_MODE(name, desc) \
     { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \