diff mbox series

[FFmpeg-devel,v4,2/2] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance

Message ID 20220218020757.834409-2-wenbin.chen@intel.com
State New
Headers show
Series [FFmpeg-devel,v4,1/2] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode | expand

Checks

Context Check Description
yinshiyou/make_fate_loongarch64 success Make fate finished
yinshiyou/make_loongarch64 warning New warnings during build
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished
andriy/make_aarch64_jetson success Make finished
andriy/make_fate_aarch64_jetson success Make fate finished
andriy/make_armv7_RPi4 success Make finished
andriy/make_fate_armv7_RPi4 success Make fate finished

Commit Message

Wenbin Chen Feb. 18, 2022, 2:07 a.m. UTC
From: Wenbin Chen <wenbin.chen-at-intel.com@ffmpeg.org>

Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance
decrease. The reason is that vaRenderPicture() and vaSyncBuffer() are
called at the same time (vaRenderPicture() always followed by a
vaSyncBuffer()). Now I changed them to be called in a asynchronous way,
which will make better use of hardware.
Async_depth is added to increase encoder's performance. The frames that
are sent to hardware are stored in a fifo. Encoder will sync output
after async fifo is full.

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
Signed-off-by: Haihao Xiang <haihao.xiang@intel.com>
---
 doc/encoders.texi         |  6 ++++
 libavcodec/vaapi_encode.c | 65 +++++++++++++++++++++++++++++++--------
 libavcodec/vaapi_encode.h | 16 ++++++++--
 3 files changed, 72 insertions(+), 15 deletions(-)
diff mbox series

Patch

diff --git a/doc/encoders.texi b/doc/encoders.texi
index bfb6c7eef6..6bac2b7f28 100644
--- a/doc/encoders.texi
+++ b/doc/encoders.texi
@@ -3600,6 +3600,12 @@  will refer only to P- or I-frames.  When set to greater values multiple layers
 of B-frames will be present, frames in each layer only referring to frames in
 higher layers.
 
+@item async_depth
+Maximum processing parallelism. Increase this to improve single channel
+performance. This option doesn't work if driver doesn't implement vaSyncBuffer
+function. Please make sure there are enough hw_frames allocated if a large
+number of async_depth is used.
+
 @item rc_mode
 Set the rate control mode to use.  A given driver may only support a subset of
 modes.
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index 335a8e450a..8c6e881702 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -965,8 +965,10 @@  static int vaapi_encode_pick_next(AVCodecContext *avctx,
     if (!pic && ctx->end_of_stream) {
         --b_counter;
         pic = ctx->pic_end;
-        if (pic->encode_issued)
+        if (pic->encode_complete)
             return AVERROR_EOF;
+        else if (pic->encode_issued)
+            return AVERROR(EAGAIN);
     }
 
     if (!pic) {
@@ -1137,7 +1139,8 @@  static int vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
         if (ctx->input_order == ctx->decode_delay)
             ctx->dts_pts_diff = pic->pts - ctx->first_pts;
         if (ctx->output_delay > 0)
-            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
+            ctx->ts_ring[ctx->input_order %
+                        (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
 
         pic->display_order = ctx->input_order;
         ++ctx->input_order;
@@ -1191,18 +1194,47 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
             return AVERROR(EAGAIN);
     }
 
-    pic = NULL;
-    err = vaapi_encode_pick_next(avctx, &pic);
-    if (err < 0)
-        return err;
-    av_assert0(pic);
+    if (ctx->has_sync_buffer_func) {
+        pic = NULL;
+
+        if (av_fifo_can_write(ctx->encode_fifo)) {
+            err = vaapi_encode_pick_next(avctx, &pic);
+            if (!err) {
+                av_assert0(pic);
+                pic->encode_order = ctx->encode_order +
+                    av_fifo_can_read(ctx->encode_fifo);
+                err = vaapi_encode_issue(avctx, pic);
+                if (err < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
+                    return err;
+                }
+                av_fifo_write(ctx->encode_fifo, &pic, 1);
+            }
+        }
 
-    pic->encode_order = ctx->encode_order++;
+        if (!av_fifo_can_read(ctx->encode_fifo))
+            return err;
 
-    err = vaapi_encode_issue(avctx, pic);
-    if (err < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
-        return err;
+        // More frames can be buffered
+        if (av_fifo_can_write(ctx->encode_fifo) && !ctx->end_of_stream)
+            return AVERROR(EAGAIN);
+
+        av_fifo_read(ctx->encode_fifo, &pic, 1);
+        ctx->encode_order = pic->encode_order + 1;
+    } else {
+        pic = NULL;
+        err = vaapi_encode_pick_next(avctx, &pic);
+        if (err < 0)
+            return err;
+        av_assert0(pic);
+
+        pic->encode_order = ctx->encode_order++;
+
+        err = vaapi_encode_issue(avctx, pic);
+        if (err < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
+            return err;
+        }
     }
 
     err = vaapi_encode_output(avctx, pic, pkt);
@@ -1220,7 +1252,7 @@  int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
             pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
     } else {
         pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
-                                (3 * ctx->output_delay)];
+                                (3 * ctx->output_delay + ctx->async_depth)];
     }
     av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n",
            pkt->pts, pkt->dts);
@@ -2541,6 +2573,12 @@  av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
     vas = vaSyncBuffer(ctx->hwctx->display, VA_INVALID_ID, 0);
     if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
         ctx->has_sync_buffer_func = 1;
+        ctx->encode_fifo = av_fifo_alloc2(ctx->async_depth,
+                                          sizeof(VAAPIEncodePicture *),
+                                          0);
+        if (!ctx->encode_fifo)
+            return AVERROR(ENOMEM);
+    }
 #endif
 
     return 0;
@@ -2580,6 +2618,7 @@  av_cold int ff_vaapi_encode_close(AVCodecContext *avctx)
 
     av_freep(&ctx->codec_sequence_params);
     av_freep(&ctx->codec_picture_params);
+    av_fifo_freep2(&ctx->encode_fifo);
 
     av_buffer_unref(&ctx->recon_frames_ref);
     av_buffer_unref(&ctx->input_frames_ref);
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index 29d9e9b91c..1b40819c69 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -29,6 +29,7 @@ 
 
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_vaapi.h"
+#include "libavutil/fifo.h"
 
 #include "avcodec.h"
 #include "hwconfig.h"
@@ -47,6 +48,7 @@  enum {
     MAX_TILE_ROWS          = 22,
     // A.4.1: table A.6 allows at most 20 tile columns for any level.
     MAX_TILE_COLS          = 20,
+    MAX_ASYNC_DEPTH        = 64,
 };
 
 extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[];
@@ -297,7 +299,8 @@  typedef struct VAAPIEncodeContext {
     // Timestamp handling.
     int64_t         first_pts;
     int64_t         dts_pts_diff;
-    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
+    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
+                            MAX_ASYNC_DEPTH];
 
     // Slice structure.
     int slice_block_rows;
@@ -348,6 +351,10 @@  typedef struct VAAPIEncodeContext {
 
     // Whether the driver support vaSyncBuffer
     int             has_sync_buffer_func;
+    // Store buffered pic
+    AVFifo          *encode_fifo;
+    // Max number of frame buffered in encoder.
+    int             async_depth;
 } VAAPIEncodeContext;
 
 enum {
@@ -458,7 +465,12 @@  int ff_vaapi_encode_close(AVCodecContext *avctx);
     { "b_depth", \
       "Maximum B-frame reference depth", \
       OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
-      { .i64 = 1 }, 1, INT_MAX, FLAGS }
+      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
+    { "async_depth", "Maximum processing parallelism. " \
+      "Increase this to improve single channel performance. This option " \
+      "doesn't work if driver doesn't implement vaSyncBuffer function.", \
+      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
+      { .i64 = 2 }, 1, MAX_ASYNC_DEPTH, FLAGS }
 
 #define VAAPI_ENCODE_RC_MODE(name, desc) \
     { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \