Message ID | 20211027085705.4114165-3-wenbin.chen@intel.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/3] libavcodec/vaapi_encode: Change the way to call async to increase performance | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
> Add async_depth to increase encoder's performance. Reuse encode_fifo as > async buffer. Encoder puts all reordered frame to HW and then check > fifo size. If fifo < async_depth and the top frame is not ready, it will > return AVERROR(EAGAIN) to require more frames. > > 1080p transcoding (no B frames) with -async_depth=4 can increase 20% > performance on my environment. > The async increases performance but also introduces frame delay. > > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > --- > libavcodec/vaapi_encode.c | 20 +++++++++++++++----- > libavcodec/vaapi_encode.h | 12 ++++++++++-- > 2 files changed, 25 insertions(+), 7 deletions(-) > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > index db0ae136a1..616fb7c089 100644 > --- a/libavcodec/vaapi_encode.c > +++ b/libavcodec/vaapi_encode.c > @@ -1158,7 +1158,8 @@ static int > vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame) > if (ctx->input_order == ctx->decode_delay) > ctx->dts_pts_diff = pic->pts - ctx->first_pts; > if (ctx->output_delay > 0) > - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts; > + ctx->ts_ring[ctx->input_order % > + (3 * ctx->output_delay + ctx->async_depth)] = pic->pts; > > pic->display_order = ctx->input_order; > ++ctx->input_order; > @@ -1212,7 +1213,8 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > return AVERROR(EAGAIN); > } > > - while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * > sizeof(VAAPIEncodePicture *)) { > + while (av_fifo_size(ctx->encode_fifo) < > + MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) { > pic = NULL; > err = vaapi_encode_pick_next(avctx, &pic); > if (err < 0) > @@ -1234,6 +1236,14 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > if (!av_fifo_size(ctx->encode_fifo)) > return err; > > + if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth * > sizeof(VAAPIEncodePicture *) && > + !ctx->end_of_stream) { > + av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), NULL); > + err = vaapi_encode_wait(avctx, pic, 0); > + if (err < 0) > + return err; > + } > + > av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL); > ctx->encode_order = pic->encode_order + 1; > > @@ -1252,7 +1262,7 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff; > } else { > pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) % > - (3 * ctx->output_delay)]; > + (3 * ctx->output_delay + ctx->async_depth)]; > } > av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" > dts %"PRId64".\n", > pkt->pts, pkt->dts); > @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext > *avctx) > } > } > > - ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) * > - sizeof(VAAPIEncodePicture *)); > + ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH * > + sizeof(VAAPIEncodePicture *)); > if (!ctx->encode_fifo) > return AVERROR(ENOMEM); > > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h > index 89fe8de466..1bf5d7c337 100644 > --- a/libavcodec/vaapi_encode.h > +++ b/libavcodec/vaapi_encode.h > @@ -48,6 +48,7 @@ enum { > MAX_TILE_ROWS = 22, > // A.4.1: table A.6 allows at most 20 tile columns for any level. > MAX_TILE_COLS = 20, > + MAX_ASYNC_DEPTH = 64, > }; > > extern const AVCodecHWConfigInternal *const > ff_vaapi_encode_hw_configs[]; > @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext { > // Timestamp handling. > int64_t first_pts; > int64_t dts_pts_diff; > - int64_t ts_ring[MAX_REORDER_DELAY * 3]; > + int64_t ts_ring[MAX_REORDER_DELAY * 3 + > + MAX_ASYNC_DEPTH]; > > // Slice structure. > int slice_block_rows; > @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext { > AVFrame *frame; > > AVFifoBuffer *encode_fifo; > + > + int async_depth; > } VAAPIEncodeContext; > > enum { > @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx); > { "b_depth", \ > "Maximum B-frame reference depth", \ > OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \ > - { .i64 = 1 }, 1, INT_MAX, FLAGS } > + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \ > + { "async_depth", "Maximum processing parallelism. " \ > + "Increase this to improve single channel performance", \ > + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \ > + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS } > > #define VAAPI_ENCODE_RC_MODE(name, desc) \ > { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \ > -- > 2.25.1 ping
On 10/31/21 22:14, Chen, Wenbin wrote: >> Add async_depth to increase encoder's performance. Reuse encode_fifo as >> async buffer. Encoder puts all reordered frame to HW and then check >> fifo size. If fifo < async_depth and the top frame is not ready, it will >> return AVERROR(EAGAIN) to require more frames. >> >> 1080p transcoding (no B frames) with -async_depth=4 can increase 20% >> performance on my environment. >> The async increases performance but also introduces frame delay. >> >> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> >> --- >> libavcodec/vaapi_encode.c | 20 +++++++++++++++----- >> libavcodec/vaapi_encode.h | 12 ++++++++++-- >> 2 files changed, 25 insertions(+), 7 deletions(-) >> >> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c >> index db0ae136a1..616fb7c089 100644 >> --- a/libavcodec/vaapi_encode.c >> +++ b/libavcodec/vaapi_encode.c >> @@ -1158,7 +1158,8 @@ static int >> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame) >> if (ctx->input_order == ctx->decode_delay) >> ctx->dts_pts_diff = pic->pts - ctx->first_pts; >> if (ctx->output_delay > 0) >> - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts; >> + ctx->ts_ring[ctx->input_order % >> + (3 * ctx->output_delay + ctx->async_depth)] = pic->pts; >> >> pic->display_order = ctx->input_order; >> ++ctx->input_order; >> @@ -1212,7 +1213,8 @@ int >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) >> return AVERROR(EAGAIN); >> } >> >> - while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * >> sizeof(VAAPIEncodePicture *)) { >> + while (av_fifo_size(ctx->encode_fifo) < >> + MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) { >> pic = NULL; >> err = vaapi_encode_pick_next(avctx, &pic); >> if (err < 0) >> @@ -1234,6 +1236,14 @@ int >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) >> if (!av_fifo_size(ctx->encode_fifo)) >> return err; >> >> + if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth * >> sizeof(VAAPIEncodePicture *) && >> + !ctx->end_of_stream) { >> + av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), NULL); >> + err = vaapi_encode_wait(avctx, pic, 0); >> + if (err < 0) >> + return err; >> + } >> + >> av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL); >> ctx->encode_order = pic->encode_order + 1; >> >> @@ -1252,7 +1262,7 @@ int >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) >> pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff; >> } else { >> pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) % >> - (3 * ctx->output_delay)]; >> + (3 * ctx->output_delay + ctx->async_depth)]; >> } >> av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" >> dts %"PRId64".\n", >> pkt->pts, pkt->dts); >> @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext >> *avctx) >> } >> } >> >> - ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) * >> - sizeof(VAAPIEncodePicture *)); >> + ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH * >> + sizeof(VAAPIEncodePicture *)); >> if (!ctx->encode_fifo) >> return AVERROR(ENOMEM); >> >> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h >> index 89fe8de466..1bf5d7c337 100644 >> --- a/libavcodec/vaapi_encode.h >> +++ b/libavcodec/vaapi_encode.h >> @@ -48,6 +48,7 @@ enum { >> MAX_TILE_ROWS = 22, >> // A.4.1: table A.6 allows at most 20 tile columns for any level. >> MAX_TILE_COLS = 20, >> + MAX_ASYNC_DEPTH = 64, >> }; >> >> extern const AVCodecHWConfigInternal *const >> ff_vaapi_encode_hw_configs[]; >> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext { >> // Timestamp handling. >> int64_t first_pts; >> int64_t dts_pts_diff; >> - int64_t ts_ring[MAX_REORDER_DELAY * 3]; >> + int64_t ts_ring[MAX_REORDER_DELAY * 3 + >> + MAX_ASYNC_DEPTH]; >> >> // Slice structure. >> int slice_block_rows; >> @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext { >> AVFrame *frame; >> >> AVFifoBuffer *encode_fifo; >> + >> + int async_depth; >> } VAAPIEncodeContext; >> >> enum { >> @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx); >> { "b_depth", \ >> "Maximum B-frame reference depth", \ >> OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \ >> - { .i64 = 1 }, 1, INT_MAX, FLAGS } >> + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \ >> + { "async_depth", "Maximum processing parallelism. " \ >> + "Increase this to improve single channel performance", \ >> + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \ >> + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS } >> >> #define VAAPI_ENCODE_RC_MODE(name, desc) \ >> { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \ >> -- >> 2.25.1 > ping I tested this patchset and I can confirm that it solves my bug that I thought was a mesa bug (https://gitlab.freedesktop.org/mesa/mesa/-/issues/1235) I would love if this feature is incorporated into ffmpeg
On Sat, 25 Dec 2021, 02:23 Ed Martin, <lists@edman007.com> wrote: > On 10/31/21 22:14, Chen, Wenbin wrote: > >> Add async_depth to increase encoder's performance. Reuse encode_fifo as > >> async buffer. Encoder puts all reordered frame to HW and then check > >> fifo size. If fifo < async_depth and the top frame is not ready, it will > >> return AVERROR(EAGAIN) to require more frames. > >> > >> 1080p transcoding (no B frames) with -async_depth=4 can increase 20% > >> performance on my environment. > >> The async increases performance but also introduces frame delay. > >> > >> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > >> --- > >> libavcodec/vaapi_encode.c | 20 +++++++++++++++----- > >> libavcodec/vaapi_encode.h | 12 ++++++++++-- > >> 2 files changed, 25 insertions(+), 7 deletions(-) > >> > >> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > >> index db0ae136a1..616fb7c089 100644 > >> --- a/libavcodec/vaapi_encode.c > >> +++ b/libavcodec/vaapi_encode.c > >> @@ -1158,7 +1158,8 @@ static int > >> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame) > >> if (ctx->input_order == ctx->decode_delay) > >> ctx->dts_pts_diff = pic->pts - ctx->first_pts; > >> if (ctx->output_delay > 0) > >> - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = > pic->pts; > >> + ctx->ts_ring[ctx->input_order % > >> + (3 * ctx->output_delay + ctx->async_depth)] = > pic->pts; > >> > >> pic->display_order = ctx->input_order; > >> ++ctx->input_order; > >> @@ -1212,7 +1213,8 @@ int > >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > >> return AVERROR(EAGAIN); > >> } > >> > >> - while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * > >> sizeof(VAAPIEncodePicture *)) { > >> + while (av_fifo_size(ctx->encode_fifo) < > >> + MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) { > >> pic = NULL; > >> err = vaapi_encode_pick_next(avctx, &pic); > >> if (err < 0) > >> @@ -1234,6 +1236,14 @@ int > >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > >> if (!av_fifo_size(ctx->encode_fifo)) > >> return err; > >> > >> + if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth * > >> sizeof(VAAPIEncodePicture *) && > >> + !ctx->end_of_stream) { > >> + av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), > NULL); > >> + err = vaapi_encode_wait(avctx, pic, 0); > >> + if (err < 0) > >> + return err; > >> + } > >> + > >> av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL); > >> ctx->encode_order = pic->encode_order + 1; > >> > >> @@ -1252,7 +1262,7 @@ int > >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > >> pkt->dts = ctx->ts_ring[pic->encode_order] - > ctx->dts_pts_diff; > >> } else { > >> pkt->dts = ctx->ts_ring[(pic->encode_order - > ctx->decode_delay) % > >> - (3 * ctx->output_delay)]; > >> + (3 * ctx->output_delay + > ctx->async_depth)]; > >> } > >> av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" > >> dts %"PRId64".\n", > >> pkt->pts, pkt->dts); > >> @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext > >> *avctx) > >> } > >> } > >> > >> - ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) * > >> - sizeof(VAAPIEncodePicture *)); > >> + ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH * > >> + sizeof(VAAPIEncodePicture *)); > >> if (!ctx->encode_fifo) > >> return AVERROR(ENOMEM); > >> > >> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h > >> index 89fe8de466..1bf5d7c337 100644 > >> --- a/libavcodec/vaapi_encode.h > >> +++ b/libavcodec/vaapi_encode.h > >> @@ -48,6 +48,7 @@ enum { > >> MAX_TILE_ROWS = 22, > >> // A.4.1: table A.6 allows at most 20 tile columns for any level. > >> MAX_TILE_COLS = 20, > >> + MAX_ASYNC_DEPTH = 64, > >> }; > >> > >> extern const AVCodecHWConfigInternal *const > >> ff_vaapi_encode_hw_configs[]; > >> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext { > >> // Timestamp handling. > >> int64_t first_pts; > >> int64_t dts_pts_diff; > >> - int64_t ts_ring[MAX_REORDER_DELAY * 3]; > >> + int64_t ts_ring[MAX_REORDER_DELAY * 3 + > >> + MAX_ASYNC_DEPTH]; > >> > >> // Slice structure. > >> int slice_block_rows; > >> @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext { > >> AVFrame *frame; > >> > >> AVFifoBuffer *encode_fifo; > >> + > >> + int async_depth; > >> } VAAPIEncodeContext; > >> > >> enum { > >> @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx); > >> { "b_depth", \ > >> "Maximum B-frame reference depth", \ > >> OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \ > >> - { .i64 = 1 }, 1, INT_MAX, FLAGS } > >> + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \ > >> + { "async_depth", "Maximum processing parallelism. " \ > >> + "Increase this to improve single channel performance", \ > >> + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \ > >> + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS } > >> > >> #define VAAPI_ENCODE_RC_MODE(name, desc) \ > >> { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, > \ > >> -- > >> 2.25.1 > > ping > > I tested this patchset and I can confirm that it solves my bug that I > thought was a mesa bug > (https://gitlab.freedesktop.org/mesa/mesa/-/issues/1235) > > > I would love if this feature is incorporated into ffmpeg > > > Indeed, this is the only patch that makes AMD GPUs usable with VAAPI.
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c index db0ae136a1..616fb7c089 100644 --- a/libavcodec/vaapi_encode.c +++ b/libavcodec/vaapi_encode.c @@ -1158,7 +1158,8 @@ static int vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame) if (ctx->input_order == ctx->decode_delay) ctx->dts_pts_diff = pic->pts - ctx->first_pts; if (ctx->output_delay > 0) - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts; + ctx->ts_ring[ctx->input_order % + (3 * ctx->output_delay + ctx->async_depth)] = pic->pts; pic->display_order = ctx->input_order; ++ctx->input_order; @@ -1212,7 +1213,8 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) return AVERROR(EAGAIN); } - while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * sizeof(VAAPIEncodePicture *)) { + while (av_fifo_size(ctx->encode_fifo) < + MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) { pic = NULL; err = vaapi_encode_pick_next(avctx, &pic); if (err < 0) @@ -1234,6 +1236,14 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) if (!av_fifo_size(ctx->encode_fifo)) return err; + if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth * sizeof(VAAPIEncodePicture *) && + !ctx->end_of_stream) { + av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic), NULL); + err = vaapi_encode_wait(avctx, pic, 0); + if (err < 0) + return err; + } + av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL); ctx->encode_order = pic->encode_order + 1; @@ -1252,7 +1262,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff; } else { pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) % - (3 * ctx->output_delay)]; + (3 * ctx->output_delay + ctx->async_depth)]; } av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n", pkt->pts, pkt->dts); @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx) } } - ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) * - sizeof(VAAPIEncodePicture *)); + ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH * + sizeof(VAAPIEncodePicture *)); if (!ctx->encode_fifo) return AVERROR(ENOMEM); diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h index 89fe8de466..1bf5d7c337 100644 --- a/libavcodec/vaapi_encode.h +++ b/libavcodec/vaapi_encode.h @@ -48,6 +48,7 @@ enum { MAX_TILE_ROWS = 22, // A.4.1: table A.6 allows at most 20 tile columns for any level. MAX_TILE_COLS = 20, + MAX_ASYNC_DEPTH = 64, }; extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[]; @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext { // Timestamp handling. int64_t first_pts; int64_t dts_pts_diff; - int64_t ts_ring[MAX_REORDER_DELAY * 3]; + int64_t ts_ring[MAX_REORDER_DELAY * 3 + + MAX_ASYNC_DEPTH]; // Slice structure. int slice_block_rows; @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext { AVFrame *frame; AVFifoBuffer *encode_fifo; + + int async_depth; } VAAPIEncodeContext; enum { @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx); { "b_depth", \ "Maximum B-frame reference depth", \ OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \ - { .i64 = 1 }, 1, INT_MAX, FLAGS } + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \ + { "async_depth", "Maximum processing parallelism. " \ + "Increase this to improve single channel performance", \ + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \ + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS } #define VAAPI_ENCODE_RC_MODE(name, desc) \ { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
Add async_depth to increase encoder's performance. Reuse encode_fifo as async buffer. Encoder puts all reordered frame to HW and then check fifo size. If fifo < async_depth and the top frame is not ready, it will return AVERROR(EAGAIN) to require more frames. 1080p transcoding (no B frames) with -async_depth=4 can increase 20% performance on my environment. The async increases performance but also introduces frame delay. Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> --- libavcodec/vaapi_encode.c | 20 +++++++++++++++----- libavcodec/vaapi_encode.h | 12 ++++++++++-- 2 files changed, 25 insertions(+), 7 deletions(-)