Message ID | 20220208030549.340748-3-wenbin.chen@intel.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,V3,1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
andriy/make_aarch64_jetson | success | Make finished |
andriy/make_fate_aarch64_jetson | success | Make fate finished |
> Add async_depth to increase encoder's performance. Reuse encode_fifo as > async buffer. Encoder puts all reordered frame to HW and then check > fifo size. If fifo < async_depth and the top frame is not ready, it will > return AVERROR(EAGAIN) to require more frames. > > 1080p transcoding (no B frames) with -async_depth=4 can increase 20% > performance on my environment. > The async increases performance but also introduces frame delay. > > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > --- > libavcodec/vaapi_encode.c | 16 ++++++++++++---- > libavcodec/vaapi_encode.h | 12 ++++++++++-- > 2 files changed, 22 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > index 15ddbbaa4a..432abf31f7 100644 > --- a/libavcodec/vaapi_encode.c > +++ b/libavcodec/vaapi_encode.c > @@ -1158,7 +1158,8 @@ static int > vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame) > if (ctx->input_order == ctx->decode_delay) > ctx->dts_pts_diff = pic->pts - ctx->first_pts; > if (ctx->output_delay > 0) > - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts; > + ctx->ts_ring[ctx->input_order % > + (3 * ctx->output_delay + ctx->async_depth)] = pic->pts; > > pic->display_order = ctx->input_order; > ++ctx->input_order; > @@ -1214,7 +1215,7 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > > #if VA_CHECK_VERSION(1, 9, 0) > if (ctx->has_sync_buffer_func) { > - while (av_fifo_can_read(ctx->encode_fifo) <= > MAX_PICTURE_REFERENCES) { > + while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) { Here is a mistake I should use "<" instead of "<=" and I can use av_fifo_can_write() instead. I will update it. > pic = NULL; > err = vaapi_encode_pick_next(avctx, &pic); > if (err < 0) > @@ -1232,6 +1233,13 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > } > if (!av_fifo_can_read(ctx->encode_fifo)) > return err; > + if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth && > + !ctx->end_of_stream) { > + av_fifo_peek(ctx->encode_fifo, &pic, 1, 0); > + err = vaapi_encode_wait(avctx, pic, 0); > + if (err < 0) > + return err; > + } > av_fifo_read(ctx->encode_fifo, &pic, 1); > ctx->encode_order = pic->encode_order + 1; > } else > @@ -1267,7 +1275,7 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff; > } else { > pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) % > - (3 * ctx->output_delay)]; > + (3 * ctx->output_delay + ctx->async_depth)]; > } > av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" > dts %"PRId64".\n", > pkt->pts, pkt->dts); > @@ -2588,7 +2596,7 @@ av_cold int ff_vaapi_encode_init(AVCodecContext > *avctx) > vas = vaSyncBuffer(ctx->hwctx->display, 0, 0); > if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) { > ctx->has_sync_buffer_func = 1; > - ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1, > + ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH, > sizeof(VAAPIEncodePicture *), > 0); > if (!ctx->encode_fifo) > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h > index d33a486cb8..691521387d 100644 > --- a/libavcodec/vaapi_encode.h > +++ b/libavcodec/vaapi_encode.h > @@ -48,6 +48,7 @@ enum { > MAX_TILE_ROWS = 22, > // A.4.1: table A.6 allows at most 20 tile columns for any level. > MAX_TILE_COLS = 20, > + MAX_ASYNC_DEPTH = 64, > }; > > extern const AVCodecHWConfigInternal *const > ff_vaapi_encode_hw_configs[]; > @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext { > // Timestamp handling. > int64_t first_pts; > int64_t dts_pts_diff; > - int64_t ts_ring[MAX_REORDER_DELAY * 3]; > + int64_t ts_ring[MAX_REORDER_DELAY * 3 + > + MAX_ASYNC_DEPTH]; > > // Slice structure. > int slice_block_rows; > @@ -350,6 +352,8 @@ typedef struct VAAPIEncodeContext { > AVFifo *encode_fifo; > //Whether the driver support vaSyncBuffer > int has_sync_buffer_func; > + //Max number of frame buffered in encoder. > + int async_depth; > } VAAPIEncodeContext; > > enum { > @@ -460,7 +464,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx); > { "b_depth", \ > "Maximum B-frame reference depth", \ > OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \ > - { .i64 = 1 }, 1, INT_MAX, FLAGS } > + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \ > + { "async_depth", "Maximum processing parallelism. " \ > + "Increase this to improve single channel performance", \ > + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \ > + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS } > > #define VAAPI_ENCODE_RC_MODE(name, desc) \ > { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \ > -- > 2.32.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> Add async_depth to increase encoder's performance. Reuse encode_fifo as > async buffer. Encoder puts all reordered frame to HW and then check > fifo size. If fifo < async_depth and the top frame is not ready, it will > return AVERROR(EAGAIN) to require more frames. > > 1080p transcoding (no B frames) with -async_depth=4 can increase 20% > performance on my environment. > The async increases performance but also introduces frame delay. > > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > --- > libavcodec/vaapi_encode.c | 16 ++++++++++++---- > libavcodec/vaapi_encode.h | 12 ++++++++++-- > 2 files changed, 22 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > index 15ddbbaa4a..432abf31f7 100644 > --- a/libavcodec/vaapi_encode.c > +++ b/libavcodec/vaapi_encode.c > @@ -1158,7 +1158,8 @@ static int vaapi_encode_send_frame(AVCodecContext > *avctx, AVFrame *frame) > if (ctx->input_order == ctx->decode_delay) > ctx->dts_pts_diff = pic->pts - ctx->first_pts; > if (ctx->output_delay > 0) > - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic- > >pts; > + ctx->ts_ring[ctx->input_order % > + (3 * ctx->output_delay + ctx->async_depth)] = pic- > >pts; > > pic->display_order = ctx->input_order; > ++ctx->input_order; > @@ -1214,7 +1215,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext > *avctx, AVPacket *pkt) > > #if VA_CHECK_VERSION(1, 9, 0) > if (ctx->has_sync_buffer_func) { > - while (av_fifo_can_read(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES) > { > + while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) { > pic = NULL; > err = vaapi_encode_pick_next(avctx, &pic); > if (err < 0) > @@ -1232,6 +1233,13 @@ int ff_vaapi_encode_receive_packet(AVCodecContext > *avctx, AVPacket *pkt) > } > if (!av_fifo_can_read(ctx->encode_fifo)) > return err; > + if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth && > + !ctx->end_of_stream) { > + av_fifo_peek(ctx->encode_fifo, &pic, 1, 0); > + err = vaapi_encode_wait(avctx, pic, 0); > + if (err < 0) > + return err; > + } > av_fifo_read(ctx->encode_fifo, &pic, 1); > ctx->encode_order = pic->encode_order + 1; > } else > @@ -1267,7 +1275,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext > *avctx, AVPacket *pkt) > pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff; > } else { > pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) % > - (3 * ctx->output_delay)]; > + (3 * ctx->output_delay + ctx->async_depth)]; > } > av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts > %"PRId64".\n", > pkt->pts, pkt->dts); > @@ -2588,7 +2596,7 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx) > vas = vaSyncBuffer(ctx->hwctx->display, 0, 0); > if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) { > ctx->has_sync_buffer_func = 1; > - ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1, > + ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH, > sizeof(VAAPIEncodePicture *), > 0); > if (!ctx->encode_fifo) > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h > index d33a486cb8..691521387d 100644 > --- a/libavcodec/vaapi_encode.h > +++ b/libavcodec/vaapi_encode.h > @@ -48,6 +48,7 @@ enum { > MAX_TILE_ROWS = 22, > // A.4.1: table A.6 allows at most 20 tile columns for any level. > MAX_TILE_COLS = 20, > + MAX_ASYNC_DEPTH = 64, > }; > > extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[]; > @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext { > // Timestamp handling. > int64_t first_pts; > int64_t dts_pts_diff; > - int64_t ts_ring[MAX_REORDER_DELAY * 3]; > + int64_t ts_ring[MAX_REORDER_DELAY * 3 + > + MAX_ASYNC_DEPTH]; > > // Slice structure. > int slice_block_rows; > @@ -350,6 +352,8 @@ typedef struct VAAPIEncodeContext { > AVFifo *encode_fifo; > //Whether the driver support vaSyncBuffer > int has_sync_buffer_func; > + //Max number of frame buffered in encoder. > + int async_depth; > } VAAPIEncodeContext; > > enum { > @@ -460,7 +464,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx); > { "b_depth", \ > "Maximum B-frame reference depth", \ > OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \ > - { .i64 = 1 }, 1, INT_MAX, FLAGS } > + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \ > + { "async_depth", "Maximum processing parallelism. " \ > + "Increase this to improve single channel performance", \ async_depth is not available if vaSyncBuffer is not implemented, it would be better to add some comments in the help string. Thanks Haihao > + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \ > + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS } > > #define VAAPI_ENCODE_RC_MODE(name, desc) \ > { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c index 15ddbbaa4a..432abf31f7 100644 --- a/libavcodec/vaapi_encode.c +++ b/libavcodec/vaapi_encode.c @@ -1158,7 +1158,8 @@ static int vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame) if (ctx->input_order == ctx->decode_delay) ctx->dts_pts_diff = pic->pts - ctx->first_pts; if (ctx->output_delay > 0) - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts; + ctx->ts_ring[ctx->input_order % + (3 * ctx->output_delay + ctx->async_depth)] = pic->pts; pic->display_order = ctx->input_order; ++ctx->input_order; @@ -1214,7 +1215,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) #if VA_CHECK_VERSION(1, 9, 0) if (ctx->has_sync_buffer_func) { - while (av_fifo_can_read(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES) { + while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) { pic = NULL; err = vaapi_encode_pick_next(avctx, &pic); if (err < 0) @@ -1232,6 +1233,13 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) } if (!av_fifo_can_read(ctx->encode_fifo)) return err; + if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth && + !ctx->end_of_stream) { + av_fifo_peek(ctx->encode_fifo, &pic, 1, 0); + err = vaapi_encode_wait(avctx, pic, 0); + if (err < 0) + return err; + } av_fifo_read(ctx->encode_fifo, &pic, 1); ctx->encode_order = pic->encode_order + 1; } else @@ -1267,7 +1275,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff; } else { pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) % - (3 * ctx->output_delay)]; + (3 * ctx->output_delay + ctx->async_depth)]; } av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n", pkt->pts, pkt->dts); @@ -2588,7 +2596,7 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx) vas = vaSyncBuffer(ctx->hwctx->display, 0, 0); if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) { ctx->has_sync_buffer_func = 1; - ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1, + ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH, sizeof(VAAPIEncodePicture *), 0); if (!ctx->encode_fifo) diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h index d33a486cb8..691521387d 100644 --- a/libavcodec/vaapi_encode.h +++ b/libavcodec/vaapi_encode.h @@ -48,6 +48,7 @@ enum { MAX_TILE_ROWS = 22, // A.4.1: table A.6 allows at most 20 tile columns for any level. MAX_TILE_COLS = 20, + MAX_ASYNC_DEPTH = 64, }; extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[]; @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext { // Timestamp handling. int64_t first_pts; int64_t dts_pts_diff; - int64_t ts_ring[MAX_REORDER_DELAY * 3]; + int64_t ts_ring[MAX_REORDER_DELAY * 3 + + MAX_ASYNC_DEPTH]; // Slice structure. int slice_block_rows; @@ -350,6 +352,8 @@ typedef struct VAAPIEncodeContext { AVFifo *encode_fifo; //Whether the driver support vaSyncBuffer int has_sync_buffer_func; + //Max number of frame buffered in encoder. + int async_depth; } VAAPIEncodeContext; enum { @@ -460,7 +464,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx); { "b_depth", \ "Maximum B-frame reference depth", \ OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \ - { .i64 = 1 }, 1, INT_MAX, FLAGS } + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \ + { "async_depth", "Maximum processing parallelism. " \ + "Increase this to improve single channel performance", \ + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \ + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS } #define VAAPI_ENCODE_RC_MODE(name, desc) \ { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
Add async_depth to increase encoder's performance. Reuse encode_fifo as async buffer. Encoder puts all reordered frame to HW and then check fifo size. If fifo < async_depth and the top frame is not ready, it will return AVERROR(EAGAIN) to require more frames. 1080p transcoding (no B frames) with -async_depth=4 can increase 20% performance on my environment. The async increases performance but also introduces frame delay. Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> --- libavcodec/vaapi_encode.c | 16 ++++++++++++---- libavcodec/vaapi_encode.h | 12 ++++++++++-- 2 files changed, 22 insertions(+), 6 deletions(-)