Message ID | 20191107163203.6799-1-linjie.fu@intel.com |
---|---|
State | New |
Headers | show |
> -----Original Message----- > From: Fu, Linjie <linjie.fu@intel.com> > Sent: Friday, November 8, 2019 00:32 > To: ffmpeg-devel@ffmpeg.org > Cc: Fu, Linjie <linjie.fu@intel.com> > Subject: [PATCH] lavc/vaapi_encode: Async the encoding and output > procedure of encoder > > Currently, vaapi encodes a pic if all its references are ready, > and then outputs it immediately by calling vaapi_encode_output. > > However, while working on output procedure, hardware is be able to > cope with encoding tasks in the meantime to have the better performance. > > So a more efficient way is to send all the pics with available refs to > hardware to allow encoding while output. > > It's what vaapi originally did before the regression, and the performance > could be improved for ~20%. > > CMD: > ffmpeg -hwaccel vaapi -vaapi_device /dev/dri/renderD128 > -hwaccel_output_format vaapi -i bbb_sunflower_1080p_30fps_normal.mp4 > -c:v h264_vaapi -f h264 -y /dev/null > > Source: > https://download.blender.org/demo/movies/BBB/ > > Before: > ~164 fps > After: > ~198 fps > > Fix #7706. > > Signed-off-by: Linjie Fu <linjie.fu@intel.com> > --- > libavcodec/vaapi_encode.c | 27 +++++++++++++++++++-------- > 1 file changed, 19 insertions(+), 8 deletions(-) > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > index 3be9159d37..aceb268315 100644 > --- a/libavcodec/vaapi_encode.c > +++ b/libavcodec/vaapi_encode.c > @@ -1109,17 +1109,28 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > return AVERROR(EAGAIN); > } > > +pick_next: > pic = NULL; > err = vaapi_encode_pick_next(avctx, &pic); > - if (err < 0) > - return err; > - av_assert0(pic); > + if (!err) { > + av_assert0(pic); > > - pic->encode_order = ctx->encode_order++; > + pic->encode_order = ctx->encode_order++; > > - err = vaapi_encode_issue(avctx, pic); > - if (err < 0) { > - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > + err = vaapi_encode_issue(avctx, pic); > + if (err < 0) { > + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > + return err; > + } > + goto pick_next; > + } else if (err == AVERROR(EAGAIN)) { > + for (pic = ctx->pic_start; pic; pic = pic->next) > + if (pic->encode_issued && !pic->encode_complete && > + pic->encode_order == ctx->output_order) > + break; > + if (!pic) > + return err; > + } else { > return err; > } > > @@ -1143,7 +1154,7 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" > dts %"PRId64".\n", > pkt->pts, pkt->dts); > > - ctx->output_order = pic->encode_order; > + ctx->output_order++; > vaapi_encode_clear_old(avctx); > > return 0; Ping. - linjie
> -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Fu, > Linjie > Sent: Monday, November 11, 2019 17:43 > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH] lavc/vaapi_encode: Async the > encoding and output procedure of encoder > > > -----Original Message----- > > From: Fu, Linjie <linjie.fu@intel.com> > > Sent: Friday, November 8, 2019 00:32 > > To: ffmpeg-devel@ffmpeg.org > > Cc: Fu, Linjie <linjie.fu@intel.com> > > Subject: [PATCH] lavc/vaapi_encode: Async the encoding and output > > procedure of encoder > > > > Currently, vaapi encodes a pic if all its references are ready, > > and then outputs it immediately by calling vaapi_encode_output. > > > > However, while working on output procedure, hardware is be able to > > cope with encoding tasks in the meantime to have the better performance. > > > > So a more efficient way is to send all the pics with available refs to > > hardware to allow encoding while output. > > > > It's what vaapi originally did before the regression, and the performance > > could be improved for ~20%. > > > > CMD: > > ffmpeg -hwaccel vaapi -vaapi_device /dev/dri/renderD128 > > -hwaccel_output_format vaapi -i > bbb_sunflower_1080p_30fps_normal.mp4 > > -c:v h264_vaapi -f h264 -y /dev/null > > > > Source: > > https://download.blender.org/demo/movies/BBB/ > > > > Before: > > ~164 fps > > After: > > ~198 fps > > > > Fix #7706. > > > > Signed-off-by: Linjie Fu <linjie.fu@intel.com> > > --- > > libavcodec/vaapi_encode.c | 27 +++++++++++++++++++-------- > > 1 file changed, 19 insertions(+), 8 deletions(-) > > > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > > index 3be9159d37..aceb268315 100644 > > --- a/libavcodec/vaapi_encode.c > > +++ b/libavcodec/vaapi_encode.c > > @@ -1109,17 +1109,28 @@ int > > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > > return AVERROR(EAGAIN); > > } > > > > +pick_next: > > pic = NULL; > > err = vaapi_encode_pick_next(avctx, &pic); > > - if (err < 0) > > - return err; > > - av_assert0(pic); > > + if (!err) { > > + av_assert0(pic); > > > > - pic->encode_order = ctx->encode_order++; > > + pic->encode_order = ctx->encode_order++; > > > > - err = vaapi_encode_issue(avctx, pic); > > - if (err < 0) { > > - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > > + err = vaapi_encode_issue(avctx, pic); > > + if (err < 0) { > > + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > > + return err; > > + } > > + goto pick_next; > > + } else if (err == AVERROR(EAGAIN)) { > > + for (pic = ctx->pic_start; pic; pic = pic->next) > > + if (pic->encode_issued && !pic->encode_complete && > > + pic->encode_order == ctx->output_order) > > + break; > > + if (!pic) > > + return err; > > + } else { > > return err; > > } > > > > @@ -1143,7 +1154,7 @@ int > > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > > av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" > > dts %"PRId64".\n", > > pkt->pts, pkt->dts); > > > > - ctx->output_order = pic->encode_order; > > + ctx->output_order++; > > vaapi_encode_clear_old(avctx); > > > > return 0; > > Ping. Ping for this. Any advice or comment would be appreciated. Thanks, Linjie
On Thu, 14 Nov 2019, 18:29 Fu, Linjie, <linjie.fu@intel.com> wrote: > > -----Original Message----- > > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Fu, > > Linjie > > Sent: Monday, November 11, 2019 17:43 > > To: ffmpeg-devel@ffmpeg.org > > Subject: Re: [FFmpeg-devel] [PATCH] lavc/vaapi_encode: Async the > > encoding and output procedure of encoder > > > > > -----Original Message----- > > > From: Fu, Linjie <linjie.fu@intel.com> > > > Sent: Friday, November 8, 2019 00:32 > > > To: ffmpeg-devel@ffmpeg.org > > > Cc: Fu, Linjie <linjie.fu@intel.com> > > > Subject: [PATCH] lavc/vaapi_encode: Async the encoding and output > > > procedure of encoder > > > > > > Currently, vaapi encodes a pic if all its references are ready, > > > and then outputs it immediately by calling vaapi_encode_output. > > > > > > However, while working on output procedure, hardware is be able to > > > cope with encoding tasks in the meantime to have the better > performance. > > > > > > So a more efficient way is to send all the pics with available refs to > > > hardware to allow encoding while output. > > > > > > It's what vaapi originally did before the regression, and the > performance > > > could be improved for ~20%. > > > > > > CMD: > > > ffmpeg -hwaccel vaapi -vaapi_device /dev/dri/renderD128 > > > -hwaccel_output_format vaapi -i > > bbb_sunflower_1080p_30fps_normal.mp4 > > > -c:v h264_vaapi -f h264 -y /dev/null > > > > > > Source: > > > https://download.blender.org/demo/movies/BBB/ > > > > > > Before: > > > ~164 fps > > > After: > > > ~198 fps > > > > > > Fix #7706. > > > > > > Signed-off-by: Linjie Fu <linjie.fu@intel.com> > > > --- > > > libavcodec/vaapi_encode.c | 27 +++++++++++++++++++-------- > > > 1 file changed, 19 insertions(+), 8 deletions(-) > > > > > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > > > index 3be9159d37..aceb268315 100644 > > > --- a/libavcodec/vaapi_encode.c > > > +++ b/libavcodec/vaapi_encode.c > > > @@ -1109,17 +1109,28 @@ int > > > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > > > return AVERROR(EAGAIN); > > > } > > > > > > +pick_next: > > > pic = NULL; > > > err = vaapi_encode_pick_next(avctx, &pic); > > > - if (err < 0) > > > - return err; > > > - av_assert0(pic); > > > + if (!err) { > > > + av_assert0(pic); > > > > > > - pic->encode_order = ctx->encode_order++; > > > + pic->encode_order = ctx->encode_order++; > > > > > > - err = vaapi_encode_issue(avctx, pic); > > > - if (err < 0) { > > > - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > > > + err = vaapi_encode_issue(avctx, pic); > > > + if (err < 0) { > > > + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > > > + return err; > > > + } > > > + goto pick_next; > > > + } else if (err == AVERROR(EAGAIN)) { > > > + for (pic = ctx->pic_start; pic; pic = pic->next) > > > + if (pic->encode_issued && !pic->encode_complete && > > > + pic->encode_order == ctx->output_order) > > > + break; > > > + if (!pic) > > > + return err; > > > + } else { > > > return err; > > > } > > > > > > @@ -1143,7 +1154,7 @@ int > > > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > > > av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" > > > dts %"PRId64".\n", > > > pkt->pts, pkt->dts); > > > > > > - ctx->output_order = pic->encode_order; > > > + ctx->output_order++; > > > vaapi_encode_clear_old(avctx); > > > > > > return 0; > > > > Ping. > > Ping for this. > Any advice or comment would be appreciated. > > Thanks, > Linjie > The drop is even more severe on AMD hardware with VAAPI.
On Sun, 17 Nov 2019, 06:36 Dennis Mungai, <dmngaie@gmail.com> wrote: > > > On Thu, 14 Nov 2019, 18:29 Fu, Linjie, <linjie.fu@intel.com> wrote: > >> > -----Original Message----- >> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Fu, >> > Linjie >> > Sent: Monday, November 11, 2019 17:43 >> > To: ffmpeg-devel@ffmpeg.org >> > Subject: Re: [FFmpeg-devel] [PATCH] lavc/vaapi_encode: Async the >> > encoding and output procedure of encoder >> > >> > > -----Original Message----- >> > > From: Fu, Linjie <linjie.fu@intel.com> >> > > Sent: Friday, November 8, 2019 00:32 >> > > To: ffmpeg-devel@ffmpeg.org >> > > Cc: Fu, Linjie <linjie.fu@intel.com> >> > > Subject: [PATCH] lavc/vaapi_encode: Async the encoding and output >> > > procedure of encoder >> > > >> > > Currently, vaapi encodes a pic if all its references are ready, >> > > and then outputs it immediately by calling vaapi_encode_output. >> > > >> > > However, while working on output procedure, hardware is be able to >> > > cope with encoding tasks in the meantime to have the better >> performance. >> > > >> > > So a more efficient way is to send all the pics with available refs to >> > > hardware to allow encoding while output. >> > > >> > > It's what vaapi originally did before the regression, and the >> performance >> > > could be improved for ~20%. >> > > >> > > CMD: >> > > ffmpeg -hwaccel vaapi -vaapi_device /dev/dri/renderD128 >> > > -hwaccel_output_format vaapi -i >> > bbb_sunflower_1080p_30fps_normal.mp4 >> > > -c:v h264_vaapi -f h264 -y /dev/null >> > > >> > > Source: >> > > https://download.blender.org/demo/movies/BBB/ >> > > >> > > Before: >> > > ~164 fps >> > > After: >> > > ~198 fps >> > > >> > > Fix #7706. >> > > >> > > Signed-off-by: Linjie Fu <linjie.fu@intel.com> >> > > --- >> > > libavcodec/vaapi_encode.c | 27 +++++++++++++++++++-------- >> > > 1 file changed, 19 insertions(+), 8 deletions(-) >> > > >> > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c >> > > index 3be9159d37..aceb268315 100644 >> > > --- a/libavcodec/vaapi_encode.c >> > > +++ b/libavcodec/vaapi_encode.c >> > > @@ -1109,17 +1109,28 @@ int >> > > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) >> > > return AVERROR(EAGAIN); >> > > } >> > > >> > > +pick_next: >> > > pic = NULL; >> > > err = vaapi_encode_pick_next(avctx, &pic); >> > > - if (err < 0) >> > > - return err; >> > > - av_assert0(pic); >> > > + if (!err) { >> > > + av_assert0(pic); >> > > >> > > - pic->encode_order = ctx->encode_order++; >> > > + pic->encode_order = ctx->encode_order++; >> > > >> > > - err = vaapi_encode_issue(avctx, pic); >> > > - if (err < 0) { >> > > - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); >> > > + err = vaapi_encode_issue(avctx, pic); >> > > + if (err < 0) { >> > > + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); >> > > + return err; >> > > + } >> > > + goto pick_next; >> > > + } else if (err == AVERROR(EAGAIN)) { >> > > + for (pic = ctx->pic_start; pic; pic = pic->next) >> > > + if (pic->encode_issued && !pic->encode_complete && >> > > + pic->encode_order == ctx->output_order) >> > > + break; >> > > + if (!pic) >> > > + return err; >> > > + } else { >> > > return err; >> > > } >> > > >> > > @@ -1143,7 +1154,7 @@ int >> > > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) >> > > av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" >> > > dts %"PRId64".\n", >> > > pkt->pts, pkt->dts); >> > > >> > > - ctx->output_order = pic->encode_order; >> > > + ctx->output_order++; >> > > vaapi_encode_clear_old(avctx); >> > > >> > > return 0; >> > >> > Ping. >> >> Ping for this. >> Any advice or comment would be appreciated. >> >> Thanks, >> Linjie >> > > The drop is even more severe on AMD hardware with VAAPI. > Without this patch, perf regression on AMD VAAPI is almost 40%. On Intel hardware (HSW+) it's closer to 30% in the worst case scenarios. > >
On 07/11/2019 16:32, Linjie Fu wrote: > Currently, vaapi encodes a pic if all its references are ready, > and then outputs it immediately by calling vaapi_encode_output. > > However, while working on output procedure, hardware is be able to > cope with encoding tasks in the meantime to have the better performance. > > So a more efficient way is to send all the pics with available refs to > hardware to allow encoding while output. > > It's what vaapi originally did before the regression, and the performance > could be improved for ~20%. > > CMD: > ffmpeg -hwaccel vaapi -vaapi_device /dev/dri/renderD128 > -hwaccel_output_format vaapi -i bbb_sunflower_1080p_30fps_normal.mp4 > -c:v h264_vaapi -f h264 -y /dev/null > > Source: > https://download.blender.org/demo/movies/BBB/ > > Before: > ~164 fps > After: > ~198 fps > > Fix #7706. > > Signed-off-by: Linjie Fu <linjie.fu@intel.com> > --- > libavcodec/vaapi_encode.c | 27 +++++++++++++++++++-------- > 1 file changed, 19 insertions(+), 8 deletions(-) > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > index 3be9159d37..aceb268315 100644 > --- a/libavcodec/vaapi_encode.c > +++ b/libavcodec/vaapi_encode.c > @@ -1109,17 +1109,28 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > return AVERROR(EAGAIN); > } > > +pick_next: > pic = NULL; > err = vaapi_encode_pick_next(avctx, &pic); > - if (err < 0) > - return err; > - av_assert0(pic); > + if (!err) { > + av_assert0(pic); > > - pic->encode_order = ctx->encode_order++; > + pic->encode_order = ctx->encode_order++; > > - err = vaapi_encode_issue(avctx, pic); > - if (err < 0) { > - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > + err = vaapi_encode_issue(avctx, pic); > + if (err < 0) { > + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > + return err; > + } > + goto pick_next; > + } else if (err == AVERROR(EAGAIN)) { > + for (pic = ctx->pic_start; pic; pic = pic->next) > + if (pic->encode_issued && !pic->encode_complete && > + pic->encode_order == ctx->output_order) > + break; > + if (!pic) > + return err; > + } else { > return err; > } > > @@ -1143,7 +1154,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n", > pkt->pts, pkt->dts); > > - ctx->output_order = pic->encode_order; > + ctx->output_order++; > vaapi_encode_clear_old(avctx); > > return 0; > The sync in the same receive call here is required for correctness because of the how VAAPI syncs to the input surface (consider the interactions of a split followed by two encoders using the same surfaces). I didn't realise that for a long time, hence the error existing in earlier versions. Relatedly, this change would significantly increase latency on Intel platforms because vaEndPicture() is mostly synchronous there, and you're now calling it multiple times before returning anything. More generally, though, the API definition here is just really stupid. If you're interested it would be much better to fix this in the API - sync-to-output of some kind would make far more sense, as would some kind of event-based system (extra points if it can interop via fds to normal poll() and Vulkan). - Mark
> Mark Thompson: > Sent: Monday, November 18, 2019 07:14 > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH] lavc/vaapi_encode: Async the > encoding and output procedure of encoder > > On 07/11/2019 16:32, Linjie Fu wrote: > > Currently, vaapi encodes a pic if all its references are ready, > > and then outputs it immediately by calling vaapi_encode_output. > > > > However, while working on output procedure, hardware is be able to > > cope with encoding tasks in the meantime to have the better performance. > > > > So a more efficient way is to send all the pics with available refs to > > hardware to allow encoding while output. > > > > It's what vaapi originally did before the regression, and the performance > > could be improved for ~20%. > > > > CMD: > > ffmpeg -hwaccel vaapi -vaapi_device /dev/dri/renderD128 > > -hwaccel_output_format vaapi -i > bbb_sunflower_1080p_30fps_normal.mp4 > > -c:v h264_vaapi -f h264 -y /dev/null > > > > Source: > > https://download.blender.org/demo/movies/BBB/ > > > > Before: > > ~164 fps > > After: > > ~198 fps > > > > Fix #7706. > > > > Signed-off-by: Linjie Fu <linjie.fu@intel.com> > > --- > > The sync in the same receive call here is required for correctness because of > the how VAAPI syncs to the input surface (consider the interactions of a split > followed by two encoders using the same surfaces). I didn't realise that for a > long time, hence the error existing in earlier versions. Indeed, you're right 1:N encoding procedure suffered from this. And the proper fix is to provide a new API to sync the coded buffer which is independent for each encoding task, instead of syncing the shared input surface. (See comments below) > > Relatedly, this change would significantly increase latency on Intel platforms > because vaEndPicture() is mostly synchronous there, and you're now calling > it multiple times before returning anything. As the the latency, how about adding an option like "-async 0/1" to specify the user requirement: For live streaming, user may use -async 0 to minimize the latency; For video on demand, user may use -async 1 to maximize the performance. > > More generally, though, the API definition here is just really stupid. If you're > interested it would be much better to fix this in the API - sync-to-output of > some kind would make far more sense, as would some kind of event-based > system (extra points if it can interop via fds to normal poll() and Vulkan). > We have a proposal[1] to libva to introduce the new function to make encoder synchronization by output bitstream., and looking forward to your comments. - Linjie [1] <https://github.com/intel/libva/pull/408>
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c index 3be9159d37..aceb268315 100644 --- a/libavcodec/vaapi_encode.c +++ b/libavcodec/vaapi_encode.c @@ -1109,17 +1109,28 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) return AVERROR(EAGAIN); } +pick_next: pic = NULL; err = vaapi_encode_pick_next(avctx, &pic); - if (err < 0) - return err; - av_assert0(pic); + if (!err) { + av_assert0(pic); - pic->encode_order = ctx->encode_order++; + pic->encode_order = ctx->encode_order++; - err = vaapi_encode_issue(avctx, pic); - if (err < 0) { - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); + err = vaapi_encode_issue(avctx, pic); + if (err < 0) { + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); + return err; + } + goto pick_next; + } else if (err == AVERROR(EAGAIN)) { + for (pic = ctx->pic_start; pic; pic = pic->next) + if (pic->encode_issued && !pic->encode_complete && + pic->encode_order == ctx->output_order) + break; + if (!pic) + return err; + } else { return err; } @@ -1143,7 +1154,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n", pkt->pts, pkt->dts); - ctx->output_order = pic->encode_order; + ctx->output_order++; vaapi_encode_clear_old(avctx); return 0;
Currently, vaapi encodes a pic if all its references are ready, and then outputs it immediately by calling vaapi_encode_output. However, while working on output procedure, hardware is be able to cope with encoding tasks in the meantime to have the better performance. So a more efficient way is to send all the pics with available refs to hardware to allow encoding while output. It's what vaapi originally did before the regression, and the performance could be improved for ~20%. CMD: ffmpeg -hwaccel vaapi -vaapi_device /dev/dri/renderD128 -hwaccel_output_format vaapi -i bbb_sunflower_1080p_30fps_normal.mp4 -c:v h264_vaapi -f h264 -y /dev/null Source: https://download.blender.org/demo/movies/BBB/ Before: ~164 fps After: ~198 fps Fix #7706. Signed-off-by: Linjie Fu <linjie.fu@intel.com> --- libavcodec/vaapi_encode.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-)