Message ID | 20211027085705.4114165-1-wenbin.chen@intel.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/3] libavcodec/vaapi_encode: Change the way to call async to increase performance | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
> Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance > decrease. The reason is that vaRenderPicture() and vaSyncSurface() are > called at the same time (vaRenderPicture() always followed by a > vaSyncSurface()). When we encode stream with B frames, we need buffer to > reorder frames, so we can send serveral frames to HW at once to increase > performance. Now I changed them to be called in a > asynchronous way, which will make better use of hardware. > 1080p transcoding increases about 17% fps on my environment. > > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > --- > libavcodec/vaapi_encode.c | 41 ++++++++++++++++++++++++++++----------- > libavcodec/vaapi_encode.h | 3 +++ > 2 files changed, 33 insertions(+), 11 deletions(-) > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > index ec054ae701..5927849233 100644 > --- a/libavcodec/vaapi_encode.c > +++ b/libavcodec/vaapi_encode.c > @@ -951,8 +951,10 @@ static int vaapi_encode_pick_next(AVCodecContext > *avctx, > if (!pic && ctx->end_of_stream) { > --b_counter; > pic = ctx->pic_end; > - if (pic->encode_issued) > + if (pic->encode_complete) > return AVERROR_EOF; > + else if (pic->encode_issued) > + return AVERROR(EAGAIN); > } > > if (!pic) { > @@ -1177,20 +1179,31 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) > return AVERROR(EAGAIN); > } > > - pic = NULL; > - err = vaapi_encode_pick_next(avctx, &pic); > - if (err < 0) > - return err; > - av_assert0(pic); > + while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * > sizeof(VAAPIEncodePicture *)) { > + pic = NULL; > + err = vaapi_encode_pick_next(avctx, &pic); > + if (err < 0) > + break; > + av_assert0(pic); > > - pic->encode_order = ctx->encode_order++; > + pic->encode_order = ctx->encode_order + > + (av_fifo_size(ctx->encode_fifo) / sizeof(VAAPIEncodePicture > *)); > > - err = vaapi_encode_issue(avctx, pic); > - if (err < 0) { > - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > - return err; > + err = vaapi_encode_issue(avctx, pic); > + if (err < 0) { > + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > + return err; > + } > + > + av_fifo_generic_write(ctx->encode_fifo, &pic, sizeof(pic), NULL); > } > > + if (!av_fifo_size(ctx->encode_fifo)) > + return err; > + > + av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL); > + ctx->encode_order = pic->encode_order + 1; > + > err = vaapi_encode_output(avctx, pic, pkt); > if (err < 0) { > av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err); > @@ -2520,6 +2533,11 @@ av_cold int > ff_vaapi_encode_init(AVCodecContext *avctx) > } > } > > + ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) * > + sizeof(VAAPIEncodePicture *)); > + if (!ctx->encode_fifo) > + return AVERROR(ENOMEM); > + > return 0; > > fail: > @@ -2552,6 +2570,7 @@ av_cold int > ff_vaapi_encode_close(AVCodecContext *avctx) > > av_freep(&ctx->codec_sequence_params); > av_freep(&ctx->codec_picture_params); > + av_fifo_freep(&ctx->encode_fifo); > > av_buffer_unref(&ctx->recon_frames_ref); > av_buffer_unref(&ctx->input_frames_ref); > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h > index b41604a883..89fe8de466 100644 > --- a/libavcodec/vaapi_encode.h > +++ b/libavcodec/vaapi_encode.h > @@ -29,6 +29,7 @@ > > #include "libavutil/hwcontext.h" > #include "libavutil/hwcontext_vaapi.h" > +#include "libavutil/fifo.h" > > #include "avcodec.h" > #include "hwconfig.h" > @@ -345,6 +346,8 @@ typedef struct VAAPIEncodeContext { > int roi_warned; > > AVFrame *frame; > + > + AVFifoBuffer *encode_fifo; > } VAAPIEncodeContext; > > enum { > -- > 2.25.1 ping
On Wed, 2021-10-27 at 16:57 +0800, Wenbin Chen wrote: > Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance > decrease. The reason is that vaRenderPicture() and vaSyncSurface() are > called at the same time (vaRenderPicture() always followed by a > vaSyncSurface()). When we encode stream with B frames, we need buffer to > reorder frames, so we can send serveral frames to HW at once to increase > performance. Now I changed them to be called in a > asynchronous way, which will make better use of hardware. > 1080p transcoding increases about 17% fps on my environment. Could you provide your command ? I'd like to have a try. Thanks Haihao > > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > --- > libavcodec/vaapi_encode.c | 41 ++++++++++++++++++++++++++++----------- > libavcodec/vaapi_encode.h | 3 +++ > 2 files changed, 33 insertions(+), 11 deletions(-) > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > index ec054ae701..5927849233 100644 > --- a/libavcodec/vaapi_encode.c > +++ b/libavcodec/vaapi_encode.c > @@ -951,8 +951,10 @@ static int vaapi_encode_pick_next(AVCodecContext *avctx, > if (!pic && ctx->end_of_stream) { > --b_counter; > pic = ctx->pic_end; > - if (pic->encode_issued) > + if (pic->encode_complete) > return AVERROR_EOF; > + else if (pic->encode_issued) > + return AVERROR(EAGAIN); > } > > if (!pic) { > @@ -1177,20 +1179,31 @@ int ff_vaapi_encode_receive_packet(AVCodecContext > *avctx, AVPacket *pkt) > return AVERROR(EAGAIN); > } > > - pic = NULL; > - err = vaapi_encode_pick_next(avctx, &pic); > - if (err < 0) > - return err; > - av_assert0(pic); > + while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * > sizeof(VAAPIEncodePicture *)) { > + pic = NULL; > + err = vaapi_encode_pick_next(avctx, &pic); > + if (err < 0) > + break; > + av_assert0(pic); > > - pic->encode_order = ctx->encode_order++; > + pic->encode_order = ctx->encode_order + > + (av_fifo_size(ctx->encode_fifo) / > sizeof(VAAPIEncodePicture *)); > > - err = vaapi_encode_issue(avctx, pic); > - if (err < 0) { > - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > - return err; > + err = vaapi_encode_issue(avctx, pic); > + if (err < 0) { > + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > + return err; > + } > + > + av_fifo_generic_write(ctx->encode_fifo, &pic, sizeof(pic), NULL); > } > > + if (!av_fifo_size(ctx->encode_fifo)) > + return err; > + > + av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL); > + ctx->encode_order = pic->encode_order + 1; > + > err = vaapi_encode_output(avctx, pic, pkt); > if (err < 0) { > av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err); > @@ -2520,6 +2533,11 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx) > } > } > > + ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) * > + sizeof(VAAPIEncodePicture *)); > + if (!ctx->encode_fifo) > + return AVERROR(ENOMEM); > + > return 0; > > fail: > @@ -2552,6 +2570,7 @@ av_cold int ff_vaapi_encode_close(AVCodecContext *avctx) > > av_freep(&ctx->codec_sequence_params); > av_freep(&ctx->codec_picture_params); > + av_fifo_freep(&ctx->encode_fifo); > > av_buffer_unref(&ctx->recon_frames_ref); > av_buffer_unref(&ctx->input_frames_ref); > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h > index b41604a883..89fe8de466 100644 > --- a/libavcodec/vaapi_encode.h > +++ b/libavcodec/vaapi_encode.h > @@ -29,6 +29,7 @@ > > #include "libavutil/hwcontext.h" > #include "libavutil/hwcontext_vaapi.h" > +#include "libavutil/fifo.h" > > #include "avcodec.h" > #include "hwconfig.h" > @@ -345,6 +346,8 @@ typedef struct VAAPIEncodeContext { > int roi_warned; > > AVFrame *frame; > + > + AVFifoBuffer *encode_fifo; > } VAAPIEncodeContext; > > enum {
> On Wed, 2021-10-27 at 16:57 +0800, Wenbin Chen wrote: > > Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance > > decrease. The reason is that vaRenderPicture() and vaSyncSurface() are > > called at the same time (vaRenderPicture() always followed by a > > vaSyncSurface()). When we encode stream with B frames, we need buffer > to > > reorder frames, so we can send serveral frames to HW at once to increase > > performance. Now I changed them to be called in a > > asynchronous way, which will make better use of hardware. > > 1080p transcoding increases about 17% fps on my environment. > > Could you provide your command ? I'd like to have a try. > > Thanks > Haihao Here is my command: ffmpeg -hwaccel vaapi -hwaccel_output_format vaapi -i input.264 -c:v h264_vaapi output.264 Thanks Wenbin > > > > > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > > --- > > libavcodec/vaapi_encode.c | 41 ++++++++++++++++++++++++++++----------- > > libavcodec/vaapi_encode.h | 3 +++ > > 2 files changed, 33 insertions(+), 11 deletions(-) > > > > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c > > index ec054ae701..5927849233 100644 > > --- a/libavcodec/vaapi_encode.c > > +++ b/libavcodec/vaapi_encode.c > > @@ -951,8 +951,10 @@ static int > vaapi_encode_pick_next(AVCodecContext *avctx, > > if (!pic && ctx->end_of_stream) { > > --b_counter; > > pic = ctx->pic_end; > > - if (pic->encode_issued) > > + if (pic->encode_complete) > > return AVERROR_EOF; > > + else if (pic->encode_issued) > > + return AVERROR(EAGAIN); > > } > > > > if (!pic) { > > @@ -1177,20 +1179,31 @@ int > ff_vaapi_encode_receive_packet(AVCodecContext > > *avctx, AVPacket *pkt) > > return AVERROR(EAGAIN); > > } > > > > - pic = NULL; > > - err = vaapi_encode_pick_next(avctx, &pic); > > - if (err < 0) > > - return err; > > - av_assert0(pic); > > + while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * > > sizeof(VAAPIEncodePicture *)) { > > + pic = NULL; > > + err = vaapi_encode_pick_next(avctx, &pic); > > + if (err < 0) > > + break; > > + av_assert0(pic); > > > > - pic->encode_order = ctx->encode_order++; > > + pic->encode_order = ctx->encode_order + > > + (av_fifo_size(ctx->encode_fifo) / > > sizeof(VAAPIEncodePicture *)); > > > > - err = vaapi_encode_issue(avctx, pic); > > - if (err < 0) { > > - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > > - return err; > > + err = vaapi_encode_issue(avctx, pic); > > + if (err < 0) { > > + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); > > + return err; > > + } > > + > > + av_fifo_generic_write(ctx->encode_fifo, &pic, sizeof(pic), NULL); > > } > > > > + if (!av_fifo_size(ctx->encode_fifo)) > > + return err; > > + > > + av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL); > > + ctx->encode_order = pic->encode_order + 1; > > + > > err = vaapi_encode_output(avctx, pic, pkt); > > if (err < 0) { > > av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err); > > @@ -2520,6 +2533,11 @@ av_cold int > ff_vaapi_encode_init(AVCodecContext *avctx) > > } > > } > > > > + ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) * > > + sizeof(VAAPIEncodePicture *)); > > + if (!ctx->encode_fifo) > > + return AVERROR(ENOMEM); > > + > > return 0; > > > > fail: > > @@ -2552,6 +2570,7 @@ av_cold int > ff_vaapi_encode_close(AVCodecContext *avctx) > > > > av_freep(&ctx->codec_sequence_params); > > av_freep(&ctx->codec_picture_params); > > + av_fifo_freep(&ctx->encode_fifo); > > > > av_buffer_unref(&ctx->recon_frames_ref); > > av_buffer_unref(&ctx->input_frames_ref); > > diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h > > index b41604a883..89fe8de466 100644 > > --- a/libavcodec/vaapi_encode.h > > +++ b/libavcodec/vaapi_encode.h > > @@ -29,6 +29,7 @@ > > > > #include "libavutil/hwcontext.h" > > #include "libavutil/hwcontext_vaapi.h" > > +#include "libavutil/fifo.h" > > > > #include "avcodec.h" > > #include "hwconfig.h" > > @@ -345,6 +346,8 @@ typedef struct VAAPIEncodeContext { > > int roi_warned; > > > > AVFrame *frame; > > + > > + AVFifoBuffer *encode_fifo; > > } VAAPIEncodeContext; > > > > enum {
On 27/10/2021 09:57, Wenbin Chen wrote: > Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance > decrease. The reason is that vaRenderPicture() and vaSyncSurface() are > called at the same time (vaRenderPicture() always followed by a > vaSyncSurface()). When we encode stream with B frames, we need buffer to > reorder frames, so we can send serveral frames to HW at once to increase > performance. Now I changed them to be called in a > asynchronous way, which will make better use of hardware. > 1080p transcoding increases about 17% fps on my environment. > > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > --- > libavcodec/vaapi_encode.c | 41 ++++++++++++++++++++++++++++----------- > libavcodec/vaapi_encode.h | 3 +++ > 2 files changed, 33 insertions(+), 11 deletions(-) The API does not allow this behaviour. For some bizarre reason (I think a badly-written example combined with the Intel driver being synchronous in vaEndPicture() for a long time), the sync to a surface is to the /input/ surface of an encode rather than the output surface. That means you can't have multiple encodes outstanding on the same surface and expect to sync usefully, because the only argument to vaSyncSurface() is the surface to sync to without anything about the associated context. Therefore trying to make it asynchronous like this falls down when input surfaces might appear multiple times, or might be used in the input of multiple encoders, because you can't tell whether your sync means the thing you actually wanted to finish has finished. (The commit you point to above as having decreased performance fixed this bug, since it became much more visible with decoupled send/receive.) So: put this change after the switch to syncing on output buffers (since that operation does make sense for this), and leave the existing behaviour for cases where you have to sync on the input surface. - Mark
> On 27/10/2021 09:57, Wenbin Chen wrote: > > Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance > > decrease. The reason is that vaRenderPicture() and vaSyncSurface() are > > called at the same time (vaRenderPicture() always followed by a > > vaSyncSurface()). When we encode stream with B frames, we need buffer > to > > reorder frames, so we can send serveral frames to HW at once to increase > > performance. Now I changed them to be called in a > > asynchronous way, which will make better use of hardware. > > 1080p transcoding increases about 17% fps on my environment. > > > > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> > > --- > > libavcodec/vaapi_encode.c | 41 ++++++++++++++++++++++++++++----------- > > libavcodec/vaapi_encode.h | 3 +++ > > 2 files changed, 33 insertions(+), 11 deletions(-) > > The API does not allow this behaviour. > > For some bizarre reason (I think a badly-written example combined with the > Intel driver being synchronous in vaEndPicture() for a long time), the sync to > a surface is to the /input/ surface of an encode rather than the output > surface. > > That means you can't have multiple encodes outstanding on the same > surface and expect to sync usefully, because the only argument to > vaSyncSurface() is the surface to sync to without anything about the > associated context. > > Therefore trying to make it asynchronous like this falls down when input > surfaces might appear multiple times, or might be used in the input of > multiple encoders, because you can't tell whether your sync means the thing > you actually wanted to finish has finished. > > (The commit you point to above as having decreased performance fixed this > bug, since it became much more visible with decoupled send/receive.) > > So: put this change after the switch to syncing on output buffers (since that > operation does make sense for this), and leave the existing behaviour for > cases where you have to sync on the input surface. > > - Mark Thanks for your advice. It makes sense to me. I will update the patches Best Regards Wenbin > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c index ec054ae701..5927849233 100644 --- a/libavcodec/vaapi_encode.c +++ b/libavcodec/vaapi_encode.c @@ -951,8 +951,10 @@ static int vaapi_encode_pick_next(AVCodecContext *avctx, if (!pic && ctx->end_of_stream) { --b_counter; pic = ctx->pic_end; - if (pic->encode_issued) + if (pic->encode_complete) return AVERROR_EOF; + else if (pic->encode_issued) + return AVERROR(EAGAIN); } if (!pic) { @@ -1177,20 +1179,31 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt) return AVERROR(EAGAIN); } - pic = NULL; - err = vaapi_encode_pick_next(avctx, &pic); - if (err < 0) - return err; - av_assert0(pic); + while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES * sizeof(VAAPIEncodePicture *)) { + pic = NULL; + err = vaapi_encode_pick_next(avctx, &pic); + if (err < 0) + break; + av_assert0(pic); - pic->encode_order = ctx->encode_order++; + pic->encode_order = ctx->encode_order + + (av_fifo_size(ctx->encode_fifo) / sizeof(VAAPIEncodePicture *)); - err = vaapi_encode_issue(avctx, pic); - if (err < 0) { - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); - return err; + err = vaapi_encode_issue(avctx, pic); + if (err < 0) { + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err); + return err; + } + + av_fifo_generic_write(ctx->encode_fifo, &pic, sizeof(pic), NULL); } + if (!av_fifo_size(ctx->encode_fifo)) + return err; + + av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL); + ctx->encode_order = pic->encode_order + 1; + err = vaapi_encode_output(avctx, pic, pkt); if (err < 0) { av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err); @@ -2520,6 +2533,11 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx) } } + ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) * + sizeof(VAAPIEncodePicture *)); + if (!ctx->encode_fifo) + return AVERROR(ENOMEM); + return 0; fail: @@ -2552,6 +2570,7 @@ av_cold int ff_vaapi_encode_close(AVCodecContext *avctx) av_freep(&ctx->codec_sequence_params); av_freep(&ctx->codec_picture_params); + av_fifo_freep(&ctx->encode_fifo); av_buffer_unref(&ctx->recon_frames_ref); av_buffer_unref(&ctx->input_frames_ref); diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h index b41604a883..89fe8de466 100644 --- a/libavcodec/vaapi_encode.h +++ b/libavcodec/vaapi_encode.h @@ -29,6 +29,7 @@ #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_vaapi.h" +#include "libavutil/fifo.h" #include "avcodec.h" #include "hwconfig.h" @@ -345,6 +346,8 @@ typedef struct VAAPIEncodeContext { int roi_warned; AVFrame *frame; + + AVFifoBuffer *encode_fifo; } VAAPIEncodeContext; enum {
Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance decrease. The reason is that vaRenderPicture() and vaSyncSurface() are called at the same time (vaRenderPicture() always followed by a vaSyncSurface()). When we encode stream with B frames, we need buffer to reorder frames, so we can send serveral frames to HW at once to increase performance. Now I changed them to be called in a asynchronous way, which will make better use of hardware. 1080p transcoding increases about 17% fps on my environment. Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> --- libavcodec/vaapi_encode.c | 41 ++++++++++++++++++++++++++++----------- libavcodec/vaapi_encode.h | 3 +++ 2 files changed, 33 insertions(+), 11 deletions(-)