From patchwork Mon Feb 11 22:41:04 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shaofei Wang X-Patchwork-Id: 12040 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id E3A534491C8 for ; Mon, 11 Feb 2019 11:42:09 +0200 (EET) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id CB07C68A571; Mon, 11 Feb 2019 11:42:09 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mga12.intel.com (mga12.intel.com [192.55.52.136]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 71C2068A55B for ; Mon, 11 Feb 2019 11:42:02 +0200 (EET) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga004.fm.intel.com ([10.253.24.48]) by fmsmga106.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 11 Feb 2019 01:41:59 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.58,358,1544515200"; d="scan'208";a="143235897" Received: from unknown (HELO localhost.localdomain.bj.intel.com) ([172.16.113.186]) by fmsmga004.fm.intel.com with ESMTP; 11 Feb 2019 01:41:58 -0800 From: Shaofei Wang To: ffmpeg-devel@ffmpeg.org Date: Mon, 11 Feb 2019 17:41:04 -0500 Message-Id: <1549924864-26466-1-git-send-email-shaofei.wang@intel.com> X-Mailer: git-send-email 1.8.3.1 Subject: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate. X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: michael@niedermayer.cc, , george@nsup.org, Shaofei Wang MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" It enabled multiple filter graph concurrency, which bring above about 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration Below are some test cases and comparison as reference. (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz) (Software: Intel iHD driver - 16.9.00100, CentOS 7) For 1:N transcode by GPU acceleration with vaapi: ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \ -hwaccel_output_format vaapi \ -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \ -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null test results: 2 encoders 5 encoders 10 encoders Improved 6.1% 6.9% 5.5% For 1:N transcode by GPU acceleration with QSV: ./ffmpeg -hwaccel qsv -c:v h264_qsv \ -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \ -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null test results: 2 encoders 5 encoders 10 encoders Improved 6% 4% 15% For Intel GPU acceleration case, 1 decode to N scaling, by QSV: ./ffmpeg -hwaccel qsv -c:v h264_qsv \ -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null /dev/null \ -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null /dev/null test results: 2 scale 5 scale 10 scale Improved 12% 21% 21% For CPU only 1 decode to N scaling: ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \ -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null test results: 2 scale 5 scale 10 scale Improved 25% 107% 148% Signed-off-by: Wang, Shaofei Reviewed-by: Zhao, Jun --- fftools/ffmpeg.c | 112 +++++++++++++++++++++++++++++++++++++++++++++--- fftools/ffmpeg.h | 14 ++++++ fftools/ffmpeg_filter.c | 4 ++ 3 files changed, 124 insertions(+), 6 deletions(-) diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c index 544f1a1..67b1a2a 100644 --- a/fftools/ffmpeg.c +++ b/fftools/ffmpeg.c @@ -1419,13 +1419,18 @@ static void finish_output_stream(OutputStream *ost) * * @return 0 for success, <0 for severe errors */ -static int reap_filters(int flush) +static int reap_filters(int flush, InputFilter * ifilter) { AVFrame *filtered_frame = NULL; int i; - /* Reap all buffers present in the buffer sinks */ + /* Reap all buffers present in the buffer sinks or just reap specified + * input filter buffer */ for (i = 0; i < nb_output_streams; i++) { + if (ifilter) { + if (ifilter != output_streams[i]->filter->graph->inputs[0]) + continue; + } OutputStream *ost = output_streams[i]; OutputFile *of = output_files[ost->file_index]; AVFilterContext *filter; @@ -2179,7 +2184,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) } } - ret = reap_filters(1); + ret = HAVE_THREADS ? reap_filters(1, ifilter) : reap_filters(1, NULL); + if (ret < 0 && ret != AVERROR_EOF) { av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n", av_err2str(ret)); return ret; @@ -2208,6 +2214,14 @@ static int ifilter_send_eof(InputFilter *ifilter, int64_t pts) ifilter->eof = 1; +#if HAVE_THREADS + ifilter->waited_frm = NULL; + pthread_mutex_lock(&ifilter->process_mutex); + ifilter->t_end = 1; + pthread_cond_signal(&ifilter->process_cond); + pthread_mutex_unlock(&ifilter->process_mutex); + pthread_join(ifilter->f_thread, NULL); +#endif if (ifilter->filter) { ret = av_buffersrc_close(ifilter->filter, pts, AV_BUFFERSRC_FLAG_PUSH); if (ret < 0) @@ -2252,12 +2266,95 @@ static int decode(AVCodecContext *avctx, AVFrame *frame, int *got_frame, AVPacke return 0; } +#if HAVE_THREADS +static void *filter_pipeline(void *arg) +{ + InputFilter *fl = arg; + AVFrame *frm; + int ret; + while(1) { + pthread_mutex_lock(&fl->process_mutex); + while (fl->waited_frm == NULL && !fl->t_end) + pthread_cond_wait(&fl->process_cond, &fl->process_mutex); + pthread_mutex_unlock(&fl->process_mutex); + + if (fl->t_end) break; + + frm = fl->waited_frm; + ret = ifilter_send_frame(fl, frm); + if (ret < 0) { + av_log(NULL, AV_LOG_ERROR, + "Failed to inject frame into filter network: %s\n", av_err2str(ret)); + } else { + ret = reap_filters(0, fl); + } + fl->t_error = ret; + + pthread_mutex_lock(&fl->finish_mutex); + fl->waited_frm = NULL; + pthread_cond_signal(&fl->finish_cond); + pthread_mutex_unlock(&fl->finish_mutex); + + if (ret < 0) + break; + } + return fl; +} +#endif + static int send_frame_to_filters(InputStream *ist, AVFrame *decoded_frame) { int i, ret; AVFrame *f; av_assert1(ist->nb_filters > 0); /* ensure ret is initialized */ +#if HAVE_THREADS + for (i = 0; i < ist->nb_filters; i++) { + //it will use abr_pipeline mode by default + if (i < ist->nb_filters - 1) { + f = &ist->filters[i]->input_frm; + ret = av_frame_ref(f, decoded_frame); + if (ret < 0) + break; + } else + f = decoded_frame; + + if (!ist->filters[i]->b_abr_thread_init) { + if ((ret = pthread_create(&ist->filters[i]->f_thread, NULL, filter_pipeline, + ist->filters[i]))) { + av_log(NULL, AV_LOG_ERROR, + "pthread_create failed: %s. Try to increase `ulimit -v` or \ + decrease `ulimit -s`.\n", strerror(ret)); + return AVERROR(ret); + } + pthread_mutex_init(&ist->filters[i]->process_mutex, NULL); + pthread_mutex_init(&ist->filters[i]->finish_mutex, NULL); + pthread_cond_init(&ist->filters[i]->process_cond, NULL); + pthread_cond_init(&ist->filters[i]->finish_cond, NULL); + ist->filters[i]->t_end = 0; + ist->filters[i]->t_error = 0; + ist->filters[i]->b_abr_thread_init = 1; + } + + pthread_mutex_lock(&ist->filters[i]->process_mutex); + ist->filters[i]->waited_frm = f; + pthread_cond_signal(&ist->filters[i]->process_cond); + pthread_mutex_unlock(&ist->filters[i]->process_mutex); + } + + for (i = 0; i < ist->nb_filters; i++) { + pthread_mutex_lock(&ist->filters[i]->finish_mutex); + while(ist->filters[i]->waited_frm != NULL) + pthread_cond_wait(&ist->filters[i]->finish_cond, &ist->filters[i]->finish_mutex); + pthread_mutex_unlock(&ist->filters[i]->finish_mutex); + } + for (i = 0; i < ist->nb_filters; i++) { + if (ist->filters[i]->t_error < 0) { + ret = ist->filters[i]->t_error; + break; + } + } +#else for (i = 0; i < ist->nb_filters; i++) { if (i < ist->nb_filters - 1) { f = ist->filter_frame; @@ -2266,6 +2363,7 @@ static int send_frame_to_filters(InputStream *ist, AVFrame *decoded_frame) break; } else f = decoded_frame; + ret = ifilter_send_frame(ist->filters[i], f); if (ret == AVERROR_EOF) ret = 0; /* ignore */ @@ -2275,6 +2373,8 @@ static int send_frame_to_filters(InputStream *ist, AVFrame *decoded_frame) break; } } +#endif + return ret; } @@ -4537,10 +4637,10 @@ static int transcode_from_filter(FilterGraph *graph, InputStream **best_ist) *best_ist = NULL; ret = avfilter_graph_request_oldest(graph->graph); if (ret >= 0) - return reap_filters(0); + return reap_filters(0, NULL); if (ret == AVERROR_EOF) { - ret = reap_filters(1); + ret = reap_filters(1, NULL); for (i = 0; i < graph->nb_outputs; i++) close_output_stream(graph->outputs[i]->ost); return ret; @@ -4642,7 +4742,7 @@ static int transcode_step(void) if (ret < 0) return ret == AVERROR_EOF ? 0 : ret; - return reap_filters(0); + return HAVE_THREADS ? ret : reap_filters(0, NULL); } /* diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h index eb1eaf6..9a8e776 100644 --- a/fftools/ffmpeg.h +++ b/fftools/ffmpeg.h @@ -253,6 +253,20 @@ typedef struct InputFilter { AVBufferRef *hw_frames_ctx; + // for abr pipeline + int b_abr_thread_init; +#if HAVE_THREADS + AVFrame *waited_frm; + AVFrame input_frm; + pthread_t f_thread; + pthread_cond_t process_cond; + pthread_cond_t finish_cond; + pthread_mutex_t process_mutex; + pthread_mutex_t finish_mutex; + int t_end; + int t_error; +#endif + int eof; } InputFilter; diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c index 6518d50..5d1e521 100644 --- a/fftools/ffmpeg_filter.c +++ b/fftools/ffmpeg_filter.c @@ -197,6 +197,7 @@ DEF_CHOOSE_FORMAT(channel_layouts, uint64_t, channel_layout, channel_layouts, 0, int init_simple_filtergraph(InputStream *ist, OutputStream *ost) { FilterGraph *fg = av_mallocz(sizeof(*fg)); + int i; if (!fg) exit_program(1); @@ -225,6 +226,9 @@ int init_simple_filtergraph(InputStream *ist, OutputStream *ost) GROW_ARRAY(ist->filters, ist->nb_filters); ist->filters[ist->nb_filters - 1] = fg->inputs[0]; + for (i = 0; i < ist->nb_filters; i++) + ist->filters[i]->b_abr_thread_init = 0; + GROW_ARRAY(filtergraphs, nb_filtergraphs); filtergraphs[nb_filtergraphs - 1] = fg;