diff mbox

[FFmpeg-devel,v3] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.

Message ID 1547673427-16566-1-git-send-email-shaofei.wang@intel.com
State Superseded
Headers show

Commit Message

Shaofei Wang Jan. 16, 2019, 9:17 p.m. UTC
With new option "-abr_pipeline"
It enabled multiple filter graph concurrency, which bring obove about
4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration

Below are some test cases and comparison as reference.
(Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz)
(Software: Intel iHD driver - 16.9.00100, CentOS 7)

For 1:N transcode by GPU acceleration with vaapi:
./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \
    -hwaccel_output_format vaapi \
    -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
    -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \
    -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null \
    -abr_pipeline

    test results:
                2 encoders 5 encoders 10 encoders
    Improved       6.1%    6.9%       5.5%

For 1:N transcode by GPU acceleration with QSV:
./ffmpeg -hwaccel qsv -c:v h264_qsv \
    -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
    -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \
    -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null

    test results:
                2 encoders  5 encoders 10 encoders
    Improved       6%       4%         15%

For Intel GPU acceleration case, 1 decode to N scaling, by QSV:
./ffmpeg -hwaccel qsv -c:v h264_qsv \
    -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
    -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null /dev/null \
    -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null /dev/null

    test results:
                2 scale  5 scale   10 scale
    Improved       12%     21%        21%

For CPU only 1 decode to N scaling:
./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
    -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \
    -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null \
    -abr_pipeline

    test results:
                2 scale  5 scale   10 scale
    Improved       25%    107%       148%

Signed-off-by: Wang, Shaofei <shaofei.wang@intel.com>
Reviewed-by: Zhao, Jun <jun.zhao@intel.com>
---
 fftools/ffmpeg.c        | 228 ++++++++++++++++++++++++++++++++++++++++++++----
 fftools/ffmpeg.h        |  15 ++++
 fftools/ffmpeg_filter.c |   4 +
 fftools/ffmpeg_opt.c    |   6 +-
 4 files changed, 237 insertions(+), 16 deletions(-)

Comments

Guo, Yejun Jan. 17, 2019, 1:24 a.m. UTC | #1
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf

> Of Shaofei Wang

> Sent: Thursday, January 17, 2019 5:17 AM

> To: ffmpeg-devel@ffmpeg.org

> Cc: michael@niedermayer.cc; atomnuker@gmail.com; cus@passwd.hu;

> Wang, Shaofei <shaofei.wang@intel.com>; ceffmpeg@gmail.com

> Subject: [FFmpeg-devel] [PATCH v3] Improved the performance of 1 decode

> + N filter graphs and adaptive bitrate.

> 

> With new option "-abr_pipeline"

> It enabled multiple filter graph concurrency, which bring obove about

> 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration

> 

> Below are some test cases and comparison as reference.

> (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz)

> (Software: Intel iHD driver - 16.9.00100, CentOS 7)

> 

> For 1:N transcode by GPU acceleration with vaapi:

> ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \

>     -hwaccel_output_format vaapi \

>     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \

>     -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \

>     -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null \

>     -abr_pipeline

> 

>     test results:

>                 2 encoders 5 encoders 10 encoders

>     Improved       6.1%    6.9%       5.5%

> 

> For 1:N transcode by GPU acceleration with QSV:

> ./ffmpeg -hwaccel qsv -c:v h264_qsv \

>     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \

>     -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \

>     -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null

> 

>     test results:

>                 2 encoders  5 encoders 10 encoders

>     Improved       6%       4%         15%

> 

> For Intel GPU acceleration case, 1 decode to N scaling, by QSV:

> ./ffmpeg -hwaccel qsv -c:v h264_qsv \

>     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \

>     -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null

> /dev/null \

>     -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null

> /dev/null

> 

>     test results:

>                 2 scale  5 scale   10 scale

>     Improved       12%     21%        21%

> 

> For CPU only 1 decode to N scaling:

> ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \

>     -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \

>     -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null \

>     -abr_pipeline

> 

>     test results:

>                 2 scale  5 scale   10 scale

>     Improved       25%    107%       148%

> 

> Signed-off-by: Wang, Shaofei <shaofei.wang@intel.com>

> Reviewed-by: Zhao, Jun <jun.zhao@intel.com>

> ---

>  fftools/ffmpeg.c        | 228

> ++++++++++++++++++++++++++++++++++++++++++++----

>  fftools/ffmpeg.h        |  15 ++++

>  fftools/ffmpeg_filter.c |   4 +

>  fftools/ffmpeg_opt.c    |   6 +-

>  4 files changed, 237 insertions(+), 16 deletions(-)

> 

> diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c

> index 544f1a1..7dbff15 100644

> --- a/fftools/ffmpeg.c

> +++ b/fftools/ffmpeg.c

> @@ -1523,6 +1523,109 @@ static int reap_filters(int flush)

>      return 0;

>  }

> 

> +static int pipeline_reap_filters(int flush, InputFilter * ifilter)

> +{

> +    AVFrame *filtered_frame = NULL;

> +    int i;

> +

> +    for (i = 0; i < nb_output_streams; i++) {

> +        if (ifilter == output_streams[i]->filter->graph->inputs[0]) break;

> +    }

> +    OutputStream *ost = output_streams[i];

> +    OutputFile    *of = output_files[ost->file_index];

> +    AVFilterContext *filter;

> +    AVCodecContext *enc = ost->enc_ctx;

> +    int ret = 0;

> +

> +    if (!ost->filter || !ost->filter->graph->graph)

> +        return 0;

> +    filter = ost->filter->filter;

> +

> +    if (!ost->initialized) {

> +        char error[1024] = "";

> +        ret = init_output_stream(ost, error, sizeof(error));

> +        if (ret < 0) {

> +            av_log(NULL, AV_LOG_ERROR, "Error initializing output stream %d:%d

> -- %s\n",

> +                   ost->file_index, ost->index, error);

> +            exit_program(1);


imo, it's not good to exit the program.

> +        }

> +    }

> +

> +    if (!ost->filtered_frame && !(ost->filtered_frame = av_frame_alloc()))

> +        return AVERROR(ENOMEM);

> +    filtered_frame = ost->filtered_frame;

> +

> +    while (1) {

> +        double float_pts = AV_NOPTS_VALUE; // this is identical to

> filtered_frame.pts but with higher precision

> +        ret = av_buffersink_get_frame_flags(filter, filtered_frame,

> +                                           AV_BUFFERSINK_FLAG_NO_REQUEST);

> +        if (ret < 0) {

> +            if (ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) {

> +                av_log(NULL, AV_LOG_WARNING,

> +                       "Error in av_buffersink_get_frame_flags(): %s\n",

> av_err2str(ret));

> +            } else if (flush && ret == AVERROR_EOF) {

> +                if (av_buffersink_get_type(filter) == AVMEDIA_TYPE_VIDEO)

> +                    do_video_out(of, ost, NULL, AV_NOPTS_VALUE);

> +            }

> +            break;

> +        }

> +        if (ost->finished) {

> +            av_frame_unref(filtered_frame);

> +            continue;

> +        }

> +        if (filtered_frame->pts != AV_NOPTS_VALUE) {

> +            int64_t start_time = (of->start_time == AV_NOPTS_VALUE) ? 0 : of-

> >start_time;

> +            AVRational filter_tb = av_buffersink_get_time_base(filter);

> +            AVRational tb = enc->time_base;

> +            int extra_bits = av_clip(29 - av_log2(tb.den), 0, 16);

> +

> +            tb.den <<= extra_bits;

> +            float_pts =

> +                av_rescale_q(filtered_frame->pts, filter_tb, tb) -

> +                av_rescale_q(start_time, AV_TIME_BASE_Q, tb);

> +            float_pts /= 1 << extra_bits;

> +            // avoid exact midoints to reduce the chance of rounding differences,

> this can be removed in case the fps code is changed to work with integers

> +            float_pts += FFSIGN(float_pts) * 1.0 / (1<<17);

> +

> +            filtered_frame->pts =

> +                av_rescale_q(filtered_frame->pts, filter_tb, enc->time_base) -

> +                av_rescale_q(start_time, AV_TIME_BASE_Q, enc->time_base);

> +        }

> +

> +        switch (av_buffersink_get_type(filter)) {

> +        case AVMEDIA_TYPE_VIDEO:

> +            if (!ost->frame_aspect_ratio.num)

> +                enc->sample_aspect_ratio = filtered_frame->sample_aspect_ratio;

> +

> +            if (debug_ts) {

> +                av_log(NULL, AV_LOG_INFO, "filter -> pts:%s pts_time:%s exact:%f

> time_base:%d/%d\n",

> +                        av_ts2str(filtered_frame->pts), av_ts2timestr(filtered_frame-

> >pts, &enc->time_base),

> +                        float_pts,

> +                        enc->time_base.num, enc->time_base.den);

> +            }

> +

> +            do_video_out(of, ost, filtered_frame, float_pts);

> +            break;

> +        case AVMEDIA_TYPE_AUDIO:

> +            if (!(enc->codec->capabilities & AV_CODEC_CAP_PARAM_CHANGE)

> &&

> +                enc->channels != filtered_frame->channels) {

> +                av_log(NULL, AV_LOG_ERROR,

> +                       "Audio filter graph output is not normalized and encoder does

> not support parameter changes\n");

> +                break;

> +            }

> +            do_audio_out(of, ost, filtered_frame);

> +            break;

> +        default:

> +            // TODO support subtitle filters

> +            av_assert0(0);


maybe better to return AVERROR_PATCHWELCOME?

> +        }

> +

> +        av_frame_unref(filtered_frame);

> +    }

> +

> +    return 0;

> +}

> +

>  static void print_final_stats(int64_t total_size)

>  {

>      uint64_t video_size = 0, audio_size = 0, extra_size = 0, other_size = 0;

> @@ -2179,7 +2282,8 @@ static int ifilter_send_frame(InputFilter *ifilter,

> AVFrame *frame)

>              }

>          }

> 

> -        ret = reap_filters(1);

> +        ret = abr_pipeline ? pipeline_reap_filters(1, ifilter) : reap_filters(1);

> +

>          if (ret < 0 && ret != AVERROR_EOF) {

>              av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n",

> av_err2str(ret));

>              return ret;

> @@ -2208,6 +2312,16 @@ static int ifilter_send_eof(InputFilter *ifilter,

> int64_t pts)

> 

>      ifilter->eof = 1;

> 

> +#if HAVE_THREADS

> +    if (abr_pipeline) {

> +        ifilter->waited_frm = NULL;

> +        pthread_mutex_lock(&ifilter->process_mutex);

> +        ifilter->t_end = 1;

> +        pthread_cond_signal(&ifilter->process_cond);

> +        pthread_mutex_unlock(&ifilter->process_mutex);

> +        pthread_join(ifilter->f_thread, NULL);

> +    }

> +#endif

>      if (ifilter->filter) {

>          ret = av_buffersrc_close(ifilter->filter, pts,

> AV_BUFFERSRC_FLAG_PUSH);

>          if (ret < 0)

> @@ -2252,6 +2366,41 @@ static int decode(AVCodecContext *avctx,

> AVFrame *frame, int *got_frame, AVPacke

>      return 0;

>  }

> 

> +#if HAVE_THREADS

> +static void *filter_pipeline(void *arg)

> +{

> +    InputFilter *fl = arg;

> +    AVFrame *frm;

> +    int ret;

> +    while(1) {

> +        pthread_mutex_lock(&fl->process_mutex);

> +        while (fl->waited_frm == NULL && !fl->t_end)

> +            pthread_cond_wait(&fl->process_cond, &fl->process_mutex);

> +        pthread_mutex_unlock(&fl->process_mutex);

> +

> +        if (fl->t_end) break;

> +

> +        frm = fl->waited_frm;

> +        ret = ifilter_send_frame(fl, frm);

> +        if (ret < 0) {

> +            av_log(NULL, AV_LOG_ERROR,

> +                   "Failed to inject frame into filter network: %s\n", av_err2str(ret));

> +        } else {

> +            ret = pipeline_reap_filters(0, fl);

> +        }

> +        fl->t_error = ret;

> +

> +        pthread_mutex_lock(&fl->finish_mutex);

> +        fl->waited_frm = NULL;

> +        pthread_cond_signal(&fl->finish_cond);

> +        pthread_mutex_unlock(&fl->finish_mutex);

> +

> +        if (ret < 0)

> +            break;

> +    }

> +    return fl;

> +}

> +#endif

>  static int send_frame_to_filters(InputStream *ist, AVFrame

> *decoded_frame)

>  {

>      int i, ret;

> @@ -2259,22 +2408,71 @@ static int send_frame_to_filters(InputStream *ist,

> AVFrame *decoded_frame)

> 

>      av_assert1(ist->nb_filters > 0); /* ensure ret is initialized */

>      for (i = 0; i < ist->nb_filters; i++) {

> -        if (i < ist->nb_filters - 1) {

> -            f = ist->filter_frame;

> -            ret = av_frame_ref(f, decoded_frame);

> -            if (ret < 0)

> +        if (!abr_pipeline) {

> +            if (i < ist->nb_filters - 1) {

> +                f = ist->filter_frame;

> +                ret = av_frame_ref(f, decoded_frame);

> +                if (ret < 0)

> +                    break;

> +            } else

> +                f = decoded_frame;

> +

> +                ret = ifilter_send_frame(ist->filters[i], f);

> +                if (ret == AVERROR_EOF)

> +                    ret = 0; /* ignore */

> +                if (ret < 0) {

> +                    av_log(NULL, AV_LOG_ERROR,

> +                           "Failed to inject frame into filter network: %s\n",

> av_err2str(ret));

> +                    break;

> +                }

> +        } else {

> +#if HAVE_THREADS

> +            if (i < ist->nb_filters - 1) {

> +                f = &ist->filters[i]->input_frm;

> +                ret = av_frame_ref(f, decoded_frame);

> +                if (ret < 0)

> +                    break;

> +            } else

> +                f = decoded_frame;

> +

> +            if (!ist->filters[i]->b_abr_thread_init) {

> +                if ((ret = pthread_create(&ist->filters[i]->f_thread, NULL,

> filter_pipeline, ist->filters[i]))) {

> +                    av_log(NULL, AV_LOG_ERROR, "pthread_create failed: %s. Try to

> increase `ulimit -v` or decrease `ulimit -s`.\n", strerror(ret));

> +                    return AVERROR(ret);

> +                }

> +                pthread_mutex_init(&ist->filters[i]->process_mutex, NULL);

> +                pthread_mutex_init(&ist->filters[i]->finish_mutex, NULL);

> +                pthread_cond_init(&ist->filters[i]->process_cond, NULL);

> +                pthread_cond_init(&ist->filters[i]->finish_cond, NULL);

> +                ist->filters[i]->t_end = 0;

> +                ist->filters[i]->t_error = 0;

> +                ist->filters[i]->b_abr_thread_init = 1;

> +            }

> +

> +            pthread_mutex_lock(&ist->filters[i]->process_mutex);

> +            ist->filters[i]->waited_frm = f;

> +            pthread_cond_signal(&ist->filters[i]->process_cond);

> +            pthread_mutex_unlock(&ist->filters[i]->process_mutex);

> +#endif

> +        }

> +    }

> +#if HAVE_THREADS

> +    if (abr_pipeline) {

> +        for (i = 0; i < ist->nb_filters; i++) {

> +            pthread_mutex_lock(&ist->filters[i]->finish_mutex);

> +            while(ist->filters[i]->waited_frm != NULL)

> +                pthread_cond_wait(&ist->filters[i]->finish_cond, &ist->filters[i]-

> >finish_mutex);

> +            pthread_mutex_unlock(&ist->filters[i]->finish_mutex);

> +        }

> +        for (i = 0; i < ist->nb_filters; i++) {

> +            if (ist->filters[i]->t_error < 0) {

> +                ret = ist->filters[i]->t_error;

>                  break;

> -        } else

> -            f = decoded_frame;

> -        ret = ifilter_send_frame(ist->filters[i], f);

> -        if (ret == AVERROR_EOF)

> -            ret = 0; /* ignore */

> -        if (ret < 0) {

> -            av_log(NULL, AV_LOG_ERROR,

> -                   "Failed to inject frame into filter network: %s\n", av_err2str(ret));

> -            break;

> +            }

>          }

>      }

> +#endif

> +

>      return ret;

>  }

> 

> @@ -4642,7 +4840,7 @@ static int transcode_step(void)

>      if (ret < 0)

>          return ret == AVERROR_EOF ? 0 : ret;

> 

> -    return reap_filters(0);

> +    return abr_pipeline ? 0 : reap_filters(0);

>  }

> 

>  /*

> diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h

> index eb1eaf6..110306a 100644

> --- a/fftools/ffmpeg.h

> +++ b/fftools/ffmpeg.h

> @@ -253,6 +253,20 @@ typedef struct InputFilter {

> 

>      AVBufferRef *hw_frames_ctx;

> 

> +    // for abr pipeline

> +    int b_abr_thread_init;

> +#if HAVE_THREADS

> +    AVFrame *waited_frm;

> +    AVFrame input_frm;

> +    pthread_t f_thread;

> +    pthread_cond_t process_cond;

> +    pthread_cond_t finish_cond;

> +    pthread_mutex_t process_mutex;

> +    pthread_mutex_t finish_mutex;

> +    int t_end;

> +    int t_error;

> +#endif

> +

>      int eof;

>  } InputFilter;

> 

> @@ -606,6 +620,7 @@ extern int frame_bits_per_raw_sample;

>  extern AVIOContext *progress_avio;

>  extern float max_error_rate;

>  extern char *videotoolbox_pixfmt;

> +extern int abr_pipeline;

> 

>  extern int filter_nbthreads;

>  extern int filter_complex_nbthreads;

> diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c

> index 6518d50..8f14fbc 100644

> --- a/fftools/ffmpeg_filter.c

> +++ b/fftools/ffmpeg_filter.c

> @@ -197,6 +197,7 @@ DEF_CHOOSE_FORMAT(channel_layouts, uint64_t,

> channel_layout, channel_layouts, 0,

>  int init_simple_filtergraph(InputStream *ist, OutputStream *ost)

>  {

>      FilterGraph *fg = av_mallocz(sizeof(*fg));

> +    int i;

> 

>      if (!fg)

>          exit_program(1);

> @@ -225,6 +226,9 @@ int init_simple_filtergraph(InputStream *ist,

> OutputStream *ost)

>      GROW_ARRAY(ist->filters, ist->nb_filters);

>      ist->filters[ist->nb_filters - 1] = fg->inputs[0];

> 

> +    if (abr_pipeline)

> +        for (i = 0; i < ist->nb_filters; i++)

> +            ist->filters[i]->b_abr_thread_init = 0;

>      GROW_ARRAY(filtergraphs, nb_filtergraphs);

>      filtergraphs[nb_filtergraphs - 1] = fg;

> 

> diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c

> index d4851a2..fa5a556 100644

> --- a/fftools/ffmpeg_opt.c

> +++ b/fftools/ffmpeg_opt.c

> @@ -110,6 +110,7 @@ float max_error_rate  = 2.0/3;

>  int filter_nbthreads = 0;

>  int filter_complex_nbthreads = 0;

>  int vstats_version = 2;

> +int abr_pipeline      = 0;

> 

> 

>  static int intra_only         = 0;

> @@ -3502,7 +3503,10 @@ const OptionDef options[] = {

>          "set the maximum number of queued packets from the demuxer" },

>      { "find_stream_info", OPT_BOOL | OPT_PERFILE | OPT_INPUT |

> OPT_EXPERT, { &find_stream_info },

>          "read and decode the streams to fill missing information with

> heuristics" },

> -

> +#if HAVE_THREADS

> +    { "abr_pipeline",    OPT_BOOL,                                    { &abr_pipeline },

> +        "adaptive bitrate pipeline (1 decode to N filter graphs, and 1 to N

> transcode" },

> +#endif

>      /* video options */

>      { "vframes",      OPT_VIDEO | HAS_ARG  | OPT_PERFILE | OPT_OUTPUT,

> { .func_arg = opt_video_frames },

>          "set the number of video frames to output", "number" },

> --

> 1.8.3.1

> 

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Shaofei Wang Jan. 17, 2019, 7:35 a.m. UTC | #2
> From: Guo, Yejun

> Sent: Thursday, January 17, 2019 9:25 AM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Cc: michael@niedermayer.cc; atomnuker@gmail.com; cus@passwd.hu;

> Wang, Shaofei <shaofei.wang@intel.com>; ceffmpeg@gmail.com

> Subject: RE: [FFmpeg-devel] [PATCH v3] Improved the performance of 1

> decode + N filter graphs and adaptive bitrate.

> 

> 

> > +static int pipeline_reap_filters(int flush, InputFilter * ifilter) {

> > +    AVFrame *filtered_frame = NULL;

> > +    int i;

> > +

> > +    for (i = 0; i < nb_output_streams; i++) {

> > +        if (ifilter == output_streams[i]->filter->graph->inputs[0]) break;

> > +    }

> > +    OutputStream *ost = output_streams[i];

> > +    OutputFile    *of = output_files[ost->file_index];

> > +    AVFilterContext *filter;

> > +    AVCodecContext *enc = ost->enc_ctx;

> > +    int ret = 0;

> > +

> > +    if (!ost->filter || !ost->filter->graph->graph)

> > +        return 0;

> > +    filter = ost->filter->filter;

> > +

> > +    if (!ost->initialized) {

> > +        char error[1024] = "";

> > +        ret = init_output_stream(ost, error, sizeof(error));

> > +        if (ret < 0) {

> > +            av_log(NULL, AV_LOG_ERROR, "Error initializing output

> > + stream %d:%d

> > -- %s\n",

> > +                   ost->file_index, ost->index, error);

> > +            exit_program(1);

> 

> imo, it's not good to exit the program.

Any reason? These lines are similar as them in reap_filters(). Line 1445.

> > +        }

> > +    }

> > +

> > +    if (!ost->filtered_frame && !(ost->filtered_frame = av_frame_alloc()))

> > +        return AVERROR(ENOMEM);

> > +    filtered_frame = ost->filtered_frame;

> > +

> > +    while (1) {

> > +        double float_pts = AV_NOPTS_VALUE; // this is identical to

> > filtered_frame.pts but with higher precision

> > +        ret = av_buffersink_get_frame_flags(filter, filtered_frame,

> > +

> AV_BUFFERSINK_FLAG_NO_REQUEST);

> > +        if (ret < 0) {

> > +            if (ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) {

> > +                av_log(NULL, AV_LOG_WARNING,

> > +                       "Error in av_buffersink_get_frame_flags():

> > + %s\n",

> > av_err2str(ret));

> > +            } else if (flush && ret == AVERROR_EOF) {

> > +                if (av_buffersink_get_type(filter) ==

> AVMEDIA_TYPE_VIDEO)

> > +                    do_video_out(of, ost, NULL, AV_NOPTS_VALUE);

> > +            }

> > +            break;

> > +        }

> > +        if (ost->finished) {

> > +            av_frame_unref(filtered_frame);

> > +            continue;

> > +        }

> > +        if (filtered_frame->pts != AV_NOPTS_VALUE) {

> > +            int64_t start_time = (of->start_time == AV_NOPTS_VALUE) ?

> > + 0 : of-

> > >start_time;

> > +            AVRational filter_tb = av_buffersink_get_time_base(filter);

> > +            AVRational tb = enc->time_base;

> > +            int extra_bits = av_clip(29 - av_log2(tb.den), 0, 16);

> > +

> > +            tb.den <<= extra_bits;

> > +            float_pts =

> > +                av_rescale_q(filtered_frame->pts, filter_tb, tb) -

> > +                av_rescale_q(start_time, AV_TIME_BASE_Q, tb);

> > +            float_pts /= 1 << extra_bits;

> > +            // avoid exact midoints to reduce the chance of rounding

> > + differences,

> > this can be removed in case the fps code is changed to work with

> > integers

> > +            float_pts += FFSIGN(float_pts) * 1.0 / (1<<17);

> > +

> > +            filtered_frame->pts =

> > +                av_rescale_q(filtered_frame->pts, filter_tb,

> enc->time_base) -

> > +                av_rescale_q(start_time, AV_TIME_BASE_Q,

> enc->time_base);

> > +        }

> > +

> > +        switch (av_buffersink_get_type(filter)) {

> > +        case AVMEDIA_TYPE_VIDEO:

> > +            if (!ost->frame_aspect_ratio.num)

> > +                enc->sample_aspect_ratio =

> > + filtered_frame->sample_aspect_ratio;

> > +

> > +            if (debug_ts) {

> > +                av_log(NULL, AV_LOG_INFO, "filter -> pts:%s

> > + pts_time:%s exact:%f

> > time_base:%d/%d\n",

> > +                        av_ts2str(filtered_frame->pts),

> > + av_ts2timestr(filtered_frame-

> > >pts, &enc->time_base),

> > +                        float_pts,

> > +                        enc->time_base.num, enc->time_base.den);

> > +            }

> > +

> > +            do_video_out(of, ost, filtered_frame, float_pts);

> > +            break;

> > +        case AVMEDIA_TYPE_AUDIO:

> > +            if (!(enc->codec->capabilities &

> > + AV_CODEC_CAP_PARAM_CHANGE)

> > &&

> > +                enc->channels != filtered_frame->channels) {

> > +                av_log(NULL, AV_LOG_ERROR,

> > +                       "Audio filter graph output is not normalized

> > + and encoder does

> > not support parameter changes\n");

> > +                break;

> > +            }

> > +            do_audio_out(of, ost, filtered_frame);

> > +            break;

> > +        default:

> > +            // TODO support subtitle filters

> > +            av_assert0(0);

> 

> maybe better to return AVERROR_PATCHWELCOME?

Is it expected to terminate the program here? It also the similar as previous code in reap_filters()
Guo, Yejun Jan. 17, 2019, 7:57 a.m. UTC | #3
> -----Original Message-----

> From: Wang, Shaofei

> Sent: Thursday, January 17, 2019 3:36 PM

> To: Guo, Yejun <yejun.guo@intel.com>; FFmpeg development discussions

> and patches <ffmpeg-devel@ffmpeg.org>

> Cc: michael@niedermayer.cc; atomnuker@gmail.com; cus@passwd.hu;

> ceffmpeg@gmail.com

> Subject: RE: [FFmpeg-devel] [PATCH v3] Improved the performance of 1

> decode + N filter graphs and adaptive bitrate.

> 

> > From: Guo, Yejun

> > Sent: Thursday, January 17, 2019 9:25 AM

> > To: FFmpeg development discussions and patches

> > <ffmpeg-devel@ffmpeg.org>

> > Cc: michael@niedermayer.cc; atomnuker@gmail.com; cus@passwd.hu;

> Wang,

> > Shaofei <shaofei.wang@intel.com>; ceffmpeg@gmail.com

> > Subject: RE: [FFmpeg-devel] [PATCH v3] Improved the performance of 1

> > decode + N filter graphs and adaptive bitrate.

> >

> >

> > > +static int pipeline_reap_filters(int flush, InputFilter * ifilter) {

> > > +    AVFrame *filtered_frame = NULL;

> > > +    int i;

> > > +

> > > +    for (i = 0; i < nb_output_streams; i++) {

> > > +        if (ifilter == output_streams[i]->filter->graph->inputs[0]) break;

> > > +    }

> > > +    OutputStream *ost = output_streams[i];

> > > +    OutputFile    *of = output_files[ost->file_index];

> > > +    AVFilterContext *filter;

> > > +    AVCodecContext *enc = ost->enc_ctx;

> > > +    int ret = 0;

> > > +

> > > +    if (!ost->filter || !ost->filter->graph->graph)

> > > +        return 0;

> > > +    filter = ost->filter->filter;

> > > +

> > > +    if (!ost->initialized) {

> > > +        char error[1024] = "";

> > > +        ret = init_output_stream(ost, error, sizeof(error));

> > > +        if (ret < 0) {

> > > +            av_log(NULL, AV_LOG_ERROR, "Error initializing output

> > > + stream %d:%d

> > > -- %s\n",

> > > +                   ost->file_index, ost->index, error);

> > > +            exit_program(1);

> >

> > imo, it's not good to exit the program.

> Any reason? These lines are similar as them in reap_filters(). Line 1445.


I'm just wondering, in general, how the program could exit in the middle. And the function does have a return value for error code. 

> 

> > > +        }

> > > +    }

> > > +

> > > +    if (!ost->filtered_frame && !(ost->filtered_frame = av_frame_alloc()))

> > > +        return AVERROR(ENOMEM);

> > > +    filtered_frame = ost->filtered_frame;

> > > +

> > > +    while (1) {

> > > +        double float_pts = AV_NOPTS_VALUE; // this is identical to

> > > filtered_frame.pts but with higher precision

> > > +        ret = av_buffersink_get_frame_flags(filter, filtered_frame,

> > > +

> > AV_BUFFERSINK_FLAG_NO_REQUEST);

> > > +        if (ret < 0) {

> > > +            if (ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) {

> > > +                av_log(NULL, AV_LOG_WARNING,

> > > +                       "Error in av_buffersink_get_frame_flags():

> > > + %s\n",

> > > av_err2str(ret));

> > > +            } else if (flush && ret == AVERROR_EOF) {

> > > +                if (av_buffersink_get_type(filter) ==

> > AVMEDIA_TYPE_VIDEO)

> > > +                    do_video_out(of, ost, NULL, AV_NOPTS_VALUE);

> > > +            }

> > > +            break;

> > > +        }

> > > +        if (ost->finished) {

> > > +            av_frame_unref(filtered_frame);

> > > +            continue;

> > > +        }

> > > +        if (filtered_frame->pts != AV_NOPTS_VALUE) {

> > > +            int64_t start_time = (of->start_time == AV_NOPTS_VALUE) ?

> > > + 0 : of-

> > > >start_time;

> > > +            AVRational filter_tb = av_buffersink_get_time_base(filter);

> > > +            AVRational tb = enc->time_base;

> > > +            int extra_bits = av_clip(29 - av_log2(tb.den), 0, 16);

> > > +

> > > +            tb.den <<= extra_bits;

> > > +            float_pts =

> > > +                av_rescale_q(filtered_frame->pts, filter_tb, tb) -

> > > +                av_rescale_q(start_time, AV_TIME_BASE_Q, tb);

> > > +            float_pts /= 1 << extra_bits;

> > > +            // avoid exact midoints to reduce the chance of

> > > + rounding differences,

> > > this can be removed in case the fps code is changed to work with

> > > integers

> > > +            float_pts += FFSIGN(float_pts) * 1.0 / (1<<17);

> > > +

> > > +            filtered_frame->pts =

> > > +                av_rescale_q(filtered_frame->pts, filter_tb,

> > enc->time_base) -

> > > +                av_rescale_q(start_time, AV_TIME_BASE_Q,

> > enc->time_base);

> > > +        }

> > > +

> > > +        switch (av_buffersink_get_type(filter)) {

> > > +        case AVMEDIA_TYPE_VIDEO:

> > > +            if (!ost->frame_aspect_ratio.num)

> > > +                enc->sample_aspect_ratio =

> > > + filtered_frame->sample_aspect_ratio;

> > > +

> > > +            if (debug_ts) {

> > > +                av_log(NULL, AV_LOG_INFO, "filter -> pts:%s

> > > + pts_time:%s exact:%f

> > > time_base:%d/%d\n",

> > > +                        av_ts2str(filtered_frame->pts),

> > > + av_ts2timestr(filtered_frame-

> > > >pts, &enc->time_base),

> > > +                        float_pts,

> > > +                        enc->time_base.num, enc->time_base.den);

> > > +            }

> > > +

> > > +            do_video_out(of, ost, filtered_frame, float_pts);

> > > +            break;

> > > +        case AVMEDIA_TYPE_AUDIO:

> > > +            if (!(enc->codec->capabilities &

> > > + AV_CODEC_CAP_PARAM_CHANGE)

> > > &&

> > > +                enc->channels != filtered_frame->channels) {

> > > +                av_log(NULL, AV_LOG_ERROR,

> > > +                       "Audio filter graph output is not normalized

> > > + and encoder does

> > > not support parameter changes\n");

> > > +                break;

> > > +            }

> > > +            do_audio_out(of, ost, filtered_frame);

> > > +            break;

> > > +        default:

> > > +            // TODO support subtitle filters

> > > +            av_assert0(0);

> >

> > maybe better to return AVERROR_PATCHWELCOME?

> Is it expected to terminate the program here? It also the similar as previous

> code in reap_filters()


imho, I think the code can be improved to add more description here, instead of the simple assert(0).


anyway, just wait for maintainer's comment.

>
Michael Niedermayer Jan. 17, 2019, 12:30 p.m. UTC | #4
On Wed, Jan 16, 2019 at 04:17:07PM -0500, Shaofei Wang wrote:
> With new option "-abr_pipeline"
> It enabled multiple filter graph concurrency, which bring obove about
> 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration
> 
> Below are some test cases and comparison as reference.
> (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz)
> (Software: Intel iHD driver - 16.9.00100, CentOS 7)
> 
> For 1:N transcode by GPU acceleration with vaapi:
> ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \
>     -hwaccel_output_format vaapi \
>     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
>     -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \
>     -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null \
>     -abr_pipeline
> 
>     test results:
>                 2 encoders 5 encoders 10 encoders
>     Improved       6.1%    6.9%       5.5%
> 
> For 1:N transcode by GPU acceleration with QSV:
> ./ffmpeg -hwaccel qsv -c:v h264_qsv \
>     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
>     -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \
>     -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null
> 
>     test results:
>                 2 encoders  5 encoders 10 encoders
>     Improved       6%       4%         15%
> 
> For Intel GPU acceleration case, 1 decode to N scaling, by QSV:
> ./ffmpeg -hwaccel qsv -c:v h264_qsv \
>     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
>     -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null /dev/null \
>     -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null /dev/null
> 
>     test results:
>                 2 scale  5 scale   10 scale
>     Improved       12%     21%        21%
> 
> For CPU only 1 decode to N scaling:
> ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
>     -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \
>     -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null \
>     -abr_pipeline
> 
>     test results:
>                 2 scale  5 scale   10 scale
>     Improved       25%    107%       148%
> 
> Signed-off-by: Wang, Shaofei <shaofei.wang@intel.com>
> Reviewed-by: Zhao, Jun <jun.zhao@intel.com>
> ---
>  fftools/ffmpeg.c        | 228 ++++++++++++++++++++++++++++++++++++++++++++----
>  fftools/ffmpeg.h        |  15 ++++
>  fftools/ffmpeg_filter.c |   4 +
>  fftools/ffmpeg_opt.c    |   6 +-
>  4 files changed, 237 insertions(+), 16 deletions(-)

Looking at this i see alot of duplicated code and alot of ifdefs

if i look at one of the duplicated functions i see:

@@ -1,10 +1,11 @@
-static int reap_filters(int flush)
+static int pipeline_reap_filters(int flush, InputFilter * ifilter)
 {
     AVFrame *filtered_frame = NULL;
     int i;
 
-    /* Reap all buffers present in the buffer sinks */
     for (i = 0; i < nb_output_streams; i++) {
+        if (ifilter == output_streams[i]->filter->graph->inputs[0]) break;
+    }
         OutputStream *ost = output_streams[i];
         OutputFile    *of = output_files[ost->file_index];
         AVFilterContext *filter;
@@ -12,7 +13,7 @@
         int ret = 0;
 
         if (!ost->filter || !ost->filter->graph->graph)
-            continue;
+        return 0;
         filter = ost->filter->filter;
 
         if (!ost->initialized) {
@@ -25,9 +26,8 @@
             }
         }
 
-        if (!ost->filtered_frame && !(ost->filtered_frame = av_frame_alloc())) {
+    if (!ost->filtered_frame && !(ost->filtered_frame = av_frame_alloc()))
             return AVERROR(ENOMEM);
-        }
         filtered_frame = ost->filtered_frame;
 
         while (1) {
@@ -97,7 +97,6 @@
 
             av_frame_unref(filtered_frame);
         }
-    }
 
     return 0;
 }
\ No newline at end of file


This is basically the same just copy and pasted 2 lines changed, one
unrelated cosmetic change and code calling it outside under ifdef

This is not ok

also IIRC nicolas knows this part of the codebase best so it probably
makes sense when he comments. But as far as my oppionion
goes, i would prefer to avoid duplicate codepathes or ifdefs.
They have alot of disadvantages making maintaince harder
also making testing harder as only one of several alternative pathes
would be tested in each individual test, ...

So what i really would like to see is this being done in a cleaner
way. Preferably one codepath when possible, and best results by default 
no need to manually enable the fast path.

Also the question of scalability should be considered. Not saying
that requires any change but it should be given a thought what
happens if there are 1000 or 1 output and if the change makes sense
for such cases too.

thanks

[...]
Shaofei Wang Jan. 21, 2019, 8:19 a.m. UTC | #5
> -----Original Message-----
> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> Michael Niedermayer
> Sent: Thursday, January 17, 2019 8:30 PM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Cc: Nicolas George <george@nsup.org>
> Subject: Re: [FFmpeg-devel] [PATCH v3] Improved the performance of 1
> decode + N filter graphs and adaptive bitrate.
> 
> On Wed, Jan 16, 2019 at 04:17:07PM -0500, Shaofei Wang wrote:
> > With new option "-abr_pipeline"
> > It enabled multiple filter graph concurrency, which bring obove about
> > 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration
> >
> > Below are some test cases and comparison as reference.
> > (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz)
> > (Software: Intel iHD driver - 16.9.00100, CentOS 7)
> >
> > For 1:N transcode by GPU acceleration with vaapi:
> > ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \
> >     -hwaccel_output_format vaapi \
> >     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> >     -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \
> >     -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null \
> >     -abr_pipeline
> >
> >     test results:
> >                 2 encoders 5 encoders 10 encoders
> >     Improved       6.1%    6.9%       5.5%
> >
> > For 1:N transcode by GPU acceleration with QSV:
> > ./ffmpeg -hwaccel qsv -c:v h264_qsv \
> >     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> >     -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null
> \
> >     -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null
> > /dev/null
> >
> >     test results:
> >                 2 encoders  5 encoders 10 encoders
> >     Improved       6%       4%         15%
> >
> > For Intel GPU acceleration case, 1 decode to N scaling, by QSV:
> > ./ffmpeg -hwaccel qsv -c:v h264_qsv \
> >     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> >     -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f
> null /dev/null \
> >     -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f
> > null /dev/null
> >
> >     test results:
> >                 2 scale  5 scale   10 scale
> >     Improved       12%     21%        21%
> >
> > For CPU only 1 decode to N scaling:
> > ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> >     -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \
> >     -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null \
> >     -abr_pipeline
> >
> >     test results:
> >                 2 scale  5 scale   10 scale
> >     Improved       25%    107%       148%
> >
> > Signed-off-by: Wang, Shaofei <shaofei.wang@intel.com>
> > Reviewed-by: Zhao, Jun <jun.zhao@intel.com>
> > ---
> >  fftools/ffmpeg.c        | 228
> ++++++++++++++++++++++++++++++++++++++++++++----
> >  fftools/ffmpeg.h        |  15 ++++
> >  fftools/ffmpeg_filter.c |   4 +
> >  fftools/ffmpeg_opt.c    |   6 +-
> >  4 files changed, 237 insertions(+), 16 deletions(-)
> 
> Looking at this i see alot of duplicated code and alot of ifdefs
Since I didn't want to change the function interface of reap_filters(), a none-loop reap
function generated.
Will change it base on the reap_filters() to avoid duplicated lines in the next patch.

> Preferably one codepath when possible, and best results by default no need to
> manually enable the fast path.
If disable/enable the fast path option is not needed for users, i'll remove it. But before
that, there are some reasons:
1. it provide more choice for user to decide whether to use it depend on their cases, 
otherwise we need to implement the 'strategies' for users to decide when to enable/disable
the fast path.
2. it's easy to compare the result to make sure which is the best

Thanks
Michael Niedermayer Jan. 21, 2019, 8:45 p.m. UTC | #6
On Mon, Jan 21, 2019 at 08:19:38AM +0000, Wang, Shaofei wrote:
> > -----Original Message-----
> > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> > Michael Niedermayer
> > Sent: Thursday, January 17, 2019 8:30 PM
> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > Cc: Nicolas George <george@nsup.org>
> > Subject: Re: [FFmpeg-devel] [PATCH v3] Improved the performance of 1
> > decode + N filter graphs and adaptive bitrate.
> > 
> > On Wed, Jan 16, 2019 at 04:17:07PM -0500, Shaofei Wang wrote:
> > > With new option "-abr_pipeline"
> > > It enabled multiple filter graph concurrency, which bring obove about
> > > 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration
> > >
> > > Below are some test cases and comparison as reference.
> > > (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz)
> > > (Software: Intel iHD driver - 16.9.00100, CentOS 7)
> > >
> > > For 1:N transcode by GPU acceleration with vaapi:
> > > ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \
> > >     -hwaccel_output_format vaapi \
> > >     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> > >     -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \
> > >     -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null \
> > >     -abr_pipeline
> > >
> > >     test results:
> > >                 2 encoders 5 encoders 10 encoders
> > >     Improved       6.1%    6.9%       5.5%
> > >
> > > For 1:N transcode by GPU acceleration with QSV:
> > > ./ffmpeg -hwaccel qsv -c:v h264_qsv \
> > >     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> > >     -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null
> > \
> > >     -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null
> > > /dev/null
> > >
> > >     test results:
> > >                 2 encoders  5 encoders 10 encoders
> > >     Improved       6%       4%         15%
> > >
> > > For Intel GPU acceleration case, 1 decode to N scaling, by QSV:
> > > ./ffmpeg -hwaccel qsv -c:v h264_qsv \
> > >     -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> > >     -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f
> > null /dev/null \
> > >     -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f
> > > null /dev/null
> > >
> > >     test results:
> > >                 2 scale  5 scale   10 scale
> > >     Improved       12%     21%        21%
> > >
> > > For CPU only 1 decode to N scaling:
> > > ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> > >     -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \
> > >     -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null \
> > >     -abr_pipeline
> > >
> > >     test results:
> > >                 2 scale  5 scale   10 scale
> > >     Improved       25%    107%       148%
> > >
> > > Signed-off-by: Wang, Shaofei <shaofei.wang@intel.com>
> > > Reviewed-by: Zhao, Jun <jun.zhao@intel.com>
> > > ---
> > >  fftools/ffmpeg.c        | 228
> > ++++++++++++++++++++++++++++++++++++++++++++----
> > >  fftools/ffmpeg.h        |  15 ++++
> > >  fftools/ffmpeg_filter.c |   4 +
> > >  fftools/ffmpeg_opt.c    |   6 +-
> > >  4 files changed, 237 insertions(+), 16 deletions(-)
> > 
> > Looking at this i see alot of duplicated code and alot of ifdefs
> Since I didn't want to change the function interface of reap_filters(), a none-loop reap
> function generated.
> Will change it base on the reap_filters() to avoid duplicated lines in the next patch.
> 
> > Preferably one codepath when possible, and best results by default no need to
> > manually enable the fast path.
> If disable/enable the fast path option is not needed for users, i'll remove it. But before
> that, there are some reasons:
> 1. it provide more choice for user to decide whether to use it depend on their cases, 
> otherwise we need to implement the 'strategies' for users to decide when to enable/disable
> the fast path.
> 2. it's easy to compare the result to make sure which is the best

its fine if users have the option to tune it but IMHO it should "just work"
well by default

thanks

[...]
diff mbox

Patch

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 544f1a1..7dbff15 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -1523,6 +1523,109 @@  static int reap_filters(int flush)
     return 0;
 }
 
+static int pipeline_reap_filters(int flush, InputFilter * ifilter)
+{
+    AVFrame *filtered_frame = NULL;
+    int i;
+
+    for (i = 0; i < nb_output_streams; i++) {
+        if (ifilter == output_streams[i]->filter->graph->inputs[0]) break;
+    }
+    OutputStream *ost = output_streams[i];
+    OutputFile    *of = output_files[ost->file_index];
+    AVFilterContext *filter;
+    AVCodecContext *enc = ost->enc_ctx;
+    int ret = 0;
+
+    if (!ost->filter || !ost->filter->graph->graph)
+        return 0;
+    filter = ost->filter->filter;
+
+    if (!ost->initialized) {
+        char error[1024] = "";
+        ret = init_output_stream(ost, error, sizeof(error));
+        if (ret < 0) {
+            av_log(NULL, AV_LOG_ERROR, "Error initializing output stream %d:%d -- %s\n",
+                   ost->file_index, ost->index, error);
+            exit_program(1);
+        }
+    }
+
+    if (!ost->filtered_frame && !(ost->filtered_frame = av_frame_alloc()))
+        return AVERROR(ENOMEM);
+    filtered_frame = ost->filtered_frame;
+
+    while (1) {
+        double float_pts = AV_NOPTS_VALUE; // this is identical to filtered_frame.pts but with higher precision
+        ret = av_buffersink_get_frame_flags(filter, filtered_frame,
+                                           AV_BUFFERSINK_FLAG_NO_REQUEST);
+        if (ret < 0) {
+            if (ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) {
+                av_log(NULL, AV_LOG_WARNING,
+                       "Error in av_buffersink_get_frame_flags(): %s\n", av_err2str(ret));
+            } else if (flush && ret == AVERROR_EOF) {
+                if (av_buffersink_get_type(filter) == AVMEDIA_TYPE_VIDEO)
+                    do_video_out(of, ost, NULL, AV_NOPTS_VALUE);
+            }
+            break;
+        }
+        if (ost->finished) {
+            av_frame_unref(filtered_frame);
+            continue;
+        }
+        if (filtered_frame->pts != AV_NOPTS_VALUE) {
+            int64_t start_time = (of->start_time == AV_NOPTS_VALUE) ? 0 : of->start_time;
+            AVRational filter_tb = av_buffersink_get_time_base(filter);
+            AVRational tb = enc->time_base;
+            int extra_bits = av_clip(29 - av_log2(tb.den), 0, 16);
+
+            tb.den <<= extra_bits;
+            float_pts =
+                av_rescale_q(filtered_frame->pts, filter_tb, tb) -
+                av_rescale_q(start_time, AV_TIME_BASE_Q, tb);
+            float_pts /= 1 << extra_bits;
+            // avoid exact midoints to reduce the chance of rounding differences, this can be removed in case the fps code is changed to work with integers
+            float_pts += FFSIGN(float_pts) * 1.0 / (1<<17);
+
+            filtered_frame->pts =
+                av_rescale_q(filtered_frame->pts, filter_tb, enc->time_base) -
+                av_rescale_q(start_time, AV_TIME_BASE_Q, enc->time_base);
+        }
+
+        switch (av_buffersink_get_type(filter)) {
+        case AVMEDIA_TYPE_VIDEO:
+            if (!ost->frame_aspect_ratio.num)
+                enc->sample_aspect_ratio = filtered_frame->sample_aspect_ratio;
+
+            if (debug_ts) {
+                av_log(NULL, AV_LOG_INFO, "filter -> pts:%s pts_time:%s exact:%f time_base:%d/%d\n",
+                        av_ts2str(filtered_frame->pts), av_ts2timestr(filtered_frame->pts, &enc->time_base),
+                        float_pts,
+                        enc->time_base.num, enc->time_base.den);
+            }
+
+            do_video_out(of, ost, filtered_frame, float_pts);
+            break;
+        case AVMEDIA_TYPE_AUDIO:
+            if (!(enc->codec->capabilities & AV_CODEC_CAP_PARAM_CHANGE) &&
+                enc->channels != filtered_frame->channels) {
+                av_log(NULL, AV_LOG_ERROR,
+                       "Audio filter graph output is not normalized and encoder does not support parameter changes\n");
+                break;
+            }
+            do_audio_out(of, ost, filtered_frame);
+            break;
+        default:
+            // TODO support subtitle filters
+            av_assert0(0);
+        }
+
+        av_frame_unref(filtered_frame);
+    }
+
+    return 0;
+}
+
 static void print_final_stats(int64_t total_size)
 {
     uint64_t video_size = 0, audio_size = 0, extra_size = 0, other_size = 0;
@@ -2179,7 +2282,8 @@  static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
             }
         }
 
-        ret = reap_filters(1);
+        ret = abr_pipeline ? pipeline_reap_filters(1, ifilter) : reap_filters(1);
+
         if (ret < 0 && ret != AVERROR_EOF) {
             av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n", av_err2str(ret));
             return ret;
@@ -2208,6 +2312,16 @@  static int ifilter_send_eof(InputFilter *ifilter, int64_t pts)
 
     ifilter->eof = 1;
 
+#if HAVE_THREADS
+    if (abr_pipeline) {
+        ifilter->waited_frm = NULL;
+        pthread_mutex_lock(&ifilter->process_mutex);
+        ifilter->t_end = 1;
+        pthread_cond_signal(&ifilter->process_cond);
+        pthread_mutex_unlock(&ifilter->process_mutex);
+        pthread_join(ifilter->f_thread, NULL);
+    }
+#endif
     if (ifilter->filter) {
         ret = av_buffersrc_close(ifilter->filter, pts, AV_BUFFERSRC_FLAG_PUSH);
         if (ret < 0)
@@ -2252,6 +2366,41 @@  static int decode(AVCodecContext *avctx, AVFrame *frame, int *got_frame, AVPacke
     return 0;
 }
 
+#if HAVE_THREADS
+static void *filter_pipeline(void *arg)
+{
+    InputFilter *fl = arg;
+    AVFrame *frm;
+    int ret;
+    while(1) {
+        pthread_mutex_lock(&fl->process_mutex);
+        while (fl->waited_frm == NULL && !fl->t_end)
+            pthread_cond_wait(&fl->process_cond, &fl->process_mutex);
+        pthread_mutex_unlock(&fl->process_mutex);
+
+        if (fl->t_end) break;
+
+        frm = fl->waited_frm;
+        ret = ifilter_send_frame(fl, frm);
+        if (ret < 0) {
+            av_log(NULL, AV_LOG_ERROR,
+                   "Failed to inject frame into filter network: %s\n", av_err2str(ret));
+        } else {
+            ret = pipeline_reap_filters(0, fl);
+        }
+        fl->t_error = ret;
+
+        pthread_mutex_lock(&fl->finish_mutex);
+        fl->waited_frm = NULL;
+        pthread_cond_signal(&fl->finish_cond);
+        pthread_mutex_unlock(&fl->finish_mutex);
+
+        if (ret < 0)
+            break;
+    }
+    return fl;
+}
+#endif
 static int send_frame_to_filters(InputStream *ist, AVFrame *decoded_frame)
 {
     int i, ret;
@@ -2259,22 +2408,71 @@  static int send_frame_to_filters(InputStream *ist, AVFrame *decoded_frame)
 
     av_assert1(ist->nb_filters > 0); /* ensure ret is initialized */
     for (i = 0; i < ist->nb_filters; i++) {
-        if (i < ist->nb_filters - 1) {
-            f = ist->filter_frame;
-            ret = av_frame_ref(f, decoded_frame);
-            if (ret < 0)
+        if (!abr_pipeline) {
+            if (i < ist->nb_filters - 1) {
+                f = ist->filter_frame;
+                ret = av_frame_ref(f, decoded_frame);
+                if (ret < 0)
+                    break;
+            } else
+                f = decoded_frame;
+
+                ret = ifilter_send_frame(ist->filters[i], f);
+                if (ret == AVERROR_EOF)
+                    ret = 0; /* ignore */
+                if (ret < 0) {
+                    av_log(NULL, AV_LOG_ERROR,
+                           "Failed to inject frame into filter network: %s\n", av_err2str(ret));
+                    break;
+                }
+        } else {
+#if HAVE_THREADS
+            if (i < ist->nb_filters - 1) {
+                f = &ist->filters[i]->input_frm;
+                ret = av_frame_ref(f, decoded_frame);
+                if (ret < 0)
+                    break;
+            } else
+                f = decoded_frame;
+
+            if (!ist->filters[i]->b_abr_thread_init) {
+                if ((ret = pthread_create(&ist->filters[i]->f_thread, NULL, filter_pipeline, ist->filters[i]))) {
+                    av_log(NULL, AV_LOG_ERROR, "pthread_create failed: %s. Try to increase `ulimit -v` or decrease `ulimit -s`.\n", strerror(ret));
+                    return AVERROR(ret);
+                }
+                pthread_mutex_init(&ist->filters[i]->process_mutex, NULL);
+                pthread_mutex_init(&ist->filters[i]->finish_mutex, NULL);
+                pthread_cond_init(&ist->filters[i]->process_cond, NULL);
+                pthread_cond_init(&ist->filters[i]->finish_cond, NULL);
+                ist->filters[i]->t_end = 0;
+                ist->filters[i]->t_error = 0;
+                ist->filters[i]->b_abr_thread_init = 1;
+            }
+
+            pthread_mutex_lock(&ist->filters[i]->process_mutex);
+            ist->filters[i]->waited_frm = f;
+            pthread_cond_signal(&ist->filters[i]->process_cond);
+            pthread_mutex_unlock(&ist->filters[i]->process_mutex);
+#endif
+        }
+    }
+#if HAVE_THREADS
+    if (abr_pipeline) {
+        for (i = 0; i < ist->nb_filters; i++) {
+            pthread_mutex_lock(&ist->filters[i]->finish_mutex);
+            while(ist->filters[i]->waited_frm != NULL)
+                pthread_cond_wait(&ist->filters[i]->finish_cond, &ist->filters[i]->finish_mutex);
+            pthread_mutex_unlock(&ist->filters[i]->finish_mutex);
+        }
+        for (i = 0; i < ist->nb_filters; i++) {
+            if (ist->filters[i]->t_error < 0) {
+                ret = ist->filters[i]->t_error;
                 break;
-        } else
-            f = decoded_frame;
-        ret = ifilter_send_frame(ist->filters[i], f);
-        if (ret == AVERROR_EOF)
-            ret = 0; /* ignore */
-        if (ret < 0) {
-            av_log(NULL, AV_LOG_ERROR,
-                   "Failed to inject frame into filter network: %s\n", av_err2str(ret));
-            break;
+            }
         }
     }
+#endif
+
     return ret;
 }
 
@@ -4642,7 +4840,7 @@  static int transcode_step(void)
     if (ret < 0)
         return ret == AVERROR_EOF ? 0 : ret;
 
-    return reap_filters(0);
+    return abr_pipeline ? 0 : reap_filters(0);
 }
 
 /*
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index eb1eaf6..110306a 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -253,6 +253,20 @@  typedef struct InputFilter {
 
     AVBufferRef *hw_frames_ctx;
 
+    // for abr pipeline
+    int b_abr_thread_init;
+#if HAVE_THREADS
+    AVFrame *waited_frm;
+    AVFrame input_frm;
+    pthread_t f_thread;
+    pthread_cond_t process_cond;
+    pthread_cond_t finish_cond;
+    pthread_mutex_t process_mutex;
+    pthread_mutex_t finish_mutex;
+    int t_end;
+    int t_error;
+#endif
+
     int eof;
 } InputFilter;
 
@@ -606,6 +620,7 @@  extern int frame_bits_per_raw_sample;
 extern AVIOContext *progress_avio;
 extern float max_error_rate;
 extern char *videotoolbox_pixfmt;
+extern int abr_pipeline;
 
 extern int filter_nbthreads;
 extern int filter_complex_nbthreads;
diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
index 6518d50..8f14fbc 100644
--- a/fftools/ffmpeg_filter.c
+++ b/fftools/ffmpeg_filter.c
@@ -197,6 +197,7 @@  DEF_CHOOSE_FORMAT(channel_layouts, uint64_t, channel_layout, channel_layouts, 0,
 int init_simple_filtergraph(InputStream *ist, OutputStream *ost)
 {
     FilterGraph *fg = av_mallocz(sizeof(*fg));
+    int i;
 
     if (!fg)
         exit_program(1);
@@ -225,6 +226,9 @@  int init_simple_filtergraph(InputStream *ist, OutputStream *ost)
     GROW_ARRAY(ist->filters, ist->nb_filters);
     ist->filters[ist->nb_filters - 1] = fg->inputs[0];
 
+    if (abr_pipeline)
+        for (i = 0; i < ist->nb_filters; i++)
+            ist->filters[i]->b_abr_thread_init = 0;
     GROW_ARRAY(filtergraphs, nb_filtergraphs);
     filtergraphs[nb_filtergraphs - 1] = fg;
 
diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
index d4851a2..fa5a556 100644
--- a/fftools/ffmpeg_opt.c
+++ b/fftools/ffmpeg_opt.c
@@ -110,6 +110,7 @@  float max_error_rate  = 2.0/3;
 int filter_nbthreads = 0;
 int filter_complex_nbthreads = 0;
 int vstats_version = 2;
+int abr_pipeline      = 0;
 
 
 static int intra_only         = 0;
@@ -3502,7 +3503,10 @@  const OptionDef options[] = {
         "set the maximum number of queued packets from the demuxer" },
     { "find_stream_info", OPT_BOOL | OPT_PERFILE | OPT_INPUT | OPT_EXPERT, { &find_stream_info },
         "read and decode the streams to fill missing information with heuristics" },
-
+#if HAVE_THREADS
+    { "abr_pipeline",    OPT_BOOL,                                    { &abr_pipeline },
+        "adaptive bitrate pipeline (1 decode to N filter graphs, and 1 to N transcode" },
+#endif
     /* video options */
     { "vframes",      OPT_VIDEO | HAS_ARG  | OPT_PERFILE | OPT_OUTPUT,           { .func_arg = opt_video_frames },
         "set the number of video frames to output", "number" },