diff mbox series

[FFmpeg-devel,11/27] fftools/ffmpeg_enc: move fps conversion code to ffmpeg_filter

Message ID 20230919191044.18873-12-anton@khirnov.net
State New
Headers show
Series [FFmpeg-devel,01/27] fftools/ffmpeg: move derivation of frame duration from filter framerate | expand

Commit Message

Anton Khirnov Sept. 19, 2023, 7:10 p.m. UTC
Its function is analogous to that of the fps filter, so filtering is a
more appropriate place for this.

The main practical reason for this move is that it places the encoding
sync queue right at the boundary between filters and encoders. This will
be important when switching to threaded scheduling, as the sync queue
involves multiple streams and will thus need to do nontrivial
inter-thread synchronization.

In addition to framerate conversion, the closely-related
* encoder timebase selection
* applying the start_time offset
are also moved to filtering.
---
 fftools/ffmpeg.c        |   6 +-
 fftools/ffmpeg.h        |   7 +-
 fftools/ffmpeg_enc.c    | 370 +++-------------------------------
 fftools/ffmpeg_filter.c | 432 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 442 insertions(+), 373 deletions(-)
diff mbox series

Patch

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index a854589bef..7c33b56cd3 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -536,7 +536,7 @@  static void print_report(int is_last_report, int64_t timer_start, int64_t cur_ti
             av_bprintf(&buf_script, "stream_%d_%d_q=%.1f\n",
                        ost->file_index, ost->index, q);
         }
-        if (!vid && ost->type == AVMEDIA_TYPE_VIDEO) {
+        if (!vid && ost->type == AVMEDIA_TYPE_VIDEO && ost->filter) {
             float fps;
             uint64_t frame_number = atomic_load(&ost->packets_written);
 
@@ -550,8 +550,8 @@  static void print_report(int is_last_report, int64_t timer_start, int64_t cur_ti
             if (is_last_report)
                 av_bprintf(&buf, "L");
 
-            nb_frames_dup  = ost->nb_frames_dup;
-            nb_frames_drop = ost->nb_frames_drop;
+            nb_frames_dup  = ost->filter->nb_frames_dup;
+            nb_frames_drop = ost->filter->nb_frames_drop;
 
             vid = 1;
         }
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index eaa663e718..15790d3e0c 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -302,6 +302,9 @@  typedef struct OutputFilter {
 
     /* pts of the last frame received from this filter, in AV_TIME_BASE_Q */
     int64_t last_pts;
+
+    uint64_t nb_frames_dup;
+    uint64_t nb_frames_drop;
 } OutputFilter;
 
 typedef struct FilterGraph {
@@ -536,10 +539,6 @@  typedef struct OutputStream {
     Encoder *enc;
     AVCodecContext *enc_ctx;
 
-    uint64_t nb_frames_dup;
-    uint64_t nb_frames_drop;
-    int64_t last_dropped;
-
     /* video only */
     AVRational frame_rate;
     AVRational max_frame_rate;
diff --git a/fftools/ffmpeg_enc.c b/fftools/ffmpeg_enc.c
index 9aee18bfe1..321554ab5c 100644
--- a/fftools/ffmpeg_enc.c
+++ b/fftools/ffmpeg_enc.c
@@ -36,29 +36,9 @@ 
 
 #include "libavcodec/avcodec.h"
 
-// FIXME private header, used for mid_pred()
-#include "libavcodec/mathops.h"
-
 #include "libavformat/avformat.h"
 
-typedef struct FPSConvContext {
-    AVFrame *last_frame;
-    /* number of frames emitted by the video-encoding sync code */
-    int64_t frame_number;
-    /* history of nb_frames_prev, i.e. the number of times the
-     * previous frame was duplicated by vsync code in recent
-     * do_video_out() calls */
-    int64_t frames_prev_hist[3];
-
-    uint64_t dup_warning;
-} FPSConvContext;
-
 struct Encoder {
-    /* predicted pts of the next frame to be encoded */
-    int64_t next_pts;
-
-    FPSConvContext fps;
-
     AVFrame *sq_frame;
 
     // packet for receiving encoded output
@@ -80,7 +60,6 @@  void enc_free(Encoder **penc)
     if (!enc)
         return;
 
-    av_frame_free(&enc->fps.last_frame);
     av_frame_free(&enc->sq_frame);
 
     av_packet_free(&enc->pkt);
@@ -98,14 +77,6 @@  int enc_alloc(Encoder **penc, const AVCodec *codec)
     if (!enc)
         return AVERROR(ENOMEM);
 
-    if (codec->type == AVMEDIA_TYPE_VIDEO) {
-        enc->fps.last_frame = av_frame_alloc();
-        if (!enc->fps.last_frame)
-            goto fail;
-
-        enc->fps.dup_warning = 1000;
-    }
-
     enc->pkt = av_packet_alloc();
     if (!enc->pkt)
         goto fail;
@@ -194,98 +165,6 @@  static int set_encoder_id(OutputFile *of, OutputStream *ost)
     return 0;
 }
 
-static int enc_choose_timebase(OutputStream *ost, AVFrame *frame)
-{
-    const OutputFile *of = output_files[ost->file_index];
-    AVCodecContext  *enc = ost->enc_ctx;
-    AVRational        tb = (AVRational){ 0, 0 };
-    AVRational fr;
-    FrameData *fd;
-
-    if (ost->type == AVMEDIA_TYPE_SUBTITLE) {
-        if (ost->enc_timebase.num)
-            av_log(ost, AV_LOG_WARNING,
-                   "-enc_time_base not supported for subtitles, ignoring\n");
-        enc->time_base = AV_TIME_BASE_Q;
-        return 0;
-    }
-
-    fd = frame_data(frame);
-
-    // apply -enc_time_base
-    if (ost->enc_timebase.num == ENC_TIME_BASE_DEMUX &&
-        (fd->dec.tb.num <= 0 || fd->dec.tb.den <= 0)) {
-        av_log(ost, AV_LOG_ERROR,
-               "Demuxing timebase not available - cannot use it for encoding\n");
-        return AVERROR(EINVAL);
-    }
-
-    switch (ost->enc_timebase.num) {
-    case 0:                                            break;
-    case ENC_TIME_BASE_DEMUX:  tb = fd->dec.tb;        break;
-    case ENC_TIME_BASE_FILTER: tb = frame->time_base;  break;
-    default:                   tb = ost->enc_timebase; break;
-    }
-
-    if (ost->type == AVMEDIA_TYPE_AUDIO) {
-        enc->time_base = tb.num ? tb : (AVRational){ 1, frame->sample_rate };
-        return 0;
-    }
-
-    fr = ost->frame_rate;
-    if (!fr.num)
-        fr = fd->frame_rate_filter;
-
-    if (ost->is_cfr) {
-        if (!fr.num && !ost->max_frame_rate.num) {
-            fr = (AVRational){25, 1};
-            av_log(ost, AV_LOG_WARNING,
-                   "No information "
-                   "about the input framerate is available. Falling "
-                   "back to a default value of 25fps. Use the -r option "
-                   "if you want a different framerate.\n");
-        }
-
-        if (ost->max_frame_rate.num &&
-            (av_q2d(fr) > av_q2d(ost->max_frame_rate) ||
-            !fr.den))
-            fr = ost->max_frame_rate;
-    }
-
-    if (fr.num > 0) {
-        if (enc->codec->supported_framerates && !ost->force_fps) {
-            int idx = av_find_nearest_q_idx(fr, enc->codec->supported_framerates);
-            fr = enc->codec->supported_framerates[idx];
-        }
-        // reduce frame rate for mpeg4 to be within the spec limits
-        if (enc->codec_id == AV_CODEC_ID_MPEG4) {
-            av_reduce(&fr.num, &fr.den,
-                      fr.num, fr.den, 65535);
-        }
-    }
-
-    if (av_q2d(fr) > 1e3 && ost->vsync_method != VSYNC_PASSTHROUGH &&
-        (ost->vsync_method == VSYNC_CFR || ost->vsync_method == VSYNC_VSCFR ||
-        (ost->vsync_method == VSYNC_AUTO && !(of->format->flags & AVFMT_VARIABLE_FPS)))){
-        av_log(ost, AV_LOG_WARNING, "Frame rate very high for a muxer not efficiently supporting it.\n"
-                                    "Please consider specifying a lower framerate, a different muxer or "
-                                    "setting vsync/fps_mode to vfr\n");
-    }
-
-    enc->framerate = fr;
-
-    ost->st->avg_frame_rate = fr;
-
-    if (!(tb.num > 0 && tb.den > 0))
-        tb = av_inv_q(fr);
-    if (!(tb.num > 0 && tb.den > 0))
-        tb = frame->time_base;
-
-    enc->time_base = tb;
-
-    return 0;
-}
-
 int enc_open(OutputStream *ost, AVFrame *frame)
 {
     InputStream *ist = ost->ist;
@@ -317,10 +196,11 @@  int enc_open(OutputStream *ost, AVFrame *frame)
         dec_ctx = ist->dec_ctx;
     }
 
-    ret = enc_choose_timebase(ost, frame);
-    if (ret < 0) {
-        av_log(ost, AV_LOG_ERROR, "Could not choose a time base for encoding\n");
-        return AVERROR(EINVAL);
+    // the timebase is chosen by filtering code
+    if (ost->type == AVMEDIA_TYPE_AUDIO || ost->type == AVMEDIA_TYPE_VIDEO) {
+        enc_ctx->time_base      = frame->time_base;
+        enc_ctx->framerate      = fd->frame_rate_filter;
+        ost->st->avg_frame_rate = fd->frame_rate_filter;
     }
 
     switch (enc_ctx->codec_type) {
@@ -383,6 +263,11 @@  int enc_open(OutputStream *ost, AVFrame *frame)
         break;
         }
     case AVMEDIA_TYPE_SUBTITLE:
+        if (ost->enc_timebase.num)
+            av_log(ost, AV_LOG_WARNING,
+                   "-enc_time_base not supported for subtitles, ignoring\n");
+        enc_ctx->time_base = AV_TIME_BASE_Q;
+
         if (!enc_ctx->width) {
             enc_ctx->width     = ost->ist->par->width;
             enc_ctx->height    = ost->ist->par->height;
@@ -765,9 +650,6 @@  static int encode_frame(OutputFile *of, OutputStream *ost, AVFrame *frame)
 
         if (frame->sample_aspect_ratio.num && !ost->frame_aspect_ratio.num)
             enc->sample_aspect_ratio = frame->sample_aspect_ratio;
-    } else if (ost->last_dropped) {
-        ost->nb_frames_drop++;
-        ost->last_dropped = 0;
     }
 
     update_benchmark(NULL);
@@ -892,7 +774,6 @@  static int submit_encode_frame(OutputFile *of, OutputStream *ost,
 static int do_audio_out(OutputFile *of, OutputStream *ost,
                         AVFrame *frame)
 {
-    Encoder          *e = ost->enc;
     AVCodecContext *enc = ost->enc_ctx;
     int ret;
 
@@ -903,183 +784,15 @@  static int do_audio_out(OutputFile *of, OutputStream *ost,
         return 0;
     }
 
-    if (frame->pts == AV_NOPTS_VALUE)
-        frame->pts = e->next_pts;
-    else {
-        int64_t start_time = (of->start_time == AV_NOPTS_VALUE) ? 0 : of->start_time;
-        frame->pts =
-            av_rescale_q(frame->pts, frame->time_base, enc->time_base) -
-            av_rescale_q(start_time, AV_TIME_BASE_Q,   enc->time_base);
-    }
-    frame->time_base = enc->time_base;
-    frame->duration  = av_rescale_q(frame->nb_samples, (AVRational){1, frame->sample_rate},
-                                    enc->time_base);
-
     if (!check_recording_time(ost, frame->pts, frame->time_base))
         return 0;
 
-    e->next_pts = frame->pts + frame->nb_samples;
-
     ret = submit_encode_frame(of, ost, frame);
     return (ret < 0 && ret != AVERROR_EOF) ? ret : 0;
 }
 
-static double adjust_frame_pts_to_encoder_tb(AVFrame *frame, AVRational tb_dst,
-                                             int64_t start_time)
-{
-    double float_pts = AV_NOPTS_VALUE; // this is identical to frame.pts but with higher precision
-
-    AVRational        tb = tb_dst;
-    AVRational filter_tb = frame->time_base;
-    const int extra_bits = av_clip(29 - av_log2(tb.den), 0, 16);
-
-    if (frame->pts == AV_NOPTS_VALUE)
-        goto early_exit;
-
-    tb.den <<= extra_bits;
-    float_pts = av_rescale_q(frame->pts, filter_tb, tb) -
-                av_rescale_q(start_time, AV_TIME_BASE_Q, tb);
-    float_pts /= 1 << extra_bits;
-    // when float_pts is not exactly an integer,
-    // avoid exact midpoints to reduce the chance of rounding differences, this
-    // can be removed in case the fps code is changed to work with integers
-    if (float_pts != llrint(float_pts))
-        float_pts += FFSIGN(float_pts) * 1.0 / (1<<17);
-
-    frame->pts = av_rescale_q(frame->pts, filter_tb, tb_dst) -
-                 av_rescale_q(start_time, AV_TIME_BASE_Q, tb_dst);
-    frame->time_base = tb_dst;
-
-early_exit:
-
-    if (debug_ts) {
-        av_log(NULL, AV_LOG_INFO, "filter -> pts:%s pts_time:%s exact:%f time_base:%d/%d\n",
-               frame ? av_ts2str(frame->pts) : "NULL",
-               av_ts2timestr(frame->pts, &tb_dst),
-               float_pts, tb_dst.num, tb_dst.den);
-    }
-
-    return float_pts;
-}
-
-/* Convert frame timestamps to the encoder timebase and decide how many times
- * should this (and possibly previous) frame be repeated in order to conform to
- * desired target framerate (if any).
- */
-static void video_sync_process(OutputFile *of, OutputStream *ost, AVFrame *frame,
-                               int64_t *nb_frames, int64_t *nb_frames_prev)
-{
-    Encoder *e = ost->enc;
-    FPSConvContext *fps = &e->fps;
-    AVCodecContext *enc = ost->enc_ctx;
-    double delta0, delta, sync_ipts, duration;
-
-    if (!frame) {
-        *nb_frames_prev = *nb_frames = mid_pred(fps->frames_prev_hist[0],
-                                                fps->frames_prev_hist[1],
-                                                fps->frames_prev_hist[2]);
-        goto finish;
-    }
-
-    duration = lrintf(frame->duration * av_q2d(frame->time_base) / av_q2d(enc->time_base));
-
-    sync_ipts = adjust_frame_pts_to_encoder_tb(frame, enc->time_base,
-                                               of->start_time == AV_NOPTS_VALUE ? 0 : of->start_time);
-    /* delta0 is the "drift" between the input frame and
-     * where it would fall in the output. */
-    delta0 = sync_ipts - e->next_pts;
-    delta  = delta0 + duration;
-
-    // tracks the number of times the PREVIOUS frame should be duplicated,
-    // mostly for variable framerate (VFR)
-    *nb_frames_prev = 0;
-    /* by default, we output a single frame */
-    *nb_frames = 1;
-
-    if (delta0 < 0 &&
-        delta > 0 &&
-        ost->vsync_method != VSYNC_PASSTHROUGH &&
-        ost->vsync_method != VSYNC_DROP) {
-        if (delta0 < -0.6) {
-            av_log(ost, AV_LOG_VERBOSE, "Past duration %f too large\n", -delta0);
-        } else
-            av_log(ost, AV_LOG_DEBUG, "Clipping frame in rate conversion by %f\n", -delta0);
-        sync_ipts = e->next_pts;
-        duration += delta0;
-        delta0 = 0;
-    }
-
-    switch (ost->vsync_method) {
-    case VSYNC_VSCFR:
-        if (fps->frame_number == 0 && delta0 >= 0.5) {
-            av_log(ost, AV_LOG_DEBUG, "Not duplicating %d initial frames\n", (int)lrintf(delta0));
-            delta = duration;
-            delta0 = 0;
-            e->next_pts = llrint(sync_ipts);
-        }
-    case VSYNC_CFR:
-        // FIXME set to 0.5 after we fix some dts/pts bugs like in avidec.c
-        if (frame_drop_threshold && delta < frame_drop_threshold && fps->frame_number) {
-            *nb_frames = 0;
-        } else if (delta < -1.1)
-            *nb_frames = 0;
-        else if (delta > 1.1) {
-            *nb_frames = llrintf(delta);
-            if (delta0 > 1.1)
-                *nb_frames_prev = llrintf(delta0 - 0.6);
-        }
-        frame->duration = 1;
-        break;
-    case VSYNC_VFR:
-        if (delta <= -0.6)
-            *nb_frames = 0;
-        else if (delta > 0.6)
-            e->next_pts = llrint(sync_ipts);
-        frame->duration = duration;
-        break;
-    case VSYNC_DROP:
-    case VSYNC_PASSTHROUGH:
-        frame->duration = duration;
-        e->next_pts = llrint(sync_ipts);
-        break;
-    default:
-        av_assert0(0);
-    }
-
-finish:
-    memmove(fps->frames_prev_hist + 1,
-            fps->frames_prev_hist,
-            sizeof(fps->frames_prev_hist[0]) * (FF_ARRAY_ELEMS(fps->frames_prev_hist) - 1));
-    fps->frames_prev_hist[0] = *nb_frames_prev;
-
-    if (*nb_frames_prev == 0 && ost->last_dropped) {
-        ost->nb_frames_drop++;
-        av_log(ost, AV_LOG_VERBOSE,
-               "*** dropping frame %"PRId64" at ts %"PRId64"\n",
-               fps->frame_number, fps->last_frame->pts);
-    }
-    if (*nb_frames > (*nb_frames_prev && ost->last_dropped) + (*nb_frames > *nb_frames_prev)) {
-        if (*nb_frames > dts_error_threshold * 30) {
-            av_log(ost, AV_LOG_ERROR, "%"PRId64" frame duplication too large, skipping\n", *nb_frames - 1);
-            ost->nb_frames_drop++;
-            *nb_frames = 0;
-            return;
-        }
-        ost->nb_frames_dup += *nb_frames - (*nb_frames_prev && ost->last_dropped) - (*nb_frames > *nb_frames_prev);
-        av_log(ost, AV_LOG_VERBOSE, "*** %"PRId64" dup!\n", *nb_frames - 1);
-        if (ost->nb_frames_dup > fps->dup_warning) {
-            av_log(ost, AV_LOG_WARNING, "More than %"PRIu64" frames duplicated\n", fps->dup_warning);
-            fps->dup_warning *= 10;
-        }
-    }
-
-    ost->last_dropped = *nb_frames == *nb_frames_prev && frame;
-    ost->kf.dropped_keyframe |= ost->last_dropped && (frame->flags & AV_FRAME_FLAG_KEY);
-}
-
 static enum AVPictureType forced_kf_apply(void *logctx, KeyframeForceCtx *kf,
-                                          AVRational tb, const AVFrame *in_picture,
-                                          int dup_idx)
+                                          AVRational tb, const AVFrame *in_picture)
 {
     double pts_time;
 
@@ -1113,11 +826,8 @@  static enum AVPictureType forced_kf_apply(void *logctx, KeyframeForceCtx *kf,
             kf->expr_const_values[FKF_N_FORCED]     += 1;
             goto force_keyframe;
         }
-    } else if (kf->type == KF_FORCE_SOURCE && !dup_idx) {
-        int dropped_keyframe = kf->dropped_keyframe;
-        kf->dropped_keyframe = 0;
-        if ((in_picture->flags & AV_FRAME_FLAG_KEY) || dropped_keyframe)
-            goto force_keyframe;
+    } else if (kf->type == KF_FORCE_SOURCE && (in_picture->flags & AV_FRAME_FLAG_KEY)) {
+        goto force_keyframe;
     }
 
     return AV_PICTURE_TYPE_NONE;
@@ -1128,58 +838,26 @@  force_keyframe:
 }
 
 /* May modify/reset frame */
-static int do_video_out(OutputFile *of, OutputStream *ost, AVFrame *frame)
+static int do_video_out(OutputFile *of, OutputStream *ost, AVFrame *in_picture)
 {
     int ret;
-    Encoder *e = ost->enc;
     AVCodecContext *enc = ost->enc_ctx;
-    int64_t nb_frames, nb_frames_prev, i;
 
-    video_sync_process(of, ost, frame,
-                       &nb_frames, &nb_frames_prev);
+    if (!check_recording_time(ost, in_picture->pts, ost->enc_ctx->time_base))
+        return 0;
 
-    /* duplicates frame if needed */
-    for (i = 0; i < nb_frames; i++) {
-        AVFrame *in_picture;
-
-        if (i < nb_frames_prev && e->fps.last_frame->buf[0]) {
-            in_picture = e->fps.last_frame;
-        } else
-            in_picture = frame;
-
-        if (!in_picture)
-            return 0;
-
-        in_picture->pts = e->next_pts;
-
-        if (!check_recording_time(ost, in_picture->pts, ost->enc_ctx->time_base))
-            return 0;
-
-        in_picture->quality = enc->global_quality;
-        in_picture->pict_type = forced_kf_apply(ost, &ost->kf, enc->time_base, in_picture, i);
+    in_picture->quality = enc->global_quality;
+    in_picture->pict_type = forced_kf_apply(ost, &ost->kf, enc->time_base, in_picture);
 
 #if FFMPEG_OPT_TOP
-        if (ost->top_field_first >= 0) {
-            in_picture->flags &= ~AV_FRAME_FLAG_TOP_FIELD_FIRST;
-            in_picture->flags |= AV_FRAME_FLAG_TOP_FIELD_FIRST * (!!ost->top_field_first);
-        }
+    if (ost->top_field_first >= 0) {
+        in_picture->flags &= ~AV_FRAME_FLAG_TOP_FIELD_FIRST;
+        in_picture->flags |= AV_FRAME_FLAG_TOP_FIELD_FIRST * (!!ost->top_field_first);
+    }
 #endif
 
-        ret = submit_encode_frame(of, ost, in_picture);
-        if (ret == AVERROR_EOF)
-            break;
-        else if (ret < 0)
-            return ret;
-
-        e->next_pts++;
-        e->fps.frame_number++;
-    }
-
-    av_frame_unref(e->fps.last_frame);
-    if (frame)
-        av_frame_move_ref(e->fps.last_frame, frame);
-
-    return 0;
+    ret = submit_encode_frame(of, ost, in_picture);
+    return (ret == AVERROR_EOF) ? 0 : ret;
 }
 
 int enc_frame(OutputStream *ost, AVFrame *frame)
diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
index 804b9de3dc..92f6a6236d 100644
--- a/fftools/ffmpeg_filter.c
+++ b/fftools/ffmpeg_filter.c
@@ -38,6 +38,9 @@ 
 #include "libavutil/samplefmt.h"
 #include "libavutil/timestamp.h"
 
+// FIXME private header, used for mid_pred()
+#include "libavcodec/mathops.h"
+
 typedef struct FilterGraphPriv {
     FilterGraph fg;
 
@@ -54,6 +57,8 @@  typedef struct FilterGraphPriv {
 
     // frame for temporarily holding output from the filtergraph
     AVFrame *frame;
+    // frame for sending output to the encoder
+    AVFrame *frame_enc;
 } FilterGraphPriv;
 
 static FilterGraphPriv *fgp_from_fg(FilterGraph *fg)
@@ -134,6 +139,26 @@  static InputFilterPriv *ifp_from_ifilter(InputFilter *ifilter)
     return (InputFilterPriv*)ifilter;
 }
 
+typedef struct FPSConvContext {
+    AVFrame *last_frame;
+    /* number of frames emitted by the video-encoding sync code */
+    int64_t frame_number;
+    /* history of nb_frames_prev, i.e. the number of times the
+     * previous frame was duplicated by vsync code in recent
+     * do_video_out() calls */
+    int64_t frames_prev_hist[3];
+
+    uint64_t dup_warning;
+
+    int               last_dropped;
+    int               dropped_keyframe;
+
+    AVRational        framerate;
+    AVRational        framerate_max;
+    const AVRational *framerate_supported;
+    int               framerate_clip;
+} FPSConvContext;
+
 typedef struct OutputFilterPriv {
     OutputFilter        ofilter;
 
@@ -145,7 +170,13 @@  typedef struct OutputFilterPriv {
     int sample_rate;
     AVChannelLayout ch_layout;
 
-    AVRational time_base;
+    // time base in which the output is sent to our downstream
+    // does not need to match the filtersink's timebase
+    AVRational tb_out;
+    // at least one frame with the above timebase was sent
+    // to our downstream, so it cannot change anymore
+    int        tb_out_locked;
+
     AVRational sample_aspect_ratio;
 
     // those are only set if no format is specified and the encoder gives us multiple options
@@ -154,6 +185,12 @@  typedef struct OutputFilterPriv {
     const AVChannelLayout *ch_layouts;
     const int *sample_rates;
 
+    AVRational enc_timebase;
+    // offset for output timestamps, in AV_TIME_BASE_Q
+    int64_t ts_offset;
+    int64_t next_pts;
+    FPSConvContext fps;
+
     // set to 1 after at least one frame passed through this output
     int got_frame;
 } OutputFilterPriv;
@@ -627,6 +664,7 @@  static int set_channel_layout(OutputFilterPriv *f, OutputStream *ost)
 
 int ofilter_bind_ost(OutputFilter *ofilter, OutputStream *ost)
 {
+    const OutputFile  *of = output_files[ost->file_index];
     OutputFilterPriv *ofp = ofp_from_ofilter(ofilter);
     FilterGraph  *fg = ofilter->graph;
     FilterGraphPriv *fgp = fgp_from_fg(fg);
@@ -637,6 +675,9 @@  int ofilter_bind_ost(OutputFilter *ofilter, OutputStream *ost)
     ofilter->ost = ost;
     av_freep(&ofilter->linklabel);
 
+    ofp->ts_offset     = of->start_time == AV_NOPTS_VALUE ? 0 : of->start_time;
+    ofp->enc_timebase = ost->enc_timebase;
+
     switch (ost->enc_ctx->codec_type) {
     case AVMEDIA_TYPE_VIDEO:
         ofp->width      = ost->enc_ctx->width;
@@ -673,6 +714,21 @@  int ofilter_bind_ost(OutputFilter *ofilter, OutputStream *ost)
 
         fgp->disable_conversions |= ost->keep_pix_fmt;
 
+        ofp->fps.last_frame = av_frame_alloc();
+        if (!ofp->fps.last_frame)
+            return AVERROR(ENOMEM);
+
+        ofp->fps.framerate           = ost->frame_rate;
+        ofp->fps.framerate_max       = ost->max_frame_rate;
+        ofp->fps.framerate_supported = ost->force_fps ?
+                                       NULL : c->supported_framerates;
+
+        // reduce frame rate for mpeg4 to be within the spec limits
+        if (c->id == AV_CODEC_ID_MPEG4)
+            ofp->fps.framerate_clip = 65535;
+
+        ofp->fps.dup_warning         = 1000;
+
         break;
     case AVMEDIA_TYPE_AUDIO:
         if (ost->enc_ctx->sample_fmt != AV_SAMPLE_FMT_NONE) {
@@ -777,6 +833,8 @@  void fg_free(FilterGraph **pfg)
         OutputFilter *ofilter = fg->outputs[j];
         OutputFilterPriv *ofp = ofp_from_ofilter(ofilter);
 
+        av_frame_free(&ofp->fps.last_frame);
+
         av_freep(&ofilter->linklabel);
         av_freep(&ofilter->name);
         av_channel_layout_uninit(&ofp->ch_layout);
@@ -786,6 +844,7 @@  void fg_free(FilterGraph **pfg)
     av_freep(&fgp->graph_desc);
 
     av_frame_free(&fgp->frame);
+    av_frame_free(&fgp->frame_enc);
 
     av_freep(pfg);
 }
@@ -828,8 +887,9 @@  int fg_create(FilterGraph **pfg, char *graph_desc)
 
     snprintf(fgp->log_name, sizeof(fgp->log_name), "fc#%d", fg->index);
 
-    fgp->frame = av_frame_alloc();
-    if (!fgp->frame)
+    fgp->frame     = av_frame_alloc();
+    fgp->frame_enc = av_frame_alloc();
+    if (!fgp->frame || !fgp->frame_enc)
         return AVERROR(ENOMEM);
 
     /* this graph is only used for determining the kinds of inputs
@@ -1630,7 +1690,16 @@  static int configure_filtergraph(FilterGraph *fg)
         ofp->width  = av_buffersink_get_w(sink);
         ofp->height = av_buffersink_get_h(sink);
 
-        ofp->time_base           = av_buffersink_get_time_base(sink);
+        // If the timing parameters are not locked yet, get the tentative values
+        // here but don't lock them. They will only be used if no output frames
+        // are ever produced.
+        if (!ofp->tb_out_locked) {
+            AVRational fr = av_buffersink_get_frame_rate(sink);
+            if (ofp->fps.framerate.num <= 0 && ofp->fps.framerate.den <= 0 &&
+                fr.num > 0 && fr.den > 0)
+                ofp->fps.framerate = fr;
+            ofp->tb_out = av_buffersink_get_time_base(sink);
+        }
         ofp->sample_aspect_ratio = av_buffersink_get_sample_aspect_ratio(sink);
 
         ofp->sample_rate    = av_buffersink_get_sample_rate(sink);
@@ -1765,6 +1834,313 @@  void fg_send_command(FilterGraph *fg, double time, const char *target,
     }
 }
 
+static int choose_out_timebase(OutputFilterPriv *ofp, AVFrame *frame)
+{
+    OutputFilter *ofilter = &ofp->ofilter;
+    FPSConvContext   *fps = &ofp->fps;
+    AVRational        tb = (AVRational){ 0, 0 };
+    AVRational fr;
+    FrameData *fd;
+
+    fd = frame_data(frame);
+
+    // apply -enc_time_base
+    if (ofp->enc_timebase.num == ENC_TIME_BASE_DEMUX &&
+        (fd->dec.tb.num <= 0 || fd->dec.tb.den <= 0)) {
+        av_log(ofilter->ost, AV_LOG_ERROR,
+               "Demuxing timebase not available - cannot use it for encoding\n");
+        return AVERROR(EINVAL);
+    }
+
+    switch (ofp->enc_timebase.num) {
+    case 0:                                            break;
+    case ENC_TIME_BASE_DEMUX:  tb = fd->dec.tb;        break;
+    case ENC_TIME_BASE_FILTER: tb = frame->time_base;  break;
+    default:                   tb = ofp->enc_timebase; break;
+    }
+
+    if (ofilter->type == AVMEDIA_TYPE_AUDIO) {
+        tb = tb.num ? tb : (AVRational){ 1, frame->sample_rate };
+        goto finish;
+    }
+
+    fr = fps->framerate;
+    if (!fr.num) {
+        AVRational fr_sink = av_buffersink_get_frame_rate(ofp->filter);
+        if (fr_sink.num > 0 && fr_sink.den > 0)
+            fr = fr_sink;
+    }
+
+    if (ofilter->ost->is_cfr) {
+        if (!fr.num && !fps->framerate_max.num) {
+            fr = (AVRational){25, 1};
+            av_log(ofilter->ost, AV_LOG_WARNING,
+                   "No information "
+                   "about the input framerate is available. Falling "
+                   "back to a default value of 25fps. Use the -r option "
+                   "if you want a different framerate.\n");
+        }
+
+        if (fps->framerate_max.num &&
+            (av_q2d(fr) > av_q2d(fps->framerate_max) ||
+            !fr.den))
+            fr = fps->framerate_max;
+    }
+
+    if (fr.num > 0) {
+        if (fps->framerate_supported) {
+            int idx = av_find_nearest_q_idx(fr, fps->framerate_supported);
+            fr = fps->framerate_supported[idx];
+        }
+        if (fps->framerate_clip) {
+            av_reduce(&fr.num, &fr.den,
+                      fr.num, fr.den, fps->framerate_clip);
+        }
+    }
+
+    if (!(tb.num > 0 && tb.den > 0))
+        tb = av_inv_q(fr);
+    if (!(tb.num > 0 && tb.den > 0))
+        tb = frame->time_base;
+
+finish:
+    ofp->tb_out        = tb;
+    fps->framerate     = fr;
+    ofp->tb_out_locked = 1;
+
+    return 0;
+}
+
+static double adjust_frame_pts_to_encoder_tb(AVFrame *frame, AVRational tb_dst,
+                                             int64_t start_time)
+{
+    double float_pts = AV_NOPTS_VALUE; // this is identical to frame.pts but with higher precision
+
+    AVRational        tb = tb_dst;
+    AVRational filter_tb = frame->time_base;
+    const int extra_bits = av_clip(29 - av_log2(tb.den), 0, 16);
+
+    if (frame->pts == AV_NOPTS_VALUE)
+        goto early_exit;
+
+    tb.den <<= extra_bits;
+    float_pts = av_rescale_q(frame->pts, filter_tb, tb) -
+                av_rescale_q(start_time, AV_TIME_BASE_Q, tb);
+    float_pts /= 1 << extra_bits;
+    // when float_pts is not exactly an integer,
+    // avoid exact midpoints to reduce the chance of rounding differences, this
+    // can be removed in case the fps code is changed to work with integers
+    if (float_pts != llrint(float_pts))
+        float_pts += FFSIGN(float_pts) * 1.0 / (1<<17);
+
+    frame->pts = av_rescale_q(frame->pts, filter_tb, tb_dst) -
+                 av_rescale_q(start_time, AV_TIME_BASE_Q, tb_dst);
+    frame->time_base = tb_dst;
+
+early_exit:
+
+    if (debug_ts) {
+        av_log(NULL, AV_LOG_INFO, "filter -> pts:%s pts_time:%s exact:%f time_base:%d/%d\n",
+               frame ? av_ts2str(frame->pts) : "NULL",
+               av_ts2timestr(frame->pts, &tb_dst),
+               float_pts, tb_dst.num, tb_dst.den);
+    }
+
+    return float_pts;
+}
+
+/* Convert frame timestamps to the encoder timebase and decide how many times
+ * should this (and possibly previous) frame be repeated in order to conform to
+ * desired target framerate (if any).
+ */
+static void video_sync_process(OutputFilterPriv *ofp, AVFrame *frame,
+                               int64_t *nb_frames, int64_t *nb_frames_prev)
+{
+    OutputFilter   *ofilter = &ofp->ofilter;
+    OutputStream       *ost = ofilter->ost;
+    FPSConvContext     *fps = &ofp->fps;
+    double delta0, delta, sync_ipts, duration;
+
+    if (!frame) {
+        *nb_frames_prev = *nb_frames = mid_pred(fps->frames_prev_hist[0],
+                                                fps->frames_prev_hist[1],
+                                                fps->frames_prev_hist[2]);
+
+        if (!*nb_frames && fps->last_dropped) {
+            ofilter->nb_frames_drop++;
+            fps->last_dropped++;
+        }
+
+        goto finish;
+    }
+
+    duration = lrintf(frame->duration * av_q2d(frame->time_base) / av_q2d(ofp->tb_out));
+
+    sync_ipts = adjust_frame_pts_to_encoder_tb(frame, ofp->tb_out, ofp->ts_offset);
+    /* delta0 is the "drift" between the input frame and
+     * where it would fall in the output. */
+    delta0 = sync_ipts - ofp->next_pts;
+    delta  = delta0 + duration;
+
+    // tracks the number of times the PREVIOUS frame should be duplicated,
+    // mostly for variable framerate (VFR)
+    *nb_frames_prev = 0;
+    /* by default, we output a single frame */
+    *nb_frames = 1;
+
+    if (delta0 < 0 &&
+        delta > 0 &&
+        ost->vsync_method != VSYNC_PASSTHROUGH &&
+        ost->vsync_method != VSYNC_DROP) {
+        if (delta0 < -0.6) {
+            av_log(ost, AV_LOG_VERBOSE, "Past duration %f too large\n", -delta0);
+        } else
+            av_log(ost, AV_LOG_DEBUG, "Clipping frame in rate conversion by %f\n", -delta0);
+        sync_ipts = ofp->next_pts;
+        duration += delta0;
+        delta0 = 0;
+    }
+
+    switch (ost->vsync_method) {
+    case VSYNC_VSCFR:
+        if (fps->frame_number == 0 && delta0 >= 0.5) {
+            av_log(ost, AV_LOG_DEBUG, "Not duplicating %d initial frames\n", (int)lrintf(delta0));
+            delta = duration;
+            delta0 = 0;
+            ofp->next_pts = llrint(sync_ipts);
+        }
+    case VSYNC_CFR:
+        // FIXME set to 0.5 after we fix some dts/pts bugs like in avidec.c
+        if (frame_drop_threshold && delta < frame_drop_threshold && fps->frame_number) {
+            *nb_frames = 0;
+        } else if (delta < -1.1)
+            *nb_frames = 0;
+        else if (delta > 1.1) {
+            *nb_frames = llrintf(delta);
+            if (delta0 > 1.1)
+                *nb_frames_prev = llrintf(delta0 - 0.6);
+        }
+        frame->duration = 1;
+        break;
+    case VSYNC_VFR:
+        if (delta <= -0.6)
+            *nb_frames = 0;
+        else if (delta > 0.6)
+            ofp->next_pts = llrint(sync_ipts);
+        frame->duration = duration;
+        break;
+    case VSYNC_DROP:
+    case VSYNC_PASSTHROUGH:
+        frame->duration = duration;
+        ofp->next_pts = llrint(sync_ipts);
+        break;
+    default:
+        av_assert0(0);
+    }
+
+finish:
+    memmove(fps->frames_prev_hist + 1,
+            fps->frames_prev_hist,
+            sizeof(fps->frames_prev_hist[0]) * (FF_ARRAY_ELEMS(fps->frames_prev_hist) - 1));
+    fps->frames_prev_hist[0] = *nb_frames_prev;
+
+    if (*nb_frames_prev == 0 && fps->last_dropped) {
+        ofilter->nb_frames_drop++;
+        av_log(ost, AV_LOG_VERBOSE,
+               "*** dropping frame %"PRId64" at ts %"PRId64"\n",
+               fps->frame_number, fps->last_frame->pts);
+    }
+    if (*nb_frames > (*nb_frames_prev && fps->last_dropped) + (*nb_frames > *nb_frames_prev)) {
+        if (*nb_frames > dts_error_threshold * 30) {
+            av_log(ost, AV_LOG_ERROR, "%"PRId64" frame duplication too large, skipping\n", *nb_frames - 1);
+            ofilter->nb_frames_drop++;
+            *nb_frames = 0;
+            return;
+        }
+        ofilter->nb_frames_dup += *nb_frames - (*nb_frames_prev && fps->last_dropped) - (*nb_frames > *nb_frames_prev);
+        av_log(ost, AV_LOG_VERBOSE, "*** %"PRId64" dup!\n", *nb_frames - 1);
+        if (ofilter->nb_frames_dup > fps->dup_warning) {
+            av_log(ost, AV_LOG_WARNING, "More than %"PRIu64" frames duplicated\n", fps->dup_warning);
+            fps->dup_warning *= 10;
+        }
+    }
+
+    fps->last_dropped = *nb_frames == *nb_frames_prev && frame;
+    fps->dropped_keyframe |= fps->last_dropped && (frame->flags & AV_FRAME_FLAG_KEY);
+}
+
+static int fg_output_frame(OutputFilterPriv *ofp, AVFrame *frame)
+{
+    FilterGraphPriv  *fgp = fgp_from_fg(ofp->ofilter.graph);
+    OutputStream     *ost = ofp->ofilter.ost;
+    AVFrame   *frame_prev = ofp->fps.last_frame;
+    enum AVMediaType type = ofp->ofilter.type;
+
+    int64_t nb_frames = 1, nb_frames_prev = 0;
+
+    if (type == AVMEDIA_TYPE_VIDEO)
+        video_sync_process(ofp, frame, &nb_frames, &nb_frames_prev);
+
+    for (int64_t i = 0; i < nb_frames; i++) {
+        AVFrame *frame_out;
+        int ret;
+
+        if (type == AVMEDIA_TYPE_VIDEO) {
+            AVFrame *frame_in = (i < nb_frames_prev && frame_prev->buf[0]) ?
+                                frame_prev : frame;
+            if (!frame_in)
+                break;
+
+            frame_out = fgp->frame_enc;
+            ret = av_frame_ref(frame_out, frame_in);
+            if (ret < 0)
+                return ret;
+
+            frame_out->pts = ofp->next_pts;
+
+            if (ofp->fps.dropped_keyframe) {
+                frame_out->flags |= AV_FRAME_FLAG_KEY;
+                ofp->fps.dropped_keyframe = 0;
+            }
+        } else {
+            frame->pts = (frame->pts == AV_NOPTS_VALUE) ? ofp->next_pts :
+                av_rescale_q(frame->pts,   frame->time_base, ofp->tb_out) -
+                av_rescale_q(ofp->ts_offset, AV_TIME_BASE_Q, ofp->tb_out);
+
+            frame->time_base = ofp->tb_out;
+            frame->duration  = av_rescale_q(frame->nb_samples,
+                                            (AVRational){ 1, frame->sample_rate },
+                                            ofp->tb_out);
+
+            ofp->next_pts = frame->pts + frame->duration;
+
+            frame_out = frame;
+        }
+
+        ret = enc_frame(ost, frame_out);
+        av_frame_unref(frame_out);
+        if (ret < 0)
+            return ret;
+
+        if (type == AVMEDIA_TYPE_VIDEO) {
+            ofp->fps.frame_number++;
+            ofp->next_pts++;
+
+            if (i == nb_frames_prev && frame)
+                frame->flags &= ~AV_FRAME_FLAG_KEY;
+        }
+
+        ofp->got_frame = 1;
+    }
+
+    if (frame && frame_prev) {
+        av_frame_unref(frame_prev);
+        av_frame_move_ref(frame_prev, frame);
+    }
+
+    return 0;
+}
+
 static int fg_output_step(OutputFilterPriv *ofp, int flush)
 {
     FilterGraphPriv    *fgp = fgp_from_fg(ofp->ofilter.graph);
@@ -1782,9 +2158,8 @@  static int fg_output_step(OutputFilterPriv *ofp, int flush)
                    "Error in av_buffersink_get_frame_flags(): %s\n", av_err2str(ret));
         } else if (flush && ret == AVERROR_EOF && ofp->got_frame &&
                    av_buffersink_get_type(filter) == AVMEDIA_TYPE_VIDEO) {
-            ret = enc_frame(ost, NULL);
-            if (ret < 0)
-                return ret;
+            ret = fg_output_frame(ofp, NULL);
+            return (ret < 0) ? ret : 1;
         }
 
         return 1;
@@ -1794,14 +2169,26 @@  static int fg_output_step(OutputFilterPriv *ofp, int flush)
         return 0;
     }
 
+    frame->time_base = av_buffersink_get_time_base(filter);
+
     if (frame->pts != AV_NOPTS_VALUE) {
-        AVRational tb = av_buffersink_get_time_base(filter);
-        ost->filter->last_pts = av_rescale_q(frame->pts, tb, AV_TIME_BASE_Q);
-        frame->time_base = tb;
+        ost->filter->last_pts = av_rescale_q(frame->pts, frame->time_base,
+                                             AV_TIME_BASE_Q);
 
         if (debug_ts)
             av_log(fgp, AV_LOG_INFO, "filter_raw -> pts:%s pts_time:%s time_base:%d/%d\n",
-                   av_ts2str(frame->pts), av_ts2timestr(frame->pts, &tb), tb.num, tb.den);
+                   av_ts2str(frame->pts), av_ts2timestr(frame->pts, &frame->time_base),
+                             frame->time_base.num, frame->time_base.den);
+    }
+
+    // Choose the output timebase the first time we get a frame.
+    if (!ofp->tb_out_locked) {
+        ret = choose_out_timebase(ofp, frame);
+        if (ret < 0) {
+            av_log(ost, AV_LOG_ERROR, "Could not choose an output time base\n");
+            av_frame_unref(frame);
+            return ret;
+        }
     }
 
     fd = frame_data(frame);
@@ -1816,22 +2203,20 @@  static int fg_output_step(OutputFilterPriv *ofp, int flush)
         fd->bits_per_raw_sample = 0;
 
     if (ost->type == AVMEDIA_TYPE_VIDEO) {
-        AVRational fr = av_buffersink_get_frame_rate(filter);
-        if (fr.num > 0 && fr.den > 0) {
-            fd->frame_rate_filter = fr;
-
-            if (!frame->duration)
+        if (!frame->duration) {
+            AVRational fr = av_buffersink_get_frame_rate(filter);
+            if (fr.num > 0 && fr.den > 0)
                 frame->duration = av_rescale_q(1, av_inv_q(fr), frame->time_base);
         }
+
+        fd->frame_rate_filter = ofp->fps.framerate;
     }
 
-    ret = enc_frame(ost, frame);
+    ret = fg_output_frame(ofp, frame);
     av_frame_unref(frame);
     if (ret < 0)
         return ret;
 
-    ofp->got_frame = 1;
-
     return 0;
 }
 
@@ -2098,8 +2483,9 @@  int fg_transcode_step(FilterGraph *graph, InputStream **best_ist)
             // at least initialize the encoder with a dummy frame
             if (!ofp->got_frame) {
                 AVFrame *frame = fgp->frame;
+                FrameData *fd;
 
-                frame->time_base   = ofp->time_base;
+                frame->time_base   = ofp->tb_out;
                 frame->format      = ofp->format;
 
                 frame->width               = ofp->width;
@@ -2113,6 +2499,12 @@  int fg_transcode_step(FilterGraph *graph, InputStream **best_ist)
                         return ret;
                 }
 
+                fd = frame_data(frame);
+                if (!fd)
+                    return AVERROR(ENOMEM);
+
+                fd->frame_rate_filter = ofp->fps.framerate;
+
                 av_assert0(!frame->buf[0]);
 
                 av_log(ofilter->ost, AV_LOG_WARNING,