diff mbox

[FFmpeg-devel] GSOC 2018 qualification task.

Message ID CADPmSuztunjz-k-wtmsxDxZNDwRRXCr0oqG4dogRz47YkT9pDw@mail.gmail.com
State Superseded
Headers show

Commit Message

ANURAG SINGH IIT BHU April 9, 2018, 2:59 a.m. UTC
This mail is regarding the qualification task assigned to me for the
GSOC project
in FFmpeg for automatic real-time subtitle generation using speech to text
translation ML model. My assigned task by Michael sir was writing a
ffmpeg-libavfilter filter which outputs a "Hello World minute: sec"
subtitle each second.

I have built a libavfilter filter named "hellosubs" using the existing
subtitle filter. hellosubs filter accepts a video file as input, along with
any subtitle file of any supported subtitle format of FFmpeg with any
number of enteries(>0), any entries i.e any random subtitle file, as an
argument and writes "Hello World minute: sec" subtitle each second on the
video.

From 38fcf8c80f71a4186f03f33c9272b707390add67 Mon Sep 17 00:00:00 2001
From: ddosvulnerability <anurag.singh.phy15@iitbhu.ac.in>
Date: Fri, 6 Apr 2018 11:30:17 +0530
Subject: [PATCH] avfilter: add hellosub filter.

---

 libavfilter/Makefile       |   1 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/vf_hellosubs.c | 463
+++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 465 insertions(+)
 create mode 100644 libavfilter/vf_hellosubs.c


+
+}
+                }
+            }
+        }
+
+    }
+
+end:
+    av_dict_free(&codec_opts);
+    avcodec_close(dec_ctx);
+    avcodec_free_context(&dec_ctx);
+    avformat_close_input(&fmt);
+    return ret;
+}
+
+AVFilter ff_vf_hellosubs = {
+    .name          = "hellosubs",
+    .description   = NULL_IF_CONFIG_SMALL("Render text hello world time
subtitle onto input video using the libass library."),
+    .priv_size     = sizeof(AssContext),
+    .init          = init_hellosubs,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    .inputs        = ass_inputs,
+    .outputs       = ass_outputs,
+    .priv_class    = &hellosubs_class,
+};
+#endif

Comments

Michael Niedermayer April 9, 2018, 1:45 p.m. UTC | #1
Hi

On Mon, Apr 09, 2018 at 08:29:21AM +0530, ANURAG SINGH IIT BHU wrote:
> This mail is regarding the qualification task assigned to me for the
> GSOC project
> in FFmpeg for automatic real-time subtitle generation using speech to text
> translation ML model. My assigned task by Michael sir was writing a
> ffmpeg-libavfilter filter which outputs a "Hello World minute: sec"
> subtitle each second.

Yes
the exact task was to have the filter produce subtitle frames/packets and
then have these pass through the filter chain and into ffmpeg.
so that a subsequent filter could render them into the video or
ffmpeg store it in a file,
This would have required extending libavfilter to pass subtitles through
at least enough for this specific use case.

The time for this qualification task was very short as you contacted me rather
late.


> 
> I have built a libavfilter filter named "hellosubs" using the existing
> subtitle filter. hellosubs filter accepts a video file as input, along with
> any subtitle file of any supported subtitle format of FFmpeg with any
> number of enteries(>0), any entries i.e any random subtitle file, as an
> argument and writes "Hello World minute: sec" subtitle each second on the
> video.

yes, i understand that given the limited time that was as much as could be
implemented.
Ill review this patch below


>  Makefile       |    1 
>  allfilters.c   |    1 
>  vf_hellosubs.c |  463 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 465 insertions(+)
> 73061db543833e745b2accee67d9cca3870c1996  0001-avfilter-add-hellosub-filter.patch
> From 38fcf8c80f71a4186f03f33c9272b707390add67 Mon Sep 17 00:00:00 2001
> From: ddosvulnerability <anurag.singh.phy15@iitbhu.ac.in>
> Date: Fri, 6 Apr 2018 11:30:17 +0530
> Subject: [PATCH] avfilter: add hellosub filter.
> 
> ---
>  
>  libavfilter/Makefile       |   1 +
>  libavfilter/allfilters.c   |   1 +
>  libavfilter/vf_hellosubs.c | 463 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 465 insertions(+)
>  create mode 100644 libavfilter/vf_hellosubs.c
> 
> 
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index a90ca30..770b1b5 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -331,6 +331,7 @@ OBJS-$(CONFIG_SSIM_FILTER)                   += vf_ssim.o framesync.o
>  OBJS-$(CONFIG_STEREO3D_FILTER)               += vf_stereo3d.o
>  OBJS-$(CONFIG_STREAMSELECT_FILTER)           += f_streamselect.o framesync.o
>  OBJS-$(CONFIG_SUBTITLES_FILTER)              += vf_subtitles.o
> +OBJS-$(CONFIG_HELLOSUBS_FILTER)              += vf_hellosubs.o
>  OBJS-$(CONFIG_SUPER2XSAI_FILTER)             += vf_super2xsai.o
>  OBJS-$(CONFIG_SWAPRECT_FILTER)               += vf_swaprect.o
>  OBJS-$(CONFIG_SWAPUV_FILTER)                 += vf_swapuv.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 6eac828..a008908 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -322,6 +322,7 @@ extern AVFilter ff_vf_ssim;
>  extern AVFilter ff_vf_stereo3d;
>  extern AVFilter ff_vf_streamselect;
>  extern AVFilter ff_vf_subtitles;
> +extern AVFilter ff_vf_hellosubs;
>  extern AVFilter ff_vf_super2xsai;
>  extern AVFilter ff_vf_swaprect;
>  extern AVFilter ff_vf_swapuv;
> diff --git a/libavfilter/vf_hellosubs.c b/libavfilter/vf_hellosubs.c
> new file mode 100644
> index 0000000..7ba3a0e
> --- /dev/null
> +++ b/libavfilter/vf_hellosubs.c
> @@ -0,0 +1,463 @@
> +/*
> + * Copyright (c) 2011 Baptiste Coudurier
> + * Copyright (c) 2011 Stefano Sabatini
> + * Copyright (c) 2012 Clément Bœsch
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * Libass hellosubs burning filter.
> + *

> + * @see{http://www.matroska.org/technical/specs/hellosubs/ssa.html}

this looks like a search and replace error, this link does not work anymore


> + */
> +
> +#include <ass/ass.h>
> +
> +#include "config.h"
> +#if CONFIG_SUBTITLES_FILTER
> +# include "libavcodec/avcodec.h"
> +# include "libavformat/avformat.h"
> +#endif
> +#include "libavutil/avstring.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/parseutils.h"
> +#include "drawutils.h"
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "formats.h"
> +#include "video.h"
> +#include <stdio.h>
> +
> +typedef struct AssContext {
> +    const AVClass *class;
> +    ASS_Library  *library;
> +    ASS_Renderer *renderer;
> +    ASS_Track    *track;
> +    char *filename;
> +    char *fontsdir;
> +    char *charenc;
> +    char *force_style;
> +    int stream_index;
> +    int alpha;
> +    uint8_t rgba_map[4];
> +    int     pix_step[4];       ///< steps per pixel for each plane of the main output
> +    int original_w, original_h;
> +    int shaping;
> +    FFDrawContext draw;
> +} AssContext;
> +
> +#define OFFSET(x) offsetof(AssContext, x)
> +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
> +
> +#define COMMON_OPTIONS \
> +    {"filename",       "set the filename of file to read",                         OFFSET(filename),   AV_OPT_TYPE_STRING,     {.str = NULL},  CHAR_MIN, CHAR_MAX, FLAGS }, \
> +    {"f",              "set the filename of file to read",                         OFFSET(filename),   AV_OPT_TYPE_STRING,     {.str = NULL},  CHAR_MIN, CHAR_MAX, FLAGS }, \
> +    {"original_size",  "set the size of the original video (used to scale fonts)", OFFSET(original_w), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL},  CHAR_MIN, CHAR_MAX, FLAGS }, \
> +    {"fontsdir",       "set the directory containing the fonts to read",           OFFSET(fontsdir),   AV_OPT_TYPE_STRING,     {.str = NULL},  CHAR_MIN, CHAR_MAX, FLAGS }, \
> +    {"alpha",          "enable processing of alpha channel",                       OFFSET(alpha),      AV_OPT_TYPE_BOOL,       {.i64 = 0   },         0,        1, FLAGS }, \
> +
> +/* libass supports a log level ranging from 0 to 7 */
> +static const int ass_libavfilter_log_level_map[] = {
> +    [0] = AV_LOG_FATAL,     /* MSGL_FATAL */
> +    [1] = AV_LOG_ERROR,     /* MSGL_ERR */
> +    [2] = AV_LOG_WARNING,   /* MSGL_WARN */
> +    [3] = AV_LOG_WARNING,   /* <undefined> */
> +    [4] = AV_LOG_INFO,      /* MSGL_INFO */
> +    [5] = AV_LOG_INFO,      /* <undefined> */
> +    [6] = AV_LOG_VERBOSE,   /* MSGL_V */
> +    [7] = AV_LOG_DEBUG,     /* MSGL_DBG2 */
> +};
> +
> +static void ass_log(int ass_level, const char *fmt, va_list args, void *ctx)
> +{
> +    const int ass_level_clip = av_clip(ass_level, 0,
> +        FF_ARRAY_ELEMS(ass_libavfilter_log_level_map) - 1);
> +    const int level = ass_libavfilter_log_level_map[ass_level_clip];
> +
> +    av_vlog(ctx, level, fmt, args);
> +    av_log(ctx, level, "\n");
> +}
> +
> +static av_cold int init(AVFilterContext *ctx)
> +{
> +    AssContext *ass = ctx->priv;
> +
> +    if (!ass->filename) {
> +        av_log(ctx, AV_LOG_ERROR, "No filename provided!\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    ass->library = ass_library_init();
> +    if (!ass->library) {
> +        av_log(ctx, AV_LOG_ERROR, "Could not initialize libass.\n");
> +        return AVERROR(EINVAL);
> +    }
> +    ass_set_message_cb(ass->library, ass_log, ctx);
> +
> +    ass_set_fonts_dir(ass->library, ass->fontsdir);
> +
> +    ass->renderer = ass_renderer_init(ass->library);
> +    if (!ass->renderer) {
> +        av_log(ctx, AV_LOG_ERROR, "Could not initialize libass renderer.\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    return 0;
> +}
> +
> +static av_cold void uninit(AVFilterContext *ctx)
> +{
> +    AssContext *ass = ctx->priv;
> +
> +    if (ass->track)
> +        ass_free_track(ass->track);
> +    if (ass->renderer)
> +        ass_renderer_done(ass->renderer);
> +    if (ass->library)
> +        ass_library_done(ass->library);
> +}
> +
> +static int query_formats(AVFilterContext *ctx)
> +{
> +    return ff_set_common_formats(ctx, ff_draw_supported_pixel_formats(0));
> +}
> +
> +static int config_input(AVFilterLink *inlink)
> +{
> +    AssContext *ass = inlink->dst->priv;
> +
> +    ff_draw_init(&ass->draw, inlink->format, ass->alpha ? FF_DRAW_PROCESS_ALPHA : 0);
> +
> +    ass_set_frame_size  (ass->renderer, inlink->w, inlink->h);
> +    if (ass->original_w && ass->original_h)
> +        ass_set_aspect_ratio(ass->renderer, (double)inlink->w / inlink->h,
> +                             (double)ass->original_w / ass->original_h);
> +    if (ass->shaping != -1)
> +        ass_set_shaper(ass->renderer, ass->shaping);
> +
> +    return 0;
> +}
> +
> +/* libass stores an RGBA color in the format RRGGBBTT, where TT is the transparency level */
> +#define AR(c)  ( (c)>>24)
> +#define AG(c)  (((c)>>16)&0xFF)
> +#define AB(c)  (((c)>>8) &0xFF)
> +#define AA(c)  ((0xFF-(c)) &0xFF)
> +
> +static void overlay_ass_image(AssContext *ass, AVFrame *picref,
> +                              const ASS_Image *image)
> +{
> +    for (; image; image = image->next) {
> +        uint8_t rgba_color[] = {AR(image->color), AG(image->color), AB(image->color), AA(image->color)};
> +        FFDrawColor color;
> +        ff_draw_color(&ass->draw, &color, rgba_color);
> +        ff_blend_mask(&ass->draw, &color,
> +                      picref->data, picref->linesize,
> +                      picref->width, picref->height,
> +                      image->bitmap, image->stride, image->w, image->h,
> +                      3, 0, image->dst_x, image->dst_y);
> +    }
> +}
> +
> +static int filter_frame(AVFilterLink *inlink, AVFrame *picref)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    AVFilterLink *outlink = ctx->outputs[0];
> +    AssContext *ass = ctx->priv;
> +    int detect_change = 0;
> +    double time_ms = picref->pts * av_q2d(inlink->time_base) * 1000;
> +    ASS_Image *image = ass_render_frame(ass->renderer, ass->track,
> +                                        time_ms, &detect_change);
> +
> +    if (detect_change)
> +        av_log(ctx, AV_LOG_DEBUG, "Change happened at time ms:%f\n", time_ms);
> +
> +    overlay_ass_image(ass, picref, image);
> +
> +    return ff_filter_frame(outlink, picref);
> +}
> +
> +static const AVFilterPad ass_inputs[] = {
> +    {
> +        .name             = "default",
> +        .type             = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame     = filter_frame,
> +        .config_props     = config_input,
> +        .needs_writable   = 1,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad ass_outputs[] = {
> +    {
> +        .name = "default",
> +        .type = AVMEDIA_TYPE_VIDEO,
> +    },
> +    { NULL }
> +};
> +
> +
> +

> +#if CONFIG_HELLOSUBS_FILTER

The conditional building of the filter should happen through the Makefile
there should be no need for a #if

also this filter fails to build without libass

    src/libavfilter/vf_hellosubs.c:30:21: fatal error: ass/ass.h: No such file or directory
    #include <ass/ass.h>
                        ^
    compilation terminated.

thus dependancies in configure must be missing


> +
> +static const AVOption hellosubs_options[] = {
> +    COMMON_OPTIONS
> +    {"charenc",      "set input character encoding", OFFSET(charenc),      AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, FLAGS},

> +    {"stream_index", "set stream index",             OFFSET(stream_index), AV_OPT_TYPE_INT,    { .i64 = -1 }, -1,       INT_MAX,  FLAGS},
> +    {"si",           "set stream index",             OFFSET(stream_index), AV_OPT_TYPE_INT,    { .i64 = -1 }, -1,       INT_MAX,  FLAGS},

As the hellosubs filter draws a fixed subtitle it should not need or have a input stream_index


> +    {"force_style",  "force subtitle style",         OFFSET(force_style),  AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, FLAGS},
> +    {NULL},
> +};
> +
> +static const char * const font_mimetypes[] = {
> +    "application/x-truetype-font",
> +    "application/vnd.ms-opentype",
> +    "application/x-font-ttf",
> +    NULL
> +};
> +
> +static int attachment_is_font(AVStream * st)
> +{
> +    const AVDictionaryEntry *tag = NULL;
> +    int n;
> +
> +    tag = av_dict_get(st->metadata, "mimetype", NULL, AV_DICT_MATCH_CASE);
> +
> +    if (tag) {
> +        for (n = 0; font_mimetypes[n]; n++) {
> +            if (av_strcasecmp(font_mimetypes[n], tag->value) == 0)
> +                return 1;
> +        }
> +    }
> +    return 0;
> +}
> +
> +AVFILTER_DEFINE_CLASS(hellosubs);
> +

> +static av_cold int init_hellosubs(AVFilterContext *ctx)
> +{

> +    int j, ret, sid;long int z=0;int t1=0;

Please use the enter key sometimes


> +    int k = 0;
> +    AVDictionary *codec_opts = NULL;

> +    AVFormatContext *fmt = NULL;

As the subtitles are generated by the filter, there should be non need for a demxuer


> +    AVCodecContext *dec_ctx = NULL;
> +    AVCodec *dec = NULL;
> +    const AVCodecDescriptor *dec_desc;
> +    AVStream *st;
> +    AVPacket pkt;
> +    AssContext *ass = ctx->priv;
> +
> +    /* Init libass */
> +    ret = init(ctx);
> +    if (ret < 0)
> +        return ret;
> +    ass->track = ass_new_track(ass->library);
> +    if (!ass->track) {
> +        av_log(ctx, AV_LOG_ERROR, "Could not create a libass track\n");
> +        return AVERROR(EINVAL);
> +    }
> +

> +    /* Open hellosubs file */

looks like a search and replace mistake


> +    ret = avformat_open_input(&fmt, ass->filename, NULL, NULL);
> +    if (ret < 0) {
> +        av_log(ctx, AV_LOG_ERROR, "Unable to open %s\n", ass->filename);
> +        
> +    }

no input subtitle file should be needed either


> +    
> +
> +    /* Locate hellosubs stream */
> +    if (ass->stream_index < 0)
> +        ret = av_find_best_stream(fmt, AVMEDIA_TYPE_SUBTITLE, -1, -1, NULL, 0);
> +    else {
> +        ret = -1;
> +        if (ass->stream_index < fmt->nb_streams) {
> +            for (j = 0; j < fmt->nb_streams; j++) {
> +                if (fmt->streams[j]->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
> +                    if (ass->stream_index == k) {
> +                        ret = j;
> +                        break;
> +                    }
> +                    k++;
> +                }
> +            }
> +        }
> +    }
> +
> +    
> +    sid = ret;
> +    st = fmt->streams[sid];
> +

> +    /* Load attached fonts */
> +    for (j = 0; j < fmt->nb_streams; j++) {
> +        AVStream *st = fmt->streams[j];
> +        if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT &&
> +            attachment_is_font(st)) {
> +            const AVDictionaryEntry *tag = NULL;
> +            tag = av_dict_get(st->metadata, "filename", NULL,
> +                              AV_DICT_MATCH_CASE);
> +
> +            if (tag) {
> +                av_log(ctx, AV_LOG_DEBUG, "Loading attached font: %s\n",
> +                       tag->value);
> +                ass_add_font(ass->library, tag->value,
> +                             st->codecpar->extradata,
> +                             st->codecpar->extradata_size);
> +            } else {
> +                av_log(ctx, AV_LOG_WARNING,
> +                       "Font attachment has no filename, ignored.\n");
> +            }
> +        }
> +    }

Iam not sure if fonts should be loaded through attachments of a subtitle file and
not seperatly (if there is support for loading fonts) i dont think this feature
is important at this early stage


> +
> +    /* Initialize fonts */
> +    ass_set_fonts(ass->renderer, NULL, NULL, 1, NULL, 1);
> +

> +    /* Open decoder */
> +    dec = avcodec_find_decoder(st->codecpar->codec_id);
> +    if (!dec) {
> +        av_log(ctx, AV_LOG_ERROR, "Failed to find subtitle codec %s\n",
> +               avcodec_get_name(st->codecpar->codec_id));
> +        return AVERROR(EINVAL);
> +    }
> +    dec_desc = avcodec_descriptor_get(st->codecpar->codec_id);
> +    if (dec_desc && !(dec_desc->props & AV_CODEC_PROP_TEXT_SUB)) {
> +        av_log(ctx, AV_LOG_ERROR,

> +               "Only text based hellosubs are currently supported\n");

What are "text based hellosubs" ?
this looks like a slightly sloppy search and replace


> +        return AVERROR_PATCHWELCOME;
> +    }
> +    if (ass->charenc)
> +        av_dict_set(&codec_opts, "sub_charenc", ass->charenc, 0);

> +    if (LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(57,26,100))
> +        av_dict_set(&codec_opts, "sub_text_format", "ass", 0);

this check is unneeded as the version should be impossible to be that old


> +
> +    dec_ctx = avcodec_alloc_context3(dec);
> +    if (!dec_ctx)
> +        return AVERROR(ENOMEM);
> +
> +    ret = avcodec_parameters_to_context(dec_ctx, st->codecpar);
> +    if (ret < 0)
> +        goto end;
> +
> +    /*
> +     * This is required by the decoding process in order to rescale the
> +     * timestamps: in the current API the decoded hellosubs have their pts
> +     * expressed in AV_TIME_BASE, and thus the lavc internals need to know the
> +     * stream time base in order to achieve the rescaling.
> +     *
> +     * That API is old and needs to be reworked to match behaviour with A/V.
> +     */
> +    dec_ctx->pkt_timebase = st->time_base;
> +
> +    ret = avcodec_open2(dec_ctx, NULL, &codec_opts);
> +    //if (ret < 0)
> +        //goto end;
> +
> +    if (ass->force_style) {
> +        char **list = NULL;
> +        char *temp = NULL;
> +        char *ptr = av_strtok(ass->force_style, ",", &temp);
> +        int i = 0;
> +        while (ptr) {
> +            av_dynarray_add(&list, &i, ptr);
> +            if (!list) {
> +                ret = AVERROR(ENOMEM);
> +                goto end;
> +            }
> +            ptr = av_strtok(NULL, ",", &temp);
> +        }
> +        av_dynarray_add(&list, &i, NULL);
> +        if (!list) {
> +            ret = AVERROR(ENOMEM);
> +            goto end;
> +        }
> +        ass_set_style_overrides(ass->library, list);
> +        av_free(list);
> +    }
> +    /* Decode hellosubs and push them into the renderer (libass) */
> +    if (dec_ctx->subtitle_header)
> +        ass_process_codec_private(ass->track,
> +                                  dec_ctx->subtitle_header,
> +                                  dec_ctx->subtitle_header_size);
> +    av_init_packet(&pkt);
> +    pkt.data = NULL;
> +    pkt.size = 0;

> +AVSubtitle sub = {0};
> +int got_subtitle;

indention is inconsistent


> +
> +

> +    while (z<99999) {

that is certainly not correct, 99999 is arbitrary
also this fundamentally cannot work, you cannot create all subtitles for
a unbounded stream at init.


> +	        
> +	{int e = avcodec_decode_subtitle2(dec_ctx, &sub, &got_subtitle, &pkt);}
> +        got_subtitle=1;
> +	ret=1;
> +         {
> +            
> +            
> +		 {
> +                 int64_t start_time;
> +                 int64_t duration;
> +                start_time=t1;duration=1000;
> +		 {
> +                    
> +                    
> +				
> +		char a[100];char am[100];char am1[100];char ass_line1[100];	 
> +		sprintf(a, "%ld",z);
> +		sprintf(am, "%d",t1/60000);
> +		sprintf(am1, "%d",(t1/1000)%60);
> +		
> +	strcat(a,",0,Default,,0,0,0,,Hello world ");
> +	strcat(a,am);
> +	strcat(a,":");
> +	strcat(a,am1);
> +	strcpy(ass_line1, a);

sprintf / strcat / strcpy all are security wise risky, they dont check the space
in the output array

these should be using safer alternatives like snprintf

about testing, the filter works with ffmpeg.
With ffplay there is a significant AV sync problem
for example:
./ffplay matrixbench_mpeg2.mpg -vf hellosubs=fate-suite/sub/a9-misc.ssa

[...]

Thanks
Rostislav Pehlivanov April 9, 2018, 5:23 p.m. UTC | #2
On 9 April 2018 at 03:59, ANURAG SINGH IIT BHU <
anurag.singh.phy15@iitbhu.ac.in> wrote:

> This mail is regarding the qualification task assigned to me for the
> GSOC project
> in FFmpeg for automatic real-time subtitle generation using speech to text
> translation ML model.
>

i really don't think lavfi is the correct place for such code, nor that the
project's repo should contain such code at all.
This would need to be in another repo and a separate library.
Paul B Mahol April 9, 2018, 6:10 p.m. UTC | #3
On 4/9/18, Rostislav Pehlivanov <atomnuker@gmail.com> wrote:
> On 9 April 2018 at 03:59, ANURAG SINGH IIT BHU <
> anurag.singh.phy15@iitbhu.ac.in> wrote:
>
>> This mail is regarding the qualification task assigned to me for the
>> GSOC project
>> in FFmpeg for automatic real-time subtitle generation using speech to text
>> translation ML model.
>>
>
> i really don't think lavfi is the correct place for such code, nor that the
> project's repo should contain such code at all.
> This would need to be in another repo and a separate library.

Why? Are you against ocr filter too?

This is necessarey for A->S filter, once subtitles are supported by lavfi.
Rostislav Pehlivanov April 9, 2018, 11:25 p.m. UTC | #4
On 9 April 2018 at 19:10, Paul B Mahol <onemda@gmail.com> wrote:

> On 4/9/18, Rostislav Pehlivanov <atomnuker@gmail.com> wrote:
> > On 9 April 2018 at 03:59, ANURAG SINGH IIT BHU <
> > anurag.singh.phy15@iitbhu.ac.in> wrote:
> >
> >> This mail is regarding the qualification task assigned to me for the
> >> GSOC project
> >> in FFmpeg for automatic real-time subtitle generation using speech to
> text
> >> translation ML model.
> >>
> >
> > i really don't think lavfi is the correct place for such code, nor that
> the
> > project's repo should contain such code at all.
> > This would need to be in another repo and a separate library.
>
> Why? Are you against ocr filter too?
>

The OCR filter uses libtessract so I'm fine with it. Like I said, as long
as the actual code to do it is in an external library I don't mind.
Mozilla recently released Deep Speech (https://github.com/mozilla/DeepSpeech)
which does pretty much exactly speech to text and is considered to have the
most accurate one out there. Someone just needs to convert the tensorflow
code to something more usable.
diff mbox

Patch

From 38fcf8c80f71a4186f03f33c9272b707390add67 Mon Sep 17 00:00:00 2001
From: ddosvulnerability <anurag.singh.phy15@iitbhu.ac.in>
Date: Fri, 6 Apr 2018 11:30:17 +0530
Subject: [PATCH] avfilter: add hellosub filter.

---
 
 libavfilter/Makefile       |   1 +
 libavfilter/allfilters.c   |   1 +
 libavfilter/vf_hellosubs.c | 463 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 465 insertions(+)
 create mode 100644 libavfilter/vf_hellosubs.c


diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index a90ca30..770b1b5 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -331,6 +331,7 @@  OBJS-$(CONFIG_SSIM_FILTER)                   += vf_ssim.o framesync.o
 OBJS-$(CONFIG_STEREO3D_FILTER)               += vf_stereo3d.o
 OBJS-$(CONFIG_STREAMSELECT_FILTER)           += f_streamselect.o framesync.o
 OBJS-$(CONFIG_SUBTITLES_FILTER)              += vf_subtitles.o
+OBJS-$(CONFIG_HELLOSUBS_FILTER)              += vf_hellosubs.o
 OBJS-$(CONFIG_SUPER2XSAI_FILTER)             += vf_super2xsai.o
 OBJS-$(CONFIG_SWAPRECT_FILTER)               += vf_swaprect.o
 OBJS-$(CONFIG_SWAPUV_FILTER)                 += vf_swapuv.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 6eac828..a008908 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -322,6 +322,7 @@  extern AVFilter ff_vf_ssim;
 extern AVFilter ff_vf_stereo3d;
 extern AVFilter ff_vf_streamselect;
 extern AVFilter ff_vf_subtitles;
+extern AVFilter ff_vf_hellosubs;
 extern AVFilter ff_vf_super2xsai;
 extern AVFilter ff_vf_swaprect;
 extern AVFilter ff_vf_swapuv;
diff --git a/libavfilter/vf_hellosubs.c b/libavfilter/vf_hellosubs.c
new file mode 100644
index 0000000..7ba3a0e
--- /dev/null
+++ b/libavfilter/vf_hellosubs.c
@@ -0,0 +1,463 @@ 
+/*
+ * Copyright (c) 2011 Baptiste Coudurier
+ * Copyright (c) 2011 Stefano Sabatini
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Libass hellosubs burning filter.
+ *
+ * @see{http://www.matroska.org/technical/specs/hellosubs/ssa.html}
+ */
+
+#include <ass/ass.h>
+
+#include "config.h"
+#if CONFIG_SUBTITLES_FILTER
+# include "libavcodec/avcodec.h"
+# include "libavformat/avformat.h"
+#endif
+#include "libavutil/avstring.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+#include "libavutil/parseutils.h"
+#include "drawutils.h"
+#include "avfilter.h"
+#include "internal.h"
+#include "formats.h"
+#include "video.h"
+#include <stdio.h>
+
+typedef struct AssContext {
+    const AVClass *class;
+    ASS_Library  *library;
+    ASS_Renderer *renderer;
+    ASS_Track    *track;
+    char *filename;
+    char *fontsdir;
+    char *charenc;
+    char *force_style;
+    int stream_index;
+    int alpha;
+    uint8_t rgba_map[4];
+    int     pix_step[4];       ///< steps per pixel for each plane of the main output
+    int original_w, original_h;
+    int shaping;
+    FFDrawContext draw;
+} AssContext;
+
+#define OFFSET(x) offsetof(AssContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
+
+#define COMMON_OPTIONS \
+    {"filename",       "set the filename of file to read",                         OFFSET(filename),   AV_OPT_TYPE_STRING,     {.str = NULL},  CHAR_MIN, CHAR_MAX, FLAGS }, \
+    {"f",              "set the filename of file to read",                         OFFSET(filename),   AV_OPT_TYPE_STRING,     {.str = NULL},  CHAR_MIN, CHAR_MAX, FLAGS }, \
+    {"original_size",  "set the size of the original video (used to scale fonts)", OFFSET(original_w), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL},  CHAR_MIN, CHAR_MAX, FLAGS }, \
+    {"fontsdir",       "set the directory containing the fonts to read",           OFFSET(fontsdir),   AV_OPT_TYPE_STRING,     {.str = NULL},  CHAR_MIN, CHAR_MAX, FLAGS }, \
+    {"alpha",          "enable processing of alpha channel",                       OFFSET(alpha),      AV_OPT_TYPE_BOOL,       {.i64 = 0   },         0,        1, FLAGS }, \
+
+/* libass supports a log level ranging from 0 to 7 */
+static const int ass_libavfilter_log_level_map[] = {
+    [0] = AV_LOG_FATAL,     /* MSGL_FATAL */
+    [1] = AV_LOG_ERROR,     /* MSGL_ERR */
+    [2] = AV_LOG_WARNING,   /* MSGL_WARN */
+    [3] = AV_LOG_WARNING,   /* <undefined> */
+    [4] = AV_LOG_INFO,      /* MSGL_INFO */
+    [5] = AV_LOG_INFO,      /* <undefined> */
+    [6] = AV_LOG_VERBOSE,   /* MSGL_V */
+    [7] = AV_LOG_DEBUG,     /* MSGL_DBG2 */
+};
+
+static void ass_log(int ass_level, const char *fmt, va_list args, void *ctx)
+{
+    const int ass_level_clip = av_clip(ass_level, 0,
+        FF_ARRAY_ELEMS(ass_libavfilter_log_level_map) - 1);
+    const int level = ass_libavfilter_log_level_map[ass_level_clip];
+
+    av_vlog(ctx, level, fmt, args);
+    av_log(ctx, level, "\n");
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+    AssContext *ass = ctx->priv;
+
+    if (!ass->filename) {
+        av_log(ctx, AV_LOG_ERROR, "No filename provided!\n");
+        return AVERROR(EINVAL);
+    }
+
+    ass->library = ass_library_init();
+    if (!ass->library) {
+        av_log(ctx, AV_LOG_ERROR, "Could not initialize libass.\n");
+        return AVERROR(EINVAL);
+    }
+    ass_set_message_cb(ass->library, ass_log, ctx);
+
+    ass_set_fonts_dir(ass->library, ass->fontsdir);
+
+    ass->renderer = ass_renderer_init(ass->library);
+    if (!ass->renderer) {
+        av_log(ctx, AV_LOG_ERROR, "Could not initialize libass renderer.\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    AssContext *ass = ctx->priv;
+
+    if (ass->track)
+        ass_free_track(ass->track);
+    if (ass->renderer)
+        ass_renderer_done(ass->renderer);
+    if (ass->library)
+        ass_library_done(ass->library);
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    return ff_set_common_formats(ctx, ff_draw_supported_pixel_formats(0));
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    AssContext *ass = inlink->dst->priv;
+
+    ff_draw_init(&ass->draw, inlink->format, ass->alpha ? FF_DRAW_PROCESS_ALPHA : 0);
+
+    ass_set_frame_size  (ass->renderer, inlink->w, inlink->h);
+    if (ass->original_w && ass->original_h)
+        ass_set_aspect_ratio(ass->renderer, (double)inlink->w / inlink->h,
+                             (double)ass->original_w / ass->original_h);
+    if (ass->shaping != -1)
+        ass_set_shaper(ass->renderer, ass->shaping);
+
+    return 0;
+}
+
+/* libass stores an RGBA color in the format RRGGBBTT, where TT is the transparency level */
+#define AR(c)  ( (c)>>24)
+#define AG(c)  (((c)>>16)&0xFF)
+#define AB(c)  (((c)>>8) &0xFF)
+#define AA(c)  ((0xFF-(c)) &0xFF)
+
+static void overlay_ass_image(AssContext *ass, AVFrame *picref,
+                              const ASS_Image *image)
+{
+    for (; image; image = image->next) {
+        uint8_t rgba_color[] = {AR(image->color), AG(image->color), AB(image->color), AA(image->color)};
+        FFDrawColor color;
+        ff_draw_color(&ass->draw, &color, rgba_color);
+        ff_blend_mask(&ass->draw, &color,
+                      picref->data, picref->linesize,
+                      picref->width, picref->height,
+                      image->bitmap, image->stride, image->w, image->h,
+                      3, 0, image->dst_x, image->dst_y);
+    }
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *picref)
+{
+    AVFilterContext *ctx = inlink->dst;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AssContext *ass = ctx->priv;
+    int detect_change = 0;
+    double time_ms = picref->pts * av_q2d(inlink->time_base) * 1000;
+    ASS_Image *image = ass_render_frame(ass->renderer, ass->track,
+                                        time_ms, &detect_change);
+
+    if (detect_change)
+        av_log(ctx, AV_LOG_DEBUG, "Change happened at time ms:%f\n", time_ms);
+
+    overlay_ass_image(ass, picref, image);
+
+    return ff_filter_frame(outlink, picref);
+}
+
+static const AVFilterPad ass_inputs[] = {
+    {
+        .name             = "default",
+        .type             = AVMEDIA_TYPE_VIDEO,
+        .filter_frame     = filter_frame,
+        .config_props     = config_input,
+        .needs_writable   = 1,
+    },
+    { NULL }
+};
+
+static const AVFilterPad ass_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO,
+    },
+    { NULL }
+};
+
+
+
+#if CONFIG_HELLOSUBS_FILTER
+
+static const AVOption hellosubs_options[] = {
+    COMMON_OPTIONS
+    {"charenc",      "set input character encoding", OFFSET(charenc),      AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, FLAGS},
+    {"stream_index", "set stream index",             OFFSET(stream_index), AV_OPT_TYPE_INT,    { .i64 = -1 }, -1,       INT_MAX,  FLAGS},
+    {"si",           "set stream index",             OFFSET(stream_index), AV_OPT_TYPE_INT,    { .i64 = -1 }, -1,       INT_MAX,  FLAGS},
+    {"force_style",  "force subtitle style",         OFFSET(force_style),  AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, FLAGS},
+    {NULL},
+};
+
+static const char * const font_mimetypes[] = {
+    "application/x-truetype-font",
+    "application/vnd.ms-opentype",
+    "application/x-font-ttf",
+    NULL
+};
+
+static int attachment_is_font(AVStream * st)
+{
+    const AVDictionaryEntry *tag = NULL;
+    int n;
+
+    tag = av_dict_get(st->metadata, "mimetype", NULL, AV_DICT_MATCH_CASE);
+
+    if (tag) {
+        for (n = 0; font_mimetypes[n]; n++) {
+            if (av_strcasecmp(font_mimetypes[n], tag->value) == 0)
+                return 1;
+        }
+    }
+    return 0;
+}
+
+AVFILTER_DEFINE_CLASS(hellosubs);
+
+static av_cold int init_hellosubs(AVFilterContext *ctx)
+{
+    int j, ret, sid;long int z=0;int t1=0;
+    int k = 0;
+    AVDictionary *codec_opts = NULL;
+    AVFormatContext *fmt = NULL;
+    AVCodecContext *dec_ctx = NULL;
+    AVCodec *dec = NULL;
+    const AVCodecDescriptor *dec_desc;
+    AVStream *st;
+    AVPacket pkt;
+    AssContext *ass = ctx->priv;
+
+    /* Init libass */
+    ret = init(ctx);
+    if (ret < 0)
+        return ret;
+    ass->track = ass_new_track(ass->library);
+    if (!ass->track) {
+        av_log(ctx, AV_LOG_ERROR, "Could not create a libass track\n");
+        return AVERROR(EINVAL);
+    }
+
+    /* Open hellosubs file */
+    ret = avformat_open_input(&fmt, ass->filename, NULL, NULL);
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to open %s\n", ass->filename);
+        
+    }
+    
+
+    /* Locate hellosubs stream */
+    if (ass->stream_index < 0)
+        ret = av_find_best_stream(fmt, AVMEDIA_TYPE_SUBTITLE, -1, -1, NULL, 0);
+    else {
+        ret = -1;
+        if (ass->stream_index < fmt->nb_streams) {
+            for (j = 0; j < fmt->nb_streams; j++) {
+                if (fmt->streams[j]->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
+                    if (ass->stream_index == k) {
+                        ret = j;
+                        break;
+                    }
+                    k++;
+                }
+            }
+        }
+    }
+
+    
+    sid = ret;
+    st = fmt->streams[sid];
+
+    /* Load attached fonts */
+    for (j = 0; j < fmt->nb_streams; j++) {
+        AVStream *st = fmt->streams[j];
+        if (st->codecpar->codec_type == AVMEDIA_TYPE_ATTACHMENT &&
+            attachment_is_font(st)) {
+            const AVDictionaryEntry *tag = NULL;
+            tag = av_dict_get(st->metadata, "filename", NULL,
+                              AV_DICT_MATCH_CASE);
+
+            if (tag) {
+                av_log(ctx, AV_LOG_DEBUG, "Loading attached font: %s\n",
+                       tag->value);
+                ass_add_font(ass->library, tag->value,
+                             st->codecpar->extradata,
+                             st->codecpar->extradata_size);
+            } else {
+                av_log(ctx, AV_LOG_WARNING,
+                       "Font attachment has no filename, ignored.\n");
+            }
+        }
+    }
+
+    /* Initialize fonts */
+    ass_set_fonts(ass->renderer, NULL, NULL, 1, NULL, 1);
+
+    /* Open decoder */
+    dec = avcodec_find_decoder(st->codecpar->codec_id);
+    if (!dec) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to find subtitle codec %s\n",
+               avcodec_get_name(st->codecpar->codec_id));
+        return AVERROR(EINVAL);
+    }
+    dec_desc = avcodec_descriptor_get(st->codecpar->codec_id);
+    if (dec_desc && !(dec_desc->props & AV_CODEC_PROP_TEXT_SUB)) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Only text based hellosubs are currently supported\n");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (ass->charenc)
+        av_dict_set(&codec_opts, "sub_charenc", ass->charenc, 0);
+    if (LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(57,26,100))
+        av_dict_set(&codec_opts, "sub_text_format", "ass", 0);
+
+    dec_ctx = avcodec_alloc_context3(dec);
+    if (!dec_ctx)
+        return AVERROR(ENOMEM);
+
+    ret = avcodec_parameters_to_context(dec_ctx, st->codecpar);
+    if (ret < 0)
+        goto end;
+
+    /*
+     * This is required by the decoding process in order to rescale the
+     * timestamps: in the current API the decoded hellosubs have their pts
+     * expressed in AV_TIME_BASE, and thus the lavc internals need to know the
+     * stream time base in order to achieve the rescaling.
+     *
+     * That API is old and needs to be reworked to match behaviour with A/V.
+     */
+    dec_ctx->pkt_timebase = st->time_base;
+
+    ret = avcodec_open2(dec_ctx, NULL, &codec_opts);
+    //if (ret < 0)
+        //goto end;
+
+    if (ass->force_style) {
+        char **list = NULL;
+        char *temp = NULL;
+        char *ptr = av_strtok(ass->force_style, ",", &temp);
+        int i = 0;
+        while (ptr) {
+            av_dynarray_add(&list, &i, ptr);
+            if (!list) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            ptr = av_strtok(NULL, ",", &temp);
+        }
+        av_dynarray_add(&list, &i, NULL);
+        if (!list) {
+            ret = AVERROR(ENOMEM);
+            goto end;
+        }
+        ass_set_style_overrides(ass->library, list);
+        av_free(list);
+    }
+    /* Decode hellosubs and push them into the renderer (libass) */
+    if (dec_ctx->subtitle_header)
+        ass_process_codec_private(ass->track,
+                                  dec_ctx->subtitle_header,
+                                  dec_ctx->subtitle_header_size);
+    av_init_packet(&pkt);
+    pkt.data = NULL;
+    pkt.size = 0;
+AVSubtitle sub = {0};
+int got_subtitle;
+
+
+    while (z<99999) {
+	        
+	{int e = avcodec_decode_subtitle2(dec_ctx, &sub, &got_subtitle, &pkt);}
+        got_subtitle=1;
+	ret=1;
+         {
+            
+            
+		 {
+                 int64_t start_time;
+                 int64_t duration;
+                start_time=t1;duration=1000;
+		 {
+                    
+                    
+				
+		char a[100];char am[100];char am1[100];char ass_line1[100];	 
+		sprintf(a, "%ld",z);
+		sprintf(am, "%d",t1/60000);
+		sprintf(am1, "%d",(t1/1000)%60);
+		
+	strcat(a,",0,Default,,0,0,0,,Hello world ");
+	strcat(a,am);
+	strcat(a,":");
+	strcat(a,am1);
+	strcpy(ass_line1, a);
+		
+                    
+                        {ass_process_chunk(ass->track, ass_line1, strlen(ass_line1),
+                                          start_time, duration);z++;t1=t1+1000;
+   
+   
+}
+                }
+            }
+        }
+        
+    }
+
+end:
+    av_dict_free(&codec_opts);
+    avcodec_close(dec_ctx);
+    avcodec_free_context(&dec_ctx);
+    avformat_close_input(&fmt);
+    return ret;
+}
+
+AVFilter ff_vf_hellosubs = {
+    .name          = "hellosubs",
+    .description   = NULL_IF_CONFIG_SMALL("Render text hello world time subtitle onto input video using the libass library."),
+    .priv_size     = sizeof(AssContext),
+    .init          = init_hellosubs,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    .inputs        = ass_inputs,
+    .outputs       = ass_outputs,
+    .priv_class    = &hellosubs_class,
+};
+#endif
-- 
2.7.4