From 0cd4d92b98ecbf364891038b851740291cf75219 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Wed, 28 Sep 2016 03:28:52 +0200
Subject: [PATCH] avformat: basic language support in SAMI subtitles.
Different languages are not yet extracted as separate subtitle tracks.
This first basic version simply uses the default language and ignore any
others. The spec says: "If the user (or author) has not explicitly
selected a language, the first Class (language) definition will be used
by default."
See: https://msdn.microsoft.com/en-us/library/ms971327.aspx
---
libavformat/realtextdec.c | 14 ++--
libavformat/samidec.c | 167 ++++++++++++++++++++++++++++++++++++++++++----
libavformat/subtitles.c | 22 +++++-
libavformat/subtitles.h | 5 +-
4 files changed, 187 insertions(+), 21 deletions(-)
@@ -85,10 +85,12 @@ static int realtext_read_header(AVFormatContext *s)
if (!av_strncasecmp(buf.str, "<window", 7)) {
/* save header to extradata */
- const char *p = ff_smil_get_attr_ptr(buf.str, "duration");
+ char *p = ff_smil_get_attr_ptr(buf.str, "duration");
- if (p)
+ if (p) {
duration = read_ts(p);
+ av_free(p);
+ }
st->codecpar->extradata = av_strdup(buf.str);
if (!st->codecpar->extradata) {
res = AVERROR(ENOMEM);
@@ -105,12 +107,16 @@ static int realtext_read_header(AVFormatContext *s)
goto end;
}
if (!merge) {
- const char *begin = ff_smil_get_attr_ptr(buf.str, "begin");
- const char *end = ff_smil_get_attr_ptr(buf.str, "end");
+ char *begin = ff_smil_get_attr_ptr(buf.str, "begin");
+ char *end = ff_smil_get_attr_ptr(buf.str, "end");
sub->pos = pos;
sub->pts = begin ? read_ts(begin) : 0;
sub->duration = end ? (read_ts(end) - sub->pts) : duration;
+ if (begin)
+ av_free(begin);
+ if (end)
+ av_free(end);
}
}
av_bprint_clear(&buf);
@@ -26,6 +26,7 @@
#include "avformat.h"
#include "internal.h"
+#include "regex.h"
#include "subtitles.h"
#include "libavcodec/internal.h"
#include "libavutil/avstring.h"
@@ -34,6 +35,10 @@
typedef struct {
FFDemuxSubtitlesQueue q;
+ char **lang_names;
+ char **lang_codes;
+ char **lang_ids;
+ int n_langs;
} SAMIContext;
static int sami_probe(AVProbeData *p)
@@ -51,11 +56,19 @@ static int sami_read_header(AVFormatContext *s)
SAMIContext *sami = s->priv_data;
AVStream *st = avformat_new_stream(s, NULL);
AVBPrint buf, hdr_buf;
+ int is_style = 0;
char c = 0;
int res = 0, got_first_sync_point = 0;
+ int ignore_sub;
FFTextReader tr;
+ int64_t last_pos;
+ int64_t last_pts;
+ int new_sub = 0;
+
ff_text_init_avio(s, &tr, s->pb);
+ sami->n_langs = 0;
+
if (!st)
return AVERROR(ENOMEM);
avpriv_set_pts_info(st, 64, 1, 1000);
@@ -65,10 +78,87 @@ static int sami_read_header(AVFormatContext *s)
av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
av_bprint_init(&hdr_buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+ /* Read header */
+ while (!ff_text_eof(&tr)) {
+ int header_end;
+ int n = ff_smil_extract_next_text_chunk(&tr, &buf, &c);
+
+ if (n == 0)
+ break;
+
+ header_end = !av_strncasecmp(buf.str, "</Head", 6);
+ if (header_end) {
+ av_bprint_clear(&buf);
+ break;
+ }
+
+ if (!av_strncasecmp(buf.str, "<Style", 6)) {
+ is_style = 1;
+ av_bprint_clear(&buf);
+ continue;
+ }
+
+ if (is_style) {
+ if (!av_strncasecmp(buf.str, "</Style", 7)) {
+ is_style = 0;
+ av_bprint_clear(&buf);
+ continue;
+ }
+ else {
+ regex_t regex;
+ regmatch_t matchptr[4];
+
+ if (!regcomp(®ex, "\\.\\([a-z]\\+\\)\\s*{\\s*Name:\\s*\\([^;}]\\+\\);\\s*Lang:\\s*\\([^;}]\\+\\)", REG_ICASE)) {
+ char *style_text = buf.str;
+
+ while (!regexec(®ex, style_text, 4, matchptr, REG_ICASE)) {
+ char **lang_names;
+ char **lang_codes;
+ char **lang_ids;
+
+ lang_names = av_realloc(sami->lang_names,
+ (sami->n_langs + 1) * sizeof(char*));
+ lang_codes = av_realloc(sami->lang_codes,
+ (sami->n_langs + 1) * sizeof(char*));
+ lang_ids = av_realloc(sami->lang_ids,
+ (sami->n_langs + 1) * sizeof(char*));
+ if (lang_names && lang_codes && lang_ids) {
+ char *lang_name;
+ char *lang_code;
+ char *lang_id;
+
+ lang_name = av_strndup(style_text + matchptr[2].rm_so,
+ matchptr[2].rm_eo - matchptr[2].rm_so);
+
+ lang_code = av_strndup(style_text + matchptr[3].rm_so,
+ matchptr[3].rm_eo - matchptr[3].rm_so);
+
+ lang_id = av_strndup(style_text + matchptr[1].rm_so,
+ matchptr[1].rm_eo - matchptr[1].rm_so);
+
+ lang_names[sami->n_langs] = lang_name;
+ lang_codes[sami->n_langs] = lang_code;
+ lang_ids[sami->n_langs++] = lang_id;
+
+ sami->lang_names = lang_names;
+ sami->lang_codes = lang_codes;
+ sami->lang_ids = lang_ids;
+ }
+ style_text = style_text + matchptr[0].rm_eo;
+ }
+ }
+ regfree(®ex);
+ }
+ }
+ av_bprint_clear(&buf);
+ }
+
+ /* Read body. */
while (!ff_text_eof(&tr)) {
AVPacket *sub;
const int64_t pos = ff_text_pos(&tr) - (c != 0);
- int is_sync, is_body, n = ff_smil_extract_next_text_chunk(&tr, &buf, &c);
+ int is_sync, is_body, is_p;
+ int n = ff_smil_extract_next_text_chunk(&tr, &buf, &c);
if (n == 0)
break;
@@ -80,22 +170,60 @@ static int sami_read_header(AVFormatContext *s)
}
is_sync = !av_strncasecmp(buf.str, "<SYNC", 5);
- if (is_sync)
+ if (is_sync) {
+ char *p = ff_smil_get_attr_ptr(buf.str, "Start");
+
got_first_sync_point = 1;
+ ignore_sub = 0;
+ new_sub = 1;
+
+ last_pos = pos;
+ last_pts = p ? strtol(p, NULL, 10) : 0;
+ if (p)
+ av_free(p);
+ }
+
+ is_p = !av_strncasecmp(buf.str, "<P", 2);
+ if (is_p) {
+ char *class = ff_smil_get_attr_ptr(buf.str, "Class");
+ if (class && !sami->n_langs) {
+ /* If no languages were set in <STYLE>, let's just
+ * assume that the first encountered Class is the
+ * default language. */
+ sami->lang_names = av_malloc(sizeof(char*));
+ sami->lang_codes = av_malloc(sizeof(char*));
+ sami->lang_ids = av_malloc(sizeof(char*));
+ sami->lang_names[0] = av_strdup(class);
+ sami->lang_codes[0] = av_strdup(class);
+ sami->lang_ids[0] = av_strdup(class);
+ sami->n_langs = 1;
+ }
+ if (class && av_strcasecmp(sami->lang_ids[0], class)) {
+ /* Ignore subtitle for a different language. */
+ ignore_sub = 1;
+ }
+ else {
+ ignore_sub = 0;
+ }
+ if (class)
+ av_free(class);
+ }
if (!got_first_sync_point) {
av_bprintf(&hdr_buf, "%s", buf.str);
} else {
- sub = ff_subtitles_queue_insert(&sami->q, buf.str, buf.len, !is_sync);
- if (!sub) {
- res = AVERROR(ENOMEM);
- goto end;
- }
- if (is_sync) {
- const char *p = ff_smil_get_attr_ptr(buf.str, "Start");
- sub->pos = pos;
- sub->pts = p ? strtol(p, NULL, 10) : 0;
- sub->duration = -1;
+ if (!ignore_sub && !is_sync) {
+ sub = ff_subtitles_queue_insert(&sami->q, buf.str, buf.len, !new_sub);
+ if (!sub) {
+ res = AVERROR(ENOMEM);
+ goto end;
+ }
+ if (new_sub) {
+ new_sub = 0;
+ sub->duration = -1;
+ sub->pos = last_pos;
+ sub->pts = last_pts;
+ }
}
}
av_bprint_clear(&buf);
@@ -130,6 +258,21 @@ static int sami_read_close(AVFormatContext *s)
{
SAMIContext *sami = s->priv_data;
ff_subtitles_queue_clean(&sami->q);
+ if (sami->lang_names) {
+ for (int i = 0; i < sami->n_langs; i++)
+ av_free(sami->lang_names[i]);
+ av_free(sami->lang_names);
+ }
+ if (sami->lang_codes) {
+ for (int i = 0; i < sami->n_langs; i++)
+ av_free(sami->lang_codes[i]);
+ av_free(sami->lang_codes);
+ }
+ if (sami->lang_ids) {
+ for (int i = 0; i < sami->n_langs; i++)
+ av_free(sami->lang_ids[i]);
+ av_free(sami->lang_ids);
+ }
return 0;
}
@@ -327,7 +327,7 @@ int ff_smil_extract_next_text_chunk(FFTextReader *tr, AVBPrint *buf, char *c)
return i;
}
-const char *ff_smil_get_attr_ptr(const char *s, const char *attr)
+char *ff_smil_get_attr_ptr(const char *s, const char *attr)
{
int in_quotes = 0;
const size_t len = strlen(attr);
@@ -341,8 +341,24 @@ const char *ff_smil_get_attr_ptr(const char *s, const char *attr)
}
while (av_isspace(*s))
s++;
- if (!av_strncasecmp(s, attr, len) && s[len] == '=')
- return s + len + 1 + (s[len + 1] == '"');
+ if (!av_strncasecmp(s, attr, len) && s[len] == '=') {
+ const char *value = NULL;
+ int quoted_value = 0;
+
+ quoted_value = (s[len + 1] == '"');
+ value = s + len + 1 + quoted_value;
+ s = value;
+
+ while (*s) {
+ if ((quoted_value && *s == '"') ||
+ (!quoted_value && (av_isspace(*s) || *s == '>')) ||
+ *s == '\0')
+ {
+ return av_strndup(value, s - value);
+ }
+ s++;
+ }
+ }
}
return NULL;
}
@@ -151,12 +151,13 @@ void ff_subtitles_queue_clean(FFDemuxSubtitlesQueue *q);
int ff_smil_extract_next_text_chunk(FFTextReader *tr, AVBPrint *buf, char *c);
/**
- * SMIL helper to point on the value of an attribute in the given tag.
+ * SMIL helper returning a newly allocated copy of the value of an
+ * attribute in the given tag.
*
* @param s SMIL tag ("<...>")
* @param attr the attribute to look for
*/
-const char *ff_smil_get_attr_ptr(const char *s, const char *attr);
+char *ff_smil_get_attr_ptr(const char *s, const char *attr);
/**
* @brief Same as ff_subtitles_read_text_chunk(), but read from an AVIOContext.
--
2.7.4