diff mbox

[FFmpeg-devel] lavf/mpegts: Convert service name and service provider to utf-8

Message ID CAB0OVGqjrLbpDM8vVLV_9v3tKe5XpZ_6-hfee3YTx4rSiMrW9g@mail.gmail.com
State Accepted
Headers show

Commit Message

Carl Eugen Hoyos Feb. 9, 2019, 5:39 p.m. UTC
2019-02-09 17:42 GMT+01:00, Marton Balint <cus@passwd.hu>:
> On Sat, 9 Feb 2019, Carl Eugen Hoyos wrote:
>
>> From 9033f0a18727a7a576c4cc06b9985d6d922d46ad Mon Sep 17 00:00:00 2001
>> From: Carl Eugen Hoyos <ceffmpeg@gmail.com>
>> Date: Sat, 9 Feb 2019 00:49:51 +0100
>> Subject: [PATCH] lavf/mpegts: Convert service_name and service_provider to
>>  utf-8.
>>
>> Fixes ticket #6320.
>> ---
>>  libavformat/mpegts.c |   48
>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 48 insertions(+)
>>
>> Diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
>> Index b04fd7b..1e27500 100644
>> --- a/libavformat/mpegts.c
>> +++ b/libavformat/mpegts.c
>> @@ -37,6 +37,9 @@
>>  #include "avio_internal.h"
>>  #include "mpeg.h"
>>  #include "isom.h"
>> +#if CONFIG_ICONV
>> +#include <iconv.h>
>> +#endif
>>
>>  /* maximum size in which we look for synchronization if
>>   * synchronization is lost */
>> @@ -674,6 +677,51 @@ static char *getstr8(const uint8_t **pp, const
>> uint8_t *p_end)
>>          return NULL;
>>      if (len > p_end - p)
>>          return NULL;
>> +#if CONFIG_ICONV
>> +    if (len && *p < 0x20) {
>> +        char iso8859[] = "ISO-8859-00";
>> +        const char *encodings[] = {
>> +            "ISO6937", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7",
>> "ISO-8859-8",
>> +            "ISO-8859-9", "ISO-8859-10", "ISO-8859-11", "",
>> "ISO-8859-13",
>> +            "ISO-8859-14", "ISO-8859-15", "", "", "", "",
>> +            "", "ISO-10646", "KSC_5601", "GB2312", "ISO-10646", "UTF-8",
>> "",
>> +            "", "", "", "", "", "", "", "", ""
>> +        };
>> +        iconv_t cd;
>> +        char *in, *out;
>> +        size_t inlen = len - 1, outlen = inlen * 6 + 1;
>> +        if (len >= 3 && p[0] == 0x10 && !p[1] && p[2] && p[2] <= 0xf &&
>> p[2] != 0xc) {
>> +            if (p[2] < 10) {
>> +                iso8859[9] += p[2];
>> +                iso8859[10] = 0;
>> +            } else {
>> +                iso8859[9]++;
>> +                iso8859[10] += p[2] - 10;
>> +            }
>
> I think this would be much more readable:
>
> char iso8859[16];
> snprintf(iso8859, sizeof(iso8859), "ISO-8859-%d", p[2]);

Definitely, new patch attached.

Thank you, Carl Eugen

Comments

Marton Balint Feb. 9, 2019, 10:14 p.m. UTC | #1
On Sat, 9 Feb 2019, Carl Eugen Hoyos wrote:

+#if CONFIG_ICONV
+    if (len && *p < 0x20) {
> +        const char *encodings[] = {
> +            "ISO6937", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
> +            "ISO-8859-9", "ISO-8859-10", "ISO-8859-11", "", "ISO-8859-13",
> +            "ISO-8859-14", "ISO-8859-15", "", "", "", "",
> +            "", "ISO-10646", "KSC_5601", "GB2312", "ISO-10646", "UTF-8", "",
> +            "", "", "", "", "", "", "", "", ""

Instead of ISO-10646 you probably should use UCS-2BE, VLC is also doing this
and the specs refer to the basic multilingual plane. ISO-10646 alone is UCS-4
which uses 4 byte encoding, probably not what we want here.

Otherwise LGTM, thanks.

Marton
Carl Eugen Hoyos Feb. 9, 2019, 10:53 p.m. UTC | #2
2019-02-09 23:14 GMT+01:00, Marton Balint <cus@passwd.hu>:
>
>
> On Sat, 9 Feb 2019, Carl Eugen Hoyos wrote:
>
> +#if CONFIG_ICONV
> +    if (len && *p < 0x20) {
>> +        const char *encodings[] = {
>> +            "ISO6937", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7",
>> "ISO-8859-8",
>> +            "ISO-8859-9", "ISO-8859-10", "ISO-8859-11", "",
>> "ISO-8859-13",
>> +            "ISO-8859-14", "ISO-8859-15", "", "", "", "",
>> +            "", "ISO-10646", "KSC_5601", "GB2312", "ISO-10646", "UTF-8",
>> "",
>> +            "", "", "", "", "", "", "", "", ""
>
> Instead of ISO-10646 you probably should use UCS-2BE, VLC
> is also doing this and the specs refer to the basic multilingual
> plane. ISO-10646 alone is UCS-4 which uses 4 byte encoding,
> probably not what we want here.

(Not "BIG-5"?)

> Otherwise LGTM, thanks.

Changed and pushed.

Thank you for the review, Carl Eugen
Jan Ekström Feb. 9, 2019, 11:10 p.m. UTC | #3
On Sat, Feb 9, 2019 at 7:39 PM Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:
>
> 2019-02-09 17:42 GMT+01:00, Marton Balint <cus@passwd.hu>:
> > On Sat, 9 Feb 2019, Carl Eugen Hoyos wrote:
> >
> >> From 9033f0a18727a7a576c4cc06b9985d6d922d46ad Mon Sep 17 00:00:00 2001
> >> From: Carl Eugen Hoyos <ceffmpeg@gmail.com>
> >> Date: Sat, 9 Feb 2019 00:49:51 +0100
> >> Subject: [PATCH] lavf/mpegts: Convert service_name and service_provider to
> >>  utf-8.
> >>
> >> Fixes ticket #6320.
> >> ---
> >>  libavformat/mpegts.c |   48
> >> ++++++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 48 insertions(+)
> >>
> >> Diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
> >> Index b04fd7b..1e27500 100644
> >> --- a/libavformat/mpegts.c
> >> +++ b/libavformat/mpegts.c
> >> @@ -37,6 +37,9 @@
> >>  #include "avio_internal.h"
> >>  #include "mpeg.h"
> >>  #include "isom.h"
> >> +#if CONFIG_ICONV
> >> +#include <iconv.h>
> >> +#endif
> >>
> >>  /* maximum size in which we look for synchronization if
> >>   * synchronization is lost */
> >> @@ -674,6 +677,51 @@ static char *getstr8(const uint8_t **pp, const
> >> uint8_t *p_end)
> >>          return NULL;
> >>      if (len > p_end - p)
> >>          return NULL;
> >> +#if CONFIG_ICONV
> >> +    if (len && *p < 0x20) {
> >> +        char iso8859[] = "ISO-8859-00";
> >> +        const char *encodings[] = {
> >> +            "ISO6937", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7",
> >> "ISO-8859-8",
> >> +            "ISO-8859-9", "ISO-8859-10", "ISO-8859-11", "",
> >> "ISO-8859-13",
> >> +            "ISO-8859-14", "ISO-8859-15", "", "", "", "",
> >> +            "", "ISO-10646", "KSC_5601", "GB2312", "ISO-10646", "UTF-8",
> >> "",
> >> +            "", "", "", "", "", "", "", "", ""
> >> +        };
> >> +        iconv_t cd;
> >> +        char *in, *out;
> >> +        size_t inlen = len - 1, outlen = inlen * 6 + 1;
> >> +        if (len >= 3 && p[0] == 0x10 && !p[1] && p[2] && p[2] <= 0xf &&
> >> p[2] != 0xc) {
> >> +            if (p[2] < 10) {
> >> +                iso8859[9] += p[2];
> >> +                iso8859[10] = 0;
> >> +            } else {
> >> +                iso8859[9]++;
> >> +                iso8859[10] += p[2] - 10;
> >> +            }
> >
> > I think this would be much more readable:
> >
> > char iso8859[16];
> > snprintf(iso8859, sizeof(iso8859), "ISO-8859-%d", p[2]);
>
> Definitely, new patch attached.
>

Idea-wise I like this. We generally try to promise that our metadata
is UTF-8, but with broadcast things we've not held up to that promise
too much :) . This fixes quite a bit of that, which is nice.

Checked that this doesn't seem to be breaking my future integration of
ARIB STD-B24 text decoding into UTF-8 looking at my set of samples on
hand. Just changes the place I'll have to integrate to as to not do a
double conversion.

In other words, good work.

Jan
Marton Balint Feb. 9, 2019, 11:23 p.m. UTC | #4
On Sat, 9 Feb 2019, Carl Eugen Hoyos wrote:

> 2019-02-09 23:14 GMT+01:00, Marton Balint <cus@passwd.hu>:
>>
>>
>> On Sat, 9 Feb 2019, Carl Eugen Hoyos wrote:
>>
>> +#if CONFIG_ICONV
>> +    if (len && *p < 0x20) {
>>> +        const char *encodings[] = {
>>> +            "ISO6937", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7",
>>> "ISO-8859-8",
>>> +            "ISO-8859-9", "ISO-8859-10", "ISO-8859-11", "",
>>> "ISO-8859-13",
>>> +            "ISO-8859-14", "ISO-8859-15", "", "", "", "",
>>> +            "", "ISO-10646", "KSC_5601", "GB2312", "ISO-10646", "UTF-8",
>>> "",
>>> +            "", "", "", "", "", "", "", "", ""
>>
>> Instead of ISO-10646 you probably should use UCS-2BE, VLC
>> is also doing this and the specs refer to the basic multilingual
>> plane. ISO-10646 alone is UCS-4 which uses 4 byte encoding,
>> probably not what we want here.
>
> (Not "BIG-5"?)

No, see this VLC ticket:

https://trac.videolan.org/vlc/ticket/8235

Regards,
Marton
Carl Eugen Hoyos Feb. 9, 2019, 11:25 p.m. UTC | #5
2019-02-10 0:23 GMT+01:00, Marton Balint <cus@passwd.hu>:
>
> On Sat, 9 Feb 2019, Carl Eugen Hoyos wrote:
>
>> 2019-02-09 23:14 GMT+01:00, Marton Balint <cus@passwd.hu>:

>>> Instead of ISO-10646 you probably should use UCS-2BE, VLC
>>> is also doing this and the specs refer to the basic multilingual
>>> plane. ISO-10646 alone is UCS-4 which uses 4 byte encoding,
>>> probably not what we want here.
>>
>> (Not "BIG-5"?)
>
> No, see this VLC ticket:
>
> https://trac.videolan.org/vlc/ticket/8235

Thank you!

Carl Eugen
diff mbox

Patch

From 8ef77f9fb36c85247a9514fe18f4b48fd23f405c Mon Sep 17 00:00:00 2001
From: Carl Eugen Hoyos <ceffmpeg@gmail.com>
Date: Sat, 9 Feb 2019 18:38:13 +0100
Subject: [PATCH] lavf/mpegts: Convert service_name and service_provider to
 utf-8.

Fixes ticket #6320.
---
 libavformat/mpegts.c |   44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index b04fd7b..108514c 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -37,6 +37,9 @@ 
 #include "avio_internal.h"
 #include "mpeg.h"
 #include "isom.h"
+#if CONFIG_ICONV
+#include <iconv.h>
+#endif
 
 /* maximum size in which we look for synchronization if
  * synchronization is lost */
@@ -674,6 +677,47 @@  static char *getstr8(const uint8_t **pp, const uint8_t *p_end)
         return NULL;
     if (len > p_end - p)
         return NULL;
+#if CONFIG_ICONV
+    if (len && *p < 0x20) {
+        const char *encodings[] = {
+            "ISO6937", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
+            "ISO-8859-9", "ISO-8859-10", "ISO-8859-11", "", "ISO-8859-13",
+            "ISO-8859-14", "ISO-8859-15", "", "", "", "",
+            "", "ISO-10646", "KSC_5601", "GB2312", "ISO-10646", "UTF-8", "",
+            "", "", "", "", "", "", "", "", ""
+        };
+        iconv_t cd;
+        char *in, *out;
+        size_t inlen = len - 1, outlen = inlen * 6 + 1;
+        if (len >= 3 && p[0] == 0x10 && !p[1] && p[2] && p[2] <= 0xf && p[2] != 0xc) {
+            char iso8859[12];
+            snprintf(iso8859, sizeof(iso8859), "ISO-8859-%d", p[2]);
+            inlen -= 2;
+            in = (char *)p + 3;
+            cd = iconv_open("UTF-8", iso8859);
+        } else {
+            in = (char *)p + 1;
+            cd = iconv_open("UTF-8", encodings[*p]);
+        }
+        if (cd == (iconv_t)-1)
+            goto no_iconv;
+        str = out = av_malloc(outlen);
+        if (!str) {
+            iconv_close(cd);
+            return NULL;
+        }
+        if (iconv(cd, &in, &inlen, &out, &outlen) == -1) {
+            iconv_close(cd);
+            av_freep(&str);
+            goto no_iconv;
+        }
+        iconv_close(cd);
+        *out = 0;
+        *pp = p + len;
+        return str;
+    }
+no_iconv:
+#endif
     str = av_malloc(len + 1);
     if (!str)
         return NULL;
-- 
1.7.10.4