diff mbox series

[FFmpeg-devel,v4,1/4] avutil/{avstring, bprint}: add XML escaping from ffprobe to avutil

Message ID 20210122112058.49068-2-jeebjp@gmail.com
State Superseded
Headers show
Series Initial implementation of TTML encoding/muxing
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Jan Ekström Jan. 22, 2021, 11:20 a.m. UTC
From: Stefano Sabatini <stefasab@gmail.com>

---
 libavutil/avstring.h |  7 ++++---
 libavutil/bprint.c   | 15 +++++++++++++++
 tools/ffescape.c     |  7 ++++---
 3 files changed, 23 insertions(+), 6 deletions(-)

Comments

Nicolas George Jan. 25, 2021, 9:57 a.m. UTC | #1
Jan Ekström (12021-01-22):
> From: Stefano Sabatini <stefasab@gmail.com>
> 
> ---
>  libavutil/avstring.h |  7 ++++---
>  libavutil/bprint.c   | 15 +++++++++++++++
>  tools/ffescape.c     |  7 ++++---
>  3 files changed, 23 insertions(+), 6 deletions(-)

Thanks for this new version.

I think this patch and the next would be better merged.

> 
> diff --git a/libavutil/avstring.h b/libavutil/avstring.h
> index ee225585b3..189b4726a5 100644
> --- a/libavutil/avstring.h
> +++ b/libavutil/avstring.h
> @@ -321,9 +321,10 @@ int av_match_name(const char *name, const char *names);
>  char *av_append_path_component(const char *path, const char *component);
>  
>  enum AVEscapeMode {
> -    AV_ESCAPE_MODE_AUTO,      ///< Use auto-selected escaping mode.
> -    AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
> -    AV_ESCAPE_MODE_QUOTE,     ///< Use single-quote escaping.
> +    AV_ESCAPE_MODE_AUTO,          ///< Use auto-selected escaping mode.
> +    AV_ESCAPE_MODE_BACKSLASH,     ///< Use backslash escaping.
> +    AV_ESCAPE_MODE_QUOTE,         ///< Use single-quote escaping.

> +    AV_ESCAPE_MODE_XML_CHAR_DATA, ///< Use XML non-markup character data escaping.

It could be shorter: XML_TEXT, XML_ATTR_QUOT and XML_ATTR_APOS are clear
enough IMHO, and more convenient.

>  };
>  
>  /**
> diff --git a/libavutil/bprint.c b/libavutil/bprint.c
> index 2f059c5ba6..7cdbb75095 100644
> --- a/libavutil/bprint.c
> +++ b/libavutil/bprint.c
> @@ -283,6 +283,21 @@ void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_cha
>          av_bprint_chars(dstbuf, '\'', 1);
>          break;
>  
> +    case AV_ESCAPE_MODE_XML_CHAR_DATA:
> +        /* escape XML non-markup character data as per 2.4 */
> +        /*  [^<&]* - ([^<&]* ']]>' [^<&]*) */
> +        for (; *src; src++) {
> +            switch (*src) {
> +            case '&' : av_bprintf(dstbuf, "%s", "&amp;");  break;
> +            case '<' : av_bprintf(dstbuf, "%s", "&lt;");   break;
> +            case '>' : av_bprintf(dstbuf, "%s", "&gt;");   break;

> +            case '"' : av_bprintf(dstbuf, "%s", "&quot;"); break;
> +            case '\'': av_bprintf(dstbuf, "%s", "&apos;"); break;

Outside attributes, " and ' do not need to be quoted.

> +            default: av_bprint_chars(dstbuf, *src, 1);
> +            }
> +        }
> +        break;
> +
>      /* case AV_ESCAPE_MODE_BACKSLASH or unknown mode */
>      default:
>          /* \-escape characters */
> diff --git a/tools/ffescape.c b/tools/ffescape.c
> index 0530d28c6d..e18f1edaf9 100644
> --- a/tools/ffescape.c
> +++ b/tools/ffescape.c
> @@ -101,9 +101,10 @@ int main(int argc, char **argv)
>              break;
>          }
>          case 'm':
> -            if      (!strcmp(optarg, "auto"))      escape_mode = AV_ESCAPE_MODE_AUTO;
> -            else if (!strcmp(optarg, "backslash")) escape_mode = AV_ESCAPE_MODE_BACKSLASH;
> -            else if (!strcmp(optarg, "quote"))     escape_mode = AV_ESCAPE_MODE_QUOTE;
> +            if      (!strcmp(optarg, "auto"))          escape_mode = AV_ESCAPE_MODE_AUTO;
> +            else if (!strcmp(optarg, "backslash"))     escape_mode = AV_ESCAPE_MODE_BACKSLASH;
> +            else if (!strcmp(optarg, "quote"))         escape_mode = AV_ESCAPE_MODE_QUOTE;

> +            else if (!strcmp(optarg, "xml_char_data")) escape_mode = AV_ESCAPE_MODE_XML_CHAR_DATA;
>              else {
>                  av_log(NULL, AV_LOG_ERROR,
>                         "Invalid value '%s' for option -m, "

Regards,
Jan Ekström Jan. 25, 2021, 11:44 a.m. UTC | #2
On Mon, Jan 25, 2021 at 11:58 AM Nicolas George <george@nsup.org> wrote:
>
> Jan Ekström (12021-01-22):
> > From: Stefano Sabatini <stefasab@gmail.com>
> >
> > ---
> >  libavutil/avstring.h |  7 ++++---
> >  libavutil/bprint.c   | 15 +++++++++++++++
> >  tools/ffescape.c     |  7 ++++---
> >  3 files changed, 23 insertions(+), 6 deletions(-)
>
> Thanks for this new version.
>
> I think this patch and the next would be better merged.

For now I kept them separate since one is just moving Stefano's code,
and another adds new functionality that you requested.

> >
> > diff --git a/libavutil/avstring.h b/libavutil/avstring.h
> > index ee225585b3..189b4726a5 100644
> > --- a/libavutil/avstring.h
> > +++ b/libavutil/avstring.h
> > @@ -321,9 +321,10 @@ int av_match_name(const char *name, const char *names);
> >  char *av_append_path_component(const char *path, const char *component);
> >
> >  enum AVEscapeMode {
> > -    AV_ESCAPE_MODE_AUTO,      ///< Use auto-selected escaping mode.
> > -    AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
> > -    AV_ESCAPE_MODE_QUOTE,     ///< Use single-quote escaping.
> > +    AV_ESCAPE_MODE_AUTO,          ///< Use auto-selected escaping mode.
> > +    AV_ESCAPE_MODE_BACKSLASH,     ///< Use backslash escaping.
> > +    AV_ESCAPE_MODE_QUOTE,         ///< Use single-quote escaping.
>
> > +    AV_ESCAPE_MODE_XML_CHAR_DATA, ///< Use XML non-markup character data escaping.
>
> It could be shorter: XML_TEXT, XML_ATTR_QUOT and XML_ATTR_APOS are clear
> enough IMHO, and more convenient.
>

I really have no bigger bone with either naming, I just utilized:
1. How the XML spec calls the thing ("AttValue" thus becoming
ATT_VALUE, "CharData" becoming CHAR_DATA). I slightly would prefer to
keep this due to this making it easy to point towards what is actually
meant in the XML spec.
2. Single and double quotes are the wording that I am more acquainted
with, but I have no hard opinion so if you want _QUOT/_APOS, sure.

> >  };
> >
> >  /**
> > diff --git a/libavutil/bprint.c b/libavutil/bprint.c
> > index 2f059c5ba6..7cdbb75095 100644
> > --- a/libavutil/bprint.c
> > +++ b/libavutil/bprint.c
> > @@ -283,6 +283,21 @@ void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_cha
> >          av_bprint_chars(dstbuf, '\'', 1);
> >          break;
> >
> > +    case AV_ESCAPE_MODE_XML_CHAR_DATA:
> > +        /* escape XML non-markup character data as per 2.4 */
> > +        /*  [^<&]* - ([^<&]* ']]>' [^<&]*) */
> > +        for (; *src; src++) {
> > +            switch (*src) {
> > +            case '&' : av_bprintf(dstbuf, "%s", "&amp;");  break;
> > +            case '<' : av_bprintf(dstbuf, "%s", "&lt;");   break;
> > +            case '>' : av_bprintf(dstbuf, "%s", "&gt;");   break;
>
> > +            case '"' : av_bprintf(dstbuf, "%s", "&quot;"); break;
> > +            case '\'': av_bprintf(dstbuf, "%s", "&apos;"); break;
>
> Outside attributes, " and ' do not need to be quoted.
>

I think this is where I'm slightly going out of my comfort zone. This
is already existing code, and I just moved it out of ffprobe (where it
was already successfully merged and existed for a while). Additionally
the expression for char_data does at least reference single quotes. I
do see the paragraph at the end of 2.4 that notes

> To allow attribute values to contain both single and double quotes, the apostrophe or single-quote character (') may be represented as "&apos;", and the double-quote character (") as "&quot;"."

Of course, which seems specific to attribute values.

But at this point I'm 50/50 if it makes even sense for me to continue
this struggle or if this is an unclear message from you to me that I
should just utilize libxml2, which is already utilized in the DASH
demuxer. Mostly because there I at least can relatively surely know
that (probably) not only escaping, but also removal of invalid UTF-8
might be handled by it (which we currently do not do either with JSON
or XML output from ffprobe as far as I know). That way also I don't
have to attempt to be a nice person and attempt to re-utilize existing
code from ffprobe, nor do I have to touch ffprobe.

Jan
Nicolas George Jan. 25, 2021, 12:47 p.m. UTC | #3
Jan Ekström (12021-01-25):
> For now I kept them separate since one is just moving Stefano's code,
> and another adds new functionality that you requested.

Stefano's code was good enough for ffprobe, because it had to perform a
very precise task in a very constrained context, but as is, it is not
good enough for lavu, it needs to be adapted.

> I really have no bigger bone with either naming, I just utilized:
> 1. How the XML spec calls the thing ("AttValue" thus becoming
> ATT_VALUE, "CharData" becoming CHAR_DATA). I slightly would prefer to
> keep this due to this making it easy to point towards what is actually
> meant in the XML spec.

CharData is only used a few times in the abstract syntax of the spec, it
will have meaning only for people who are very familiar with it.

> 2. Single and double quotes are the wording that I am more acquainted
> with, but I have no hard opinion so if you want _QUOT/_APOS, sure.

I am fine with "double_quotes" and "single_quotes" or "dquot" and
"squot" too.

> > Outside attributes, " and ' do not need to be quoted.
> 
> I think this is where I'm slightly going out of my comfort zone. This
> is already existing code, and I just moved it out of ffprobe (where it
> was already successfully merged and existed for a while). Additionally
> the expression for char_data does at least reference single quotes. I
> do see the paragraph at the end of 2.4 that notes

Let us take a few steps back.

The spec says:

- < needs to be always escaped;

- ' needs to be escaped when between ';

- " needs to be escaped when between ".

Common sens and good practice say:

- characters that do not need to be escaped should not be escaped.

(With XML, we make an exception for that last rule for >.)

Therefore, we need three modes:

- one mode to use between ';

- one mode to use between ";

- one mode to use outside.

> > To allow attribute values to contain both single and double quotes, the apostrophe or single-quote character (') may be represented as "&apos;", and the double-quote character (") as "&quot;"."
> 
> Of course, which seems specific to attribute values.

> But at this point I'm 50/50 if it makes even sense for me to continue
> this struggle or if this is an unclear message from you to me that I
> should just utilize libxml2, which is already utilized in the DASH
> demuxer. Mostly because there I at least can relatively surely know
> that (probably) not only escaping, but also removal of invalid UTF-8
> might be handled by it (which we currently do not do either with JSON
> or XML output from ffprobe as far as I know). That way also I don't
> have to attempt to be a nice person and attempt to re-utilize existing
> code from ffprobe, nor do I have to touch ffprobe.

libxml2 has a terrible track records in terms of stability, including
security. We should strive to use it less, not more.

For parsers, we currently have no choice, because parsing XML is tricky.
I still intend to finish the limited XML parser that I started
implementing a few years ago, though.

On the other hand, writing good XML is quite easy, and that is what we
should be doing. Your code is almost ready, it just needs a few tweaks.

Regards,
Jan Ekström Feb. 1, 2021, 11:40 a.m. UTC | #4
On Mon, Jan 25, 2021 at 2:47 PM Nicolas George <george@nsup.org> wrote:
>
> Jan Ekström (12021-01-25):
> > For now I kept them separate since one is just moving Stefano's code,
> > and another adds new functionality that you requested.
>
> Stefano's code was good enough for ffprobe, because it had to perform a
> very precise task in a very constrained context, but as is, it is not
> good enough for lavu, it needs to be adapted.
>

I beg to differ, since it seemed to fully implement the cases noted in
2.4. Does escape double and single quotes unnecessarily in some cases,
yes. But not invalid.

But at this point I have wasted enough time with parts of this patch
set that I do not care about. I do wish that I had said to you earlier
that if you want to rework this stuff, you could do it yourself. I
just noticed the existing code and attempted to do the Right Thing and
move it into general code. I bet if I had not done it, nobody would
have bat an eye regarding it, since that's exactly how the existing
XML output things do things right now. I hope it is understandable why
it is annoying to me that instead of the code that I actually proposed
to add, the stuff that was *already* *there*s is the stuff that gets
now picked out.

> > I really have no bigger bone with either naming, I just utilized:
> > 1. How the XML spec calls the thing ("AttValue" thus becoming
> > ATT_VALUE, "CharData" becoming CHAR_DATA). I slightly would prefer to
> > keep this due to this making it easy to point towards what is actually
> > meant in the XML spec.
>
> CharData is only used a few times in the abstract syntax of the spec, it
> will have meaning only for people who are very familiar with it.
>

It also contains a nice definition of it in 2.4

"[Definition: All text that is not markup constitutes the character
data of the document.] "

> > 2. Single and double quotes are the wording that I am more acquainted
> > with, but I have no hard opinion so if you want _QUOT/_APOS, sure.
>
> I am fine with "double_quotes" and "single_quotes" or "dquot" and
> "squot" too.
>

This is where (for now) I go through the route of least resistance and post a v5

https://github.com/jeeb/ffmpeg/commits/ttml_encoder_v5

- I changed "quoted" to "quotes".
- I kept the names matching to the spec, esp. due to the definitions
in the spec in 2.3 and 2.4, as well as kept the fact that it is
escaping for the actual value of the attribute, not just "the
attribute".
- I removed the double and single quote escaping from the CHAR_DATA
escaping, as you requested.

For the record, I have no great interest in XML. If you wish to take
this stuff further, please do so. But at this point I am feeling
rather stumped that after the initial reviews which actually were
concerned of code that I wrote, most of the comments have been
regarding things that were already there in the code base.

Best regards,
Jan
Nicolas George Feb. 1, 2021, 11:52 a.m. UTC | #5
Jan Ekström (12021-02-01):
> I beg to differ, since it seemed to fully implement the cases noted in
> 2.4. Does escape double and single quotes unnecessarily in some cases,
> yes. But not invalid.

Not invalid, but not good enough.

> https://github.com/jeeb/ffmpeg/commits/ttml_encoder_v5
> 
> - I changed "quoted" to "quotes".

I strongly suggest you merge the commits, but I will not block on it.

> - I kept the names matching to the spec, esp. due to the definitions
> in the spec in 2.3 and 2.4, as well as kept the fact that it is
> escaping for the actual value of the attribute, not just "the
> attribute".

"ATT_VALUE" is a terrible part of the name, virtually impossible to
memorize. Nobody learns the names used by the XML spec by rote. Call
these modes:

AV_ESCAPE_MODE_XML
AV_ESCAPE_MODE_XML_DOUBLE_QUOTES
AV_ESCAPE_MODE_XML_SINGLE_QUOTES

and be done with it.

This is a public API, it needs to be good immediately.

> - I removed the double and single quote escaping from the CHAR_DATA
> escaping, as you requested.

Thank you very much.

> For the record, I have no great interest in XML. If you wish to take
> this stuff further, please do so. But at this point I am feeling
> rather stumped that after the initial reviews which actually were
> concerned of code that I wrote, most of the comments have been
> regarding things that were already there in the code base.

As for me, I am quite stumped that you wasted your time arguing
something you have no great interest in, instead of just following
advice and be done with it.

Regards,
Jan Ekström Feb. 1, 2021, 12:17 p.m. UTC | #6
On Mon, Feb 1, 2021 at 1:52 PM Nicolas George <george@nsup.org> wrote:
>
> Jan Ekström (12021-02-01):
> > I beg to differ, since it seemed to fully implement the cases noted in
> > 2.4. Does escape double and single quotes unnecessarily in some cases,
> > yes. But not invalid.
>
> Not invalid, but not good enough.
>
> > https://github.com/jeeb/ffmpeg/commits/ttml_encoder_v5
> >
> > - I changed "quoted" to "quotes".
>
> I strongly suggest you merge the commits, but I will not block on it.
>
> > - I kept the names matching to the spec, esp. due to the definitions
> > in the spec in 2.3 and 2.4, as well as kept the fact that it is
> > escaping for the actual value of the attribute, not just "the
> > attribute".
>
> "ATT_VALUE" is a terrible part of the name, virtually impossible to
> memorize. Nobody learns the names used by the XML spec by rote. Call
> these modes:
>
> AV_ESCAPE_MODE_XML
> AV_ESCAPE_MODE_XML_DOUBLE_QUOTES
> AV_ESCAPE_MODE_XML_SINGLE_QUOTES
>
> and be done with it.
>
> This is a public API, it needs to be good immediately.
>

I fear that those sound way, way too generic. "This escapes all XML"
definitely is not what that does.

And yes, even if it has the better definition in the Doxygen comment.

> > - I removed the double and single quote escaping from the CHAR_DATA
> > escaping, as you requested.
>
> Thank you very much.
>
> > For the record, I have no great interest in XML. If you wish to take
> > this stuff further, please do so. But at this point I am feeling
> > rather stumped that after the initial reviews which actually were
> > concerned of code that I wrote, most of the comments have been
> > regarding things that were already there in the code base.
>
> As for me, I am quite stumped that you wasted your time arguing
> something you have no great interest in, instead of just following
> advice and be done with it.
>

Well, my first reaction is to attempt to validate your requests. Then
when I'm not easily capable of doing that I become annoyed because
this code is not what I came here to receive change requests that I
cannot easily apply without blindly trusting you. Which I have done,
but only because that seems to have been the course of least
resistance. But yet, this is where we are.

Jan
Nicolas George Feb. 1, 2021, 12:25 p.m. UTC | #7
Jan Ekström (12021-02-01):
> I fear that those sound way, way too generic. "This escapes all XML"
> definitely is not what that does.
> 
> And yes, even if it has the better definition in the Doxygen comment.

Why? Do you expect to add something that "escapes all XML"? What does it
even mean?

These symbols are what developers will need to type. Making them long is
just a waste of time. Adding hard to remembers parts like "ATT" (is it
"ATTR"? or "ATTRIBUTE"?) is just adding insult to injury.

Make the documentation as clear as possible, but keep the symbols short
and easy to remember.

Regards,
Jan Ekström Feb. 1, 2021, 12:38 p.m. UTC | #8
On Mon, Feb 1, 2021 at 2:25 PM Nicolas George <george@nsup.org> wrote:
>
> Jan Ekström (12021-02-01):
> > I fear that those sound way, way too generic. "This escapes all XML"
> > definitely is not what that does.
> >
> > And yes, even if it has the better definition in the Doxygen comment.
>
> Why? Do you expect to add something that "escapes all XML"? What does it
> even mean?
>

No, I do not expect to. Just that as you made me add modifications to
this logic, I no longer wish to make it as simple as "XML escaping"
since at least in my opinion if nothing else - that would now include
those cases that have now been separated.

Of course I would *love* secondary opinions here. If other people
accept the naming then I will become less hesitant about them.

Also removing the attribute value part out of the attribute value
things seems like a dangerous bid. Same regarding others' opinions
with this as well.

> These symbols are what developers will need to type. Making them long is
> just a waste of time. Adding hard to remembers parts like "ATT" (is it
> "ATTR"? or "ATTRIBUTE"?) is just adding insult to injury.
>

I don't disagree with it being ugly, just that it is what it matches
in the spec. I wouldn't be against making it "ATTR_VALUE" or
"ATTRIBUTE_VALUE", but effectively does that improve anything?

Also for the record, I have clang-based auto-completion in most of my
editors, just for this. So that I can find things that I do not know
yet/recall.

> Make the documentation as clear as possible, but keep the symbols short
> and easy to remember.
>

As a general guide line I 100% agree with you. It's just that in this
case I'm not sure if removing details from the names like this is the
right way forward.

Jan
Nicolas George Feb. 1, 2021, 12:44 p.m. UTC | #9
Jan Ekström (12021-02-01):
> I don't disagree with it being ugly, just that it is what it matches
> in the spec.

Nobody reads the XML spec. Especially for something as trivial as the
syntax to escape attributes. People learn XML from tutorials and
examples, which will not have exactly the same wording as the spec.

I second your request of a third opinion.

Regards,
Anton Khirnov Feb. 19, 2021, 1:19 p.m. UTC | #10
Quoting Jan Ekström (2021-02-01 13:38:43)
> On Mon, Feb 1, 2021 at 2:25 PM Nicolas George <george@nsup.org> wrote:
> >
> > Jan Ekström (12021-02-01):
> > > I fear that those sound way, way too generic. "This escapes all XML"
> > > definitely is not what that does.
> > >
> > > And yes, even if it has the better definition in the Doxygen comment.
> >
> > Why? Do you expect to add something that "escapes all XML"? What does it
> > even mean?
> >
> 
> No, I do not expect to. Just that as you made me add modifications to
> this logic, I no longer wish to make it as simple as "XML escaping"
> since at least in my opinion if nothing else - that would now include
> those cases that have now been separated.
> 
> Of course I would *love* secondary opinions here. If other people
> accept the naming then I will become less hesitant about them.
> 
> Also removing the attribute value part out of the attribute value
> things seems like a dangerous bid. Same regarding others' opinions
> with this as well.

Why not have a single ESCAPE_MODE_XML mode and use flags to tweak what
specifically is or is not escaped?

> 
> > These symbols are what developers will need to type. Making them long is
> > just a waste of time. Adding hard to remembers parts like "ATT" (is it
> > "ATTR"? or "ATTRIBUTE"?) is just adding insult to injury.
> >
> 
> I don't disagree with it being ugly, just that it is what it matches
> in the spec. I wouldn't be against making it "ATTR_VALUE" or
> "ATTRIBUTE_VALUE", but effectively does that improve anything?

Mild pereference for "attr" here
Besides, doesn't everyone use auto-completion for identifiers?
Nicolas George Feb. 19, 2021, 1:24 p.m. UTC | #11
Anton Khirnov (12021-02-19):
> Why not have a single ESCAPE_MODE_XML mode and use flags to tweak what
> specifically is or is not escaped?

I forgot the flags argument were there. You are right, it is the best
choice: escape <&> by default, a flag to escape ' and a flag to escape
".

> Mild pereference for "attr" here
> Besides, doesn't everyone use auto-completion for identifiers?

Thanks you for your input. Typing the identifiers is not the only issue:
overly long lines are annoying to read the code too.

Regards,
diff mbox series

Patch

diff --git a/libavutil/avstring.h b/libavutil/avstring.h
index ee225585b3..189b4726a5 100644
--- a/libavutil/avstring.h
+++ b/libavutil/avstring.h
@@ -321,9 +321,10 @@  int av_match_name(const char *name, const char *names);
 char *av_append_path_component(const char *path, const char *component);
 
 enum AVEscapeMode {
-    AV_ESCAPE_MODE_AUTO,      ///< Use auto-selected escaping mode.
-    AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
-    AV_ESCAPE_MODE_QUOTE,     ///< Use single-quote escaping.
+    AV_ESCAPE_MODE_AUTO,          ///< Use auto-selected escaping mode.
+    AV_ESCAPE_MODE_BACKSLASH,     ///< Use backslash escaping.
+    AV_ESCAPE_MODE_QUOTE,         ///< Use single-quote escaping.
+    AV_ESCAPE_MODE_XML_CHAR_DATA, ///< Use XML non-markup character data escaping.
 };
 
 /**
diff --git a/libavutil/bprint.c b/libavutil/bprint.c
index 2f059c5ba6..7cdbb75095 100644
--- a/libavutil/bprint.c
+++ b/libavutil/bprint.c
@@ -283,6 +283,21 @@  void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_cha
         av_bprint_chars(dstbuf, '\'', 1);
         break;
 
+    case AV_ESCAPE_MODE_XML_CHAR_DATA:
+        /* escape XML non-markup character data as per 2.4 */
+        /*  [^<&]* - ([^<&]* ']]>' [^<&]*) */
+        for (; *src; src++) {
+            switch (*src) {
+            case '&' : av_bprintf(dstbuf, "%s", "&amp;");  break;
+            case '<' : av_bprintf(dstbuf, "%s", "&lt;");   break;
+            case '>' : av_bprintf(dstbuf, "%s", "&gt;");   break;
+            case '"' : av_bprintf(dstbuf, "%s", "&quot;"); break;
+            case '\'': av_bprintf(dstbuf, "%s", "&apos;"); break;
+            default: av_bprint_chars(dstbuf, *src, 1);
+            }
+        }
+        break;
+
     /* case AV_ESCAPE_MODE_BACKSLASH or unknown mode */
     default:
         /* \-escape characters */
diff --git a/tools/ffescape.c b/tools/ffescape.c
index 0530d28c6d..e18f1edaf9 100644
--- a/tools/ffescape.c
+++ b/tools/ffescape.c
@@ -101,9 +101,10 @@  int main(int argc, char **argv)
             break;
         }
         case 'm':
-            if      (!strcmp(optarg, "auto"))      escape_mode = AV_ESCAPE_MODE_AUTO;
-            else if (!strcmp(optarg, "backslash")) escape_mode = AV_ESCAPE_MODE_BACKSLASH;
-            else if (!strcmp(optarg, "quote"))     escape_mode = AV_ESCAPE_MODE_QUOTE;
+            if      (!strcmp(optarg, "auto"))          escape_mode = AV_ESCAPE_MODE_AUTO;
+            else if (!strcmp(optarg, "backslash"))     escape_mode = AV_ESCAPE_MODE_BACKSLASH;
+            else if (!strcmp(optarg, "quote"))         escape_mode = AV_ESCAPE_MODE_QUOTE;
+            else if (!strcmp(optarg, "xml_char_data")) escape_mode = AV_ESCAPE_MODE_XML_CHAR_DATA;
             else {
                 av_log(NULL, AV_LOG_ERROR,
                        "Invalid value '%s' for option -m, "