[FFmpeg-devel,2/5] lavu/bprint: add XML escaping

Submitted by Rodger Combs on April 12, 2017, 7:11 a.m.

Details

Message ID 20170412071127.60511-2-rodger.combs@gmail.com
State New
Headers show

Commit Message

Rodger Combs April 12, 2017, 7:11 a.m.
---
 libavutil/avstring.h | 28 ++++++++++++++++++++++++++++
 libavutil/bprint.c   | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)

Comments

Nicolas George April 12, 2017, 1:23 p.m.
Le tridi 23 germinal, an CCXXV, Rodger Combs a écrit :
> ---
>  libavutil/avstring.h | 28 ++++++++++++++++++++++++++++
>  libavutil/bprint.c   | 43 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 71 insertions(+)
> 
> diff --git a/libavutil/avstring.h b/libavutil/avstring.h
> index 04d2695640..68b753a569 100644
> --- a/libavutil/avstring.h
> +++ b/libavutil/avstring.h
> @@ -314,6 +314,7 @@ enum AVEscapeMode {
>      AV_ESCAPE_MODE_AUTO,      ///< Use auto-selected escaping mode.
>      AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
>      AV_ESCAPE_MODE_QUOTE,     ///< Use single-quote escaping.
> +    AV_ESCAPE_MODE_XML,       ///< Use XML ampersand-escaping; requires UTF-8 input.
>  };
>  
>  /**
> @@ -334,6 +335,33 @@ enum AVEscapeMode {
>  #define AV_ESCAPE_FLAG_STRICT (1 << 1)
>  
>  /**
> + * In addition to the provided list, escape all characters outside the range of
> + * U+0020 to U+007E.
> + * This only applies to XML-escaping.
> + */
> +#define AV_ESCAPE_FLAG_NON_ASCII (1 << 2)
> +

> +/**
> + * In addition to the provided list, escape single or double quotes.
> + * This only applies to XML-escaping.
> + */

I think this doxy comment, written like that, only applies to SINGLE.

> +#define AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE (1 << 3)
> +#define AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE (1 << 4)

Maybe also:

#define AV_ESCAPE_FLAG_ESCAPE_QUOTES (AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE|AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE)

?

> +
> +/**
> + * Replace invalid UTF-8 characters with a U+FFFD REPLACEMENT CHARACTER, escaped
> + * if AV_ESCAPE_FLAG_NON_ASCII is set.
> + * This only applies to XML-escaping.
> + */
> +#define AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES (1 << 5)
> +
> +/**
> + * Replace invalid UTF-8 characters with a '?', overriding the previous flag.
> + * This only applies to XML-escaping.
> + */
> +#define AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII (1 << 6)
> +
> +/**
>   * Escape string in src, and put the escaped string in an allocated
>   * string in *dst, which must be freed with av_free().
>   *
> diff --git a/libavutil/bprint.c b/libavutil/bprint.c
> index 652775bef9..8e44c57346 100644
> --- a/libavutil/bprint.c
> +++ b/libavutil/bprint.c
> @@ -302,5 +302,48 @@ void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_cha
>          }
>          av_bprint_chars(dstbuf, '\'', 1);
>          break;
> +
> +    case AV_ESCAPE_MODE_XML:
> +        /* &;-escape characters */
> +        while (*src) {
> +            uint8_t tmp;
> +            uint32_t cp;
> +            const char *src1 = src;
> +            GET_UTF8(cp, (uint8_t)*src++, goto err;);
> +
> +            if ((cp < 0xFF &&
> +                 ((special_chars && strchr(special_chars, cp)) ||
> +                  (flags & AV_ESCAPE_FLAG_WHITESPACE) && strchr(WHITESPACES, cp))) ||

> +                (!(flags & AV_ESCAPE_FLAG_STRICT) &&
> +                 (cp == '&' || cp == '<' || cp == '>')) ||

& and < must always be encoded in XML. And > can sometimes be left
alone, but so few people use it that I think it is not worth the effort.

> +                ((flags & AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE) && cp == '\'') ||
> +                ((flags & AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE) && cp == '"') ||
> +                ((flags & AV_ESCAPE_FLAG_NON_ASCII) && (cp < 0x20 || cp > 0x7e))) {
> +                switch (cp) {
> +                case '&' : av_bprintf(dstbuf, "&amp;");  break;
> +                case '<' : av_bprintf(dstbuf, "&lt;");   break;
> +                case '>' : av_bprintf(dstbuf, "&gt;");   break;
> +                case '"' : av_bprintf(dstbuf, "&quot;"); break;
> +                case '\'': av_bprintf(dstbuf, "&apos;"); break;
> +                default:   av_bprintf(dstbuf, "&#x%"PRIx32";", cp); break;
> +                }
> +            } else {
> +                PUT_UTF8(cp, tmp, av_bprint_chars(dstbuf, tmp, 1);)
> +            }
> +            continue;
> +        err:
> +            if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII) {
> +                av_bprint_chars(dstbuf, '?', 1);
> +            } else if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES) {
> +                if (flags & AV_ESCAPE_FLAG_NON_ASCII)
> +                    av_bprintf(dstbuf, "\xEF\xBF\xBD");
> +                else
> +                    av_bprintf(dstbuf, "&#xfffd;");
> +            } else {
> +                while (src1 < src)
> +                    av_bprint_chars(dstbuf, *src1++, 1);
> +            }
> +        }
> +        break;
>      }
>  }

Regards,

Patch hide | download patch | download mbox

diff --git a/libavutil/avstring.h b/libavutil/avstring.h
index 04d2695640..68b753a569 100644
--- a/libavutil/avstring.h
+++ b/libavutil/avstring.h
@@ -314,6 +314,7 @@  enum AVEscapeMode {
     AV_ESCAPE_MODE_AUTO,      ///< Use auto-selected escaping mode.
     AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
     AV_ESCAPE_MODE_QUOTE,     ///< Use single-quote escaping.
+    AV_ESCAPE_MODE_XML,       ///< Use XML ampersand-escaping; requires UTF-8 input.
 };
 
 /**
@@ -334,6 +335,33 @@  enum AVEscapeMode {
 #define AV_ESCAPE_FLAG_STRICT (1 << 1)
 
 /**
+ * In addition to the provided list, escape all characters outside the range of
+ * U+0020 to U+007E.
+ * This only applies to XML-escaping.
+ */
+#define AV_ESCAPE_FLAG_NON_ASCII (1 << 2)
+
+/**
+ * In addition to the provided list, escape single or double quotes.
+ * This only applies to XML-escaping.
+ */
+#define AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE (1 << 3)
+#define AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE (1 << 4)
+
+/**
+ * Replace invalid UTF-8 characters with a U+FFFD REPLACEMENT CHARACTER, escaped
+ * if AV_ESCAPE_FLAG_NON_ASCII is set.
+ * This only applies to XML-escaping.
+ */
+#define AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES (1 << 5)
+
+/**
+ * Replace invalid UTF-8 characters with a '?', overriding the previous flag.
+ * This only applies to XML-escaping.
+ */
+#define AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII (1 << 6)
+
+/**
  * Escape string in src, and put the escaped string in an allocated
  * string in *dst, which must be freed with av_free().
  *
diff --git a/libavutil/bprint.c b/libavutil/bprint.c
index 652775bef9..8e44c57346 100644
--- a/libavutil/bprint.c
+++ b/libavutil/bprint.c
@@ -302,5 +302,48 @@  void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_cha
         }
         av_bprint_chars(dstbuf, '\'', 1);
         break;
+
+    case AV_ESCAPE_MODE_XML:
+        /* &;-escape characters */
+        while (*src) {
+            uint8_t tmp;
+            uint32_t cp;
+            const char *src1 = src;
+            GET_UTF8(cp, (uint8_t)*src++, goto err;);
+
+            if ((cp < 0xFF &&
+                 ((special_chars && strchr(special_chars, cp)) ||
+                  (flags & AV_ESCAPE_FLAG_WHITESPACE) && strchr(WHITESPACES, cp))) ||
+                (!(flags & AV_ESCAPE_FLAG_STRICT) &&
+                 (cp == '&' || cp == '<' || cp == '>')) ||
+                ((flags & AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE) && cp == '\'') ||
+                ((flags & AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE) && cp == '"') ||
+                ((flags & AV_ESCAPE_FLAG_NON_ASCII) && (cp < 0x20 || cp > 0x7e))) {
+                switch (cp) {
+                case '&' : av_bprintf(dstbuf, "&amp;");  break;
+                case '<' : av_bprintf(dstbuf, "&lt;");   break;
+                case '>' : av_bprintf(dstbuf, "&gt;");   break;
+                case '"' : av_bprintf(dstbuf, "&quot;"); break;
+                case '\'': av_bprintf(dstbuf, "&apos;"); break;
+                default:   av_bprintf(dstbuf, "&#x%"PRIx32";", cp); break;
+                }
+            } else {
+                PUT_UTF8(cp, tmp, av_bprint_chars(dstbuf, tmp, 1);)
+            }
+            continue;
+        err:
+            if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII) {
+                av_bprint_chars(dstbuf, '?', 1);
+            } else if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES) {
+                if (flags & AV_ESCAPE_FLAG_NON_ASCII)
+                    av_bprintf(dstbuf, "\xEF\xBF\xBD");
+                else
+                    av_bprintf(dstbuf, "&#xfffd;");
+            } else {
+                while (src1 < src)
+                    av_bprint_chars(dstbuf, *src1++, 1);
+            }
+        }
+        break;
     }
 }