diff mbox

[FFmpeg-devel,1/3] lavu/bprint: add XML escaping

Message ID 20160910084515.11048-1-rodger.combs@gmail.com
State Superseded
Headers show

Commit Message

Rodger Combs Sept. 10, 2016, 8:45 a.m. UTC
---
 libavutil/avstring.h | 28 ++++++++++++++++++++++++++++
 libavutil/bprint.c   | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)
diff mbox

Patch

diff --git a/libavutil/avstring.h b/libavutil/avstring.h
index dd28769..8e97314 100644
--- a/libavutil/avstring.h
+++ b/libavutil/avstring.h
@@ -309,6 +309,7 @@  enum AVEscapeMode {
     AV_ESCAPE_MODE_AUTO,      ///< Use auto-selected escaping mode.
     AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
     AV_ESCAPE_MODE_QUOTE,     ///< Use single-quote escaping.
+    AV_ESCAPE_MODE_XML,       ///< Use XML ampersand-escaping; requires UTF-8 input.
 };
 
 /**
@@ -329,6 +330,33 @@  enum AVEscapeMode {
 #define AV_ESCAPE_FLAG_STRICT (1 << 1)
 
 /**
+ * In addition to the provided list, escape all characters outside the range of
+ * U+0020 to U+007E.
+ * This only applies to XML-escaping.
+ */
+#define AV_ESCAPE_FLAG_NON_ASCII (1 << 2)
+
+/**
+ * In addition to the provided list, escape single or double quotes.
+ * This only applies to XML-escaping.
+ */
+#define AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE (1 << 3)
+#define AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE (1 << 4)
+
+/**
+ * Replace invalid UTF-8 characters with a U+FFFD REPLACEMENT CHARACTER, escaped
+ * if AV_ESCAPE_FLAG_NON_ASCII is set.
+ * This only applies to XML-escaping.
+ */
+#define AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES (1 << 5)
+
+/**
+ * Replace invalid UTF-8 characters with a '?', overriding the previous flag.
+ * This only applies to XML-escaping.
+ */
+#define AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII (1 << 6)
+
+/**
  * Escape string in src, and put the escaped string in an allocated
  * string in *dst, which must be freed with av_free().
  *
diff --git a/libavutil/bprint.c b/libavutil/bprint.c
index 2f059c5..c6b9919 100644
--- a/libavutil/bprint.c
+++ b/libavutil/bprint.c
@@ -271,6 +271,49 @@  void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_cha
         mode = AV_ESCAPE_MODE_BACKSLASH; /* TODO: implement a heuristic */
 
     switch (mode) {
+    case AV_ESCAPE_MODE_XML:
+        /* &;-escape characters */
+        while (*src) {
+            uint8_t tmp;
+            uint32_t cp;
+            const char *src1 = src;
+            GET_UTF8(cp, (uint8_t)*src++, goto err;);
+
+            if ((cp < 0xFF &&
+                 ((special_chars && strchr(special_chars, cp)) ||
+                  (flags & AV_ESCAPE_FLAG_WHITESPACE) && strchr(WHITESPACES, cp))) ||
+                (!(flags & AV_ESCAPE_FLAG_STRICT) &&
+                 (cp == '&' || cp == '<' || cp == '>')) ||
+                ((flags & AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE) && cp == '\'') ||
+                ((flags & AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE) && cp == '"') ||
+                ((flags & AV_ESCAPE_FLAG_NON_ASCII) && (cp < 0x20 || cp > 0x7e))) {
+                switch (cp) {
+                case '&' : av_bprintf(dstbuf, "&amp;");  break;
+                case '<' : av_bprintf(dstbuf, "&lt;");   break;
+                case '>' : av_bprintf(dstbuf, "&gt;");   break;
+                case '"' : av_bprintf(dstbuf, "&quot;"); break;
+                case '\'': av_bprintf(dstbuf, "&apos;"); break;
+                default:   av_bprintf(dstbuf, "&#x%"PRIx32";", cp); break;
+                }
+            } else {
+                PUT_UTF8(cp, tmp, av_bprint_chars(dstbuf, tmp, 1);)
+            }
+            continue;
+        err:
+            if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII) {
+                av_bprint_chars(dstbuf, '?', 1);
+            } else if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES) {
+                if (flags & AV_ESCAPE_FLAG_NON_ASCII)
+                    av_bprintf(dstbuf, "\xEF\xBF\xBD");
+                else
+                    av_bprintf(dstbuf, "&#xfffd;");
+            } else {
+                while (src1 < src)
+                    av_bprint_chars(dstbuf, *src1++, 1);
+            }
+        }
+        break;
+
     case AV_ESCAPE_MODE_QUOTE:
         /* enclose the string between '' */
         av_bprint_chars(dstbuf, '\'', 1);