Message ID | 20170412071127.60511-2-rodger.combs@gmail.com |
---|---|
State | Withdrawn, archived |
Headers | show |
Le tridi 23 germinal, an CCXXV, Rodger Combs a écrit : > --- > libavutil/avstring.h | 28 ++++++++++++++++++++++++++++ > libavutil/bprint.c | 43 +++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 71 insertions(+) > > diff --git a/libavutil/avstring.h b/libavutil/avstring.h > index 04d2695640..68b753a569 100644 > --- a/libavutil/avstring.h > +++ b/libavutil/avstring.h > @@ -314,6 +314,7 @@ enum AVEscapeMode { > AV_ESCAPE_MODE_AUTO, ///< Use auto-selected escaping mode. > AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping. > AV_ESCAPE_MODE_QUOTE, ///< Use single-quote escaping. > + AV_ESCAPE_MODE_XML, ///< Use XML ampersand-escaping; requires UTF-8 input. > }; > > /** > @@ -334,6 +335,33 @@ enum AVEscapeMode { > #define AV_ESCAPE_FLAG_STRICT (1 << 1) > > /** > + * In addition to the provided list, escape all characters outside the range of > + * U+0020 to U+007E. > + * This only applies to XML-escaping. > + */ > +#define AV_ESCAPE_FLAG_NON_ASCII (1 << 2) > + > +/** > + * In addition to the provided list, escape single or double quotes. > + * This only applies to XML-escaping. > + */ I think this doxy comment, written like that, only applies to SINGLE. > +#define AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE (1 << 3) > +#define AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE (1 << 4) Maybe also: #define AV_ESCAPE_FLAG_ESCAPE_QUOTES (AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE|AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE) ? > + > +/** > + * Replace invalid UTF-8 characters with a U+FFFD REPLACEMENT CHARACTER, escaped > + * if AV_ESCAPE_FLAG_NON_ASCII is set. > + * This only applies to XML-escaping. > + */ > +#define AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES (1 << 5) > + > +/** > + * Replace invalid UTF-8 characters with a '?', overriding the previous flag. > + * This only applies to XML-escaping. > + */ > +#define AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII (1 << 6) > + > +/** > * Escape string in src, and put the escaped string in an allocated > * string in *dst, which must be freed with av_free(). > * > diff --git a/libavutil/bprint.c b/libavutil/bprint.c > index 652775bef9..8e44c57346 100644 > --- a/libavutil/bprint.c > +++ b/libavutil/bprint.c > @@ -302,5 +302,48 @@ void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_cha > } > av_bprint_chars(dstbuf, '\'', 1); > break; > + > + case AV_ESCAPE_MODE_XML: > + /* &;-escape characters */ > + while (*src) { > + uint8_t tmp; > + uint32_t cp; > + const char *src1 = src; > + GET_UTF8(cp, (uint8_t)*src++, goto err;); > + > + if ((cp < 0xFF && > + ((special_chars && strchr(special_chars, cp)) || > + (flags & AV_ESCAPE_FLAG_WHITESPACE) && strchr(WHITESPACES, cp))) || > + (!(flags & AV_ESCAPE_FLAG_STRICT) && > + (cp == '&' || cp == '<' || cp == '>')) || & and < must always be encoded in XML. And > can sometimes be left alone, but so few people use it that I think it is not worth the effort. > + ((flags & AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE) && cp == '\'') || > + ((flags & AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE) && cp == '"') || > + ((flags & AV_ESCAPE_FLAG_NON_ASCII) && (cp < 0x20 || cp > 0x7e))) { > + switch (cp) { > + case '&' : av_bprintf(dstbuf, "&"); break; > + case '<' : av_bprintf(dstbuf, "<"); break; > + case '>' : av_bprintf(dstbuf, ">"); break; > + case '"' : av_bprintf(dstbuf, """); break; > + case '\'': av_bprintf(dstbuf, "'"); break; > + default: av_bprintf(dstbuf, "&#x%"PRIx32";", cp); break; > + } > + } else { > + PUT_UTF8(cp, tmp, av_bprint_chars(dstbuf, tmp, 1);) > + } > + continue; > + err: > + if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII) { > + av_bprint_chars(dstbuf, '?', 1); > + } else if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES) { > + if (flags & AV_ESCAPE_FLAG_NON_ASCII) > + av_bprintf(dstbuf, "\xEF\xBF\xBD"); > + else > + av_bprintf(dstbuf, "�"); > + } else { > + while (src1 < src) > + av_bprint_chars(dstbuf, *src1++, 1); > + } > + } > + break; > } > } Regards,
diff --git a/libavutil/avstring.h b/libavutil/avstring.h index 04d2695640..68b753a569 100644 --- a/libavutil/avstring.h +++ b/libavutil/avstring.h @@ -314,6 +314,7 @@ enum AVEscapeMode { AV_ESCAPE_MODE_AUTO, ///< Use auto-selected escaping mode. AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping. AV_ESCAPE_MODE_QUOTE, ///< Use single-quote escaping. + AV_ESCAPE_MODE_XML, ///< Use XML ampersand-escaping; requires UTF-8 input. }; /** @@ -334,6 +335,33 @@ enum AVEscapeMode { #define AV_ESCAPE_FLAG_STRICT (1 << 1) /** + * In addition to the provided list, escape all characters outside the range of + * U+0020 to U+007E. + * This only applies to XML-escaping. + */ +#define AV_ESCAPE_FLAG_NON_ASCII (1 << 2) + +/** + * In addition to the provided list, escape single or double quotes. + * This only applies to XML-escaping. + */ +#define AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE (1 << 3) +#define AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE (1 << 4) + +/** + * Replace invalid UTF-8 characters with a U+FFFD REPLACEMENT CHARACTER, escaped + * if AV_ESCAPE_FLAG_NON_ASCII is set. + * This only applies to XML-escaping. + */ +#define AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES (1 << 5) + +/** + * Replace invalid UTF-8 characters with a '?', overriding the previous flag. + * This only applies to XML-escaping. + */ +#define AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII (1 << 6) + +/** * Escape string in src, and put the escaped string in an allocated * string in *dst, which must be freed with av_free(). * diff --git a/libavutil/bprint.c b/libavutil/bprint.c index 652775bef9..8e44c57346 100644 --- a/libavutil/bprint.c +++ b/libavutil/bprint.c @@ -302,5 +302,48 @@ void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_cha } av_bprint_chars(dstbuf, '\'', 1); break; + + case AV_ESCAPE_MODE_XML: + /* &;-escape characters */ + while (*src) { + uint8_t tmp; + uint32_t cp; + const char *src1 = src; + GET_UTF8(cp, (uint8_t)*src++, goto err;); + + if ((cp < 0xFF && + ((special_chars && strchr(special_chars, cp)) || + (flags & AV_ESCAPE_FLAG_WHITESPACE) && strchr(WHITESPACES, cp))) || + (!(flags & AV_ESCAPE_FLAG_STRICT) && + (cp == '&' || cp == '<' || cp == '>')) || + ((flags & AV_ESCAPE_FLAG_ESCAPE_SINGLE_QUOTE) && cp == '\'') || + ((flags & AV_ESCAPE_FLAG_ESCAPE_DOUBLE_QUOTE) && cp == '"') || + ((flags & AV_ESCAPE_FLAG_NON_ASCII) && (cp < 0x20 || cp > 0x7e))) { + switch (cp) { + case '&' : av_bprintf(dstbuf, "&"); break; + case '<' : av_bprintf(dstbuf, "<"); break; + case '>' : av_bprintf(dstbuf, ">"); break; + case '"' : av_bprintf(dstbuf, """); break; + case '\'': av_bprintf(dstbuf, "'"); break; + default: av_bprintf(dstbuf, "&#x%"PRIx32";", cp); break; + } + } else { + PUT_UTF8(cp, tmp, av_bprint_chars(dstbuf, tmp, 1);) + } + continue; + err: + if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_ASCII) { + av_bprint_chars(dstbuf, '?', 1); + } else if (flags & AV_ESCAPE_FLAG_REPLACE_INVALID_SEQUENCES) { + if (flags & AV_ESCAPE_FLAG_NON_ASCII) + av_bprintf(dstbuf, "\xEF\xBF\xBD"); + else + av_bprintf(dstbuf, "�"); + } else { + while (src1 < src) + av_bprint_chars(dstbuf, *src1++, 1); + } + } + break; } }