diff mbox series

[FFmpeg-devel,WIP] libavutil: add a limited XML parser

Message ID 20220506141511.112227-1-george@nsup.org
State New
Headers show
Series [FFmpeg-devel,WIP] libavutil: add a limited XML parser | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Nicolas George May 6, 2022, 2:15 p.m. UTC
Signed-off-by: Nicolas George <george@nsup.org>
---
 libavutil/Makefile          |   3 +
 libavutil/tests/xmlparser.c |  93 ++++++
 libavutil/xmlparser.c       | 644 ++++++++++++++++++++++++++++++++++++
 libavutil/xmlparser.h       |  58 ++++
 4 files changed, 798 insertions(+)
 create mode 100644 libavutil/tests/xmlparser.c
 create mode 100644 libavutil/xmlparser.c
 create mode 100644 libavutil/xmlparser.h


I promised this quite some time ago, I finally got around to work
through the first block and make something. It is still
work-in-progress, but I will have less time to work on it on the next
weeks, so I might as well put it out there.

Currently, it can parse into events a XML file without declarations or
comments. The goal is to make it enough to parse the files we actually
encounter in the field of multimedia, so that FFmpeg can do its work
without libxml2.

TODO:

- byte order mark
- <?xml...?>
- comments
- doctype declaration
- check matching begin/end tags
- freeing memory
- buffer compaction
- extensive testing
- use it in FFmpeg

If there are XML files you want me to make sure the parser can digest,
please send them to me.

If there are features you want me to make sure are possible, please let
me know.

If there are FFmpeg components that you want me to port to using this
API, please tell me precisely how I can test them. The real case I
intend to implement first is matroskachapters.dtd.

I thin libavutil also needs a XML writer. And a JSON writer. Possibly
others. I want to work on it. But I refuse to do it unless we have a
good API to return the result: always returning mallocated buffers or a
similar obvious solution is wasteful and ugly. So please reply to this:
https://ffmpeg.org/pipermail/ffmpeg-devel/2021-December/290226.html

Regards,
diff mbox series

Patch

diff --git a/libavutil/Makefile b/libavutil/Makefile
index 81df3b0640..7dcf308e4a 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -84,6 +84,7 @@  HEADERS = adler32.h                                                     \
           twofish.h                                                     \
           version.h                                                     \
           video_enc_params.h                                            \
+          xmlparser.h                                                   \
           xtea.h                                                        \
           tea.h                                                         \
           tx.h                                                          \
@@ -168,6 +169,7 @@  OBJS = adler32.o                                                        \
        twofish.o                                                        \
        utils.o                                                          \
        xga_font_data.o                                                  \
+       xmlparser.o                                                      \
        xtea.o                                                           \
        tea.o                                                            \
        tx.o                                                             \
@@ -258,6 +260,7 @@  TESTPROGS = adler32                                                     \
             tree                                                        \
             twofish                                                     \
             utf8                                                        \
+            xmlparser                                                   \
             xtea                                                        \
             tea                                                         \
 
diff --git a/libavutil/tests/xmlparser.c b/libavutil/tests/xmlparser.c
new file mode 100644
index 0000000000..f6823b4074
--- /dev/null
+++ b/libavutil/tests/xmlparser.c
@@ -0,0 +1,93 @@ 
+/*
+ * Copyright (c) 2022 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libavutil/error.h"
+#include "libavutil/xmlparser.h"
+
+static void dump_text(const uint8_t *text, size_t len)
+{
+    size_t i;
+
+    for (i = 0; i < len; i++)
+        printf(text[i] < 32 ? "\\x%02x" : "%c", text[i]);
+}
+
+int main(int argc, char **argv)
+{
+    AVXMLParser *xp;
+    AVXMLPEvent *ev;
+    char buf[20];
+    size_t r;
+    int ret;
+    unsigned depth = 0;
+
+    ret = av_xmlp_alloc(&xp);
+    if (ret < 0) {
+        fprintf(stderr, "Falied to allocate parser: %s\n", av_err2str(ret));
+        exit(1);
+    }
+    while (1) {
+        r = fread(buf, 1, sizeof(buf), stdin);
+        ret = av_xmlp_add_data(xp, buf, r);
+        if (ret < 0) {
+            fprintf(stderr, "Falied to add data: %s\n", av_err2str(ret));
+            exit(1);
+        }
+        if (r == 0)
+            break;
+    }
+    while (1) {
+        ret = av_xmlp_get_event(xp, &ev);
+        if (ret == AVERROR_EOF)
+            break;
+        if (ret < 0) {
+            fprintf(stderr, "Error: %s\n", av_err2str(ret));
+            exit(1);
+        }
+        printf("%*s", depth * 2, "");
+        switch (ev->type) {
+        case AV_XMLP_EV_EL_START:
+            printf("<%s>\n", ev->name);
+            depth++;
+            break;
+        case AV_XMLP_EV_EL_END:
+            depth--;
+            printf("</%s>\n", ev->name);
+            break;
+        case AV_XMLP_EV_EL_EMPTY:
+            printf("<%s/>\n", ev->name);
+            break;
+        case AV_XMLP_EV_ATTR:
+            printf("%s=", ev->name);
+            dump_text(ev->text, ev->text_len);
+            printf("\n");
+            break;
+        case AV_XMLP_EV_TEXT:
+            printf("\"");
+            dump_text(ev->text, ev->text_len);
+            printf("\"\n");
+            break;
+        }
+    }
+    return 0;
+}
diff --git a/libavutil/xmlparser.c b/libavutil/xmlparser.c
new file mode 100644
index 0000000000..87ef12510f
--- /dev/null
+++ b/libavutil/xmlparser.c
@@ -0,0 +1,644 @@ 
+/*
+ * Copyright (c) 2022 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+
+#include "avassert.h"
+#include "common.h"
+#include "error.h"
+#include "log.h"
+#include "mem.h"
+
+#include "xmlparser.h"
+
+typedef enum Context {
+    CONT_START,
+    CONT_PROLOG,
+    CONT_BASE,
+    CONT_STAG,
+    CONT_ETAG,
+    CONT_ATTRIBUTE,
+    CONT_AFTER,
+} Context;
+
+typedef enum TokenType {
+    TOK_STAG,
+    TOK_ETAG,
+    TOK_SETAG,
+    TOK_ATTR,
+    TOK_TEXT,
+} TokenType;
+
+struct AVXMLParser {
+    int (*add_octet)(AVXMLParser *xp, unsigned c);
+    uint8_t *buf;
+    const AVXMLPEntities *entities;
+    uint64_t in_offset;
+    size_t buf_size;
+    size_t buf_head;
+    size_t buf_tail;
+    size_t buf_cur_token;
+    Context context;
+    int parse_error;
+    int run_error;
+    unsigned have_expect;
+    unsigned u8_codepoint;
+
+    unsigned depth;
+    unsigned entity;
+    uint8_t u8_len;
+    uint8_t have_quote;
+    unsigned short ent_first;
+    unsigned short ent_last;
+    unsigned short have_amp_name;
+    unsigned have_lt : 1;
+    unsigned have_equal : 1;
+    unsigned have_name : 1;
+    unsigned have_amp : 1;
+    unsigned have_amp_num : 1;
+    unsigned have_amp_dec : 1;
+    unsigned have_amp_hex : 1;
+    unsigned have_text : 1;
+    unsigned eof : 1;
+    AVXMLPEvent event;
+};
+
+typedef struct Entity Entity;
+
+struct Entity {
+    const char *name;
+    const char *text;
+};
+
+struct AVXMLPEntities {
+    unsigned nb;
+    Entity entities[0];
+};
+
+/* https://www.w3.org/TR/xml/#sec-predefined-ent */
+static const struct {
+    AVXMLPEntities entities;
+    Entity array[5];
+} predefined_entities = {
+    .entities.nb = 5,
+    .array = {
+        /* must be sorted */
+        { "amp",   "&"  },
+        { "apos",  "'"  },
+        { "gt",    ">"  },
+        { "lt",    "<"  },
+        { "quot",  "\"" },
+    },
+};
+
+/* Can be changed to size_t if we need. */
+typedef unsigned toksize;
+
+static int set_error(AVXMLParser *xp)
+{
+    av_log(0, 16, "set_error() at %"PRIu64"\n", xp->in_offset);
+    xp->parse_error = AVERROR_INVALIDDATA;
+    return 0;
+}
+
+static int set_run_error(AVXMLParser *xp, int err)
+{
+    av_log(0, 16, "set_run_error(%s)\n", av_err2str(err));
+    xp->run_error = err;
+    return err;
+}
+
+static int between(unsigned c, unsigned a, unsigned b)
+{
+    return (c - a) <= (b - a);
+}
+
+/* https://www.w3.org/TR/xml/#NT-S */
+static int char_is_space(unsigned c)
+{
+    return c == ' ' || c == '\t' || c == '\a' || c == '\n';
+}
+
+/* https://www.w3.org/TR/xml/#NT-NameStartChar */
+static int char_is_name_initial(unsigned c)
+{
+    return c == ':' ||
+           c == '_' ||
+           between(c, 'A', 'Z') ||
+           between(c, 'a', 'z') ||
+           between(c, 0xC0, 0xD6) ||
+           between(c, 0xD8, 0xF6) ||
+           between(c, 0xF8, 0x2FF) ||
+           between(c, 0x370, 0x37D) ||
+           between(c, 0x37F, 0x1FFF) ||
+           between(c, 0x200C, 0x200D) ||
+           between(c, 0x2070, 0x218F) ||
+           between(c, 0x2C00, 0x2FEF) ||
+           between(c, 0x3001, 0xD7FF) ||
+           between(c, 0xF900, 0xFDCF) ||
+           between(c, 0xFDF0, 0xFFFD) ||
+           between(c, 0x10000, 0xEFFFF);
+}
+
+/* https://www.w3.org/TR/xml/#NT-Name */
+static int char_is_name(unsigned c)
+{
+    return char_is_name_initial(c) ||
+           c == '-' ||
+           c == '.' ||
+           c == 0xB7 ||
+           between(c, '0', '9') ||
+           between(c, 0x0300, 0x036F) ||
+           between(c, 0x203F, 0x2040);
+}
+
+static int buf_grow(AVXMLParser *xp, size_t ref, size_t add)
+{
+    size_t new_size = xp->buf_size / sizeof(toksize);
+    size_t min_size = ref + add + sizeof(toksize) - 1;
+    uint8_t *new_buf;
+
+    if (xp->run_error)
+        return xp->run_error;
+    if (min_size < ref)
+        return set_run_error(xp, AVERROR(ENOMEM));
+    min_size /= sizeof(toksize);
+    while (new_size < min_size) {
+        if (new_size > SIZE_MAX / sizeof(toksize) / 2)
+            return set_run_error(xp, AVERROR(ENOMEM));
+        new_size = ((new_size | 511) << 1) + 1;
+    }
+    new_size *= sizeof(toksize);
+    new_buf = av_realloc(xp->buf, new_size);
+    if (!new_buf)
+        return set_run_error(xp, AVERROR(ENOMEM));
+    xp->buf = new_buf;
+    return 0;
+}
+
+static toksize *token_at_size(AVXMLParser *xp, size_t off)
+{
+    return (toksize *)(xp->buf + off);
+}
+
+static uint8_t *token_at_type(AVXMLParser *xp, size_t off)
+{
+    return (uint8_t *)(xp->buf + off + sizeof(toksize));
+}
+
+static uint8_t *token_at_body(AVXMLParser *xp, size_t off)
+{
+    return (uint8_t *)(xp->buf + off + sizeof(toksize) + 1);
+}
+
+#define cur_token_size (*token_at_size(xp, xp->buf_cur_token))
+#define cur_token_type (*token_at_type(xp, xp->buf_cur_token))
+#define cur_token_body ( token_at_body(xp, xp->buf_cur_token))
+
+static size_t token_at_next(AVXMLParser *xp, size_t off)
+{
+    /* size + type + body + 0 + rounding */
+    return off + (2 * sizeof(toksize) + 1 + *token_at_size(xp, off))
+           / sizeof(toksize) * sizeof(toksize);
+}
+
+static void token_end(AVXMLParser *xp)
+{
+    if (xp->buf_cur_token == xp->buf_head)
+        return;
+    av_assert0(xp->buf_head == token_at_next(xp, xp->buf_cur_token));
+    xp->buf_cur_token = xp->buf_head;
+}
+
+static int token_new(AVXMLParser *xp, TokenType type)
+{
+    buf_grow(xp, xp->buf_head, sizeof(toksize) + 2);
+    if (xp->run_error)
+        return xp->run_error;
+    token_end(xp);
+    xp->buf_head += 2 * sizeof(toksize);
+    cur_token_size = 0;
+    cur_token_type = type;
+    cur_token_body[0] = 0;
+    return 0;
+}
+
+static int token_grow(AVXMLParser *xp, toksize add)
+{
+    /* TODO check for overflow for toksize */
+    return buf_grow(xp, cur_token_body + cur_token_size - xp->buf, add + 1);
+}
+
+static int token_add_char(AVXMLParser *xp, unsigned c)
+{
+    uint8_t *p, tmp;
+
+    /* TODO must be UTF-8! */
+    token_grow(xp, 6);
+    if (xp->run_error)
+        return xp->run_error;
+    p = cur_token_body + cur_token_size;
+    PUT_UTF8(c, tmp, *(p++) = tmp;);
+    *p = 0;
+    cur_token_size += p - (cur_token_body + cur_token_size);
+    xp->buf_head = token_at_next(xp, xp->buf_cur_token);
+    return 0;
+}
+
+static const Entity *entity_get(const AVXMLPEntities *entities, unsigned idx)
+{
+    ptrdiff_t off = (char *)&predefined_entities.array - (char *)&predefined_entities;
+    av_assert0(idx < entities->nb);
+    return &((const Entity *)((char *)entities + off))[idx];
+}
+
+#include <stdio.h>
+
+static void dump_tokens(AVXMLParser *xp)
+{
+    size_t off = 0;
+    toksize size, i;
+    uint8_t *body;
+
+    printf("Tokens:\n");
+    while (off < xp->buf_head) {
+        size = *token_at_size(xp, off);
+        body = token_at_body(xp, off);
+        printf("  at %zd: [0x%02x, %zd] \"", off, *token_at_type(xp, off), (size_t)size);
+        for (i = 0; i < size; i++) {
+            if (body[i] < 32 || body[i] == '"')
+                printf("\\x%02x", body[i]);
+            else
+                fputc(body[i], stdout);
+        }
+        printf("\"\n");
+        off = token_at_next(xp, off);
+    }
+    av_assert0(off == xp->buf_head);
+    printf("  .\n");
+}
+
+static int add_char(AVXMLParser *xp, unsigned c)
+{
+    if (xp->parse_error || xp->run_error)
+        return xp->run_error;
+    while (1) {
+        /* return   → case handled
+           continue → try again in the new state
+           break    → error */
+
+        if (0) { /* cosmetic */
+        } else if (xp->have_expect) {
+            if (c != xp->have_expect)
+                return set_error(xp);
+            xp->have_expect = 0;
+            return xp->run_error;
+
+        } else if (xp->have_lt) {
+            av_assert0(xp->context == CONT_BASE);
+            if (char_is_name_initial(c)) {
+                xp->have_lt = 0;
+                token_new(xp, TOK_STAG);
+                token_add_char(xp, c);
+                xp->context = CONT_STAG;
+                xp->have_name = 1;
+                return xp->run_error;
+            }
+            if (c == '/') {
+                xp->have_lt = 0;
+                token_new(xp, TOK_ETAG);
+                xp->context = CONT_ETAG;
+                /* This will accept tags that begin with a non-initial
+                   character, but thei it will not match the start tag. */
+                xp->have_name = 1;
+                return xp->run_error;
+            }
+
+        } else if (xp->have_name) {
+            if (char_is_name(c)) {
+                token_add_char(xp, c);
+                return xp->run_error;
+            }
+            xp->have_name = 0;
+            continue;
+
+        } else if (xp->have_amp) {
+            xp->have_amp = 0;
+            xp->entity = 0;
+            if (c == '#') {
+                xp->have_amp_num = 1;
+                return xp->run_error;
+            }
+            xp->ent_first = 0;
+            xp->ent_last = xp->entities->nb;
+            xp->have_amp_name = 1;
+            continue;
+
+        } else if (xp->have_amp_num) {
+            xp->have_amp_num = 0;
+            if (c == 'x') {
+                xp->have_amp_hex = 1;
+                return xp->run_error;
+            }
+            xp->have_amp_dec = 1;
+            continue;
+
+        } else if (xp->have_amp_dec) {
+            if (between(c, '0', '9')) {
+                xp->entity = xp->entity * 10 + (c - '0');
+                return xp->run_error;
+            } else if (c == ';') {
+                xp->have_amp_dec = 0;
+                token_add_char(xp, xp->entity);
+                return xp->run_error;
+            }
+
+        } else if (xp->have_amp_name) {
+            av_log(0, 16, "got %c at %d\n", c, xp->have_amp_name - 1);
+            if (c == ';') {
+                const Entity *e = entity_get(xp->entities, xp->ent_first);
+                const char *p;
+                if (e->name[xp->have_amp_name - 1])
+                    return set_error(xp);
+                xp->have_amp_name = 0;
+                for (p = e->text; *p; p++)
+                    token_add_char(xp, *p);
+                return xp->run_error;
+            }
+            /* TODO use a dichotomy */
+            /* All entities between ent_first (included) and ent_last
+               (excluded) begin with the current have_amp_name-1 characters. */
+            while (xp->ent_first < xp->ent_last) {
+                const Entity *e = entity_get(xp->entities, xp->ent_first);
+                if (e->name[xp->have_amp_name - 1] >= c)
+                    break;
+                xp->ent_first++;
+            }
+            av_log(0, 16, "for %c, between %d and %d\n", c, xp->ent_first, xp->ent_last);
+            while (xp->ent_last > xp->ent_first) {
+                const Entity *e = entity_get(xp->entities, xp->ent_last - 1);
+                if (e->name[xp->have_amp_name - 1] <= c)
+                    break;
+                xp->ent_last--;
+            }
+            av_log(0, 16, "... between %d and %d\n", xp->ent_first, xp->ent_last);
+            if (xp->ent_last == xp->ent_first)
+                return set_error(xp);
+            xp->have_amp_name++;
+            return xp->run_error;
+
+        } else if (xp->have_amp_hex) {
+            if (between(c, '0', '9')) {
+                xp->entity = xp->entity * 16 + (c - '0');
+                return xp->run_error;
+            } else if (between(c & ~('a' - 'A'), 'A', 'F')) {
+                xp->entity = xp->entity * 16 + ((c & ~('a' - 'A')) - 'A' + 10);
+                return xp->run_error;
+            } else if (c == ';') {
+                xp->have_amp_hex = 0;
+                token_add_char(xp, xp->entity);
+                return xp->run_error;
+            }
+
+        } else if (xp->have_text) {
+            if (c == '<' || (xp->have_quote && c == xp->have_quote)) {
+                xp->have_text = 0;
+                continue;
+            }
+            if (c == '&') {
+                xp->have_amp = 1;
+                return xp->run_error;
+            }
+            token_add_char(xp, c);
+            return xp->run_error;
+
+        } else if (xp->context == CONT_BASE) {
+            if (c == '<') {
+                xp->have_lt = 1;
+                return xp->run_error;
+            }
+            token_new(xp, TOK_TEXT);
+            xp->have_text = 1;
+            continue;
+
+        } else if (xp->context == CONT_STAG) {
+            /* https://www.w3.org/TR/xml/#NT-element */
+            if (char_is_space(c))
+                return xp->run_error;
+            if (char_is_name_initial(c)) {
+                /* TODO check for space before */
+                token_new(xp, TOK_ATTR);
+                token_add_char(xp, c);
+                xp->have_name = 1;
+                xp->context = CONT_ATTRIBUTE;
+                return xp->run_error;
+            }
+            if (c == '>') {
+                token_end(xp);
+                xp->depth++;
+                xp->context = CONT_BASE;
+                return xp->run_error;
+            }
+            if (c == '/') {
+                cur_token_type = TOK_SETAG;
+                token_end(xp);
+                xp->context = CONT_BASE;
+                xp->have_expect = '>';
+                return xp->run_error;
+            }
+
+        } else if (xp->context == CONT_ETAG) {
+            /* https://www.w3.org/TR/xml/#NT-ETag */
+            if (char_is_space(c))
+                return xp->run_error;
+            if (c == '>') {
+                /* Checking it matches the begin tag will be done when
+                   returning the events. */
+                token_end(xp);
+                xp->depth--;
+                xp->context = xp->depth ? CONT_BASE : CONT_AFTER;
+                return xp->run_error;
+            }
+
+        } else if (xp->context == CONT_ATTRIBUTE) {
+            if (!xp->have_equal) {
+                if (c == '=') {
+                    xp->have_equal = 1;
+                    return xp->run_error;
+                }
+            } else if (!xp->have_quote) {
+                if (c == '"' || c == '\'') {
+                    xp->have_quote = c;
+                    xp->have_text = 1;
+                    token_add_char(xp, 0);
+                    return xp->run_error;
+                }
+            } else {
+                if (c == xp->have_quote) {
+                    xp->have_quote = 0;
+                    xp->have_equal = 0;
+                    xp->context = CONT_STAG;
+                    return xp->run_error;
+                }
+            }
+
+        } else if (xp->context == CONT_AFTER) {
+            if (char_is_space(c))
+                return xp->run_error;
+        }
+        break;
+    }
+    av_log(0, 16, "unexpected case for %c\n", c);
+    return set_error(xp);
+}
+
+static int add_eof(AVXMLParser *xp)
+{
+    if (xp->parse_error || xp->run_error)
+        return xp->run_error;
+    if (xp->context != CONT_AFTER)
+        return set_error(xp);
+    xp->eof = 1;
+    return xp->run_error;
+}
+
+static int add_octet_utf8(AVXMLParser *xp, unsigned c)
+{
+    if (c == (unsigned)-1)
+        return add_eof(xp);
+    if (!xp->u8_len) {
+        if (c < 0x80)
+            return add_char(xp, c);
+        else if (c < 0xc0)
+            return set_error(xp);
+        else if (c < 0xe0)
+            xp->u8_len = 1;
+        else if (c < 0xf0)
+            xp->u8_len = 2;
+        else if (c < 0xf8)
+            xp->u8_len = 3;
+        else if (c < 0xfc)
+            xp->u8_len = 4;
+        else if (c < 0xfe)
+            xp->u8_len = 5;
+        else
+            return set_error(xp);
+        xp->u8_codepoint = (c & (0x3f >> xp->u8_len)) << (6 * xp->u8_len);
+    } else {
+        if (c < 0x80 || c >= 0xc0)
+            return set_error(xp);
+        xp->u8_codepoint |= (c & 0x3f) << (--xp->u8_len * 6);
+        if (!xp->u8_len)
+            return add_char(xp, xp->u8_codepoint);
+    }
+    return 0;
+}
+
+int av_xmlp_alloc(AVXMLParser **rxp)
+{
+    AVXMLParser *xp;
+
+    xp = av_calloc(1, sizeof(*xp));
+    if (!xp)
+        return AVERROR(ENOMEM);
+    xp->entities = &predefined_entities.entities;
+    xp->context = CONT_BASE;
+    xp->add_octet = add_octet_utf8; /* TODO probe, BOM */
+    *rxp = xp;
+    return 0;
+}
+
+int av_xmlp_add_data(AVXMLParser *xp, uint8_t *buf, size_t size)
+{
+    size_t i;
+
+    if (xp->parse_error || xp->run_error)
+        return xp->run_error;
+    for (i = 0; i < size; i++) {
+        xp->add_octet(xp, buf[i]);
+        if (xp->parse_error || xp->run_error)
+            return xp->run_error;
+        xp->in_offset++;
+    }
+    if (!size) {
+        xp->add_octet(xp, -1);
+        if (xp->parse_error || xp->run_error)
+            return xp->run_error;
+    }
+    if (0 && !size)
+        dump_tokens(xp);
+    return 0;
+}
+
+int av_xmlp_get_event(AVXMLParser *xp, AVXMLPEvent **rev)
+{
+    AVXMLPEvent *ev = &xp->event;
+    toksize size;
+    TokenType type;
+    uint8_t *body;
+
+    if (xp->run_error < 0)
+        return xp->run_error;
+    if (xp->parse_error < 0)
+        return xp->parse_error;
+    if (xp->buf_tail == xp->buf_cur_token)
+        return xp->eof ? AVERROR_EOF : AVERROR(EAGAIN);
+
+    size = *token_at_size(xp, xp->buf_tail);
+    type = *token_at_type(xp, xp->buf_tail);
+    body =  token_at_body(xp, xp->buf_tail);
+    memset(ev, 0, sizeof(*ev));
+    switch (type) {
+    case TOK_STAG:
+        ev->type = AV_XMLP_EV_EL_START;
+        ev->name = body;
+        break;
+
+    case TOK_ETAG:
+        /* TODO check consistency */
+        ev->type = AV_XMLP_EV_EL_END;
+        ev->name = body;
+        break;
+
+    case TOK_SETAG:
+        ev->type = AV_XMLP_EV_EL_EMPTY;
+        ev->name = body;
+        break;
+
+    case TOK_ATTR:
+        ev->type = AV_XMLP_EV_ATTR;
+        ev->name = body;
+        ev->text = body + strlen(body) + 1;
+        ev->text_len = body + size - ev->text;
+        break;
+
+    case TOK_TEXT:
+        ev->type = AV_XMLP_EV_TEXT;
+        ev->text = body;
+        ev->text_len = size;
+        break;
+
+    default:
+        av_assert0(!"reached");
+    }
+    xp->buf_tail = token_at_next(xp, xp->buf_tail);
+    *rev = ev;
+    return 0;
+}
diff --git a/libavutil/xmlparser.h b/libavutil/xmlparser.h
new file mode 100644
index 0000000000..f4fbf46ff3
--- /dev/null
+++ b/libavutil/xmlparser.h
@@ -0,0 +1,58 @@ 
+/*
+ * Limited XML parser
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_XMLPARSER_H
+#define AVUTIL_XMLPARSER_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct AVXMLParser AVXMLParser;
+
+typedef enum AVXMLPEventType AVXMLPEventType;
+
+typedef struct AVXMLPEvent AVXMLPEvent;
+
+typedef struct AVXMLPEntities AVXMLPEntities;
+
+enum AVXMLPEventType {
+    AV_XMLP_EV_EL_START,
+    AV_XMLP_EV_EL_END,
+    AV_XMLP_EV_EL_EMPTY,
+    AV_XMLP_EV_ATTR,
+    AV_XMLP_EV_TEXT,
+};
+
+struct AVXMLPEvent {
+    const uint8_t *name;
+    const uint8_t *text;
+    size_t text_len;
+    AVXMLPEventType type;
+};
+
+int av_xmlp_alloc(AVXMLParser **rxp);
+
+int av_xmlp_add_data(AVXMLParser *xp, uint8_t *buf, size_t size);
+
+int av_xmlp_add_eof(AVXMLParser *xp);
+
+int av_xmlp_get_event(AVXMLParser *xp, AVXMLPEvent **ev);
+
+#endif /* AVFILTER_FILTERS_H */