diff mbox series

[FFmpeg-devel,2/2] lavf/url: rewrite ff_make_absolute_url() using ff_url_decompose().

Message ID 20200730151009.118835-2-george@nsup.org
State Superseded
Headers show
Series [FFmpeg-devel,1/2] lavf/url: add ff_url_decompose().
Related show

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Nicolas George July 30, 2020, 3:10 p.m. UTC
Also add and update some tests.

Change the semantic a little, because for filesytem paths
symlinks complicate things.
See the comments in the code for detail.

Signed-off-by: Nicolas George <george@nsup.org>
---
 libavformat/tests/url.c |  17 ++-
 libavformat/url.c       | 246 +++++++++++++++++++---------------------
 libavformat/url.h       |   4 +-
 tests/ref/fate/url      |  13 ++-
 4 files changed, 145 insertions(+), 135 deletions(-)

Comments

Marton Balint Aug. 5, 2020, 7:58 a.m. UTC | #1
On Thu, 30 Jul 2020, Nicolas George wrote:

> Also add and update some tests.
>
> Change the semantic a little, because for filesytem paths
> symlinks complicate things.
> See the comments in the code for detail.
>
> Signed-off-by: Nicolas George <george@nsup.org>
> ---
> libavformat/tests/url.c |  17 ++-
> libavformat/url.c       | 246 +++++++++++++++++++---------------------
> libavformat/url.h       |   4 +-
> tests/ref/fate/url      |  13 ++-
> 4 files changed, 145 insertions(+), 135 deletions(-)
>

[...]

> diff --git a/libavformat/url.c b/libavformat/url.c
> index 26aaab4019..0c82317134 100644
> --- a/libavformat/url.c
> +++ b/libavformat/url.c
> @@ -27,6 +27,7 @@
> #if CONFIG_NETWORK
> #include "network.h"
> #endif
> +#include "libavutil/avassert.h"
> #include "libavutil/avstring.h"
> 
> /**
> @@ -152,146 +153,131 @@ int ff_url_decompose(URLComponents *uc, const char *url, const char *end)
>     return 0;
> }
> 
> -static void trim_double_dot_url(char *buf, const char *rel, int size)
> +static int append_path(char *root, char *out_end, char **rout,
> +                       const char *in, const char *in_end)
> {
> +    char *out = *rout;
> +    const char *d, *next;
> +
> +    if (in < in_end && *in == '/')
> +        in++; /* already taken care of */
> +    while (in < in_end) {
> +        d = find_delim("/", in, in_end);
> +        next = d + (d < in_end && *d == '/');
> +        if (d - in == 1 && in[0] == '.') {
> +            /* skip */
> +        } else if (d - in == 2 && in[0] == '.' && in[1] == '.') {
> +            av_assert1(out[-1] == '/');
> +            if (out - root > 1)
> +                while (out > root && (--out)[-1] != '/');
> +        } else {
> +            if (out_end - out < next - in)
> +                return AVERROR(ENOMEM);
> +            memcpy(out, in, next - in);

memmove, because out and in can overlap or they can be the same when
buf == base in ff_make_absolute_url()

> +            out += next - in;
> +        }
> +        in = next;
>     }
> -
> -    if (!av_stristart(p, "/", NULL) && root != rel)
> -        av_strlcat(tmp_path, "/", size);
> -
> -    av_strlcat(tmp_path, p, size);
> -    /* start set buf after temp path process. */
> -    av_strlcpy(buf, rel, root - rel + 1);
> -
> -    if (!av_stristart(tmp_path, "/", NULL) && root != rel)
> -        av_strlcat(buf, "/", size);
> -
> -    av_strlcat(buf, tmp_path, size);
> +    *rout = out;
> +    return 0;
> }
> 
> -void ff_make_absolute_url(char *buf, int size, const char *base,
> +int ff_make_absolute_url(char *buf, int size, const char *base,
>                           const char *rel)

A problem with the function type change is that previously the caller 
assumed that even if there is not enough space, buf will be a 0 terminated 
string. This guarantee is now lost, so all usage of 
ff_make_absolute_url should also be updated to check error code, otherwise 
crashes might occur. Or we could re-introduce that guarantee, at 
least temporarily, and change type and add checks in a later patch.

> {
> +    URLComponents ub, uc;
> +    char *out, *out_end, *path;
> +    const char *keep, *base_path_end;
> +    int use_base_path, simplify_path = 0, ret;
> +
> +    /* This is tricky.
> +       For HTTP, http://server/site/page + ../media/file
> +       should resolve into http://media/file
> +       but for filesystem access, dir/playlist + ../media/file
> +       should resolve into dir/../media/file
> +       because dir could be a symlink, and .. points to
> +       the actual parent of the target directory.
> +
> +       We'll consider that URLs with an actual scheme and authority,
> +       i.e. starting with scheme://, need parent dir simplification,
> +       while bare paths or pseudo-URLs starting with proto: without
> +       the double slash do not.
> +     */
> +
> +    if (!size)
> +        return AVERROR(ENOMEM);
> +    out = buf;
> +    out_end = buf + size - 1;
> +
> +    if ((ret = ff_url_decompose(&ub, base, NULL) < 0) ||
> +        (ret = ff_url_decompose(&uc, rel,  NULL) < 0))
> +        return ret;
> +
> +    keep = ub.url;
> +#define KEEP(component, also) do { \
> +        if (uc.url_component_end_##component == uc.url && \
> +            ub.url_component_end_##component > keep) { \
> +            keep = ub.url_component_end_##component; \
> +            also \
> +        } \
> +    } while (0)
> +    KEEP(scheme, );
> +    KEEP(authority_full, simplify_path = 1;);
> +    KEEP(path,);
> +    KEEP(query,);
> +    KEEP(fragment,);
> +#undef KEEP
> +#define COPY(start, end) do { \
> +        size_t len = end - start; \
> +        if (len > out_end - out) \
> +            return AVERROR(ENOMEM); \
> +        memcpy(out, start, len); \

memmove because buffers can overlap here as well.

> +        out += len; \
> +    } while (0)
> +    COPY(ub.url, keep);
> +    COPY(uc.url, uc.path);
> +    if (keep > ub.path)
> +        simplify_path = 0;
> +
> +    use_base_path = URL_COMPONENT_HAVE(ub, path) && keep <= ub.path;
> +    if (uc.path > uc.url)
> +        use_base_path = 0;
> +    if (URL_COMPONENT_HAVE(uc, scheme))
> +        simplify_path = 0;
> +    if (URL_COMPONENT_HAVE(uc, authority))
> +        simplify_path = 1;

These last two checks should be moved up for better readability (to set 
simplify_path in one section)

[...]


> diff --git a/libavformat/url.h b/libavformat/url.h
> index ae27da5c73..e33edf0cb9 100644
> --- a/libavformat/url.h
> +++ b/libavformat/url.h
> @@ -312,8 +312,8 @@ int ff_url_join(char *str, int size, const char 
> *proto,
>  * @param base the base url, may be equal to buf.
>  * @param rel the new url, which is interpreted relative to base
>  */
> -void ff_make_absolute_url(char *buf, int size, const char *base,
> -                          const char *rel);
> +int ff_make_absolute_url(char *buf, int size, const char *base,
> +                         const char *rel);

We should add a reference to the function docs to the relevant RFC:
https://tools.ietf.org/html/rfc3986#section-5

[...]

> diff --git a/tests/ref/fate/url b/tests/ref/fate/url
> index 84cf85abdd..7998fb5be9 100644
> --- a/tests/ref/fate/url
> +++ b/tests/ref/fate/url
> @@ -46,9 +46,9 @@ http://[::1]:8080/dev/null =>
> Testing ff_make_absolute_url:
>                                             (null) baz                  => baz
>                                           /foo/bar baz                  => /foo/baz
> -                                          /foo/bar ../baz               => /baz
> +                                          /foo/bar ../baz               => /foo/../baz
>                                           /foo/bar /baz                 => /baz
> -                                          /foo/bar ../../../baz         => /baz
> +                                          /foo/bar ../../../baz         => /foo/../../../baz
>                                 http://server/foo/ baz                  => http://server/foo/baz
>                              http://server/foo/bar baz                  => http://server/foo/baz
>                                 http://server/foo/ ../baz               => http://server/baz
> @@ -62,6 +62,15 @@ Testing ff_make_absolute_url:
>                              http://server/foo/bar /../../../../../other/url => http://server/other/url
>                              http://server/foo/bar /test/../../../../../other/url => http://server/other/url
>                              http://server/foo/bar /test/../../test/../../../other/url => http://server/other/url
> +                             http://server/foo/bar file:../baz/qux      => file:../baz/qux
> +                           http://server/foo//bar/ ../../               => http://server/foo/
> +                                   file:../tmp/foo ../bar/              => file:../tmp/../bar/
> +                                   file:../tmp/foo file:../bar/         => file:../bar/
> +                             http://server/foo/bar ./                   => http://server/foo/
> +                             http://server/foo/bar .dotfile             => http://server/foo/.dotfile
> +                             http://server/foo/bar ..doubledotfile      => http://server/foo/..doubledotfile
> +                             http://server/foo/bar double..dotfile      => http://server/foo/double..dotfile
> +                             http://server/foo/bar doubledotfile..      => http://server/foo/doubledotfile..

It would probably make sense to add all tests which are in
https://tools.ietf.org/html/rfc3986#section-5.4

I did a quick check manually, and found one failing test:

http://a/b/c/d;p?q //g                  => http://g/

but it supposed to be "http://g". Not that it matters too much...

Good work overall, thanks!

Regards,
Marton
Nicolas George Aug. 5, 2020, 4:53 p.m. UTC | #2
Marton Balint (12020-08-05):
> memmove, because out and in can overlap or they can be the same when
> buf == base in ff_make_absolute_url()

Good catch. Fixed.

> A problem with the function type change is that previously the caller
> assumed that even if there is not enough space, buf will be a 0 terminated
> string. This guarantee is now lost, so all usage of ff_make_absolute_url
> should also be updated to check error code, otherwise crashes might occur.
> Or we could re-introduce that guarantee, at least temporarily, and change
> type and add checks in a later patch.

I do not like the idea of delaying the error check, but the invariant
must be preserved. I think the solution I have found is good.

> These last two checks should be moved up for better readability (to set
> simplify_path in one section)

Done. Unfortunately, I had to re-alter simplify_path after to handle the
last case. I just locally added /* No path at all, leave it */ after
sending the patch to make it more explicit.

> We should add a reference to the function docs to the relevant RFC:
> https://tools.ietf.org/html/rfc3986#section-5

Good idea.

> It would probably make sense to add all tests which are in
> https://tools.ietf.org/html/rfc3986#section-5.4

Done.

> I did a quick check manually, and found one failing test:
> 
> http://a/b/c/d;p?q //g                  => http://g/
> 
> but it supposed to be "http://g". Not that it matters too much...

It probably does not (is there a real protocol where an empty path makes
sense), but better handle it according to the official examples, it was
not hard.

Thanks for the comments. See the new version.

Regards,
diff mbox series

Patch

diff --git a/libavformat/tests/url.c b/libavformat/tests/url.c
index e7d259ab7d..ecd9725112 100644
--- a/libavformat/tests/url.c
+++ b/libavformat/tests/url.c
@@ -49,7 +49,13 @@  static void test_decompose(const char *url)
 static void test(const char *base, const char *rel)
 {
     char buf[200], buf2[200];
-    ff_make_absolute_url(buf, sizeof(buf), base, rel);
+    int ret;
+
+    ret = ff_make_absolute_url(buf, sizeof(buf), base, rel);
+    if (ret < 0) {
+        printf("%50s %-20s => error %s\n", base, rel, av_err2str(ret));
+        return;
+    }
     printf("%50s %-20s => %s\n", base, rel, buf);
     if (base) {
         /* Test in-buffer replacement */
@@ -104,6 +110,15 @@  int main(void)
     test("http://server/foo/bar", "/../../../../../other/url");
     test("http://server/foo/bar", "/test/../../../../../other/url");
     test("http://server/foo/bar", "/test/../../test/../../../other/url");
+    test("http://server/foo/bar", "file:../baz/qux");
+    test("http://server/foo//bar/", "../../");
+    test("file:../tmp/foo", "../bar/");
+    test("file:../tmp/foo", "file:../bar/");
+    test("http://server/foo/bar", "./");
+    test("http://server/foo/bar", ".dotfile");
+    test("http://server/foo/bar", "..doubledotfile");
+    test("http://server/foo/bar", "double..dotfile");
+    test("http://server/foo/bar", "doubledotfile..");
 
     printf("\nTesting av_url_split:\n");
     test2("/foo/bar");
diff --git a/libavformat/url.c b/libavformat/url.c
index 26aaab4019..0c82317134 100644
--- a/libavformat/url.c
+++ b/libavformat/url.c
@@ -27,6 +27,7 @@ 
 #if CONFIG_NETWORK
 #include "network.h"
 #endif
+#include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 
 /**
@@ -152,146 +153,131 @@  int ff_url_decompose(URLComponents *uc, const char *url, const char *end)
     return 0;
 }
 
-static void trim_double_dot_url(char *buf, const char *rel, int size)
+static int append_path(char *root, char *out_end, char **rout,
+                       const char *in, const char *in_end)
 {
-    const char *p = rel;
-    const char *root = rel;
-    char tmp_path[MAX_URL_SIZE] = {0, };
-    char *sep;
-    char *node;
-
-    /* Get the path root of the url which start by "://" */
-    if (p && (sep = strstr(p, "://"))) {
-        sep += 3;
-        root = strchr(sep, '/');
-        if (!root)
-            return;
-    }
-
-    /* set new current position if the root node is changed */
-    p = root;
-    while (p && (node = strstr(p, ".."))) {
-        av_strlcat(tmp_path, p, node - p + strlen(tmp_path));
-        p = node + 3;
-        sep = strrchr(tmp_path, '/');
-        if (sep)
-            sep[0] = '\0';
-        else
-            tmp_path[0] = '\0';
+    char *out = *rout;
+    const char *d, *next;
+
+    if (in < in_end && *in == '/')
+        in++; /* already taken care of */
+    while (in < in_end) {
+        d = find_delim("/", in, in_end);
+        next = d + (d < in_end && *d == '/');
+        if (d - in == 1 && in[0] == '.') {
+            /* skip */
+        } else if (d - in == 2 && in[0] == '.' && in[1] == '.') {
+            av_assert1(out[-1] == '/');
+            if (out - root > 1)
+                while (out > root && (--out)[-1] != '/');
+        } else {
+            if (out_end - out < next - in)
+                return AVERROR(ENOMEM);
+            memcpy(out, in, next - in);
+            out += next - in;
+        }
+        in = next;
     }
-
-    if (!av_stristart(p, "/", NULL) && root != rel)
-        av_strlcat(tmp_path, "/", size);
-
-    av_strlcat(tmp_path, p, size);
-    /* start set buf after temp path process. */
-    av_strlcpy(buf, rel, root - rel + 1);
-
-    if (!av_stristart(tmp_path, "/", NULL) && root != rel)
-        av_strlcat(buf, "/", size);
-
-    av_strlcat(buf, tmp_path, size);
+    *rout = out;
+    return 0;
 }
 
-void ff_make_absolute_url(char *buf, int size, const char *base,
+int ff_make_absolute_url(char *buf, int size, const char *base,
                           const char *rel)
 {
-    char *sep, *path_query;
-    char *root, *p;
-    char tmp_path[MAX_URL_SIZE];
-
-    memset(tmp_path, 0, sizeof(tmp_path));
-    /* Absolute path, relative to the current server */
-    if (base && strstr(base, "://") && rel[0] == '/') {
-        if (base != buf)
-            av_strlcpy(buf, base, size);
-        sep = strstr(buf, "://");
-        if (sep) {
-            /* Take scheme from base url */
-            if (rel[1] == '/') {
-                sep[1] = '\0';
-            } else {
-                /* Take scheme and host from base url */
-                sep += 3;
-                sep = strchr(sep, '/');
-                if (sep)
-                    *sep = '\0';
-            }
-        }
-        av_strlcat(buf, rel, size);
-        trim_double_dot_url(tmp_path, buf, size);
-        memset(buf, 0, size);
-        av_strlcpy(buf, tmp_path, size);
-        return;
-    }
-    /* If rel actually is an absolute url, just copy it */
-    if (!base || strstr(rel, "://") || rel[0] == '/') {
-        memset(buf, 0, size);
-        trim_double_dot_url(buf, rel, size);
-        return;
+    URLComponents ub, uc;
+    char *out, *out_end, *path;
+    const char *keep, *base_path_end;
+    int use_base_path, simplify_path = 0, ret;
+
+    /* This is tricky.
+       For HTTP, http://server/site/page + ../media/file
+       should resolve into http://media/file
+       but for filesystem access, dir/playlist + ../media/file
+       should resolve into dir/../media/file
+       because dir could be a symlink, and .. points to
+       the actual parent of the target directory.
+
+       We'll consider that URLs with an actual scheme and authority,
+       i.e. starting with scheme://, need parent dir simplification,
+       while bare paths or pseudo-URLs starting with proto: without
+       the double slash do not.
+     */
+
+    if (!size)
+        return AVERROR(ENOMEM);
+    out = buf;
+    out_end = buf + size - 1;
+
+    if ((ret = ff_url_decompose(&ub, base, NULL) < 0) ||
+        (ret = ff_url_decompose(&uc, rel,  NULL) < 0))
+        return ret;
+
+    keep = ub.url;
+#define KEEP(component, also) do { \
+        if (uc.url_component_end_##component == uc.url && \
+            ub.url_component_end_##component > keep) { \
+            keep = ub.url_component_end_##component; \
+            also \
+        } \
+    } while (0)
+    KEEP(scheme, );
+    KEEP(authority_full, simplify_path = 1;);
+    KEEP(path,);
+    KEEP(query,);
+    KEEP(fragment,);
+#undef KEEP
+#define COPY(start, end) do { \
+        size_t len = end - start; \
+        if (len > out_end - out) \
+            return AVERROR(ENOMEM); \
+        memcpy(out, start, len); \
+        out += len; \
+    } while (0)
+    COPY(ub.url, keep);
+    COPY(uc.url, uc.path);
+    if (keep > ub.path)
+        simplify_path = 0;
+
+    use_base_path = URL_COMPONENT_HAVE(ub, path) && keep <= ub.path;
+    if (uc.path > uc.url)
+        use_base_path = 0;
+    if (URL_COMPONENT_HAVE(uc, scheme))
+        simplify_path = 0;
+    if (URL_COMPONENT_HAVE(uc, authority))
+        simplify_path = 1;
+    if (URL_COMPONENT_HAVE(uc, path) && uc.path[0] == '/')
+        use_base_path = 0;
+    if (use_base_path) {
+        base_path_end = ub.url_component_end_path;
+        if (URL_COMPONENT_HAVE(uc, path))
+            while (base_path_end > ub.path && base_path_end[-1] != '/')
+                base_path_end--;
     }
-    if (base != buf)
-        av_strlcpy(buf, base, size);
-
-    /* Strip off any query string from base */
-    path_query = strchr(buf, '?');
-    if (path_query)
-        *path_query = '\0';
-
-    /* Is relative path just a new query part? */
-    if (rel[0] == '?') {
-        av_strlcat(buf, rel, size);
-        trim_double_dot_url(tmp_path, buf, size);
-        memset(buf, 0, size);
-        av_strlcpy(buf, tmp_path, size);
-        return;
-    }
-
-    root = p = buf;
-    /* Get the path root of the url which start by "://" */
-    if (p && strstr(p, "://")) {
-        sep = strstr(p, "://");
-        if (sep) {
-            sep += 3;
-            root = strchr(sep, '/');
-            if (!root)
-                return;
-        }
-    }
-
-    /* Remove the file name from the base url */
-    sep = strrchr(buf, '/');
-    if (sep && sep <= root)
-        sep = root;
-
-    if (sep)
-        sep[1] = '\0';
-    else
-        buf[0] = '\0';
-    while (av_strstart(rel, "..", NULL) && sep) {
-        /* Remove the path delimiter at the end */
-        if (sep > root) {
-            sep[0] = '\0';
-            sep = strrchr(buf, '/');
+    if (simplify_path) {
+        const char *root = "/";
+        COPY(root, root + 1);
+        path = out;
+        if (use_base_path) {
+            ret = append_path(path, out_end, &out, ub.path, base_path_end);
+            if (ret < 0)
+                return ret;
         }
-
-        /* If the next directory name to pop off is "..", break here */
-        if (!strcmp(sep ? &sep[1] : buf, "..")) {
-            /* Readd the slash we just removed */
-            av_strlcat(buf, "/", size);
-            break;
+        if (URL_COMPONENT_HAVE(uc, path)) {
+            ret = append_path(path, out_end, &out, uc.path, uc.url_component_end_path);
+            if (ret < 0)
+                return ret;
         }
-        /* Cut off the directory name */
-        if (sep)
-            sep[1] = '\0';
-        else
-            buf[0] = '\0';
-        rel += 3;
+    } else {
+        if (use_base_path)
+            COPY(ub.path, base_path_end);
+        COPY(uc.path, uc.url_component_end_path);
     }
-    av_strlcat(buf, rel, size);
-    trim_double_dot_url(tmp_path, buf, size);
-    memset(buf, 0, size);
-    av_strlcpy(buf, tmp_path, size);
+
+    COPY(uc.url_component_end_path, uc.end);
+#undef COPY
+    *out = 0;
+    return 0;
 }
 
 AVIODirEntry *ff_alloc_dir_entry(void)
diff --git a/libavformat/url.h b/libavformat/url.h
index ae27da5c73..e33edf0cb9 100644
--- a/libavformat/url.h
+++ b/libavformat/url.h
@@ -312,8 +312,8 @@  int ff_url_join(char *str, int size, const char *proto,
  * @param base the base url, may be equal to buf.
  * @param rel the new url, which is interpreted relative to base
  */
-void ff_make_absolute_url(char *buf, int size, const char *base,
-                          const char *rel);
+int ff_make_absolute_url(char *buf, int size, const char *base,
+                         const char *rel);
 
 /**
  * Allocate directory entry with default values.
diff --git a/tests/ref/fate/url b/tests/ref/fate/url
index 84cf85abdd..7998fb5be9 100644
--- a/tests/ref/fate/url
+++ b/tests/ref/fate/url
@@ -46,9 +46,9 @@  http://[::1]:8080/dev/null =>
 Testing ff_make_absolute_url:
                                             (null) baz                  => baz
                                           /foo/bar baz                  => /foo/baz
-                                          /foo/bar ../baz               => /baz
+                                          /foo/bar ../baz               => /foo/../baz
                                           /foo/bar /baz                 => /baz
-                                          /foo/bar ../../../baz         => /baz
+                                          /foo/bar ../../../baz         => /foo/../../../baz
                                 http://server/foo/ baz                  => http://server/foo/baz
                              http://server/foo/bar baz                  => http://server/foo/baz
                                 http://server/foo/ ../baz               => http://server/baz
@@ -62,6 +62,15 @@  Testing ff_make_absolute_url:
                              http://server/foo/bar /../../../../../other/url => http://server/other/url
                              http://server/foo/bar /test/../../../../../other/url => http://server/other/url
                              http://server/foo/bar /test/../../test/../../../other/url => http://server/other/url
+                             http://server/foo/bar file:../baz/qux      => file:../baz/qux
+                           http://server/foo//bar/ ../../               => http://server/foo/
+                                   file:../tmp/foo ../bar/              => file:../tmp/../bar/
+                                   file:../tmp/foo file:../bar/         => file:../bar/
+                             http://server/foo/bar ./                   => http://server/foo/
+                             http://server/foo/bar .dotfile             => http://server/foo/.dotfile
+                             http://server/foo/bar ..doubledotfile      => http://server/foo/..doubledotfile
+                             http://server/foo/bar double..dotfile      => http://server/foo/double..dotfile
+                             http://server/foo/bar doubledotfile..      => http://server/foo/doubledotfile..
 
 Testing av_url_split:
 /foo/bar                                                     =>                                                    -1 /foo/bar