diff mbox series

[FFmpeg-devel,v2] avformat: add a concat protocol that takes a line break delimited list of resources

Message ID 20210626165657.2066-1-jamrial@gmail.com
State Accepted
Headers show
Series [FFmpeg-devel,v2] avformat: add a concat protocol that takes a line break delimited list of resources
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

James Almer June 26, 2021, 4:56 p.m. UTC
Suggested-by: ffmpeg@fb.com
Signed-off-by: James Almer <jamrial@gmail.com>
---
Updated documentation, and line breaks can now be part of the filename.

 doc/protocols.texi      |  33 +++++++++
 libavformat/Makefile    |   1 +
 libavformat/concat.c    | 146 ++++++++++++++++++++++++++++++++++++++++
 libavformat/protocols.c |   1 +
 4 files changed, 181 insertions(+)

Comments

Nicolas George June 27, 2021, 6:26 p.m. UTC | #1
James Almer (12021-06-26):
> Suggested-by: ffmpeg@fb.com
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
> Updated documentation, and line breaks can now be part of the filename.
> 
>  doc/protocols.texi      |  33 +++++++++
>  libavformat/Makefile    |   1 +
>  libavformat/concat.c    | 146 ++++++++++++++++++++++++++++++++++++++++
>  libavformat/protocols.c |   1 +
>  4 files changed, 181 insertions(+)
> 
> diff --git a/doc/protocols.texi b/doc/protocols.texi
> index ccdfb6e439..11de674225 100644
> --- a/doc/protocols.texi
> +++ b/doc/protocols.texi
> @@ -215,6 +215,39 @@ ffplay concat:split1.mpeg\|split2.mpeg\|split3.mpeg
>  Note that you may need to escape the character "|" which is special for
>  many shells.
>  
> +@section concatf
> +
> +Physical concatenation protocol using a line break delimited list of
> +resources.
> +
> +Read and seek from many resources in sequence as if they were
> +a unique resource.
> +
> +A URL accepted by this protocol has the syntax:
> +@example
> +concatf:@var{URL}
> +@end example
> +
> +where @var{URL} is the url containing a line break delimited list of
> +resources to be concatenated, each one possibly specifying a distinct
> +protocol.
> +
> +For example to read a sequence of files @file{split1.mpeg},
> +@file{split2.mpeg}, @file{split3.mpeg} listed in separate lines within
> +a file @file{split.txt} with @command{ffplay} use the command:
> +@example
> +ffplay concatf:split.txt
> +@end example
> +Where @file{split.txt} contains the lines:
> +@example
> +split1.mpeg
> +split2.mpeg
> +split3.mpeg
> +@end example
> +

> +Note that if any of the entries in the list contain a line break as part
> +of their name, you'll need to escape it with a preceding "\" character.

This is not detailed and accurate enough. And it should be a link to the
common description of our escaping syntax anyway.

> +
>  @section crypto
>  
>  AES-encrypted stream reading protocol.
> diff --git a/libavformat/Makefile b/libavformat/Makefile
> index c9ef564523..caca95802a 100644
> --- a/libavformat/Makefile
> +++ b/libavformat/Makefile
> @@ -616,6 +616,7 @@ OBJS-$(CONFIG_APPLEHTTP_PROTOCOL)        += hlsproto.o
>  OBJS-$(CONFIG_BLURAY_PROTOCOL)           += bluray.o
>  OBJS-$(CONFIG_CACHE_PROTOCOL)            += cache.o
>  OBJS-$(CONFIG_CONCAT_PROTOCOL)           += concat.o
> +OBJS-$(CONFIG_CONCATF_PROTOCOL)          += concat.o
>  OBJS-$(CONFIG_CRYPTO_PROTOCOL)           += crypto.o
>  OBJS-$(CONFIG_DATA_PROTOCOL)             += data_uri.o
>  OBJS-$(CONFIG_FFRTMPCRYPT_PROTOCOL)      += rtmpcrypt.o rtmpdigest.o rtmpdh.o
> diff --git a/libavformat/concat.c b/libavformat/concat.c
> index 278afd997d..b66e3b9e01 100644
> --- a/libavformat/concat.c
> +++ b/libavformat/concat.c
> @@ -22,9 +22,11 @@
>   */
>  
>  #include "libavutil/avstring.h"
> +#include "libavutil/bprint.h"
>  #include "libavutil/mem.h"
>  
>  #include "avformat.h"
> +#include "avio_internal.h"
>  #include "url.h"
>  
>  #define AV_CAT_SEPARATOR "|"
> @@ -56,6 +58,7 @@ static av_cold int concat_close(URLContext *h)
>      return err < 0 ? -1 : 0;
>  }
>  
> +#if CONFIG_CONCAT_PROTOCOL
>  static av_cold int concat_open(URLContext *h, const char *uri, int flags)
>  {
>      char *node_uri = NULL;
> @@ -124,6 +127,7 @@ static av_cold int concat_open(URLContext *h, const char *uri, int flags)
>      data->total_size = total_size;
>      return err;
>  }
> +#endif
>  
>  static int concat_read(URLContext *h, unsigned char *buf, int size)
>  {
> @@ -188,6 +192,7 @@ static int64_t concat_seek(URLContext *h, int64_t pos, int whence)
>      return result;
>  }
>  
> +#if CONFIG_CONCAT_PROTOCOL
>  const URLProtocol ff_concat_protocol = {
>      .name           = "concat",
>      .url_open       = concat_open,
> @@ -197,3 +202,144 @@ const URLProtocol ff_concat_protocol = {
>      .priv_data_size = sizeof(struct concat_data),
>      .default_whitelist = "concat,file,subfile",
>  };
> +#endif
> +

> +#if CONFIG_CONCATF_PROTOCOL
> +// Custom ff_read_line_to_bprint() implementation where line breaks can be
> +// part of the line being read if escaped.
> +static int64_t read_line_to_bprint(AVIOContext *s, AVBPrint *bp)
> +{
> +    int len, end;
> +    int64_t read = 0;
> +    char tmp[1024];
> +    char c;
> +
> +    do {
> +        len = 0;
> +        do {
> +            char escape = c = avio_r8(s);
> +            if (c == '\\')
> +                c = avio_r8(s);
> +            end = (c == '\r' || c == '\n' || c == '\0');
> +            if (end && escape == '\\') {
> +                if (c != '\0') {
> +                    tmp[len++] = c;
> +                    end = 0;
> +                } else
> +                    tmp[len++] = escape;
> +            } else if (!end) {
> +                if (escape == '\\') {
> +                    tmp[len++] = escape;
> +                    avio_skip(s, -1);
> +                } else
> +                    tmp[len++] = c;
> +            }
> +        } while (!end && len < sizeof(tmp));
> +        av_bprint_append_data(bp, tmp, len);
> +        read += len;
> +    } while (!end);
> +
> +    if (c == '\r' && avio_r8(s) != '\n' && !avio_feof(s))
> +        avio_skip(s, -1);
> +
> +    if (!c && s->error)
> +        return s->error;
> +
> +    if (!c && !read && avio_feof(s))
> +        return AVERROR_EOF;
> +
> +    return read;
> +}

Re-implementing yet another de-escaping and splitting function is a
terrible idea, I am strongly against it.

It would be easier if we had a good string API already. Barring that, I
think you need to load the whole file and use the existing de-escaping
functions.

> +
> +static av_cold int concatf_open(URLContext *h, const char *uri, int flags)
> +{
> +    AVBPrint bp;
> +    struct concat_data  *data = h->priv_data;
> +    struct concat_nodes *nodes = NULL;
> +    AVIOContext *in = NULL;
> +    URLContext *uc;
> +    int64_t size, total_size = 0;
> +    unsigned int nodes_size = 0;
> +    size_t i = 0;
> +    int err = 0;
> +
> +    if (!av_strstart(uri, "concatf:", &uri)) {
> +        av_log(h, AV_LOG_ERROR, "URL %s lacks prefix\n", uri);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    /* handle input */
> +    if (!*uri)
> +        return AVERROR(ENOENT);
> +
> +    err = ffio_open_whitelist(&in, uri, AVIO_FLAG_READ, &h->interrupt_callback,
> +                              NULL, h->protocol_whitelist, h->protocol_blacklist);
> +    if (err < 0)
> +        return err;
> +
> +    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
> +
> +    for (i = 0; !avio_feof(in); i++) {
> +        size_t len = i;
> +
> +        av_bprint_clear(&bp);
> +        if ((err = read_line_to_bprint(in, &bp)) <= 0) {
> +            if (err == 0 && i == 0)
> +                err = AVERROR_INVALIDDATA;
> +            else if (err == AVERROR_EOF)
> +                err = 0;
> +            break;
> +        }
> +
> +        if (++len == SIZE_MAX / sizeof(*nodes)) {
> +            err = AVERROR(ENAMETOOLONG);
> +            break;
> +        }
> +
> +        /* creating URLContext */
> +        err = ffurl_open_whitelist(&uc, bp.str, flags,
> +                                   &h->interrupt_callback, NULL, h->protocol_whitelist, h->protocol_blacklist, h);
> +        if (err < 0)
> +            break;
> +
> +        /* creating size */
> +        if ((size = ffurl_size(uc)) < 0) {
> +            ffurl_close(uc);
> +            err = AVERROR(ENOSYS);
> +            break;
> +        }
> +
> +        nodes = av_fast_realloc(data->nodes, &nodes_size, sizeof(*nodes) * len);
> +        if (!nodes) {
> +            ffurl_close(uc);
> +            err = AVERROR(ENOMEM);
> +            break;
> +        }
> +        data->nodes = nodes;
> +
> +        /* assembling */
> +        data->nodes[i].uc   = uc;
> +        data->nodes[i].size = size;
> +        total_size += size;
> +    }
> +    avio_closep(&in);
> +    av_bprint_finalize(&bp, NULL);
> +    data->length = i;
> +
> +    if (err < 0)
> +        concat_close(h);
> +
> +    data->total_size = total_size;
> +    return err;
> +}
> +
> +const URLProtocol ff_concatf_protocol = {
> +    .name           = "concatf",
> +    .url_open       = concatf_open,
> +    .url_read       = concat_read,
> +    .url_seek       = concat_seek,
> +    .url_close      = concat_close,
> +    .priv_data_size = sizeof(struct concat_data),
> +    .default_whitelist = "concatf,concat,file,subfile",
> +};
> +#endif
> diff --git a/libavformat/protocols.c b/libavformat/protocols.c
> index 4b6b1c8e98..7f08f151b6 100644
> --- a/libavformat/protocols.c
> +++ b/libavformat/protocols.c
> @@ -27,6 +27,7 @@ extern const URLProtocol ff_async_protocol;
>  extern const URLProtocol ff_bluray_protocol;
>  extern const URLProtocol ff_cache_protocol;
>  extern const URLProtocol ff_concat_protocol;
> +extern const URLProtocol ff_concatf_protocol;
>  extern const URLProtocol ff_crypto_protocol;
>  extern const URLProtocol ff_data_protocol;
>  extern const URLProtocol ff_ffrtmpcrypt_protocol;

Regards,
James Almer June 27, 2021, 6:51 p.m. UTC | #2
On 6/27/2021 3:26 PM, Nicolas George wrote:
> James Almer (12021-06-26):
>> Suggested-by: ffmpeg@fb.com
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>> Updated documentation, and line breaks can now be part of the filename.
>>
>>   doc/protocols.texi      |  33 +++++++++
>>   libavformat/Makefile    |   1 +
>>   libavformat/concat.c    | 146 ++++++++++++++++++++++++++++++++++++++++
>>   libavformat/protocols.c |   1 +
>>   4 files changed, 181 insertions(+)
>>
>> diff --git a/doc/protocols.texi b/doc/protocols.texi
>> index ccdfb6e439..11de674225 100644
>> --- a/doc/protocols.texi
>> +++ b/doc/protocols.texi
>> @@ -215,6 +215,39 @@ ffplay concat:split1.mpeg\|split2.mpeg\|split3.mpeg
>>   Note that you may need to escape the character "|" which is special for
>>   many shells.
>>   
>> +@section concatf
>> +
>> +Physical concatenation protocol using a line break delimited list of
>> +resources.
>> +
>> +Read and seek from many resources in sequence as if they were
>> +a unique resource.
>> +
>> +A URL accepted by this protocol has the syntax:
>> +@example
>> +concatf:@var{URL}
>> +@end example
>> +
>> +where @var{URL} is the url containing a line break delimited list of
>> +resources to be concatenated, each one possibly specifying a distinct
>> +protocol.
>> +
>> +For example to read a sequence of files @file{split1.mpeg},
>> +@file{split2.mpeg}, @file{split3.mpeg} listed in separate lines within
>> +a file @file{split.txt} with @command{ffplay} use the command:
>> +@example
>> +ffplay concatf:split.txt
>> +@end example
>> +Where @file{split.txt} contains the lines:
>> +@example
>> +split1.mpeg
>> +split2.mpeg
>> +split3.mpeg
>> +@end example
>> +
> 
>> +Note that if any of the entries in the list contain a line break as part
>> +of their name, you'll need to escape it with a preceding "\" character.
> 
> This is not detailed and accurate enough. And it should be a link to the
> common description of our escaping syntax anyway.

How is it not accurate enough? If the file contains the following

foobar.h264
foo\
bar.h264

It will read the resources

foobar.h264

and

foo
bar.h264

But without that backslash, it will try to read foo and bar.h264 as 
separate resources because it interpreted the line break as a delimiter 
character.
I can add a line to the documentation stating that no other character 
needs to be escaped if that will make it more clear.

> 
>> +
>>   @section crypto
>>   
>>   AES-encrypted stream reading protocol.
>> diff --git a/libavformat/Makefile b/libavformat/Makefile
>> index c9ef564523..caca95802a 100644
>> --- a/libavformat/Makefile
>> +++ b/libavformat/Makefile
>> @@ -616,6 +616,7 @@ OBJS-$(CONFIG_APPLEHTTP_PROTOCOL)        += hlsproto.o
>>   OBJS-$(CONFIG_BLURAY_PROTOCOL)           += bluray.o
>>   OBJS-$(CONFIG_CACHE_PROTOCOL)            += cache.o
>>   OBJS-$(CONFIG_CONCAT_PROTOCOL)           += concat.o
>> +OBJS-$(CONFIG_CONCATF_PROTOCOL)          += concat.o
>>   OBJS-$(CONFIG_CRYPTO_PROTOCOL)           += crypto.o
>>   OBJS-$(CONFIG_DATA_PROTOCOL)             += data_uri.o
>>   OBJS-$(CONFIG_FFRTMPCRYPT_PROTOCOL)      += rtmpcrypt.o rtmpdigest.o rtmpdh.o
>> diff --git a/libavformat/concat.c b/libavformat/concat.c
>> index 278afd997d..b66e3b9e01 100644
>> --- a/libavformat/concat.c
>> +++ b/libavformat/concat.c
>> @@ -22,9 +22,11 @@
>>    */
>>   
>>   #include "libavutil/avstring.h"
>> +#include "libavutil/bprint.h"
>>   #include "libavutil/mem.h"
>>   
>>   #include "avformat.h"
>> +#include "avio_internal.h"
>>   #include "url.h"
>>   
>>   #define AV_CAT_SEPARATOR "|"
>> @@ -56,6 +58,7 @@ static av_cold int concat_close(URLContext *h)
>>       return err < 0 ? -1 : 0;
>>   }
>>   
>> +#if CONFIG_CONCAT_PROTOCOL
>>   static av_cold int concat_open(URLContext *h, const char *uri, int flags)
>>   {
>>       char *node_uri = NULL;
>> @@ -124,6 +127,7 @@ static av_cold int concat_open(URLContext *h, const char *uri, int flags)
>>       data->total_size = total_size;
>>       return err;
>>   }
>> +#endif
>>   
>>   static int concat_read(URLContext *h, unsigned char *buf, int size)
>>   {
>> @@ -188,6 +192,7 @@ static int64_t concat_seek(URLContext *h, int64_t pos, int whence)
>>       return result;
>>   }
>>   
>> +#if CONFIG_CONCAT_PROTOCOL
>>   const URLProtocol ff_concat_protocol = {
>>       .name           = "concat",
>>       .url_open       = concat_open,
>> @@ -197,3 +202,144 @@ const URLProtocol ff_concat_protocol = {
>>       .priv_data_size = sizeof(struct concat_data),
>>       .default_whitelist = "concat,file,subfile",
>>   };
>> +#endif
>> +
> 
>> +#if CONFIG_CONCATF_PROTOCOL
>> +// Custom ff_read_line_to_bprint() implementation where line breaks can be
>> +// part of the line being read if escaped.
>> +static int64_t read_line_to_bprint(AVIOContext *s, AVBPrint *bp)
>> +{
>> +    int len, end;
>> +    int64_t read = 0;
>> +    char tmp[1024];
>> +    char c;
>> +
>> +    do {
>> +        len = 0;
>> +        do {
>> +            char escape = c = avio_r8(s);
>> +            if (c == '\\')
>> +                c = avio_r8(s);
>> +            end = (c == '\r' || c == '\n' || c == '\0');
>> +            if (end && escape == '\\') {
>> +                if (c != '\0') {
>> +                    tmp[len++] = c;
>> +                    end = 0;
>> +                } else
>> +                    tmp[len++] = escape;
>> +            } else if (!end) {
>> +                if (escape == '\\') {
>> +                    tmp[len++] = escape;
>> +                    avio_skip(s, -1);
>> +                } else
>> +                    tmp[len++] = c;
>> +            }
>> +        } while (!end && len < sizeof(tmp));
>> +        av_bprint_append_data(bp, tmp, len);
>> +        read += len;
>> +    } while (!end);
>> +
>> +    if (c == '\r' && avio_r8(s) != '\n' && !avio_feof(s))
>> +        avio_skip(s, -1);
>> +
>> +    if (!c && s->error)
>> +        return s->error;
>> +
>> +    if (!c && !read && avio_feof(s))
>> +        return AVERROR_EOF;
>> +
>> +    return read;
>> +}
> 
> Re-implementing yet another de-escaping and splitting function is a
> terrible idea, I am strongly against it.
> 
> It would be easier if we had a good string API already. Barring that, I
> think you need to load the whole file and use the existing de-escaping
> functions.

The existing de-escaping functions will just remove backslashes, and the 
line break character will be parsed as a delimiter, which is why i wrote 
the above to ensure it is read as part of the filename.

The contents of the file are meant to be taken as is, and if you want 
the delimiter character to be part of the filename, you need to let the 
parser know about it. Every other character in the text file doesn't 
need to be escaped since this is not the command line where they could 
be interpreted as something else.

> 
>> +
>> +static av_cold int concatf_open(URLContext *h, const char *uri, int flags)
>> +{
>> +    AVBPrint bp;
>> +    struct concat_data  *data = h->priv_data;
>> +    struct concat_nodes *nodes = NULL;
>> +    AVIOContext *in = NULL;
>> +    URLContext *uc;
>> +    int64_t size, total_size = 0;
>> +    unsigned int nodes_size = 0;
>> +    size_t i = 0;
>> +    int err = 0;
>> +
>> +    if (!av_strstart(uri, "concatf:", &uri)) {
>> +        av_log(h, AV_LOG_ERROR, "URL %s lacks prefix\n", uri);
>> +        return AVERROR(EINVAL);
>> +    }
>> +
>> +    /* handle input */
>> +    if (!*uri)
>> +        return AVERROR(ENOENT);
>> +
>> +    err = ffio_open_whitelist(&in, uri, AVIO_FLAG_READ, &h->interrupt_callback,
>> +                              NULL, h->protocol_whitelist, h->protocol_blacklist);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
>> +
>> +    for (i = 0; !avio_feof(in); i++) {
>> +        size_t len = i;
>> +
>> +        av_bprint_clear(&bp);
>> +        if ((err = read_line_to_bprint(in, &bp)) <= 0) {
>> +            if (err == 0 && i == 0)
>> +                err = AVERROR_INVALIDDATA;
>> +            else if (err == AVERROR_EOF)
>> +                err = 0;
>> +            break;
>> +        }
>> +
>> +        if (++len == SIZE_MAX / sizeof(*nodes)) {
>> +            err = AVERROR(ENAMETOOLONG);
>> +            break;
>> +        }
>> +
>> +        /* creating URLContext */
>> +        err = ffurl_open_whitelist(&uc, bp.str, flags,
>> +                                   &h->interrupt_callback, NULL, h->protocol_whitelist, h->protocol_blacklist, h);
>> +        if (err < 0)
>> +            break;
>> +
>> +        /* creating size */
>> +        if ((size = ffurl_size(uc)) < 0) {
>> +            ffurl_close(uc);
>> +            err = AVERROR(ENOSYS);
>> +            break;
>> +        }
>> +
>> +        nodes = av_fast_realloc(data->nodes, &nodes_size, sizeof(*nodes) * len);
>> +        if (!nodes) {
>> +            ffurl_close(uc);
>> +            err = AVERROR(ENOMEM);
>> +            break;
>> +        }
>> +        data->nodes = nodes;
>> +
>> +        /* assembling */
>> +        data->nodes[i].uc   = uc;
>> +        data->nodes[i].size = size;
>> +        total_size += size;
>> +    }
>> +    avio_closep(&in);
>> +    av_bprint_finalize(&bp, NULL);
>> +    data->length = i;
>> +
>> +    if (err < 0)
>> +        concat_close(h);
>> +
>> +    data->total_size = total_size;
>> +    return err;
>> +}
>> +
>> +const URLProtocol ff_concatf_protocol = {
>> +    .name           = "concatf",
>> +    .url_open       = concatf_open,
>> +    .url_read       = concat_read,
>> +    .url_seek       = concat_seek,
>> +    .url_close      = concat_close,
>> +    .priv_data_size = sizeof(struct concat_data),
>> +    .default_whitelist = "concatf,concat,file,subfile",
>> +};
>> +#endif
>> diff --git a/libavformat/protocols.c b/libavformat/protocols.c
>> index 4b6b1c8e98..7f08f151b6 100644
>> --- a/libavformat/protocols.c
>> +++ b/libavformat/protocols.c
>> @@ -27,6 +27,7 @@ extern const URLProtocol ff_async_protocol;
>>   extern const URLProtocol ff_bluray_protocol;
>>   extern const URLProtocol ff_cache_protocol;
>>   extern const URLProtocol ff_concat_protocol;
>> +extern const URLProtocol ff_concatf_protocol;
>>   extern const URLProtocol ff_crypto_protocol;
>>   extern const URLProtocol ff_data_protocol;
>>   extern const URLProtocol ff_ffrtmpcrypt_protocol;
> 
> Regards,
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Nicolas George June 27, 2021, 7 p.m. UTC | #3
James Almer (12021-06-27):
> How is it not accurate enough?

Which characters must be escaped? Is it legal to escape another
character? Does it support single quotes like the other escaping
functions? Can you use byte or Unicode numbers?

Various escaping syntax have subtle differences about these points, and
this is something users need to know about, especially for scripts.

Anyway, our escaping syntax should be documented once well, with links
to it everywhere where needed, the description must not be duplicated
all over the place.

> I can add a line to the documentation stating that no other character needs
> to be escaped if that will make it more clear.

Make it a link to the unique good documentation of escaping.

If we do not have an unique good documentation of escaping, the it needs
to be written, and this is a good occasion. But I think we have it.

> The existing de-escaping functions will just remove backslashes, and the
> line break character will be parsed as a delimiter, which is why i wrote the
> above to ensure it is read as part of the filename.
> 
> The contents of the file are meant to be taken as is, and if you want the
> delimiter character to be part of the filename, you need to let the parser
> know about it. Every other character in the text file doesn't need to be
> escaped since this is not the command line where they could be interpreted
> as something else.

I know what the existing de-escaping functions do, and I know what they
can be used for, and I repeat: you can the parsing correctly with them
by reading the whole file at once.

But as it is, this code duplication is a big no.

Regards,
James Almer June 27, 2021, 7:32 p.m. UTC | #4
On 6/27/2021 4:00 PM, Nicolas George wrote:
> James Almer (12021-06-27):
>> How is it not accurate enough?
> 
> Which characters must be escaped? Is it legal to escape another
> character? Does it support single quotes like the other escaping
> functions? Can you use byte or Unicode numbers?

Only line breaks need to be escaped. I said as much and even suggested 
to make it more explicit in the documentation.
Every other character will be read and added verbatim into a string used 
as uri for a given resource that ffurl_open_whitelist() will then handle.

Create a text file with the following four entries

foo\
bar.h264
foo'bar.h264
foo\bar.h264
foobar.h264

Where only the line break is escaped to ensure it's not interpreted as a 
resource name delimiter, and they will all be read and open if they exist.

> 
> Various escaping syntax have subtle differences about these points, and
> this is something users need to know about, especially for scripts.
> 
> Anyway, our escaping syntax should be documented once well, with links
> to it everywhere where needed, the description must not be duplicated
> all over the place.
> 
>> I can add a line to the documentation stating that no other character needs
>> to be escaped if that will make it more clear.
> 
> Make it a link to the unique good documentation of escaping.

There's no need for any escaping other than the delimiter character, so 
wouldn't it be misleading linking to a document that will make you write 
your entries in the file in a way that will not work?
Said documentation will tell you that if you want to load foo'bar.h264 
you will need to write it as foo\'bar.h264, but that's not true here and 
will only make it attempt to load a file with all those characters as 
part of its name.

> 
> If we do not have an unique good documentation of escaping, the it needs
> to be written, and this is a good occasion. But I think we have it.
> 
>> The existing de-escaping functions will just remove backslashes, and the
>> line break character will be parsed as a delimiter, which is why i wrote the
>> above to ensure it is read as part of the filename.
>>
>> The contents of the file are meant to be taken as is, and if you want the
>> delimiter character to be part of the filename, you need to let the parser
>> know about it. Every other character in the text file doesn't need to be
>> escaped since this is not the command line where they could be interpreted
>> as something else.
> 
> I know what the existing de-escaping functions do, and I know what they
> can be used for, and I repeat: you can the parsing correctly with them
> by reading the whole file at once.
> 
> But as it is, this code duplication is a big no.

Wont your request to use escaping functions result in the need for the 
entries in the text file to start pointlessly escaping characters like 
quotes, backslashes and others for no reason, as if this were bash?

The protocol right now supports line breaks as part of the filename if 
you escape it like you asked me to support, alongside every other 
character without doing anything for them. What you're now asking from 
me is to require it to parse more characters than necessary as if this 
was the command line, when there's no reason to.

> 
> Regards,
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Nicolas George June 27, 2021, 7:40 p.m. UTC | #5
James Almer (12021-06-27):
> Only line breaks need to be escaped. I said as much and even suggested to
> make it more explicit in the documentation.
> Every other character will be read and added verbatim into a string used as
> uri for a given resource that ffurl_open_whitelist() will then handle.
> 
> Create a text file with the following four entries
> 
> foo\
> bar.h264
> foo'bar.h264
> foo\bar.h264
> foobar.h264
> 
> Where only the line break is escaped to ensure it's not interpreted as a
> resource name delimiter, and they will all be read and open if they exist.

I do not need you to explain this to me.

> There's no need for any escaping other than the delimiter character, so
> wouldn't it be misleading linking to a document that will make you write
> your entries in the file in a way that will not work?

The code and documentation must be compatible, of course.

> Said documentation will tell you that if you want to load foo'bar.h264 you
> will need to write it as foo\'bar.h264, but that's not true here and will

That should be true, see below.

> Wont your request to use escaping functions result in the need for the
> entries in the text file to start pointlessly escaping characters like
> quotes, backslashes and others for no reason, as if this were bash?

My request results in using the same syntax we have developed and tuned
for the rest of the project.

Having several subtly different rules for escaping in a single project
is an abominable idea.

Regards,
James Almer June 27, 2021, 8 p.m. UTC | #6
On 6/27/2021 4:40 PM, Nicolas George wrote:
> James Almer (12021-06-27):
>> Only line breaks need to be escaped. I said as much and even suggested to
>> make it more explicit in the documentation.
>> Every other character will be read and added verbatim into a string used as
>> uri for a given resource that ffurl_open_whitelist() will then handle.
>>
>> Create a text file with the following four entries
>>
>> foo\
>> bar.h264
>> foo'bar.h264
>> foo\bar.h264
>> foobar.h264
>>
>> Where only the line break is escaped to ensure it's not interpreted as a
>> resource name delimiter, and they will all be read and open if they exist.
> 
> I do not need you to explain this to me.

You asked what characters needed to be escaped, i answered and figured 
giving an example would help. If not for you, for someone else reading 
this thread.
I know well strings are your field of expertise and hardly mine. Don't 
wrongly assume the other person has ill intentions.

> 
>> There's no need for any escaping other than the delimiter character, so
>> wouldn't it be misleading linking to a document that will make you write
>> your entries in the file in a way that will not work?
> 
> The code and documentation must be compatible, of course.
> 
>> Said documentation will tell you that if you want to load foo'bar.h264 you
>> will need to write it as foo\'bar.h264, but that's not true here and will
> 
> That should be true, see below.
> 
>> Wont your request to use escaping functions result in the need for the
>> entries in the text file to start pointlessly escaping characters like
>> quotes, backslashes and others for no reason, as if this were bash?
> 
> My request results in using the same syntax we have developed and tuned
> for the rest of the project.
> 
> Having several subtly different rules for escaping in a single project
> is an abominable idea.

It's literally having a list of raw filenames in a file, where the 
delimiter character can be part of the filename and as such needs to be 
handled. I'm not writing a new language, or new escaping rules. I don't 
see how it's so unacceptable.

Can you point me in the right direction then? What functions should i 
use? How would the entries in the text file need to be written like in 
order for said functions to write all valid characters correctly in the 
output buffer? How will line breaks as part of the file name need be 
handled? A backslash before an actual line break like in this version, 
or will this mean the entries will need to look like something out of 
bash? If the latter, will these functions convert the sequence of the 
characters \ and n into a single 0x0A byte in the output buffer?

> 
> Regards,
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Nicolas George June 27, 2021, 8:16 p.m. UTC | #7
James Almer (12021-06-27):
> You asked

No, I did not ASK: I told what needs to be present in the documentation.

> I know well strings are your field of expertise and hardly mine. Don't
> wrongly assume the other person has ill intentions.

I am not assuming any ill intent from you. Why would you think that?
This is the second time in a few days that you accuse me of personal
stuff like that. Please re-calibrate your opinion of me.

> It's literally having a list of raw filenames in a file, where the delimiter
> character can be part of the filename and as such needs to be handled. I'm
> not writing a new language, or new escaping rules. I don't see how it's so
> unacceptable.

What you just described IS precisely a new set of escaping rules. It is
very very simple, but it is a set of rules. A new one, while we already
have one.

And of course, you are re-implementing it, which in itself cannot be
accepted.

> Can you point me in the right direction then? What functions should i use?

I already pointed you in the right direction: read the whole file, and
then use the standard parsing and de-escaping functions.

Your use of av_get_token() in the first patches was good. The flaw was
using them on separate lines instead of the whole file: you ended up
with both ff_read_line_to_bprint_overwrite() and av_get_token() handling
the new line characters, in a subtly different way. You needed to get
rid of one of them, the one that does only a small part of the work, to
keep the one that can do the whole work.

Regards,
James Almer June 27, 2021, 9:36 p.m. UTC | #8
On 6/27/2021 5:16 PM, Nicolas George wrote:
> James Almer (12021-06-27):
>> You asked
> 
> No, I did not ASK: I told what needs to be present in the documentation.

Your first paragraph started with "Which characters must be escaped?" 
among other questions. The part you quoted from my reply was an answer 
to that, including the example.

What is the documentation about escaping syntax i should link to, for 
that matter? The documentation for the other concat protocol simply 
states that special characters must be escaped, but no reference to any 
external document or section within our documentation about it.

> 
>> I know well strings are your field of expertise and hardly mine. Don't
>> wrongly assume the other person has ill intentions.
> 
> I am not assuming any ill intent from you. Why would you think that?

I probably read too much into what you said. Sorry.

> This is the second time in a few days that you accuse me of personal
> stuff like that. Please re-calibrate your opinion of me.
> 
>> It's literally having a list of raw filenames in a file, where the delimiter
>> character can be part of the filename and as such needs to be handled. I'm
>> not writing a new language, or new escaping rules. I don't see how it's so
>> unacceptable.
> 
> What you just described IS precisely a new set of escaping rules. It is
> very very simple, but it is a set of rules. A new one, while we already
> have one.
> 
> And of course, you are re-implementing it, which in itself cannot be
> accepted.
> 
>> Can you point me in the right direction then? What functions should i use?
> 
> I already pointed you in the right direction: read the whole file, and
> then use the standard parsing and de-escaping functions.
> 
> Your use of av_get_token() in the first patches was good. The flaw was
> using them on separate lines instead of the whole file: you ended up
> with both ff_read_line_to_bprint_overwrite() and av_get_token() handling
> the new line characters, in a subtly different way. You needed to get
> rid of one of them, the one that does only a small part of the work, to
> keep the one that can do the whole work.
> 
> Regards,
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Nicolas George June 28, 2021, 12:48 p.m. UTC | #9
James Almer (12021-06-27):
> Your first paragraph started with "Which characters must be escaped?"

Yes, but it was a follow-up to "the doc is not detailed enough". I trim
my mails, because it makes reading easier, but the context still
matters. Should I leave more of it, even for conversations over the
course of just a few hours?

> What is the documentation about escaping syntax i should link to, for that
> matter? The documentation for the other concat protocol simply states that
> special characters must be escaped, but no reference to any external
> document or section within our documentation about it.

doc/utils.texi starts with:

# @chapter Syntax
# @c man begin SYNTAX
# 
# This section documents the syntax and formats employed by the FFmpeg
# libraries and tools.
# 
# @anchor{quoting_and_escaping}
# @section Quoting and escaping
# 
# FFmpeg adopts the following quoting and escaping mechanism, unless
# explicitly specified. The following rules are applied:

It looks like exactly what we want.

> I probably read too much into what you said. Sorry.

No problem. I hope we can learn to not be irritated by each other's
style.

Regards,
diff mbox series

Patch

diff --git a/doc/protocols.texi b/doc/protocols.texi
index ccdfb6e439..11de674225 100644
--- a/doc/protocols.texi
+++ b/doc/protocols.texi
@@ -215,6 +215,39 @@  ffplay concat:split1.mpeg\|split2.mpeg\|split3.mpeg
 Note that you may need to escape the character "|" which is special for
 many shells.
 
+@section concatf
+
+Physical concatenation protocol using a line break delimited list of
+resources.
+
+Read and seek from many resources in sequence as if they were
+a unique resource.
+
+A URL accepted by this protocol has the syntax:
+@example
+concatf:@var{URL}
+@end example
+
+where @var{URL} is the url containing a line break delimited list of
+resources to be concatenated, each one possibly specifying a distinct
+protocol.
+
+For example to read a sequence of files @file{split1.mpeg},
+@file{split2.mpeg}, @file{split3.mpeg} listed in separate lines within
+a file @file{split.txt} with @command{ffplay} use the command:
+@example
+ffplay concatf:split.txt
+@end example
+Where @file{split.txt} contains the lines:
+@example
+split1.mpeg
+split2.mpeg
+split3.mpeg
+@end example
+
+Note that if any of the entries in the list contain a line break as part
+of their name, you'll need to escape it with a preceding "\" character.
+
 @section crypto
 
 AES-encrypted stream reading protocol.
diff --git a/libavformat/Makefile b/libavformat/Makefile
index c9ef564523..caca95802a 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -616,6 +616,7 @@  OBJS-$(CONFIG_APPLEHTTP_PROTOCOL)        += hlsproto.o
 OBJS-$(CONFIG_BLURAY_PROTOCOL)           += bluray.o
 OBJS-$(CONFIG_CACHE_PROTOCOL)            += cache.o
 OBJS-$(CONFIG_CONCAT_PROTOCOL)           += concat.o
+OBJS-$(CONFIG_CONCATF_PROTOCOL)          += concat.o
 OBJS-$(CONFIG_CRYPTO_PROTOCOL)           += crypto.o
 OBJS-$(CONFIG_DATA_PROTOCOL)             += data_uri.o
 OBJS-$(CONFIG_FFRTMPCRYPT_PROTOCOL)      += rtmpcrypt.o rtmpdigest.o rtmpdh.o
diff --git a/libavformat/concat.c b/libavformat/concat.c
index 278afd997d..b66e3b9e01 100644
--- a/libavformat/concat.c
+++ b/libavformat/concat.c
@@ -22,9 +22,11 @@ 
  */
 
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/mem.h"
 
 #include "avformat.h"
+#include "avio_internal.h"
 #include "url.h"
 
 #define AV_CAT_SEPARATOR "|"
@@ -56,6 +58,7 @@  static av_cold int concat_close(URLContext *h)
     return err < 0 ? -1 : 0;
 }
 
+#if CONFIG_CONCAT_PROTOCOL
 static av_cold int concat_open(URLContext *h, const char *uri, int flags)
 {
     char *node_uri = NULL;
@@ -124,6 +127,7 @@  static av_cold int concat_open(URLContext *h, const char *uri, int flags)
     data->total_size = total_size;
     return err;
 }
+#endif
 
 static int concat_read(URLContext *h, unsigned char *buf, int size)
 {
@@ -188,6 +192,7 @@  static int64_t concat_seek(URLContext *h, int64_t pos, int whence)
     return result;
 }
 
+#if CONFIG_CONCAT_PROTOCOL
 const URLProtocol ff_concat_protocol = {
     .name           = "concat",
     .url_open       = concat_open,
@@ -197,3 +202,144 @@  const URLProtocol ff_concat_protocol = {
     .priv_data_size = sizeof(struct concat_data),
     .default_whitelist = "concat,file,subfile",
 };
+#endif
+
+#if CONFIG_CONCATF_PROTOCOL
+// Custom ff_read_line_to_bprint() implementation where line breaks can be
+// part of the line being read if escaped.
+static int64_t read_line_to_bprint(AVIOContext *s, AVBPrint *bp)
+{
+    int len, end;
+    int64_t read = 0;
+    char tmp[1024];
+    char c;
+
+    do {
+        len = 0;
+        do {
+            char escape = c = avio_r8(s);
+            if (c == '\\')
+                c = avio_r8(s);
+            end = (c == '\r' || c == '\n' || c == '\0');
+            if (end && escape == '\\') {
+                if (c != '\0') {
+                    tmp[len++] = c;
+                    end = 0;
+                } else
+                    tmp[len++] = escape;
+            } else if (!end) {
+                if (escape == '\\') {
+                    tmp[len++] = escape;
+                    avio_skip(s, -1);
+                } else
+                    tmp[len++] = c;
+            }
+        } while (!end && len < sizeof(tmp));
+        av_bprint_append_data(bp, tmp, len);
+        read += len;
+    } while (!end);
+
+    if (c == '\r' && avio_r8(s) != '\n' && !avio_feof(s))
+        avio_skip(s, -1);
+
+    if (!c && s->error)
+        return s->error;
+
+    if (!c && !read && avio_feof(s))
+        return AVERROR_EOF;
+
+    return read;
+}
+
+static av_cold int concatf_open(URLContext *h, const char *uri, int flags)
+{
+    AVBPrint bp;
+    struct concat_data  *data = h->priv_data;
+    struct concat_nodes *nodes = NULL;
+    AVIOContext *in = NULL;
+    URLContext *uc;
+    int64_t size, total_size = 0;
+    unsigned int nodes_size = 0;
+    size_t i = 0;
+    int err = 0;
+
+    if (!av_strstart(uri, "concatf:", &uri)) {
+        av_log(h, AV_LOG_ERROR, "URL %s lacks prefix\n", uri);
+        return AVERROR(EINVAL);
+    }
+
+    /* handle input */
+    if (!*uri)
+        return AVERROR(ENOENT);
+
+    err = ffio_open_whitelist(&in, uri, AVIO_FLAG_READ, &h->interrupt_callback,
+                              NULL, h->protocol_whitelist, h->protocol_blacklist);
+    if (err < 0)
+        return err;
+
+    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; !avio_feof(in); i++) {
+        size_t len = i;
+
+        av_bprint_clear(&bp);
+        if ((err = read_line_to_bprint(in, &bp)) <= 0) {
+            if (err == 0 && i == 0)
+                err = AVERROR_INVALIDDATA;
+            else if (err == AVERROR_EOF)
+                err = 0;
+            break;
+        }
+
+        if (++len == SIZE_MAX / sizeof(*nodes)) {
+            err = AVERROR(ENAMETOOLONG);
+            break;
+        }
+
+        /* creating URLContext */
+        err = ffurl_open_whitelist(&uc, bp.str, flags,
+                                   &h->interrupt_callback, NULL, h->protocol_whitelist, h->protocol_blacklist, h);
+        if (err < 0)
+            break;
+
+        /* creating size */
+        if ((size = ffurl_size(uc)) < 0) {
+            ffurl_close(uc);
+            err = AVERROR(ENOSYS);
+            break;
+        }
+
+        nodes = av_fast_realloc(data->nodes, &nodes_size, sizeof(*nodes) * len);
+        if (!nodes) {
+            ffurl_close(uc);
+            err = AVERROR(ENOMEM);
+            break;
+        }
+        data->nodes = nodes;
+
+        /* assembling */
+        data->nodes[i].uc   = uc;
+        data->nodes[i].size = size;
+        total_size += size;
+    }
+    avio_closep(&in);
+    av_bprint_finalize(&bp, NULL);
+    data->length = i;
+
+    if (err < 0)
+        concat_close(h);
+
+    data->total_size = total_size;
+    return err;
+}
+
+const URLProtocol ff_concatf_protocol = {
+    .name           = "concatf",
+    .url_open       = concatf_open,
+    .url_read       = concat_read,
+    .url_seek       = concat_seek,
+    .url_close      = concat_close,
+    .priv_data_size = sizeof(struct concat_data),
+    .default_whitelist = "concatf,concat,file,subfile",
+};
+#endif
diff --git a/libavformat/protocols.c b/libavformat/protocols.c
index 4b6b1c8e98..7f08f151b6 100644
--- a/libavformat/protocols.c
+++ b/libavformat/protocols.c
@@ -27,6 +27,7 @@  extern const URLProtocol ff_async_protocol;
 extern const URLProtocol ff_bluray_protocol;
 extern const URLProtocol ff_cache_protocol;
 extern const URLProtocol ff_concat_protocol;
+extern const URLProtocol ff_concatf_protocol;
 extern const URLProtocol ff_crypto_protocol;
 extern const URLProtocol ff_data_protocol;
 extern const URLProtocol ff_ffrtmpcrypt_protocol;