diff mbox series

[FFmpeg-devel,v17,1/5] libavutil: Add wchartoutf8(), wchartoansi(), utf8toansi() and getenv_utf8()

Message ID 20220617093141.9826-1-nil-admirari@mailo.com
State New
Headers show
Series [FFmpeg-devel,v17,1/5] libavutil: Add wchartoutf8(), wchartoansi(), utf8toansi() and getenv_utf8() | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Nil Admirari June 17, 2022, 9:31 a.m. UTC
wchartoutf8() converts strings returned by WinAPI into UTF-8,
which is FFmpeg's preffered encoding.

Some external dependencies, such as AviSynth, are still
not Unicode-enabled. utf8toansi() converts UTF-8 strings
into ANSI in two steps: UTF-8 -> wchar_t -> ANSI.
wchartoansi() is responsible for the second step of the conversion.
Conversion in just one step is not supported by WinAPI.

Since these character converting functions allocate the buffer
of necessary size, they also facilitate the removal of MAX_PATH limit
in places where fixed-size ANSI/WCHAR strings were used
as filename buffers.

getenv_utf8() wraps _wgetenv() converting its input from
and its output to UTF-8. Compared to plain getenv(),
getenv_utf8() requires a cleanup.

Because of that, in places that only test the existence of
an environment variable or compare its value with a string
consisting entirely of ASCII characters, the use of plain getenv()
is still preferred. (libavutil/log.c check_color_terminal()
is an example of such a place.)

Plain getenv() is also preffered in UNIX-only code,
such as bktr.c, fbdev_common.c, oss.c in libavdevice
or af_ladspa.c in libavfilter.
---
 configure                  |  1 +
 libavutil/getenv_utf8.h    | 71 ++++++++++++++++++++++++++++++++++++++
 libavutil/wchar_filename.h | 51 +++++++++++++++++++++++++++
 3 files changed, 123 insertions(+)
 create mode 100644 libavutil/getenv_utf8.h

Comments

Soft Works June 17, 2022, 7:16 p.m. UTC | #1
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nil
> Admirari
> Sent: Friday, June 17, 2022 11:32 AM
> To: ffmpeg-devel@ffmpeg.org
> Subject: [FFmpeg-devel] [PATCH v17 1/5] libavutil: Add wchartoutf8(),
> wchartoansi(), utf8toansi() and getenv_utf8()
> 
> wchartoutf8() converts strings returned by WinAPI into UTF-8,
> which is FFmpeg's preffered encoding.
> 
> Some external dependencies, such as AviSynth, are still
> not Unicode-enabled. utf8toansi() converts UTF-8 strings
> into ANSI in two steps: UTF-8 -> wchar_t -> ANSI.
> wchartoansi() is responsible for the second step of the conversion.
> Conversion in just one step is not supported by WinAPI.
> 
> Since these character converting functions allocate the buffer
> of necessary size, they also facilitate the removal of MAX_PATH limit
> in places where fixed-size ANSI/WCHAR strings were used
> as filename buffers.
> 
> getenv_utf8() wraps _wgetenv() converting its input from
> and its output to UTF-8. Compared to plain getenv(),
> getenv_utf8() requires a cleanup.
> 
> Because of that, in places that only test the existence of
> an environment variable or compare its value with a string
> consisting entirely of ASCII characters, the use of plain getenv()
> is still preferred. (libavutil/log.c check_color_terminal()
> is an example of such a place.)
> 
> Plain getenv() is also preffered in UNIX-only code,
> such as bktr.c, fbdev_common.c, oss.c in libavdevice
> or af_ladspa.c in libavfilter.
> ---
>  configure                  |  1 +
>  libavutil/getenv_utf8.h    | 71
> ++++++++++++++++++++++++++++++++++++++
>  libavutil/wchar_filename.h | 51 +++++++++++++++++++++++++++
>  3 files changed, 123 insertions(+)
>  create mode 100644 libavutil/getenv_utf8.h
> 
> diff --git a/configure b/configure
> index 3dca1c4bd3..fa37a74531 100755
> --- a/configure
> +++ b/configure
> @@ -2272,6 +2272,7 @@ SYSTEM_FUNCS="
>      fcntl
>      getaddrinfo
>      getauxval
> +    getenv
>      gethrtime
>      getopt
>      GetModuleHandle
> diff --git a/libavutil/getenv_utf8.h b/libavutil/getenv_utf8.h
> new file mode 100644
> index 0000000000..161e3e6202
> --- /dev/null
> +++ b/libavutil/getenv_utf8.h
> @@ -0,0 +1,71 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later
> version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_GETENV_UTF8_H
> +#define AVUTIL_GETENV_UTF8_H
> +
> +#include <stdlib.h>
> +
> +#include "mem.h"
> +
> +#ifdef HAVE_GETENV
> +
> +#ifdef _WIN32
> +
> +#include "libavutil/wchar_filename.h"
> +
> +static inline char *getenv_utf8(const char *varname)
> +{
> +    wchar_t *varname_w, *var_w;
> +    char *var;
> +
> +    if (utf8towchar(varname, &varname_w))
> +        return NULL;
> +    if (!varname_w)
> +        return NULL;
> +
> +    var_w = _wgetenv(varname_w);
> +    av_free(varname_w);
> +
> +    if (!var_w)
> +        return NULL;
> +    if (wchartoutf8(var_w, &var))
> +        return NULL;
> +
> +    return var;
> +
> +    // No CP_ACP fallback compared to other *_utf8() functions:
> +    // non UTF-8 strings must not be returned.
> +}
> +
> +#else
> +
> +static inline char *getenv_utf8(const char *varname)
> +{
> +    return av_strdup(getenv(varname));
> +}
> +
> +#endif // _WIN32
> +
> +#else
> +
> +#define getenv_utf8(x) NULL
> +
> +#endif // HAVE_GETENV
> +
> +#endif // AVUTIL_GETENV_UTF8_H
> diff --git a/libavutil/wchar_filename.h b/libavutil/wchar_filename.h
> index f36d9dfea3..a6d71e52e5 100644
> --- a/libavutil/wchar_filename.h
> +++ b/libavutil/wchar_filename.h
> @@ -41,6 +41,57 @@ static inline int utf8towchar(const char
> *filename_utf8, wchar_t **filename_w)
>      return 0;
>  }
> 
> +av_warn_unused_result
> +static inline int wchartocp(unsigned int code_page, const wchar_t
> *filename_w,
> +                            char **filename)
> +{
> +    DWORD flags = code_page == CP_UTF8 ? WC_ERR_INVALID_CHARS : 0;
> +    int num_chars = WideCharToMultiByte(code_page, flags,
> filename_w, -1,
> +                                        NULL, 0, NULL, NULL);
> +    if (num_chars <= 0) {
> +        *filename = NULL;
> +        return 0;
> +    }
> +    *filename = av_malloc_array(num_chars, sizeof *filename);
> +    if (!*filename) {
> +        errno = ENOMEM;
> +        return -1;
> +    }
> +    WideCharToMultiByte(code_page, flags, filename_w, -1,
> +                        *filename, num_chars, NULL, NULL);
> +    return 0;
> +}
> +
> +av_warn_unused_result
> +static inline int wchartoutf8(const wchar_t *filename_w, char
> **filename)
> +{
> +    return wchartocp(CP_UTF8, filename_w, filename);
> +}
> +
> +av_warn_unused_result
> +static inline int wchartoansi(const wchar_t *filename_w, char
> **filename)
> +{
> +    return wchartocp(CP_ACP, filename_w, filename);
> +}
> +
> +av_warn_unused_result
> +static inline int utf8toansi(const char *filename_utf8, char
> **filename)
> +{
> +    wchar_t *filename_w = NULL;
> +    int ret = -1;
> +    if (utf8towchar(filename_utf8, &filename_w))
> +        return -1;
> +
> +    if (!filename_w) {
> +        *filename = NULL;
> +        return 0;
> +    }
> +
> +    ret = wchartoansi(filename_w, filename);
> +    av_free(filename_w);
> +    return ret;
> +}
> +
>  /**
>   * Checks for extended path prefixes for which normalization needs
> to be skipped.
>   * see .NET6: PathInternal.IsExtended()
> --


LGTM for the whole patchset. I didn't look at the getenv part, but I think
Martin did (or will do).

Thanks for all your effort (and patience)!

Best wishes,
softworkz
Martin Storsjö June 18, 2022, 10:21 p.m. UTC | #2
On Fri, 17 Jun 2022, Nil Admirari wrote:

> wchartoutf8() converts strings returned by WinAPI into UTF-8,
> which is FFmpeg's preffered encoding.
>
> Some external dependencies, such as AviSynth, are still
> not Unicode-enabled. utf8toansi() converts UTF-8 strings
> into ANSI in two steps: UTF-8 -> wchar_t -> ANSI.
> wchartoansi() is responsible for the second step of the conversion.
> Conversion in just one step is not supported by WinAPI.
>
> Since these character converting functions allocate the buffer
> of necessary size, they also facilitate the removal of MAX_PATH limit
> in places where fixed-size ANSI/WCHAR strings were used
> as filename buffers.
>
> getenv_utf8() wraps _wgetenv() converting its input from
> and its output to UTF-8. Compared to plain getenv(),
> getenv_utf8() requires a cleanup.
>
> Because of that, in places that only test the existence of
> an environment variable or compare its value with a string
> consisting entirely of ASCII characters, the use of plain getenv()
> is still preferred. (libavutil/log.c check_color_terminal()
> is an example of such a place.)
>
> Plain getenv() is also preffered in UNIX-only code,
> such as bktr.c, fbdev_common.c, oss.c in libavdevice
> or af_ladspa.c in libavfilter.
> ---
> configure                  |  1 +
> libavutil/getenv_utf8.h    | 71 ++++++++++++++++++++++++++++++++++++++
> libavutil/wchar_filename.h | 51 +++++++++++++++++++++++++++
> 3 files changed, 123 insertions(+)
> create mode 100644 libavutil/getenv_utf8.h

This looks generally good - as others seem to be ok with this and there 
doesn't seem to be any more objections, I can push this in a while. (I'm 
not familiar with the avisynth bits though, but it seems like there's 
agreement about it.)

> diff --git a/libavutil/getenv_utf8.h b/libavutil/getenv_utf8.h
> new file mode 100644
> index 0000000000..161e3e6202
> --- /dev/null
> +++ b/libavutil/getenv_utf8.h
> @@ -0,0 +1,71 @@
> +#ifndef AVUTIL_GETENV_UTF8_H
> +#define AVUTIL_GETENV_UTF8_H
> +
> +#include <stdlib.h>
> +
> +#include "mem.h"
> +
> +#ifdef HAVE_GETENV

Note that this should be #if HAVE_GETENV - these constants are always 
defined and evaluate to 0 or 1. No need to resend the patchset just for 
that. (I added an explicit #include "config.h" above here too, just to 
make it clearer.)

// Martin
Andreas Rheinhardt June 19, 2022, 4:58 a.m. UTC | #3
Nil Admirari:
> wchartoutf8() converts strings returned by WinAPI into UTF-8,
> which is FFmpeg's preffered encoding.
> 
> Some external dependencies, such as AviSynth, are still
> not Unicode-enabled. utf8toansi() converts UTF-8 strings
> into ANSI in two steps: UTF-8 -> wchar_t -> ANSI.
> wchartoansi() is responsible for the second step of the conversion.
> Conversion in just one step is not supported by WinAPI.
> 
> Since these character converting functions allocate the buffer
> of necessary size, they also facilitate the removal of MAX_PATH limit
> in places where fixed-size ANSI/WCHAR strings were used
> as filename buffers.
> 
> getenv_utf8() wraps _wgetenv() converting its input from
> and its output to UTF-8. Compared to plain getenv(),
> getenv_utf8() requires a cleanup.
> 
> Because of that, in places that only test the existence of
> an environment variable or compare its value with a string
> consisting entirely of ASCII characters, the use of plain getenv()
> is still preferred. (libavutil/log.c check_color_terminal()
> is an example of such a place.)
> 
> Plain getenv() is also preffered in UNIX-only code,
> such as bktr.c, fbdev_common.c, oss.c in libavdevice
> or af_ladspa.c in libavfilter.
> ---
>  configure                  |  1 +
>  libavutil/getenv_utf8.h    | 71 ++++++++++++++++++++++++++++++++++++++
>  libavutil/wchar_filename.h | 51 +++++++++++++++++++++++++++
>  3 files changed, 123 insertions(+)
>  create mode 100644 libavutil/getenv_utf8.h
> 
> diff --git a/configure b/configure
> index 3dca1c4bd3..fa37a74531 100755
> --- a/configure
> +++ b/configure
> @@ -2272,6 +2272,7 @@ SYSTEM_FUNCS="
>      fcntl
>      getaddrinfo
>      getauxval
> +    getenv
>      gethrtime
>      getopt
>      GetModuleHandle
> diff --git a/libavutil/getenv_utf8.h b/libavutil/getenv_utf8.h
> new file mode 100644
> index 0000000000..161e3e6202
> --- /dev/null
> +++ b/libavutil/getenv_utf8.h
> @@ -0,0 +1,71 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_GETENV_UTF8_H
> +#define AVUTIL_GETENV_UTF8_H
> +
> +#include <stdlib.h>
> +
> +#include "mem.h"
> +
> +#ifdef HAVE_GETENV
> +
> +#ifdef _WIN32
> +
> +#include "libavutil/wchar_filename.h"
> +
> +static inline char *getenv_utf8(const char *varname)
> +{
> +    wchar_t *varname_w, *var_w;
> +    char *var;
> +
> +    if (utf8towchar(varname, &varname_w))
> +        return NULL;
> +    if (!varname_w)
> +        return NULL;
> +
> +    var_w = _wgetenv(varname_w);
> +    av_free(varname_w);
> +
> +    if (!var_w)
> +        return NULL;
> +    if (wchartoutf8(var_w, &var))
> +        return NULL;
> +
> +    return var;
> +
> +    // No CP_ACP fallback compared to other *_utf8() functions:
> +    // non UTF-8 strings must not be returned.
> +}
> +
> +#else
> +
> +static inline char *getenv_utf8(const char *varname)
> +{
> +    return av_strdup(getenv(varname));

This forces allocations and frees in scenarios where this is wholly
unnecessary. This can be avoided by adding a custom deallocator for
strings returned via getenv_utf8: Namely a define/wrapper around av_free
in the _WIN32 and a no-op else.

> +}
> +
> +#endif // _WIN32
> +
> +#else
> +
> +#define getenv_utf8(x) NULL
> +
> +#endif // HAVE_GETENV
> +
> +#endif // AVUTIL_GETENV_UTF8_H
> diff --git a/libavutil/wchar_filename.h b/libavutil/wchar_filename.h
> index f36d9dfea3..a6d71e52e5 100644
> --- a/libavutil/wchar_filename.h
> +++ b/libavutil/wchar_filename.h
> @@ -41,6 +41,57 @@ static inline int utf8towchar(const char *filename_utf8, wchar_t **filename_w)
>      return 0;
>  }
>  
> +av_warn_unused_result
> +static inline int wchartocp(unsigned int code_page, const wchar_t *filename_w,
> +                            char **filename)
> +{
> +    DWORD flags = code_page == CP_UTF8 ? WC_ERR_INVALID_CHARS : 0;
> +    int num_chars = WideCharToMultiByte(code_page, flags, filename_w, -1,
> +                                        NULL, 0, NULL, NULL);
> +    if (num_chars <= 0) {
> +        *filename = NULL;
> +        return 0;
> +    }
> +    *filename = av_malloc_array(num_chars, sizeof *filename);
> +    if (!*filename) {
> +        errno = ENOMEM;
> +        return -1;
> +    }
> +    WideCharToMultiByte(code_page, flags, filename_w, -1,
> +                        *filename, num_chars, NULL, NULL);
> +    return 0;
> +}
> +
> +av_warn_unused_result
> +static inline int wchartoutf8(const wchar_t *filename_w, char **filename)
> +{
> +    return wchartocp(CP_UTF8, filename_w, filename);
> +}
> +
> +av_warn_unused_result
> +static inline int wchartoansi(const wchar_t *filename_w, char **filename)
> +{
> +    return wchartocp(CP_ACP, filename_w, filename);
> +}
> +
> +av_warn_unused_result
> +static inline int utf8toansi(const char *filename_utf8, char **filename)
> +{
> +    wchar_t *filename_w = NULL;
> +    int ret = -1;
> +    if (utf8towchar(filename_utf8, &filename_w))
> +        return -1;
> +
> +    if (!filename_w) {
> +        *filename = NULL;
> +        return 0;
> +    }
> +
> +    ret = wchartoansi(filename_w, filename);
> +    av_free(filename_w);
> +    return ret;
> +}
> +
>  /**
>   * Checks for extended path prefixes for which normalization needs to be skipped.
>   * see .NET6: PathInternal.IsExtended()
Andreas Rheinhardt June 19, 2022, 6:43 a.m. UTC | #4
Martin Storsjö:
> On Sun, 19 Jun 2022, Soft Works wrote:
> 
>>
>>
>>> -----Original Message-----
>>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of
>>> Andreas Rheinhardt
>>> Sent: Sunday, June 19, 2022 6:59 AM
>>> To: ffmpeg-devel@ffmpeg.org
>>> Subject: Re: [FFmpeg-devel] [PATCH v17 1/5] libavutil: Add
>>> wchartoutf8(), wchartoansi(), utf8toansi() and getenv_utf8()
>>>
>>> Nil Admirari:
>>>> wchartoutf8() converts strings returned by WinAPI into UTF-8,
>>>> which is FFmpeg's preffered encoding.
>>>>
>>>> Some external dependencies, such as AviSynth, are still
>>>> not Unicode-enabled. utf8toansi() converts UTF-8 strings
>>>> into ANSI in two steps: UTF-8 -> wchar_t -> ANSI.
>>>> wchartoansi() is responsible for the second step of the conversion.
>>>> Conversion in just one step is not supported by WinAPI.
>>>>
>>>> Since these character converting functions allocate the buffer
>>>> of necessary size, they also facilitate the removal of MAX_PATH
>>> limit
>>>> in places where fixed-size ANSI/WCHAR strings were used
>>>> as filename buffers.
>>>>
>>>> getenv_utf8() wraps _wgetenv() converting its input from
>>>> and its output to UTF-8. Compared to plain getenv(),
>>>> getenv_utf8() requires a cleanup.
>>>>
>>>> Because of that, in places that only test the existence of
>>>> an environment variable or compare its value with a string
>>>> consisting entirely of ASCII characters, the use of plain getenv()
>>>> is still preferred. (libavutil/log.c check_color_terminal()
>>>> is an example of such a place.)
>>>>
>>>> Plain getenv() is also preffered in UNIX-only code,
>>>> such as bktr.c, fbdev_common.c, oss.c in libavdevice
>>>> or af_ladspa.c in libavfilter.
>>>> ---
>>>>  configure                  |  1 +
>>>>  libavutil/getenv_utf8.h    | 71
>>> ++++++++++++++++++++++++++++++++++++++
>>>>  libavutil/wchar_filename.h | 51 +++++++++++++++++++++++++++
>>>>  3 files changed, 123 insertions(+)
>>>>  create mode 100644 libavutil/getenv_utf8.h
>>>>
>>>> diff --git a/configure b/configure
>>>> index 3dca1c4bd3..fa37a74531 100755
>>>> --- a/configure
>>>> +++ b/configure
>>>> @@ -2272,6 +2272,7 @@ SYSTEM_FUNCS="
>>>>      fcntl
>>>>      getaddrinfo
>>>>      getauxval
>>>> +    getenv
>>>>      gethrtime
>>>>      getopt
>>>>      GetModuleHandle
>>>> diff --git a/libavutil/getenv_utf8.h b/libavutil/getenv_utf8.h
>>>> new file mode 100644
>>>> index 0000000000..161e3e6202
>>>> --- /dev/null
>>>> +++ b/libavutil/getenv_utf8.h
>>>> @@ -0,0 +1,71 @@
>>>> +/*
>>>> + * This file is part of FFmpeg.
>>>> + *
>>>> + * FFmpeg is free software; you can redistribute it and/or
>>>> + * modify it under the terms of the GNU Lesser General Public
>>>> + * License as published by the Free Software Foundation; either
>>>> + * version 2.1 of the License, or (at your option) any later
>>> version.
>>>> + *
>>>> + * FFmpeg is distributed in the hope that it will be useful,
>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> GNU
>>>> + * Lesser General Public License for more details.
>>>> + *
>>>> + * You should have received a copy of the GNU Lesser General
>>> Public
>>>> + * License along with FFmpeg; if not, write to the Free Software
>>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>>> 02110-1301 USA
>>>> + */
>>>> +
>>>> +#ifndef AVUTIL_GETENV_UTF8_H
>>>> +#define AVUTIL_GETENV_UTF8_H
>>>> +
>>>> +#include <stdlib.h>
>>>> +
>>>> +#include "mem.h"
>>>> +
>>>> +#ifdef HAVE_GETENV
>>>> +
>>>> +#ifdef _WIN32
>>>> +
>>>> +#include "libavutil/wchar_filename.h"
>>>> +
>>>> +static inline char *getenv_utf8(const char *varname)
>>>> +{
>>>> +    wchar_t *varname_w, *var_w;
>>>> +    char *var;
>>>> +
>>>> +    if (utf8towchar(varname, &varname_w))
>>>> +        return NULL;
>>>> +    if (!varname_w)
>>>> +        return NULL;
>>>> +
>>>> +    var_w = _wgetenv(varname_w);
>>>> +    av_free(varname_w);
>>>> +
>>>> +    if (!var_w)
>>>> +        return NULL;
>>>> +    if (wchartoutf8(var_w, &var))
>>>> +        return NULL;
>>>> +
>>>> +    return var;
>>>> +
>>>> +    // No CP_ACP fallback compared to other *_utf8() functions:
>>>> +    // non UTF-8 strings must not be returned.
>>>> +}
>>>> +
>>>> +#else
>>>> +
>>>> +static inline char *getenv_utf8(const char *varname)
>>>> +{
>>>> +    return av_strdup(getenv(varname));
>>>
>>> This forces allocations and frees in scenarios where this is wholly
>>> unnecessary.
>>
>> Why do you think this is unnecessary? At least on Windows, there is
>> no guarantee regarding the lifetime of strings returned from
>> getenv(). In case when some other code would call _putenv to set the
>> env variable, this can cause the previously returned string to become
>> invalid without the caller being able to know.
> 
> Yes, if you would keep the return value from getenv for too long, while
> something else changes the environment in the same process, you'd have
> such an issue. But that hasn't been a concern so far - right? And isn't
> what we try to fix here.
> 

And if this were an issue, then I don't see what would preclude it from
it already happening in getenv_utf8, in e.g. wchartoutf8 (or in
av_strdup() in the current non-Windows implementation).

- Andreas
Nil Admirari June 19, 2022, 11:49 a.m. UTC | #5
> Note that this should be #if HAVE_GETENV - these constants are always 
> defined and evaluate to 0 or 1. No need to resend the patchset just for 
> that. (I added an explicit #include "config.h" above here too, just to 
> make it clearer.)

Fixed: https://ffmpeg.org/pipermail/ffmpeg-devel/2022-June/297804.html
Nil Admirari June 19, 2022, 11:56 a.m. UTC | #6
> This forces allocations and frees in scenarios where this is wholly
> unnecessary. This can be avoided by adding a custom deallocator for
> strings returned via getenv_utf8: Namely a define/wrapper around av_free
> in the _WIN32 and a no-op else.

Done: https://ffmpeg.org/pipermail/ffmpeg-devel/2022-June/297804.html

Note, however, that the introduction of freeenv_utf8()
doubles allocations and deallocations in vf_frei0r.c on Windows:
https://ffmpeg.org/pipermail/ffmpeg-devel/2022-June/297802.html

These additional memory operations can be avoided only with a whole bunch
of new #ifdef _WIN32 and #if HAVE_GETENV, which haven't been done.
Andreas Rheinhardt June 20, 2022, 12:54 a.m. UTC | #7
nil-admirari@mailo.com:
>> This forces allocations and frees in scenarios where this is wholly
>> unnecessary. This can be avoided by adding a custom deallocator for
>> strings returned via getenv_utf8: Namely a define/wrapper around av_free
>> in the _WIN32 and a no-op else.
> 
> Done: https://ffmpeg.org/pipermail/ffmpeg-devel/2022-June/297804.html
> 

Thanks for this.

> Note, however, that the introduction of freeenv_utf8()
> doubles allocations and deallocations in vf_frei0r.c on Windows:
> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-June/297802.html
> 
> These additional memory operations can be avoided only with a whole bunch
> of new #ifdef _WIN32 and #if HAVE_GETENV, which haven't been done.
> 

Or any reuses the #ifs from getenv_utf8.h.
https://github.com/mkver/FFmpeg/commits/getenv contains a version that
does this.

- Andreas
Nil Admirari June 20, 2022, 10:36 a.m. UTC | #8
> Or any reuses the #ifs from getenv_utf8.h.
> https://github.com/mkver/FFmpeg/commits/getenv contains a version that
> does this.

Introduced getenv_dup() and simplified #ifs a little:
https://ffmpeg.org/pipermail/ffmpeg-devel/2022-June/297841.html
diff mbox series

Patch

diff --git a/configure b/configure
index 3dca1c4bd3..fa37a74531 100755
--- a/configure
+++ b/configure
@@ -2272,6 +2272,7 @@  SYSTEM_FUNCS="
     fcntl
     getaddrinfo
     getauxval
+    getenv
     gethrtime
     getopt
     GetModuleHandle
diff --git a/libavutil/getenv_utf8.h b/libavutil/getenv_utf8.h
new file mode 100644
index 0000000000..161e3e6202
--- /dev/null
+++ b/libavutil/getenv_utf8.h
@@ -0,0 +1,71 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_GETENV_UTF8_H
+#define AVUTIL_GETENV_UTF8_H
+
+#include <stdlib.h>
+
+#include "mem.h"
+
+#ifdef HAVE_GETENV
+
+#ifdef _WIN32
+
+#include "libavutil/wchar_filename.h"
+
+static inline char *getenv_utf8(const char *varname)
+{
+    wchar_t *varname_w, *var_w;
+    char *var;
+
+    if (utf8towchar(varname, &varname_w))
+        return NULL;
+    if (!varname_w)
+        return NULL;
+
+    var_w = _wgetenv(varname_w);
+    av_free(varname_w);
+
+    if (!var_w)
+        return NULL;
+    if (wchartoutf8(var_w, &var))
+        return NULL;
+
+    return var;
+
+    // No CP_ACP fallback compared to other *_utf8() functions:
+    // non UTF-8 strings must not be returned.
+}
+
+#else
+
+static inline char *getenv_utf8(const char *varname)
+{
+    return av_strdup(getenv(varname));
+}
+
+#endif // _WIN32
+
+#else
+
+#define getenv_utf8(x) NULL
+
+#endif // HAVE_GETENV
+
+#endif // AVUTIL_GETENV_UTF8_H
diff --git a/libavutil/wchar_filename.h b/libavutil/wchar_filename.h
index f36d9dfea3..a6d71e52e5 100644
--- a/libavutil/wchar_filename.h
+++ b/libavutil/wchar_filename.h
@@ -41,6 +41,57 @@  static inline int utf8towchar(const char *filename_utf8, wchar_t **filename_w)
     return 0;
 }
 
+av_warn_unused_result
+static inline int wchartocp(unsigned int code_page, const wchar_t *filename_w,
+                            char **filename)
+{
+    DWORD flags = code_page == CP_UTF8 ? WC_ERR_INVALID_CHARS : 0;
+    int num_chars = WideCharToMultiByte(code_page, flags, filename_w, -1,
+                                        NULL, 0, NULL, NULL);
+    if (num_chars <= 0) {
+        *filename = NULL;
+        return 0;
+    }
+    *filename = av_malloc_array(num_chars, sizeof *filename);
+    if (!*filename) {
+        errno = ENOMEM;
+        return -1;
+    }
+    WideCharToMultiByte(code_page, flags, filename_w, -1,
+                        *filename, num_chars, NULL, NULL);
+    return 0;
+}
+
+av_warn_unused_result
+static inline int wchartoutf8(const wchar_t *filename_w, char **filename)
+{
+    return wchartocp(CP_UTF8, filename_w, filename);
+}
+
+av_warn_unused_result
+static inline int wchartoansi(const wchar_t *filename_w, char **filename)
+{
+    return wchartocp(CP_ACP, filename_w, filename);
+}
+
+av_warn_unused_result
+static inline int utf8toansi(const char *filename_utf8, char **filename)
+{
+    wchar_t *filename_w = NULL;
+    int ret = -1;
+    if (utf8towchar(filename_utf8, &filename_w))
+        return -1;
+
+    if (!filename_w) {
+        *filename = NULL;
+        return 0;
+    }
+
+    ret = wchartoansi(filename_w, filename);
+    av_free(filename_w);
+    return ret;
+}
+
 /**
  * Checks for extended path prefixes for which normalization needs to be skipped.
  * see .NET6: PathInternal.IsExtended()