diff mbox

[FFmpeg-devel,4/4] avformat/libopenmpt: Probe file format from file data if possible

Message ID 1515233228-16796-4-git-send-email-osmanx@problemloesungsmaschine.de
State Superseded
Headers show

Commit Message

Jörn Heusipp Jan. 6, 2018, 10:07 a.m. UTC
When building with libopenmpt 0.3, use the libopenmpt file header
probing functions for probing. libopenmpt probing functions are
allocation-free and designed to be as fast as possible.

For libopenmpt 0.2, or when libopenmpt 0.3 file header probing cannot
probe successfully due to too small probe buffer, test the filename
against the file extensions supported by the libopenmpt library that
is actually linked, instead of relying on a hard-coded file extension
list. File extension testing is also allocation-free and designed to
be fast in libopenmpt. Avoiding a hard-coded file extension list is
useful because later libopenmpt versions will likely add support for
more module file formats.

libopenmpt file header probing is tested regularly against the FATE
suite and other diverse file collections by libopenmpt upstream in
order to avoid false positives.

FATE passes with './configure --enable-libopenmpt' as well as with
'./configure --enable-libopenmpt --enable-libmodplug'.

As expected, I did not see any measurable performance difference
caused by libopenmpt file header probing when compared to the previous
pure file extension based format probing (using the following
synthetic test program (which tries to do nothing but exercise file
probing) on the complete FATE suite).

    // find ../fate/ -type f | xargs --no-run-if-empty ./probetest
    #include <stdio.h>
    #include "libavformat/avformat.h"
    #define BUFSIZE 2048
    static char buf[BUFSIZE + AVPROBE_PADDING_SIZE];
    int main(int argc, const char * * argv) {
        av_log_set_level(AV_LOG_WARNING);
        av_register_all();
        for (int i = 1; i < argc; ++i) {
            AVProbeData pd;
            FILE * f;
            size_t size;
            memset(&pd, 0, sizeof(AVProbeData));
            pd.filename = argv[i];
            memset(buf, 0, sizeof(buf));
            f = fopen(pd.filename, "rb");
            size = fread(buf, 1, BUFSIZE, f);
            fclose(f);
            pd.buf_size = size;
            av_probe_input_format(&pd, 1);
        }
        return 0;
    }

Signed-off-by: Jörn Heusipp <osmanx@problemloesungsmaschine.de>
---
 libavformat/libopenmpt.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

Comments

Carl Eugen Hoyos Jan. 6, 2018, 3:10 p.m. UTC | #1
2018-01-06 11:07 GMT+01:00 Jörn Heusipp <osmanx@problemloesungsmaschine.de>:
> When building with libopenmpt 0.3, use the libopenmpt file header
> probing functions for probing. libopenmpt probing functions are
> allocation-free and designed to be as fast as possible.
>
> For libopenmpt 0.2, or when libopenmpt 0.3 file header probing cannot
> probe successfully due to too small probe buffer, test the filename
> against the file extensions supported by the libopenmpt library that
> is actually linked, instead of relying on a hard-coded file extension
> list. File extension testing is also allocation-free and designed to
> be fast in libopenmpt. Avoiding a hard-coded file extension list is
> useful because later libopenmpt versions will likely add support for
> more module file formats.
>
> libopenmpt file header probing is tested regularly against the FATE
> suite and other diverse file collections by libopenmpt upstream in
> order to avoid false positives.

You could also test tools/probetest

> FATE passes with './configure --enable-libopenmpt' as well as with
> './configure --enable-libopenmpt --enable-libmodplug'.
>
> As expected, I did not see any measurable performance difference
> caused by libopenmpt file header probing when compared to the previous
> pure file extension based format probing (using the following
> synthetic test program (which tries to do nothing but exercise file
> probing) on the complete FATE suite).
>
>     // find ../fate/ -type f | xargs --no-run-if-empty ./probetest
>     #include <stdio.h>
>     #include "libavformat/avformat.h"
>     #define BUFSIZE 2048
>     static char buf[BUFSIZE + AVPROBE_PADDING_SIZE];
>     int main(int argc, const char * * argv) {
>         av_log_set_level(AV_LOG_WARNING);
>         av_register_all();
>         for (int i = 1; i < argc; ++i) {
>             AVProbeData pd;
>             FILE * f;
>             size_t size;
>             memset(&pd, 0, sizeof(AVProbeData));
>             pd.filename = argv[i];
>             memset(buf, 0, sizeof(buf));
>             f = fopen(pd.filename, "rb");
>             size = fread(buf, 1, BUFSIZE, f);
>             fclose(f);
>             pd.buf_size = size;
>             av_probe_input_format(&pd, 1);
>         }
>         return 0;
>     }
>
> Signed-off-by: Jörn Heusipp <osmanx@problemloesungsmaschine.de>
> ---
>  libavformat/libopenmpt.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 59 insertions(+)
>
> diff --git a/libavformat/libopenmpt.c b/libavformat/libopenmpt.c
> index 5efbdc4..a663b2e 100644
> --- a/libavformat/libopenmpt.c
> +++ b/libavformat/libopenmpt.c
> @@ -218,6 +218,64 @@ static int read_seek_openmpt(AVFormatContext *s, int stream_idx, int64_t ts, int
>      return 0;
>  }
>
> +static int read_probe_openmpt(AVProbeData * p)
> +{
> +    const int score_data      = AVPROBE_SCORE_MIME + 1;   /* 76 */
> +    const int score_ext       = AVPROBE_SCORE_EXTENSION;  /* 50 */
> +    const int score_ext_retry = AVPROBE_SCORE_RETRY;      /* 25 */
> +    const int score_retry     = AVPROBE_SCORE_RETRY / 2;  /* 12 */
> +    const int score_fail      = 0;                        /*  0 */
> +
> +    const char *ext;
> +    int probe_result;
> +    int score = score_fail;
> +
> +    if (p->filename) {
> +        ext = strrchr(p->filename, '.');
> +        if (ext && strlen(ext + 1) > 0) {
> +            ext++;  /* skip '.' */
> +            if (openmpt_is_extension_supported(ext) == 1)
> +                score = FFMAX(score, score_ext);
> +        }
> +    }
> +
> +#if OPENMPT_API_VERSION_AT_LEAST(0,3,0)
> +    if (p->buf && p->buf_size > 0) {
> +        probe_result = openmpt_probe_file_header_without_filesize(
> +                           OPENMPT_PROBE_FILE_HEADER_FLAGS_DEFAULT,
> +                           p->buf, p->buf_size,
> +                           &openmpt_logfunc, NULL, NULL, NULL, NULL, NULL);
> +        if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_FAILURE) {
> +            score = score_fail;

What's wrong with return 0;?

> +        } else if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS) {
> +            score = FFMAX(score, score_data);

What does OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS mean?
Why not return MAX?

> +        } else if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_WANTMOREDATA) {

I believe this should return 0 but maybe you found that this is bad?

> +            if (score > score_fail) {
> +                /* known file extension */
> +                score = FFMAX(score, score_ext_retry);
> +            } else {
> +                /* unknown file extension */
> +                if (p->buf_size >= openmpt_probe_file_header_get_recommended_size()) {
> +                    /* We have already received the recommended amount of data
> +                     * and still cannot decide. Return a rather low score.
> +                     */
> +                    score = FFMAX(score, score_retry);
> +                } else {
> +                    /* The file extension is unknown and we have very few data
> +                     * bytes available. libopenmpt cannot decide anything here,
> +                     * and returning any score > 0 would result in successfull
> +                     * probing of random data.
> +                     */
> +                    score = score_fail;

This patch indicates that it may be a good idea to require libopenmpt 0.3,
when was it released, which distributions do not include it?

Carl Eugen
Jörn Heusipp Jan. 7, 2018, 2:40 p.m. UTC | #2
On 01/06/2018 04:10 PM, Carl Eugen Hoyos wrote:
> 2018-01-06 11:07 GMT+01:00 Jörn Heusipp <osmanx@problemloesungsmaschine.de>:

>> libopenmpt file header probing is tested regularly against the FATE
>> suite and other diverse file collections by libopenmpt upstream in
>> order to avoid false positives.
> 
> You could also test tools/probetest

I was not aware of that tool. Thanks for the suggestion.

It currently lists a failure related to libopenmpt:
Failure of libopenmpt probing code with score=76 type=0 p=FDC size=1024

Looking at tools/probetest.c, that would be a file with very few bits 
set. libopenmpt detects the random data in question as M15 .MOD files 
(original Amiga 15 samples .mod files), and there is sadly not much that 
can be done about that. There are perfectly valid real-world M15 .MOD 
files with only 73 bits set in the first 600 bytes 
(untouchables-station.mod, 
<https://modarchive.org/index.php?request=view_by_moduleid&query=104280>). 
The following up to 64*4*4*63 bytes could even be completely 0 (empty 
pattern data) for valid files (even without the file being totally 
silent). The generated random data that tools/probetest complains about 
here contains 46 bits set to 1 in the first 600 bytes. What follows are 
various other examples with up to 96 bits set to 1. Completely loading a 
file like that would very likely reject it (in particular, libopenmpt 
can deduce the expected file size from the sample headers and, with some 
slack to account for corrupted real-world examples, reject files with 
non-matching size), however, that is not possible when only probing the 
file header.
The libopenmpt API allows for the file header probing functions to 
return false-positives, however false-negatives are not allowed.

Performance numbers shown by tools/probetest are what I had expected 
(measured on an AMD A4-5000, during normal Xorg session (i.e. not 100% 
idle)):

   1110194971 cycles,           aa
  24986722468 cycles,          aac
  26418545168 cycles,          ac3
   1484717267 cycles,          acm
   1627888281 cycles,          act
   2109884646 cycles,          adp
   2316235992 cycles,          ads
   1244706028 cycles,          adx
   1132390431 cycles,          aea
   1729241354 cycles,         aiff
   1728288238 cycles,          aix
   2662531158 cycles,          amr
  16189546067 cycles,        amrnb
  10342883200 cycles,        amrwb
   1487752343 cycles,          anm
   2268900502 cycles,          apc
   1140814303 cycles,          ape
   2181170710 cycles,         apng
  18698762054 cycles,      aqtitle
   2656908730 cycles,          asf
   2402762967 cycles,        asf_o
  18148196647 cycles,          ass
   1392503829 cycles,          ast
   1774264703 cycles,           au
   1807159562 cycles,          avi
   1745391230 cycles,          avr
   1370939762 cycles,          avs
   1555620708 cycles,  bethsoftvid
   1459171160 cycles,          bfi
   2640635742 cycles,         bink
   2022320986 cycles,          bit
   1664933324 cycles,        bfstm
   1588023172 cycles,        brstm
   1769430536 cycles,          boa
   2294286860 cycles,          c93
   1022646071 cycles,          caf
   9063207678 cycles,    cavsvideo
   1898790300 cycles,         cdxl
   1037718383 cycles,         cine
   3358938768 cycles,       concat
   2367399953 cycles,        dcstr
   1795803759 cycles,          dfa
   1454750468 cycles,        dirac
   1167905836 cycles,        dnxhd
   2076678208 cycles,          dsf
   1226761232 cycles,       dsicin
   1157816261 cycles,          dss
  31466350564 cycles,          dts
   1357475606 cycles,        dtshd
  15626181281 cycles,           dv
  12227021709 cycles,       dvbsub
   1747998309 cycles,       dvbtxt
   1941371107 cycles,          dxa
   1988122049 cycles,           ea
   1395161162 cycles,     ea_cdata
  21013119067 cycles,         eac3
   1282697126 cycles,         epaf
   1658521102 cycles,          ffm
   2919787300 cycles,   ffmetadata
   3786264941 cycles,         fits
   2700385826 cycles,         flac
   1840776863 cycles,         flic
   1317695853 cycles,          flv
   1511756606 cycles,     live_flv
   1135064427 cycles,          4xm
   1830871233 cycles,          frm
   3011403748 cycles,          fsb
   1462985803 cycles,          gdv
   1708440935 cycles,         genh
   3480430744 cycles,          gif
   2533542048 cycles,          gsm
   2412598563 cycles,          gxf
  21637989787 cycles,         h261
  22268834035 cycles,         h263
  22135718754 cycles,         h264
  13939886275 cycles,         hevc
   1979375582 cycles, hls,applehttp
   1658646375 cycles,          hnm
   1507634977 cycles,          ico
   2534774499 cycles,        idcin
   1684324336 cycles,          idf
   1353664382 cycles,          iff
   2978779893 cycles,         ilbc
   1892353081 cycles,    alias_pix
   2456259645 cycles,  brender_pix
   2077466815 cycles,    ingenient
  11281657144 cycles,      ipmovie
   1840789384 cycles,        ircam
   2455541614 cycles,          iss
   1114518907 cycles,          iv8
   1750327098 cycles,          ivf
   3803895407 cycles,          ivr
  30510491919 cycles,      jacosub
   1271391143 cycles,           jv
   1504674165 cycles,        lmlm4
  28284647311 cycles,         loas
   2746771768 cycles,          lrc
   1630546444 cycles,          lvf
   2198871369 cycles,          lxf
  15210250791 cycles,          m4v
   2074024051 cycles, matroska,webm
   1756348463 cycles,        mgsts
  13894318111 cycles,     microdvd
  15146276963 cycles,        mjpeg
  13215378411 cycles,   mjpeg_2000
  21505153187 cycles,          mlp
   1623684275 cycles,          mlv
   2009009898 cycles,           mm
   1401453493 cycles,          mmf
   3614852044 cycles, mov,mp4,m4a,3gp,3g2,mj2
  37065167696 cycles,          mp3
   2003306237 cycles,          mpc
   1695842377 cycles,         mpc8
  20922947044 cycles,         mpeg
  26950626806 cycles,       mpegts
  12903395151 cycles,    mpegvideo
   1861191163 cycles,       mpjpeg
  11292546869 cycles,         mpl2
  10904909514 cycles,        mpsub
   2556705558 cycles,          msf
  14520727615 cycles,     msnwctcp
   1513345014 cycles,         mtaf
   1498181103 cycles,          mtv
   2100567692 cycles,         musx
   1398481833 cycles,           mv
   3839928046 cycles,          mxf
   1084340183 cycles,           nc
   2260039804 cycles,   nistsphere
   1557302811 cycles,          nsp
  14077588650 cycles,          nsv
  12804865958 cycles,          nut
   3498085105 cycles,          nuv
   2785399093 cycles,          ogg
   2800628120 cycles,          oma
   2241873172 cycles,          paf
  11630567717 cycles,          pjs
   1538360044 cycles,          pmp
   1966776985 cycles,          pva
   2051297210 cycles,          pvf
   1464824135 cycles,          qcp
   1395151376 cycles,          r3d
  13872717447 cycles,     realtext
   1648061451 cycles,     redspark
   1881530375 cycles,          rl2
   1865198787 cycles,           rm
   1848791502 cycles,          roq
   3141932957 cycles,          rpl
   2379252069 cycles,          rsd
  31146518791 cycles,        s337m
   7497815228 cycles,         sami
  24830800138 cycles,          sbg
  15351196732 cycles,          scc
   9758760073 cycles,          sdp
   2159674057 cycles,         sdr2
   1555316250 cycles,          sds
   1533405328 cycles,          sdx
   1681270049 cycles,     film_cpk
   2303851902 cycles,          shn
   1761647489 cycles,         siff
   1510520120 cycles,          smk
   2859907925 cycles,       smjpeg
   1643498999 cycles,        smush
   1545689291 cycles,          sol
   1912740702 cycles,          sox
  17486361594 cycles,        spdif
  20080502425 cycles,          srt
   2659637846 cycles,       psxstr
  17633213722 cycles,          stl
   8032855323 cycles,   subviewer1
   8572013351 cycles,    subviewer
   2043897951 cycles,          sup
   2980746200 cycles,         svag
   1617398584 cycles,          swf
   2842115745 cycles,          tak
   5320163051 cycles,  tedcaptions
   1884107745 cycles,          thp
   4320119922 cycles,       3dostr
   2018755118 cycles,   tiertexseq
   1714617022 cycles,          tmv
  21456317423 cycles,       truehd
   1050826275 cycles,          tta
   2065773077 cycles,          txd
   1577829281 cycles,           ty
   3450802460 cycles,          vag
  19179500628 cycles,          vc1
   1860036853 cycles,      vc1test
   2035593194 cycles,         vivo
   1518758455 cycles,          vmd
   2696860615 cycles,       vobsub
   2762235280 cycles,          voc
   1957794567 cycles,          vpk
  15280000639 cycles,      vplayer
   1763355055 cycles,          vqf
   1879310121 cycles,          w64
   1717961542 cycles,          wav
   2095837026 cycles,     wc3movie
   2960188092 cycles,       webvtt
   1922356839 cycles,        wsaud
   1978715237 cycles,          wsd
   1468438585 cycles,        wsvqa
   2668937770 cycles,          wtv
   3193222838 cycles,          wve
   1744694735 cycles,           wv
   1677278541 cycles,           xa
   1759862474 cycles,         xbin
   2077217647 cycles,          xmv
   2161496331 cycles,         xvag
   2330794326 cycles,         xwma
   1103137131 cycles,          yop
   2154690280 cycles, yuv4mpegpipe
   1842301899 cycles,     bmp_pipe
   2039875920 cycles,     dds_pipe
   1627504710 cycles,     dpx_pipe
   1463019740 cycles,     exr_pipe
   1539585051 cycles,     j2k_pipe
   1187861714 cycles,    jpeg_pipe
   1682815484 cycles,  jpegls_pipe
   1840465166 cycles,     pam_pipe
   1755858395 cycles,     pbm_pipe
   1211589601 cycles,     pcx_pipe
   2002446954 cycles,  pgmyuv_pipe
   1818965412 cycles,     pgm_pipe
   1654095834 cycles,  pictor_pipe
   1404252441 cycles,     png_pipe
   1211120882 cycles,     ppm_pipe
   1205883539 cycles,     psd_pipe
   1764091290 cycles,   qdraw_pipe
   1091809273 cycles,     sgi_pipe
   2994663150 cycles,     svg_pipe
   1348938514 cycles, sunrast_pipe
   1464347337 cycles,    tiff_pipe
   1142572756 cycles,    webp_pipe
   1412715104 cycles,     xpm_pipe
   3550700989 cycles,   libmodplug
109589637233 cycles,   libopenmpt

   2672917981           libopenmpt (per module format)

At first glance, libopenmpt looks huge here in comparison. However one 
should consider that libopenmpt internally has to probe for (currently) 
41 different module file formats, going through 41 separate probing 
functions internally.

Dividing 109589637233 by 41 gives 2672917981, which is in the ballpark 
of all other probing functions in ffmpeg.

>> +#if OPENMPT_API_VERSION_AT_LEAST(0,3,0)
>> +    if (p->buf && p->buf_size > 0) {
>> +        probe_result = openmpt_probe_file_header_without_filesize(
>> +                           OPENMPT_PROBE_FILE_HEADER_FLAGS_DEFAULT,
>> +                           p->buf, p->buf_size,
>> +                           &openmpt_logfunc, NULL, NULL, NULL, NULL, NULL);
>> +        if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_FAILURE) {
>> +            score = score_fail;
> 
> What's wrong with return 0;?

Nothing. If preferred, I can get rid of all score_* constants and use 0 
or AVPROBE_SCORE_* directly.

>> +        } else if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS) {
>> +            score = FFMAX(score, score_data);
> 
> What does OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS mean?

It is documented as "OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS: The file 
will most likely be supported by libopenmpt." (see 
<https://lib.openmpt.org/doc/group__libopenmpt__c.html#ga92cdc66eb529a8a4a67987b659ed3c5e>).
An ultimately precise answer is never possible as that would require 
actually trying to load the complete file in some cases:
  * Not all module file formats store feature flags in the file header.
  * Some module file formats provide very little file magic numbers, 
and/or file magic numbers at strange offsets (like at 1080 for M.K. .MOD).
  * Some formats store header-like information in the file footer, which 
is not accessible during probing.
  * The extreme case of M15 (original 15 samples Amiga .MOD files) 
provides absolutely no true file header or magic numbers. libopenmpt 
implements heuristics to reliably identify and probe even those, however 
there is only so much it can do.
  * Some container formats (Unreal Music .UMX, which can contain module 
music files) theoretically potentially require seeking to arbitrary 
locations in the file in order to determine the format.

> Why not return MAX?

For all the reasons listed above, even though libopenmpt tries to be as 
pessimistic as possible, false positives fundamentally cannot be avoided 
completely. As the libopenmpt probing logic is code outside of ffmpeg, 
the effects of such a false positive could potentially cause 
mis-detection of other formats supported by ffmpeg, which would not be 
immediately or easily fixable by ffmpeg itself. I used the lowest 
possible score that makes sense in order to reduce the risk of potential 
impact.
The probing result in this case is deduced from looking at the actual 
file data, as opposed to just trusting a mime-type which is external to 
the file and could be inconsistent/wrong, which is why I used a score 
higher than AVPROBE_SCORE_MIME.
I opted for AVPROBE_SCORE_MIME+1, which seemed reasonable to me.
Should I add a comment explaining the reasoning to the code?

>> +        } else if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_WANTMOREDATA) {
>  > I believe this should return 0 but maybe you found that this is bad?

Would 0 be semantically right here? 
OPENMPT_PROBE_FILE_HEADER_RESULT_WANTMOREDATA means that libopenmpt 
requires more data to come to any usable conclusion, which is what I 
thought AVPROBE_SCORE_RETRY would mean.
I do not see any particular problem with returning 0 in this case 
either, given the probing logic in av_probe_input_format() (and it would 
reduce the whole probe_result == 
OPENMPT_PROBE_FILE_HEADER_RESULT_WANTMOREDATA block to a single line). 
However, if client code directly calls .read_probe() on AVInputFormat 
ff_libopenmpt_demuxer, I think returning AVPROBE_SCORE_RETRY (or 
similar) makes more sense.

>> +            if (score > score_fail) {
>> +                /* known file extension */
>> +                score = FFMAX(score, score_ext_retry);
>> +            } else {
>> +                /* unknown file extension */
>> +                if (p->buf_size >= openmpt_probe_file_header_get_recommended_size()) {
>> +                    /* We have already received the recommended amount of data
>> +                     * and still cannot decide. Return a rather low score.
>> +                     */
>> +                    score = FFMAX(score, score_retry);
>> +                } else {
>> +                    /* The file extension is unknown and we have very few data
>> +                     * bytes available. libopenmpt cannot decide anything here,
>> +                     * and returning any score > 0 would result in successfull
>> +                     * probing of random data.
>> +                     */
>> +                    score = score_fail;
> 
> This patch indicates that it may be a good idea to require libopenmpt 0.3,

The amount of #ifdef needed to support 0.2 and 0.3 is rather small, I think.

I understand that the current (and future libopenmpt 0.2) way of solely 
relying on the file extension is far from optimal, but I do not see any 
reason to drop libopenmpt 0.2 support right now; in particular, 
continuing 0.2 support as is would be no regression. Additionally, 
libopenmpt 0.2 can be built with C++03 compilers while libopenmpt 0.3 
requires a C++11 compiler, thus, libopenmpt 0.3 cannot easily be built 
on older platforms.

libopenmpt 0.2 also allows for file probing, however the API and code 
path is very heavy-weight (goes through the normal file loader and 
discards unneeded data), which I fear would be way too heavy 
performance-wise for ffmpeg.

> when was it released, which distributions do not include it?

The first version of libopenmpt 0.3 was released 2017-09-28.

I am not aware of any stable, non-rolling distribution that ships 
libopenmpt 0.3 as of now.

Debian 9 has libopenmpt 0.2.7386~beta20.3-3+deb9u2
Ubuntu 17.10 has libopenmpt 0.2.8760~beta27-1
Ubuntu 16.04 LTS has no libopenmpt at all
even openSUSE Tumbleweed only has libopenmpt 0.2.8461~beta26
Debian Testing and Ubuntu Bionic both have libopenmpt 0.3.4.

I do not think ffmpeg should drop libopenmpt 0.2 support at the moment.


Regards,
Jörn
Carl Eugen Hoyos Jan. 7, 2018, 11:57 p.m. UTC | #3
2018-01-07 15:40 GMT+01:00 Jörn Heusipp <osmanx@problemloesungsmaschine.de>:
>
> On 01/06/2018 04:10 PM, Carl Eugen Hoyos wrote:
>>
>> 2018-01-06 11:07 GMT+01:00 Jörn Heusipp
>> <osmanx@problemloesungsmaschine.de>:
>
>
>>> libopenmpt file header probing is tested regularly against the FATE
>>> suite and other diverse file collections by libopenmpt upstream in
>>> order to avoid false positives.
>>
>>
>> You could also test tools/probetest
>
>
> I was not aware of that tool. Thanks for the suggestion.
>
> It currently lists a failure related to libopenmpt:
> Failure of libopenmpt probing code with score=76 type=0 p=FDC size=1024

I did not look at this closely but I suspect libopenmpt should return a
smaller score in this case.

A solution could be to never return a high value for the FFmpeg
probe function.

> Looking at tools/probetest.c, that would be a file with very few bits set.
> libopenmpt detects the random data in question as M15 .MOD files (original
> Amiga 15 samples .mod files), and there is sadly not much that can be done
> about that. There are perfectly valid real-world M15 .MOD files with only 73
> bits set in the first 600 bytes (untouchables-station.mod,
> <https://modarchive.org/index.php?request=view_by_moduleid&query=104280>).
> The following up to 64*4*4*63 bytes could even be completely 0 (empty
> pattern data) for valid files (even without the file being totally silent).
> The generated random data that tools/probetest complains about here contains
> 46 bits set to 1 in the first 600 bytes. What follows are various other
> examples with up to 96 bits set to 1. Completely loading a file like that
> would very likely reject it (in particular, libopenmpt can deduce the
> expected file size from the sample headers and, with some slack to account
> for corrupted real-world examples, reject files with non-matching size),
> however, that is not possible when only probing the file header.
> The libopenmpt API allows for the file header probing functions to return
> false-positives, however false-negatives are not allowed.
>
> Performance numbers shown by tools/probetest are what I had expected
> (measured on an AMD A4-5000, during normal Xorg session (i.e. not 100%
> idle)):
>
>   1110194971 cycles,           aa
>  24986722468 cycles,          aac
>  26418545168 cycles,          ac3
>   1484717267 cycles,          acm
>   1627888281 cycles,          act
>   2109884646 cycles,          adp
>   2316235992 cycles,          ads
>   1244706028 cycles,          adx
>   1132390431 cycles,          aea
>   1729241354 cycles,         aiff
>   1728288238 cycles,          aix
>   2662531158 cycles,          amr
>  16189546067 cycles,        amrnb
>  10342883200 cycles,        amrwb
>   1487752343 cycles,          anm
>   2268900502 cycles,          apc
>   1140814303 cycles,          ape
>   2181170710 cycles,         apng
>  18698762054 cycles,      aqtitle
>   2656908730 cycles,          asf
>   2402762967 cycles,        asf_o
>  18148196647 cycles,          ass
>   1392503829 cycles,          ast
>   1774264703 cycles,           au
>   1807159562 cycles,          avi
>   1745391230 cycles,          avr
>   1370939762 cycles,          avs
>   1555620708 cycles,  bethsoftvid
>   1459171160 cycles,          bfi
>   2640635742 cycles,         bink
>   2022320986 cycles,          bit
>   1664933324 cycles,        bfstm
>   1588023172 cycles,        brstm
>   1769430536 cycles,          boa
>   2294286860 cycles,          c93
>   1022646071 cycles,          caf
>   9063207678 cycles,    cavsvideo
>   1898790300 cycles,         cdxl
>   1037718383 cycles,         cine
>   3358938768 cycles,       concat
>   2367399953 cycles,        dcstr
>   1795803759 cycles,          dfa
>   1454750468 cycles,        dirac
>   1167905836 cycles,        dnxhd
>   2076678208 cycles,          dsf
>   1226761232 cycles,       dsicin
>   1157816261 cycles,          dss
>  31466350564 cycles,          dts
>   1357475606 cycles,        dtshd
>  15626181281 cycles,           dv
>  12227021709 cycles,       dvbsub
>   1747998309 cycles,       dvbtxt
>   1941371107 cycles,          dxa
>   1988122049 cycles,           ea
>   1395161162 cycles,     ea_cdata
>  21013119067 cycles,         eac3
>   1282697126 cycles,         epaf
>   1658521102 cycles,          ffm
>   2919787300 cycles,   ffmetadata
>   3786264941 cycles,         fits
>   2700385826 cycles,         flac
>   1840776863 cycles,         flic
>   1317695853 cycles,          flv
>   1511756606 cycles,     live_flv
>   1135064427 cycles,          4xm
>   1830871233 cycles,          frm
>   3011403748 cycles,          fsb
>   1462985803 cycles,          gdv
>   1708440935 cycles,         genh
>   3480430744 cycles,          gif
>   2533542048 cycles,          gsm
>   2412598563 cycles,          gxf
>  21637989787 cycles,         h261
>  22268834035 cycles,         h263
>  22135718754 cycles,         h264
>  13939886275 cycles,         hevc
>   1979375582 cycles, hls,applehttp
>   1658646375 cycles,          hnm
>   1507634977 cycles,          ico
>   2534774499 cycles,        idcin
>   1684324336 cycles,          idf
>   1353664382 cycles,          iff
>   2978779893 cycles,         ilbc
>   1892353081 cycles,    alias_pix
>   2456259645 cycles,  brender_pix
>   2077466815 cycles,    ingenient
>  11281657144 cycles,      ipmovie
>   1840789384 cycles,        ircam
>   2455541614 cycles,          iss
>   1114518907 cycles,          iv8
>   1750327098 cycles,          ivf
>   3803895407 cycles,          ivr
>  30510491919 cycles,      jacosub
>   1271391143 cycles,           jv
>   1504674165 cycles,        lmlm4
>  28284647311 cycles,         loas
>   2746771768 cycles,          lrc
>   1630546444 cycles,          lvf
>   2198871369 cycles,          lxf
>  15210250791 cycles,          m4v
>   2074024051 cycles, matroska,webm
>   1756348463 cycles,        mgsts
>  13894318111 cycles,     microdvd
>  15146276963 cycles,        mjpeg
>  13215378411 cycles,   mjpeg_2000
>  21505153187 cycles,          mlp
>   1623684275 cycles,          mlv
>   2009009898 cycles,           mm
>   1401453493 cycles,          mmf
>   3614852044 cycles, mov,mp4,m4a,3gp,3g2,mj2
>  37065167696 cycles,          mp3
>   2003306237 cycles,          mpc
>   1695842377 cycles,         mpc8
>  20922947044 cycles,         mpeg
>  26950626806 cycles,       mpegts
>  12903395151 cycles,    mpegvideo
>   1861191163 cycles,       mpjpeg
>  11292546869 cycles,         mpl2
>  10904909514 cycles,        mpsub
>   2556705558 cycles,          msf
>  14520727615 cycles,     msnwctcp
>   1513345014 cycles,         mtaf
>   1498181103 cycles,          mtv
>   2100567692 cycles,         musx
>   1398481833 cycles,           mv
>   3839928046 cycles,          mxf
>   1084340183 cycles,           nc
>   2260039804 cycles,   nistsphere
>   1557302811 cycles,          nsp
>  14077588650 cycles,          nsv
>  12804865958 cycles,          nut
>   3498085105 cycles,          nuv
>   2785399093 cycles,          ogg
>   2800628120 cycles,          oma
>   2241873172 cycles,          paf
>  11630567717 cycles,          pjs
>   1538360044 cycles,          pmp
>   1966776985 cycles,          pva
>   2051297210 cycles,          pvf
>   1464824135 cycles,          qcp
>   1395151376 cycles,          r3d
>  13872717447 cycles,     realtext
>   1648061451 cycles,     redspark
>   1881530375 cycles,          rl2
>   1865198787 cycles,           rm
>   1848791502 cycles,          roq
>   3141932957 cycles,          rpl
>   2379252069 cycles,          rsd
>  31146518791 cycles,        s337m
>   7497815228 cycles,         sami
>  24830800138 cycles,          sbg
>  15351196732 cycles,          scc
>   9758760073 cycles,          sdp
>   2159674057 cycles,         sdr2
>   1555316250 cycles,          sds
>   1533405328 cycles,          sdx
>   1681270049 cycles,     film_cpk
>   2303851902 cycles,          shn
>   1761647489 cycles,         siff
>   1510520120 cycles,          smk
>   2859907925 cycles,       smjpeg
>   1643498999 cycles,        smush
>   1545689291 cycles,          sol
>   1912740702 cycles,          sox
>  17486361594 cycles,        spdif
>  20080502425 cycles,          srt
>   2659637846 cycles,       psxstr
>  17633213722 cycles,          stl
>   8032855323 cycles,   subviewer1
>   8572013351 cycles,    subviewer
>   2043897951 cycles,          sup
>   2980746200 cycles,         svag
>   1617398584 cycles,          swf
>   2842115745 cycles,          tak
>   5320163051 cycles,  tedcaptions
>   1884107745 cycles,          thp
>   4320119922 cycles,       3dostr
>   2018755118 cycles,   tiertexseq
>   1714617022 cycles,          tmv
>  21456317423 cycles,       truehd
>   1050826275 cycles,          tta
>   2065773077 cycles,          txd
>   1577829281 cycles,           ty
>   3450802460 cycles,          vag
>  19179500628 cycles,          vc1
>   1860036853 cycles,      vc1test
>   2035593194 cycles,         vivo
>   1518758455 cycles,          vmd
>   2696860615 cycles,       vobsub
>   2762235280 cycles,          voc
>   1957794567 cycles,          vpk
>  15280000639 cycles,      vplayer
>   1763355055 cycles,          vqf
>   1879310121 cycles,          w64
>   1717961542 cycles,          wav
>   2095837026 cycles,     wc3movie
>   2960188092 cycles,       webvtt
>   1922356839 cycles,        wsaud
>   1978715237 cycles,          wsd
>   1468438585 cycles,        wsvqa
>   2668937770 cycles,          wtv
>   3193222838 cycles,          wve
>   1744694735 cycles,           wv
>   1677278541 cycles,           xa
>   1759862474 cycles,         xbin
>   2077217647 cycles,          xmv
>   2161496331 cycles,         xvag
>   2330794326 cycles,         xwma
>   1103137131 cycles,          yop
>   2154690280 cycles, yuv4mpegpipe
>   1842301899 cycles,     bmp_pipe
>   2039875920 cycles,     dds_pipe
>   1627504710 cycles,     dpx_pipe
>   1463019740 cycles,     exr_pipe
>   1539585051 cycles,     j2k_pipe
>   1187861714 cycles,    jpeg_pipe
>   1682815484 cycles,  jpegls_pipe
>   1840465166 cycles,     pam_pipe
>   1755858395 cycles,     pbm_pipe
>   1211589601 cycles,     pcx_pipe
>   2002446954 cycles,  pgmyuv_pipe
>   1818965412 cycles,     pgm_pipe
>   1654095834 cycles,  pictor_pipe
>   1404252441 cycles,     png_pipe
>   1211120882 cycles,     ppm_pipe
>   1205883539 cycles,     psd_pipe
>   1764091290 cycles,   qdraw_pipe
>   1091809273 cycles,     sgi_pipe
>   2994663150 cycles,     svg_pipe
>   1348938514 cycles, sunrast_pipe
>   1464347337 cycles,    tiff_pipe
>   1142572756 cycles,    webp_pipe
>   1412715104 cycles,     xpm_pipe
>   3550700989 cycles,   libmodplug

> 109589637233 cycles,   libopenmpt

This sadly may not be acceptable, others may want to comment.

>   2672917981           libopenmpt (per module format)
>
> At first glance, libopenmpt looks huge here in comparison. However one
> should consider that libopenmpt internally has to probe for (currently) 41
> different module file formats, going through 41 separate probing functions
> internally.
>
> Dividing 109589637233 by 41 gives 2672917981, which is in the ballpark of
> all other probing functions in ffmpeg.
>
>>> +#if OPENMPT_API_VERSION_AT_LEAST(0,3,0)
>>> +    if (p->buf && p->buf_size > 0) {
>>> +        probe_result = openmpt_probe_file_header_without_filesize(
>>> +                           OPENMPT_PROBE_FILE_HEADER_FLAGS_DEFAULT,
>>> +                           p->buf, p->buf_size,
>>> +                           &openmpt_logfunc, NULL, NULL, NULL, NULL,
>>> NULL);
>>> +        if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_FAILURE) {
>>> +            score = score_fail;
>>
>>
>> What's wrong with return 0;?
>
>
> Nothing. If preferred, I can get rid of all score_* constants and use 0 or
> AVPROBE_SCORE_* directly.

That sounds like a very good idea to me but please wait for others to comment.

>>> +        } else if (probe_result ==
>>> OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS) {
>>> +            score = FFMAX(score, score_data);
>>
>>
>> What does OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS mean?
>
>
> It is documented as "OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS: The file will
> most likely be supported by libopenmpt." (see
> <https://lib.openmpt.org/doc/group__libopenmpt__c.html#ga92cdc66eb529a8a4a67987b659ed3c5e>).
> An ultimately precise answer is never possible as that would require
> actually trying to load the complete file in some cases:
>  * Not all module file formats store feature flags in the file header.
>  * Some module file formats provide very little file magic numbers, and/or
> file magic numbers at strange offsets (like at 1080 for M.K. .MOD).
>  * Some formats store header-like information in the file footer, which is
> not accessible during probing.
>  * The extreme case of M15 (original 15 samples Amiga .MOD files) provides
> absolutely no true file header or magic numbers. libopenmpt implements
> heuristics to reliably identify and probe even those, however there is only
> so much it can do.
>  * Some container formats (Unreal Music .UMX, which can contain module music
> files) theoretically potentially require seeking to arbitrary locations in
> the file in order to determine the format.
>
>> Why not return MAX?
>
> For all the reasons listed above, even though libopenmpt tries to be as
> pessimistic as possible, false positives fundamentally cannot be avoided
> completely. As the libopenmpt probing logic is code outside of ffmpeg, the
> effects of such a false positive could potentially cause mis-detection of
> other formats supported by ffmpeg, which would not be immediately or easily
> fixable by ffmpeg itself. I used the lowest possible score that makes sense
> in order to reduce the risk of potential impact.
> The probing result in this case is deduced from looking at the actual file
> data, as opposed to just trusting a mime-type which is external to the file
> and could be inconsistent/wrong, which is why I used a score higher than
> AVPROBE_SCORE_MIME.
> I opted for AVPROBE_SCORE_MIME+1, which seemed reasonable to me.
> Should I add a comment explaining the reasoning to the code?
>
>>> +        } else if (probe_result ==
>>> OPENMPT_PROBE_FILE_HEADER_RESULT_WANTMOREDATA) {
>>
>>  > I believe this should return 0 but maybe you found that this is bad?
>
>
> Would 0 be semantically right here?
> OPENMPT_PROBE_FILE_HEADER_RESULT_WANTMOREDATA means that libopenmpt requires
> more data to come to any usable conclusion, which is what I thought
> AVPROBE_SCORE_RETRY would mean.
> I do not see any particular problem with returning 0 in this case either,
> given the probing logic in av_probe_input_format() (and it would reduce the
> whole probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_WANTMOREDATA block to
> a single line). However, if client code directly calls .read_probe() on

> AVInputFormat ff_libopenmpt_demuxer, I think returning AVPROBE_SCORE_RETRY
> (or similar) makes more sense.

I agree.

>>> +            if (score > score_fail) {
>>> +                /* known file extension */
>>> +                score = FFMAX(score, score_ext_retry);
>>> +            } else {
>>> +                /* unknown file extension */
>>> +                if (p->buf_size >=
>>> openmpt_probe_file_header_get_recommended_size()) {
>>> +                    /* We have already received the recommended amount
>>> of data
>>> +                     * and still cannot decide. Return a rather low
>>> score.
>>> +                     */
>>> +                    score = FFMAX(score, score_retry);
>>> +                } else {
>>> +                    /* The file extension is unknown and we have very
>>> few data
>>> +                     * bytes available. libopenmpt cannot decide
>>> anything here,
>>> +                     * and returning any score > 0 would result in
>>> successfull
>>> +                     * probing of random data.
>>> +                     */
>>> +                    score = score_fail;
>>
>> This patch indicates that it may be a good idea to require libopenmpt 0.3,
>
> The amount of #ifdef needed to support 0.2 and 0.3 is rather small, I think.
>
> I understand that the current (and future libopenmpt 0.2) way of solely
> relying on the file extension is far from optimal, but I do not see any
> reason to drop libopenmpt 0.2 support right now; in particular, continuing
> 0.2 support as is would be no regression. Additionally, libopenmpt 0.2 can
> be built with C++03 compilers while libopenmpt 0.3 requires a C++11
> compiler, thus, libopenmpt 0.3 cannot easily be built on older platforms.
>
> libopenmpt 0.2 also allows for file probing, however the API and code path
> is very heavy-weight (goes through the normal file loader and discards
> unneeded data), which I fear would be way too heavy performance-wise for
> ffmpeg.
>
>> when was it released, which distributions do not include it?
>
> The first version of libopenmpt 0.3 was released 2017-09-28.
>
> I am not aware of any stable, non-rolling distribution that ships libopenmpt
> 0.3 as of now.

Then supporting only 0.3 is currently not a good option.

Carl Eugen
Jörn Heusipp Jan. 8, 2018, 7:25 p.m. UTC | #4
On 01/08/2018 12:57 AM, Carl Eugen Hoyos wrote:
> 2018-01-07 15:40 GMT+01:00 Jörn Heusipp <osmanx@problemloesungsmaschine.de>:
>> On 01/06/2018 04:10 PM, Carl Eugen Hoyos wrote:
>>> 2018-01-06 11:07 GMT+01:00 Jörn Heusipp
>>> <osmanx@problemloesungsmaschine.de>:


>>>> libopenmpt file header probing is tested regularly against the FATE
>>>> suite and other diverse file collections by libopenmpt upstream in
>>>> order to avoid false positives.
>>>
>>> You could also test tools/probetest
>>
>> I was not aware of that tool. Thanks for the suggestion.
>>
>> It currently lists a failure related to libopenmpt:
>> Failure of libopenmpt probing code with score=76 type=0 p=FDC size=1024
> 
> I did not look at this closely but I suspect libopenmpt should return a
> smaller score in this case.

We (libopenmpt developers) are currently considering making the 
heuristic for M15 files even stricter. The changes will land in 
libopenmpt 0.3.5.
libopenmpt 0.3.5 versus 0.3.4 or earlier can be distinguished at runtime 
via openmpt_get_library_version(). I would be fine with only doing 
probing via libopenmpt in FFmpeg starting with libopenmt 0.3.5 and 
relying on file extensions for earlier versions.

However, the data that tools/probetest.c generates here fundamentally 
does have a somewhat high probability of looking like a completely legit 
M15 file. False positives are not really avoidable completely no matter 
what libopenmpt does here. The failing data is synthetic, and I am not 
seeing any M15 false positives at all on real-world file collections 
(media and non-media files (tested on 1.2 million data and system files)).

> A solution could be to never return a high value for the FFmpeg
> probe function.

Maybe, but given what tools/probetest generates here, I somewhat doubt 
these examples have any real-world implication at all.
Anyway, in case a lower score is deemed to be useful, do you have any 
suggestions which score I should pick? AVPROBE_SCORE_EXTENSION or less 
would probably not be very useful, so what comes to mind for me is 
AVPROBE_SCORE_EXTENSION+1 (51).


>> Looking at tools/probetest.c, that would be a file with very few bits set.
>> libopenmpt detects the random data in question as M15 .MOD files (original
>> Amiga 15 samples .mod files), and there is sadly not much that can be done
>> about that. There are perfectly valid real-world M15 .MOD files with only 73
>> bits set in the first 600 bytes (untouchables-station.mod,
>> <https://modarchive.org/index.php?request=view_by_moduleid&query=104280>).
>> The following up to 64*4*4*63 bytes could even be completely 0 (empty
>> pattern data) for valid files (even without the file being totally silent).
>> The generated random data that tools/probetest complains about here contains
>> 46 bits set to 1 in the first 600 bytes. What follows are various other
>> examples with up to 96 bits set to 1. Completely loading a file like that
>> would very likely reject it (in particular, libopenmpt can deduce the
>> expected file size from the sample headers and, with some slack to account
>> for corrupted real-world examples, reject files with non-matching size),
>> however, that is not possible when only probing the file header.
>> The libopenmpt API allows for the file header probing functions to return
>> false-positives, however false-negatives are not allowed.
>>
>> Performance numbers shown by tools/probetest are what I had expected
>> (measured on an AMD A4-5000, during normal Xorg session (i.e. not 100%
>> idle)):
>>

[...]

>> 109589637233 cycles,   libopenmpt
> 
> This sadly may not be acceptable, others may want to comment.
> 
>>    2672917981           libopenmpt (per module format)
>>
>> At first glance, libopenmpt looks huge here in comparison. However one
>> should consider that libopenmpt internally has to probe for (currently) 41
>> different module file formats, going through 41 separate probing functions
>> internally.
>>
>> Dividing 109589637233 by 41 gives 2672917981, which is in the ballpark of
>> all other probing functions in ffmpeg.

What are your expectations for probing speed of 41 completely distinct 
file formats?
Even only h261,h263,h264,hevc,aac,ac3 (raw streams) combined take more 
time than libopenmpt takes for its 41 formats together.
All other FFmpeg probing functions combined (234 formats) take 
1201426924609 cycles. libopenmpt adds 109589637233 cycles for 41 
different file formats to that, which is about 10%. I do not think 
probing performance is in general that performance critical that would 
make 10% a problem, especially considering that for real-world use cases 
when probing a whole media library, the data also has to be read from 
storage in the first place.


Regards,
Jörn
Carl Eugen Hoyos Jan. 9, 2018, 1:30 a.m. UTC | #5
2018-01-08 20:25 GMT+01:00 Jörn Heusipp <osmanx@problemloesungsmaschine.de>:
>
> On 01/08/2018 12:57 AM, Carl Eugen Hoyos wrote:
>>
>> 2018-01-07 15:40 GMT+01:00 Jörn Heusipp
>> <osmanx@problemloesungsmaschine.de>:
>>>
>>> On 01/06/2018 04:10 PM, Carl Eugen Hoyos wrote:
>>>>
>>>> 2018-01-06 11:07 GMT+01:00 Jörn Heusipp
>>>> <osmanx@problemloesungsmaschine.de>:
>
>
>
>>>>> libopenmpt file header probing is tested regularly against the FATE
>>>>> suite and other diverse file collections by libopenmpt upstream in
>>>>> order to avoid false positives.
>>>>
>>>>
>>>> You could also test tools/probetest
>>>
>>>
>>> I was not aware of that tool. Thanks for the suggestion.
>>>
>>> It currently lists a failure related to libopenmpt:
>>> Failure of libopenmpt probing code with score=76 type=0 p=FDC size=1024
>>
>>
>> I did not look at this closely but I suspect libopenmpt should return a
>> smaller score in this case.
>
>
> We (libopenmpt developers) are currently considering making the heuristic
> for M15 files even stricter. The changes will land in libopenmpt 0.3.5.
> libopenmpt 0.3.5 versus 0.3.4 or earlier can be distinguished at runtime via
> openmpt_get_library_version(). I would be fine with only doing probing via
> libopenmpt in FFmpeg starting with libopenmt 0.3.5 and relying on file
> extensions for earlier versions.
>
> However, the data that tools/probetest.c generates here fundamentally does
> have a somewhat high probability of looking like a completely legit M15
> file. False positives are not really avoidable completely no matter what
> libopenmpt does here. The failing data is synthetic, and I am not seeing any
> M15 false positives at all on real-world file collections (media and
> non-media files (tested on 1.2 million data and system files)).
>
>> A solution could be to never return a high value for the FFmpeg
>> probe function.
>
>
> Maybe, but given what tools/probetest generates here, I somewhat doubt these
> examples have any real-world implication at all.
> Anyway, in case a lower score is deemed to be useful, do you have any
> suggestions which score I should pick? AVPROBE_SCORE_EXTENSION or less would
> probably not be very useful, so what comes to mind for me is
> AVPROBE_SCORE_EXTENSION+1 (51).

No real suggestion here, above was just an idea.

>>> Looking at tools/probetest.c, that would be a file with very few bits
>>> set.
>>> libopenmpt detects the random data in question as M15 .MOD files
>>> (original
>>> Amiga 15 samples .mod files), and there is sadly not much that can be
>>> done
>>> about that. There are perfectly valid real-world M15 .MOD files with only
>>> 73
>>> bits set in the first 600 bytes (untouchables-station.mod,
>>>
>>> <https://modarchive.org/index.php?request=view_by_moduleid&query=104280>).
>>> The following up to 64*4*4*63 bytes could even be completely 0 (empty
>>> pattern data) for valid files (even without the file being totally
>>> silent).
>>> The generated random data that tools/probetest complains about here
>>> contains
>>> 46 bits set to 1 in the first 600 bytes. What follows are various other
>>> examples with up to 96 bits set to 1. Completely loading a file like that
>>> would very likely reject it (in particular, libopenmpt can deduce the
>>> expected file size from the sample headers and, with some slack to
>>> account
>>> for corrupted real-world examples, reject files with non-matching size),
>>> however, that is not possible when only probing the file header.
>>> The libopenmpt API allows for the file header probing functions to return
>>> false-positives, however false-negatives are not allowed.
>>>
>>> Performance numbers shown by tools/probetest are what I had expected
>>> (measured on an AMD A4-5000, during normal Xorg session (i.e. not 100%
>>> idle)):
>>>
>
> [...]
>
>>> 109589637233 cycles,   libopenmpt
>>
>>
>> This sadly may not be acceptable, others may want to comment.
>>
>>>    2672917981           libopenmpt (per module format)
>>>
>>> At first glance, libopenmpt looks huge here in comparison. However one
>>> should consider that libopenmpt internally has to probe for (currently)
>>> 41
>>> different module file formats, going through 41 separate probing
>>> functions
>>> internally.
>>>
>>> Dividing 109589637233 by 41 gives 2672917981, which is in the ballpark of
>>> all other probing functions in ffmpeg.
>
>
> What are your expectations for probing speed of 41 completely distinct file
> formats?

My only expectation is that other FFmpeg developers comment, a
(imo strong) argument in your favour is that this will only apply if
an optional external library is activated at compile-time.

> Even only h261,h263,h264,hevc,aac,ac3 (raw streams) combined take more time
> than libopenmpt takes for its 41 formats together.

It is otoh imo not a useful argument to compare four of the most
common formats (we have to auto-detect them for mpeg-ts
recordings) to libopenmpt;-)

> All other FFmpeg probing functions combined (234 formats) take 1201426924609
> cycles. libopenmpt adds 109589637233 cycles for 41 different file formats to
> that, which is about 10%. I do not think probing performance is in general
> that performance critical that would make 10% a problem, especially
> considering that for real-world use cases when probing a whole media
> library, the data also has to be read from storage in the first place.

It is 10% for probetest, not sure if this compares well to real-world
files.

But if nobody else comments, I support your patch!

Carl Eugen
diff mbox

Patch

diff --git a/libavformat/libopenmpt.c b/libavformat/libopenmpt.c
index 5efbdc4..a663b2e 100644
--- a/libavformat/libopenmpt.c
+++ b/libavformat/libopenmpt.c
@@ -218,6 +218,64 @@  static int read_seek_openmpt(AVFormatContext *s, int stream_idx, int64_t ts, int
     return 0;
 }
 
+static int read_probe_openmpt(AVProbeData * p)
+{
+    const int score_data      = AVPROBE_SCORE_MIME + 1;   /* 76 */
+    const int score_ext       = AVPROBE_SCORE_EXTENSION;  /* 50 */
+    const int score_ext_retry = AVPROBE_SCORE_RETRY;      /* 25 */
+    const int score_retry     = AVPROBE_SCORE_RETRY / 2;  /* 12 */
+    const int score_fail      = 0;                        /*  0 */
+
+    const char *ext;
+    int probe_result;
+    int score = score_fail;
+
+    if (p->filename) {
+        ext = strrchr(p->filename, '.');
+        if (ext && strlen(ext + 1) > 0) {
+            ext++;  /* skip '.' */
+            if (openmpt_is_extension_supported(ext) == 1)
+                score = FFMAX(score, score_ext);
+        }
+    }
+
+#if OPENMPT_API_VERSION_AT_LEAST(0,3,0)
+    if (p->buf && p->buf_size > 0) {
+        probe_result = openmpt_probe_file_header_without_filesize(
+                           OPENMPT_PROBE_FILE_HEADER_FLAGS_DEFAULT,
+                           p->buf, p->buf_size,
+                           &openmpt_logfunc, NULL, NULL, NULL, NULL, NULL);
+        if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_FAILURE) {
+            score = score_fail;
+        } else if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_SUCCESS) {
+            score = FFMAX(score, score_data);
+        } else if (probe_result == OPENMPT_PROBE_FILE_HEADER_RESULT_WANTMOREDATA) {
+            if (score > score_fail) {
+                /* known file extension */
+                score = FFMAX(score, score_ext_retry);
+            } else {
+                /* unknown file extension */
+                if (p->buf_size >= openmpt_probe_file_header_get_recommended_size()) {
+                    /* We have already received the recommended amount of data
+                     * and still cannot decide. Return a rather low score.
+                     */
+                    score = FFMAX(score, score_retry);
+                } else {
+                    /* The file extension is unknown and we have very few data
+                     * bytes available. libopenmpt cannot decide anything here,
+                     * and returning any score > 0 would result in successfull
+                     * probing of random data.
+                     */
+                    score = score_fail;
+                }
+            }
+        }
+    }
+#endif
+
+    return score;
+}
+
 static const AVClass class_openmpt = {
     .class_name = "libopenmpt",
     .item_name  = av_default_item_name,
@@ -229,6 +287,7 @@  AVInputFormat ff_libopenmpt_demuxer = {
     .name           = "libopenmpt",
     .long_name      = NULL_IF_CONFIG_SMALL("Tracker formats (libopenmpt)"),
     .priv_data_size = sizeof(OpenMPTContext),
+    .read_probe     = read_probe_openmpt,
     .read_header    = read_header_openmpt,
     .read_packet    = read_packet_openmpt,
     .read_close     = read_close_openmpt,