diff mbox series

[FFmpeg-devel,v3,2/5] avfilter/af_volumedetect.c: Add 32bit float audio support

Message ID 20240702013354.14942-3-yigithanyigitdevel@gmail.com
State New
Headers show
Series avfilter/af_volumedetect.c: Add 32bit float audio | expand

Checks

Context Check Description
yinshiyou/make_fate_loongarch64 fail Make fate failed
yinshiyou/make_loongarch64 warning New warnings during build
andriy/make_fate_x86 success Make fate finished
andriy/make_x86 warning New warnings during build

Commit Message

Yigithan Yigit July 2, 2024, 1:33 a.m. UTC
---
 libavfilter/af_volumedetect.c | 139 ++++++++++++++++++++++++++--------
 1 file changed, 107 insertions(+), 32 deletions(-)

Comments

Rémi Denis-Courmont July 2, 2024, 5:46 a.m. UTC | #1
Le 2 juillet 2024 04:33:51 GMT+03:00, Yigithan Yigit <yigithanyigitdevel@gmail.com> a écrit :
>---
> libavfilter/af_volumedetect.c | 139 ++++++++++++++++++++++++++--------
> 1 file changed, 107 insertions(+), 32 deletions(-)

Did you try to compile this patch?

>
>diff --git a/libavfilter/af_volumedetect.c b/libavfilter/af_volumedetect.c
>index 327801a7f9..edd2d56f7a 100644
>--- a/libavfilter/af_volumedetect.c
>+++ b/libavfilter/af_volumedetect.c
>@@ -1,5 +1,6 @@
> /*
>  * Copyright (c) 2012 Nicolas George
>+ * Copyright (c) 2024 Yigithan Yigit - 32 Bit Float Audio Support
>  *
>  * This file is part of FFmpeg.
>  *
>@@ -20,48 +21,62 @@
> 
> #include "libavutil/channel_layout.h"
> #include "libavutil/avassert.h"
>+#include "libavutil/mem.h"
> #include "audio.h"
> #include "avfilter.h"
> #include "internal.h"
> 
>+#define MAX_DB_FLT 1024
> #define MAX_DB 91
>+#define HISTOGRAM_SIZE 0x10000
>+#define HISTOGRAM_SIZE_FLT (MAX_DB_FLT*2)
>+
>+typedef struct VolDetectContext VolDetectContext;
> 
> typedef struct VolDetectContext {
>-    /**
>-     * Number of samples at each PCM value.
>-     * histogram[0x8000 + i] is the number of samples at value i.
>-     * The extra element is there for symmetry.
>-     */
>-    uint64_t histogram[0x10001];
>+    uint64_t* histogram; ///< for integer number of samples at each PCM value, for float number of samples at each dB
>+    uint64_t nb_samples; ///< number of samples
>+    double sum2;         ///< sum of the squares of the samples
>+    double max;          ///< maximum sample value
>+    int is_float;        ///< true if the input is in floating point
>+    void (*process_samples)(VolDetectContext *vd, AVFrame *samples);
> } VolDetectContext;
> 
>-static inline double logdb(uint64_t v)
>+static inline double logdb(double v, enum AVSampleFormat sample_fmt)
> {
>-    double d = v / (double)(0x8000 * 0x8000);
>-    if (!v)
>-        return MAX_DB;
>-    return -log10(d) * 10;
>+    if (sample_fmt == AV_SAMPLE_FMT_FLT) {
>+        if (!v)
>+            return MAX_DB_FLT;
>+        return -log10(v) * 10;
>+    } else {
>+        double d = v / (double)(0x8000 * 0x8000);
>+        if (!v)
>+            return MAX_DB;
>+        return -log10(d) * 10;
>+    }
>+}
>+
>+static void update_float_stats(VolDetectContext *vd, float *audio_data)
>+{
>+    double sample;
>+    int idx;
>+    if(!isfinite(*audio_data) || isnan(*audio_data))
>+        return;
>+    sample = fabsf(*audio_data);
>+    if (sample > vd->max)
>+        vd->max = sample;
>+    vd->sum2 += sample * sample;
>+    idx = (int)floorf(logdb(sample * sample, AV_SAMPLE_FMT_FLT)) + MAX_DB_FLT;
>+    vd->histogram[idx]++;
>+    vd->nb_samples++;
> }
> 
> static int filter_frame(AVFilterLink *inlink, AVFrame *samples)
> {
>     AVFilterContext *ctx = inlink->dst;
>     VolDetectContext *vd = ctx->priv;
>-    int nb_samples  = samples->nb_samples;
>-    int nb_channels = samples->ch_layout.nb_channels;
>-    int nb_planes   = nb_channels;
>-    int plane, i;
>-    int16_t *pcm;
>-
>-    if (!av_sample_fmt_is_planar(samples->format)) {
>-        nb_samples *= nb_channels;
>-        nb_planes = 1;
>-    }
>-    for (plane = 0; plane < nb_planes; plane++) {
>-        pcm = (int16_t *)samples->extended_data[plane];
>-        for (i = 0; i < nb_samples; i++)
>-            vd->histogram[pcm[i] + 0x8000]++;
>-    }
>+
>+    vd->process_samples(vd, samples);
> 
>     return ff_filter_frame(inlink->dst->outputs[0], samples);
> }
>@@ -73,6 +88,20 @@ static void print_stats(AVFilterContext *ctx)
>     uint64_t nb_samples = 0, power = 0, nb_samples_shift = 0, sum = 0;
>     uint64_t histdb[MAX_DB + 1] = { 0 };
> 
>+    if (!vd->nb_samples)
>+        return;
>+    if (vd->is_float) {
>+        av_log(ctx, AV_LOG_INFO, "n_samples: %" PRId64 "\n", vd->nb_samples);
>+        av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb(vd->sum2 / vd->nb_samples, AV_SAMPLE_FMT_FLT));
>+        av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -2.0*logdb(vd->max, AV_SAMPLE_FMT_FLT));
>+        for (i = 0; i < HISTOGRAM_SIZE_FLT && !vd->histogram[i]; i++);
>+        for (; i >= 0 && sum < vd->nb_samples / 1000; i++) {
>+            if (!vd->histogram[i])
>+                continue;
>+            av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %" PRId64 "\n", MAX_DB_FLT - i, vd->histogram[i]);
>+            sum += vd->histogram[i];
>+        }
>+    } else {
>     for (i = 0; i < 0x10000; i++)
>         nb_samples += vd->histogram[i];
>     av_log(ctx, AV_LOG_INFO, "n_samples: %"PRId64"\n", nb_samples);
>@@ -92,26 +121,61 @@ static void print_stats(AVFilterContext *ctx)
>         return;
>     power = (power + nb_samples_shift / 2) / nb_samples_shift;
>     av_assert0(power <= 0x8000 * 0x8000);
>-    av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb(power));
>+    av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb((double)power, AV_SAMPLE_FMT_S16));
> 
>     max_volume = 0x8000;
>     while (max_volume > 0 && !vd->histogram[0x8000 + max_volume] &&
>                              !vd->histogram[0x8000 - max_volume])
>         max_volume--;
>-    av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -logdb(max_volume * max_volume));
>+    av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -logdb((double)(max_volume * max_volume), AV_SAMPLE_FMT_S16));
> 
>     for (i = 0; i < 0x10000; i++)
>-        histdb[(int)logdb((i - 0x8000) * (i - 0x8000))] += vd->histogram[i];
>+        histdb[(int)logdb((double)(i - 0x8000) * (i - 0x8000), AV_SAMPLE_FMT_S16)] += vd->histogram[i];
>     for (i = 0; i <= MAX_DB && !histdb[i]; i++);
>     for (; i <= MAX_DB && sum < nb_samples / 1000; i++) {
>-        av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %"PRId64"\n", i, histdb[i]);
>+        av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %"PRId64"\n", -i, histdb[i]);
>         sum += histdb[i];
>     }
>+    }
>+}
>+
>+static int config_output(AVFilterLink *outlink)
>+{
>+    AVFilterContext *ctx = outlink->src;
>+    VolDetectContext *vd = ctx->priv;
>+    size_t histogram_size;
>+
>+    vd->is_float = outlink->format == AV_SAMPLE_FMT_FLT ||
>+                   outlink->format == AV_SAMPLE_FMT_FLTP;
>+
>+    if (!vd->is_float) {
>+        /*
>+        * Number of samples at each PCM value.
>+        * Only used for integer formats.
>+        * For 16 bit signed PCM there are 65536.
>+        * histogram[0x8000 + i] is the number of samples at value i.
>+        * The extra element is there for symmetry.
>+        */
>+        histogram_size = HISTOGRAM_SIZE + 1;
>+    } else {
>+        /*
>+        * The histogram is used to store the number of samples at each dB
>+        * instead of the number of samples at each PCM value.
>+        */
>+        histogram_size = HISTOGRAM_SIZE_FLT + 1;
>+    }
>+    vd->histogram = av_calloc(histogram_size, sizeof(uint64_t));
>+    if (!vd->histogram)
>+        return AVERROR(ENOMEM);
>+    return 0;
> }
> 
> static av_cold void uninit(AVFilterContext *ctx)
> {
>+    VolDetectContext *vd = ctx->priv;
>     print_stats(ctx);
>+    if (vd->histogram)
>+        av_freep(&vd->histogram);
> }
> 
> static const AVFilterPad volumedetect_inputs[] = {
>@@ -122,6 +186,14 @@ static const AVFilterPad volumedetect_inputs[] = {
>     },
> };
> 
>+static const AVFilterPad volumedetect_outputs[] = {
>+    {
>+        .name         = "default",
>+        .type         = AVMEDIA_TYPE_AUDIO,
>+        .config_props = config_output,
>+    },
>+};
>+
> const AVFilter ff_af_volumedetect = {
>     .name          = "volumedetect",
>     .description   = NULL_IF_CONFIG_SMALL("Detect audio volume."),
>@@ -129,6 +201,9 @@ const AVFilter ff_af_volumedetect = {
>     .uninit        = uninit,
>     .flags         = AVFILTER_FLAG_METADATA_ONLY,
>     FILTER_INPUTS(volumedetect_inputs),
>-    FILTER_OUTPUTS(ff_audio_default_filterpad),
>-    FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P),
>+    FILTER_OUTPUTS(volumedetect_outputs),
>+    FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_S16,
>+                      AV_SAMPLE_FMT_S16P,
>+                      AV_SAMPLE_FMT_FLT,
>+                      AV_SAMPLE_FMT_FLTP),
> };
Rémi Denis-Courmont July 2, 2024, 5:49 a.m. UTC | #2
Le 2 juillet 2024 08:46:53 GMT+03:00, "Rémi Denis-Courmont" <remi@remlab.net> a écrit :
>
>
>Le 2 juillet 2024 04:33:51 GMT+03:00, Yigithan Yigit <yigithanyigitdevel@gmail.com> a écrit :
>>---
>> libavfilter/af_volumedetect.c | 139 ++++++++++++++++++++++++++--------
>> 1 file changed, 107 insertions(+), 32 deletions(-)
>
>Did you try to compile this patch?

Nvmd misread.
Rémi Denis-Courmont July 2, 2024, 5:51 a.m. UTC | #3
Le 2 juillet 2024 04:33:51 GMT+03:00, Yigithan Yigit <yigithanyigitdevel@gmail.com> a écrit :
>---
> libavfilter/af_volumedetect.c | 139 ++++++++++++++++++++++++++--------
> 1 file changed, 107 insertions(+), 32 deletions(-)
>
>diff --git a/libavfilter/af_volumedetect.c b/libavfilter/af_volumedetect.c
>index 327801a7f9..edd2d56f7a 100644
>--- a/libavfilter/af_volumedetect.c
>+++ b/libavfilter/af_volumedetect.c
>@@ -1,5 +1,6 @@
> /*
>  * Copyright (c) 2012 Nicolas George
>+ * Copyright (c) 2024 Yigithan Yigit - 32 Bit Float Audio Support
>  *
>  * This file is part of FFmpeg.
>  *
>@@ -20,48 +21,62 @@
> 
> #include "libavutil/channel_layout.h"
> #include "libavutil/avassert.h"
>+#include "libavutil/mem.h"
> #include "audio.h"
> #include "avfilter.h"
> #include "internal.h"
> 
>+#define MAX_DB_FLT 1024
> #define MAX_DB 91
>+#define HISTOGRAM_SIZE 0x10000
>+#define HISTOGRAM_SIZE_FLT (MAX_DB_FLT*2)
>+
>+typedef struct VolDetectContext VolDetectContext;
> 
> typedef struct VolDetectContext {
>-    /**
>-     * Number of samples at each PCM value.
>-     * histogram[0x8000 + i] is the number of samples at value i.
>-     * The extra element is there for symmetry.
>-     */
>-    uint64_t histogram[0x10001];
>+    uint64_t* histogram; ///< for integer number of samples at each PCM value, for float number of samples at each dB
>+    uint64_t nb_samples; ///< number of samples
>+    double sum2;         ///< sum of the squares of the samples
>+    double max;          ///< maximum sample value
>+    int is_float;        ///< true if the input is in floating point
>+    void (*process_samples)(VolDetectContext *vd, AVFrame *samples);
> } VolDetectContext;
> 
>-static inline double logdb(uint64_t v)
>+static inline double logdb(double v, enum AVSampleFormat sample_fmt)
> {
>-    double d = v / (double)(0x8000 * 0x8000);
>-    if (!v)
>-        return MAX_DB;
>-    return -log10(d) * 10;
>+    if (sample_fmt == AV_SAMPLE_FMT_FLT) {

There's no point in doing this. You've already up-converted to double precision and do all the calculations in double precision. Maybe that's fine or maybe not, but either way, this doesn't look sensible.

>+        if (!v)
>+            return MAX_DB_FLT;
>+        return -log10(v) * 10;
>+    } else {
>+        double d = v / (double)(0x8000 * 0x8000);
>+        if (!v)
>+            return MAX_DB;
>+        return -log10(d) * 10;
>+    }
>+}
>+
>+static void update_float_stats(VolDetectContext *vd, float *audio_data)
>+{
>+    double sample;
>+    int idx;
>+    if(!isfinite(*audio_data) || isnan(*audio_data))
>+        return;
>+    sample = fabsf(*audio_data);
>+    if (sample > vd->max)
>+        vd->max = sample;
>+    vd->sum2 += sample * sample;
>+    idx = (int)floorf(logdb(sample * sample, AV_SAMPLE_FMT_FLT)) + MAX_DB_FLT;
>+    vd->histogram[idx]++;
>+    vd->nb_samples++;
> }
> 
> static int filter_frame(AVFilterLink *inlink, AVFrame *samples)
> {
>     AVFilterContext *ctx = inlink->dst;
>     VolDetectContext *vd = ctx->priv;
>-    int nb_samples  = samples->nb_samples;
>-    int nb_channels = samples->ch_layout.nb_channels;
>-    int nb_planes   = nb_channels;
>-    int plane, i;
>-    int16_t *pcm;
>-
>-    if (!av_sample_fmt_is_planar(samples->format)) {
>-        nb_samples *= nb_channels;
>-        nb_planes = 1;
>-    }
>-    for (plane = 0; plane < nb_planes; plane++) {
>-        pcm = (int16_t *)samples->extended_data[plane];
>-        for (i = 0; i < nb_samples; i++)
>-            vd->histogram[pcm[i] + 0x8000]++;
>-    }
>+
>+    vd->process_samples(vd, samples);
> 
>     return ff_filter_frame(inlink->dst->outputs[0], samples);
> }
>@@ -73,6 +88,20 @@ static void print_stats(AVFilterContext *ctx)
>     uint64_t nb_samples = 0, power = 0, nb_samples_shift = 0, sum = 0;
>     uint64_t histdb[MAX_DB + 1] = { 0 };
> 
>+    if (!vd->nb_samples)
>+        return;
>+    if (vd->is_float) {
>+        av_log(ctx, AV_LOG_INFO, "n_samples: %" PRId64 "\n", vd->nb_samples);
>+        av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb(vd->sum2 / vd->nb_samples, AV_SAMPLE_FMT_FLT));
>+        av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -2.0*logdb(vd->max, AV_SAMPLE_FMT_FLT));
>+        for (i = 0; i < HISTOGRAM_SIZE_FLT && !vd->histogram[i]; i++);
>+        for (; i >= 0 && sum < vd->nb_samples / 1000; i++) {
>+            if (!vd->histogram[i])
>+                continue;
>+            av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %" PRId64 "\n", MAX_DB_FLT - i, vd->histogram[i]);
>+            sum += vd->histogram[i];
>+        }
>+    } else {
>     for (i = 0; i < 0x10000; i++)
>         nb_samples += vd->histogram[i];
>     av_log(ctx, AV_LOG_INFO, "n_samples: %"PRId64"\n", nb_samples);
>@@ -92,26 +121,61 @@ static void print_stats(AVFilterContext *ctx)
>         return;
>     power = (power + nb_samples_shift / 2) / nb_samples_shift;
>     av_assert0(power <= 0x8000 * 0x8000);
>-    av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb(power));
>+    av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb((double)power, AV_SAMPLE_FMT_S16));
> 
>     max_volume = 0x8000;
>     while (max_volume > 0 && !vd->histogram[0x8000 + max_volume] &&
>                              !vd->histogram[0x8000 - max_volume])
>         max_volume--;
>-    av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -logdb(max_volume * max_volume));
>+    av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -logdb((double)(max_volume * max_volume), AV_SAMPLE_FMT_S16));
> 
>     for (i = 0; i < 0x10000; i++)
>-        histdb[(int)logdb((i - 0x8000) * (i - 0x8000))] += vd->histogram[i];
>+        histdb[(int)logdb((double)(i - 0x8000) * (i - 0x8000), AV_SAMPLE_FMT_S16)] += vd->histogram[i];
>     for (i = 0; i <= MAX_DB && !histdb[i]; i++);
>     for (; i <= MAX_DB && sum < nb_samples / 1000; i++) {
>-        av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %"PRId64"\n", i, histdb[i]);
>+        av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %"PRId64"\n", -i, histdb[i]);
>         sum += histdb[i];
>     }
>+    }
>+}
>+
>+static int config_output(AVFilterLink *outlink)
>+{
>+    AVFilterContext *ctx = outlink->src;
>+    VolDetectContext *vd = ctx->priv;
>+    size_t histogram_size;
>+
>+    vd->is_float = outlink->format == AV_SAMPLE_FMT_FLT ||
>+                   outlink->format == AV_SAMPLE_FMT_FLTP;
>+
>+    if (!vd->is_float) {
>+        /*
>+        * Number of samples at each PCM value.
>+        * Only used for integer formats.
>+        * For 16 bit signed PCM there are 65536.
>+        * histogram[0x8000 + i] is the number of samples at value i.
>+        * The extra element is there for symmetry.
>+        */
>+        histogram_size = HISTOGRAM_SIZE + 1;
>+    } else {
>+        /*
>+        * The histogram is used to store the number of samples at each dB
>+        * instead of the number of samples at each PCM value.
>+        */
>+        histogram_size = HISTOGRAM_SIZE_FLT + 1;
>+    }
>+    vd->histogram = av_calloc(histogram_size, sizeof(uint64_t));
>+    if (!vd->histogram)
>+        return AVERROR(ENOMEM);
>+    return 0;
> }
> 
> static av_cold void uninit(AVFilterContext *ctx)
> {
>+    VolDetectContext *vd = ctx->priv;
>     print_stats(ctx);
>+    if (vd->histogram)
>+        av_freep(&vd->histogram);
> }
> 
> static const AVFilterPad volumedetect_inputs[] = {
>@@ -122,6 +186,14 @@ static const AVFilterPad volumedetect_inputs[] = {
>     },
> };
> 
>+static const AVFilterPad volumedetect_outputs[] = {
>+    {
>+        .name         = "default",
>+        .type         = AVMEDIA_TYPE_AUDIO,
>+        .config_props = config_output,
>+    },
>+};
>+
> const AVFilter ff_af_volumedetect = {
>     .name          = "volumedetect",
>     .description   = NULL_IF_CONFIG_SMALL("Detect audio volume."),
>@@ -129,6 +201,9 @@ const AVFilter ff_af_volumedetect = {
>     .uninit        = uninit,
>     .flags         = AVFILTER_FLAG_METADATA_ONLY,
>     FILTER_INPUTS(volumedetect_inputs),
>-    FILTER_OUTPUTS(ff_audio_default_filterpad),
>-    FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P),
>+    FILTER_OUTPUTS(volumedetect_outputs),
>+    FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_S16,
>+                      AV_SAMPLE_FMT_S16P,
>+                      AV_SAMPLE_FMT_FLT,
>+                      AV_SAMPLE_FMT_FLTP),
> };
Yigithan Yigit July 2, 2024, 11:46 a.m. UTC | #4
> On Jul 2, 2024, at 8:51 AM, Rémi Denis-Courmont <remi@remlab.net> wrote:
> 
> 
> 
> Le 2 juillet 2024 04:33:51 GMT+03:00, Yigithan Yigit <yigithanyigitdevel@gmail.com <mailto:yigithanyigitdevel@gmail.com>> a écrit :
>> ---
>> libavfilter/af_volumedetect.c | 139 ++++++++++++++++++++++++++--------
>> 1 file changed, 107 insertions(+), 32 deletions(-)
>> 
>> diff --git a/libavfilter/af_volumedetect.c b/libavfilter/af_volumedetect.c
>> index 327801a7f9..edd2d56f7a 100644
>> --- a/libavfilter/af_volumedetect.c
>> +++ b/libavfilter/af_volumedetect.c
>> @@ -1,5 +1,6 @@
>> /*
>> * Copyright (c) 2012 Nicolas George
>> + * Copyright (c) 2024 Yigithan Yigit - 32 Bit Float Audio Support
>> *
>> * This file is part of FFmpeg.
>> *
>> @@ -20,48 +21,62 @@
>> 
>> #include "libavutil/channel_layout.h"
>> #include "libavutil/avassert.h"
>> +#include "libavutil/mem.h"
>> #include "audio.h"
>> #include "avfilter.h"
>> #include "internal.h"
>> 
>> +#define MAX_DB_FLT 1024
>> #define MAX_DB 91
>> +#define HISTOGRAM_SIZE 0x10000
>> +#define HISTOGRAM_SIZE_FLT (MAX_DB_FLT*2)
>> +
>> +typedef struct VolDetectContext VolDetectContext;
>> 
>> typedef struct VolDetectContext {
>> -    /**
>> -     * Number of samples at each PCM value.
>> -     * histogram[0x8000 + i] is the number of samples at value i.
>> -     * The extra element is there for symmetry.
>> -     */
>> -    uint64_t histogram[0x10001];
>> +    uint64_t* histogram; ///< for integer number of samples at each PCM value, for float number of samples at each dB
>> +    uint64_t nb_samples; ///< number of samples
>> +    double sum2;         ///< sum of the squares of the samples
>> +    double max;          ///< maximum sample value
>> +    int is_float;        ///< true if the input is in floating point
>> +    void (*process_samples)(VolDetectContext *vd, AVFrame *samples);
>> } VolDetectContext;
>> 
>> -static inline double logdb(uint64_t v)
>> +static inline double logdb(double v, enum AVSampleFormat sample_fmt)
>> {
>> -    double d = v / (double)(0x8000 * 0x8000);
>> -    if (!v)
>> -        return MAX_DB;
>> -    return -log10(d) * 10;
>> +    if (sample_fmt == AV_SAMPLE_FMT_FLT) {
> 
> There's no point in doing this. You've already up-converted to double precision and do all the calculations in double precision. Maybe that's fine or maybe not, but either way, this doesn't look sensible.
> 
>> +        if (!v)
>> +            return MAX_DB_FLT;
>> +        return -log10(v) * 10;
>> +    } else {
>> +        double d = v / (double)(0x8000 * 0x8000);
>> +        if (!v)
>> +            return MAX_DB;
>> +        return -log10(d) * 10;
>> +    }
>> +}
>> +

If I understand your concerns correctly, We should have function like this;

> static inline double logdb(double v, enum AVSampleFormat sample_fmt)
> {
>     if (!v)
>         return sample_fmt == AV_SAMPLE_FMT_FLT ? MAX_DB_FLT : MAX_DB;
> 
>     if (sample_fmt == AV_SAMPLE_FMT_S16)
>       v = ldexp(v, -30);
> 
>     return -log10(v) * 10;
> }

What do you think about that?

Thanks for the feedback
Yigithan
diff mbox series

Patch

diff --git a/libavfilter/af_volumedetect.c b/libavfilter/af_volumedetect.c
index 327801a7f9..edd2d56f7a 100644
--- a/libavfilter/af_volumedetect.c
+++ b/libavfilter/af_volumedetect.c
@@ -1,5 +1,6 @@ 
 /*
  * Copyright (c) 2012 Nicolas George
+ * Copyright (c) 2024 Yigithan Yigit - 32 Bit Float Audio Support
  *
  * This file is part of FFmpeg.
  *
@@ -20,48 +21,62 @@ 
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/avassert.h"
+#include "libavutil/mem.h"
 #include "audio.h"
 #include "avfilter.h"
 #include "internal.h"
 
+#define MAX_DB_FLT 1024
 #define MAX_DB 91
+#define HISTOGRAM_SIZE 0x10000
+#define HISTOGRAM_SIZE_FLT (MAX_DB_FLT*2)
+
+typedef struct VolDetectContext VolDetectContext;
 
 typedef struct VolDetectContext {
-    /**
-     * Number of samples at each PCM value.
-     * histogram[0x8000 + i] is the number of samples at value i.
-     * The extra element is there for symmetry.
-     */
-    uint64_t histogram[0x10001];
+    uint64_t* histogram; ///< for integer number of samples at each PCM value, for float number of samples at each dB
+    uint64_t nb_samples; ///< number of samples
+    double sum2;         ///< sum of the squares of the samples
+    double max;          ///< maximum sample value
+    int is_float;        ///< true if the input is in floating point
+    void (*process_samples)(VolDetectContext *vd, AVFrame *samples);
 } VolDetectContext;
 
-static inline double logdb(uint64_t v)
+static inline double logdb(double v, enum AVSampleFormat sample_fmt)
 {
-    double d = v / (double)(0x8000 * 0x8000);
-    if (!v)
-        return MAX_DB;
-    return -log10(d) * 10;
+    if (sample_fmt == AV_SAMPLE_FMT_FLT) {
+        if (!v)
+            return MAX_DB_FLT;
+        return -log10(v) * 10;
+    } else {
+        double d = v / (double)(0x8000 * 0x8000);
+        if (!v)
+            return MAX_DB;
+        return -log10(d) * 10;
+    }
+}
+
+static void update_float_stats(VolDetectContext *vd, float *audio_data)
+{
+    double sample;
+    int idx;
+    if(!isfinite(*audio_data) || isnan(*audio_data))
+        return;
+    sample = fabsf(*audio_data);
+    if (sample > vd->max)
+        vd->max = sample;
+    vd->sum2 += sample * sample;
+    idx = (int)floorf(logdb(sample * sample, AV_SAMPLE_FMT_FLT)) + MAX_DB_FLT;
+    vd->histogram[idx]++;
+    vd->nb_samples++;
 }
 
 static int filter_frame(AVFilterLink *inlink, AVFrame *samples)
 {
     AVFilterContext *ctx = inlink->dst;
     VolDetectContext *vd = ctx->priv;
-    int nb_samples  = samples->nb_samples;
-    int nb_channels = samples->ch_layout.nb_channels;
-    int nb_planes   = nb_channels;
-    int plane, i;
-    int16_t *pcm;
-
-    if (!av_sample_fmt_is_planar(samples->format)) {
-        nb_samples *= nb_channels;
-        nb_planes = 1;
-    }
-    for (plane = 0; plane < nb_planes; plane++) {
-        pcm = (int16_t *)samples->extended_data[plane];
-        for (i = 0; i < nb_samples; i++)
-            vd->histogram[pcm[i] + 0x8000]++;
-    }
+
+    vd->process_samples(vd, samples);
 
     return ff_filter_frame(inlink->dst->outputs[0], samples);
 }
@@ -73,6 +88,20 @@  static void print_stats(AVFilterContext *ctx)
     uint64_t nb_samples = 0, power = 0, nb_samples_shift = 0, sum = 0;
     uint64_t histdb[MAX_DB + 1] = { 0 };
 
+    if (!vd->nb_samples)
+        return;
+    if (vd->is_float) {
+        av_log(ctx, AV_LOG_INFO, "n_samples: %" PRId64 "\n", vd->nb_samples);
+        av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb(vd->sum2 / vd->nb_samples, AV_SAMPLE_FMT_FLT));
+        av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -2.0*logdb(vd->max, AV_SAMPLE_FMT_FLT));
+        for (i = 0; i < HISTOGRAM_SIZE_FLT && !vd->histogram[i]; i++);
+        for (; i >= 0 && sum < vd->nb_samples / 1000; i++) {
+            if (!vd->histogram[i])
+                continue;
+            av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %" PRId64 "\n", MAX_DB_FLT - i, vd->histogram[i]);
+            sum += vd->histogram[i];
+        }
+    } else {
     for (i = 0; i < 0x10000; i++)
         nb_samples += vd->histogram[i];
     av_log(ctx, AV_LOG_INFO, "n_samples: %"PRId64"\n", nb_samples);
@@ -92,26 +121,61 @@  static void print_stats(AVFilterContext *ctx)
         return;
     power = (power + nb_samples_shift / 2) / nb_samples_shift;
     av_assert0(power <= 0x8000 * 0x8000);
-    av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb(power));
+    av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\n", -logdb((double)power, AV_SAMPLE_FMT_S16));
 
     max_volume = 0x8000;
     while (max_volume > 0 && !vd->histogram[0x8000 + max_volume] &&
                              !vd->histogram[0x8000 - max_volume])
         max_volume--;
-    av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -logdb(max_volume * max_volume));
+    av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\n", -logdb((double)(max_volume * max_volume), AV_SAMPLE_FMT_S16));
 
     for (i = 0; i < 0x10000; i++)
-        histdb[(int)logdb((i - 0x8000) * (i - 0x8000))] += vd->histogram[i];
+        histdb[(int)logdb((double)(i - 0x8000) * (i - 0x8000), AV_SAMPLE_FMT_S16)] += vd->histogram[i];
     for (i = 0; i <= MAX_DB && !histdb[i]; i++);
     for (; i <= MAX_DB && sum < nb_samples / 1000; i++) {
-        av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %"PRId64"\n", i, histdb[i]);
+        av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %"PRId64"\n", -i, histdb[i]);
         sum += histdb[i];
     }
+    }
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    VolDetectContext *vd = ctx->priv;
+    size_t histogram_size;
+
+    vd->is_float = outlink->format == AV_SAMPLE_FMT_FLT ||
+                   outlink->format == AV_SAMPLE_FMT_FLTP;
+
+    if (!vd->is_float) {
+        /*
+        * Number of samples at each PCM value.
+        * Only used for integer formats.
+        * For 16 bit signed PCM there are 65536.
+        * histogram[0x8000 + i] is the number of samples at value i.
+        * The extra element is there for symmetry.
+        */
+        histogram_size = HISTOGRAM_SIZE + 1;
+    } else {
+        /*
+        * The histogram is used to store the number of samples at each dB
+        * instead of the number of samples at each PCM value.
+        */
+        histogram_size = HISTOGRAM_SIZE_FLT + 1;
+    }
+    vd->histogram = av_calloc(histogram_size, sizeof(uint64_t));
+    if (!vd->histogram)
+        return AVERROR(ENOMEM);
+    return 0;
 }
 
 static av_cold void uninit(AVFilterContext *ctx)
 {
+    VolDetectContext *vd = ctx->priv;
     print_stats(ctx);
+    if (vd->histogram)
+        av_freep(&vd->histogram);
 }
 
 static const AVFilterPad volumedetect_inputs[] = {
@@ -122,6 +186,14 @@  static const AVFilterPad volumedetect_inputs[] = {
     },
 };
 
+static const AVFilterPad volumedetect_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_AUDIO,
+        .config_props = config_output,
+    },
+};
+
 const AVFilter ff_af_volumedetect = {
     .name          = "volumedetect",
     .description   = NULL_IF_CONFIG_SMALL("Detect audio volume."),
@@ -129,6 +201,9 @@  const AVFilter ff_af_volumedetect = {
     .uninit        = uninit,
     .flags         = AVFILTER_FLAG_METADATA_ONLY,
     FILTER_INPUTS(volumedetect_inputs),
-    FILTER_OUTPUTS(ff_audio_default_filterpad),
-    FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P),
+    FILTER_OUTPUTS(volumedetect_outputs),
+    FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_S16,
+                      AV_SAMPLE_FMT_S16P,
+                      AV_SAMPLE_FMT_FLT,
+                      AV_SAMPLE_FMT_FLTP),
 };